From 5a671f685921b5cc02ced87a410645e8ad1b5c98 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 13:55:13 -0400 Subject: [PATCH 01/46] [KernelInfo] Implement new LLVM IR pass for GPU code analysis This patch implements an LLVM IR pass, named kernel-info, that reports various statistics for codes compiled for GPUs. The ultimate goal of these statistics to help identify bad code patterns and ways to mitigate them. The pass operates at the LLVM IR level so that it can, in theory, support any LLVM-based compiler for programming languages supporting GPUs. It has been tested so far with LLVM IR generated by Clang for OpenMP offload codes targeting NVIDIA GPUs and AMD GPUs. By default, the pass is disabled. For convenience, `-kernel-info-end-lto` inserts it at the end of LTO, and options like `-Rpass=kernel-info` enable its remarks. Example opt and clang command lines appear in comments in `llvm/include/llvm/Analysis/KernelInfo.h`. Remarks include summary statistics (e.g., total size of static allocas) and individual occurrences (e.g., source location of each alloca). Examples of its output appear in tests in `llvm/test/Analysis/KernelInfo`. --- llvm/include/llvm/Analysis/KernelInfo.h | 148 ++++ llvm/include/llvm/Target/TargetMachine.h | 3 + llvm/lib/Analysis/CMakeLists.txt | 1 + llvm/lib/Analysis/KernelInfo.cpp | 350 ++++++++ llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassRegistry.def | 2 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 + llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 10 + llvm/lib/Target/TargetMachine.cpp | 5 + llvm/test/Analysis/KernelInfo/addrspace0.ll | 152 ++++ llvm/test/Analysis/KernelInfo/allocas.ll | 78 ++ llvm/test/Analysis/KernelInfo/calls.ll | 112 +++ .../kernel-info-after-lto/amdgpu.ll | 47 + .../KernelInfo/kernel-info-after-lto/nvptx.ll | 47 + .../KernelInfo/launch-bounds/amdgpu.ll | 40 + .../KernelInfo/launch-bounds/nvptx.ll | 36 + llvm/test/Analysis/KernelInfo/linkage.ll | 51 ++ .../test/Analysis/KernelInfo/openmp/README.md | 40 + .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 217 +++++ llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 811 ++++++++++++++++++ 20 files changed, 2161 insertions(+) create mode 100644 llvm/include/llvm/Analysis/KernelInfo.h create mode 100644 llvm/lib/Analysis/KernelInfo.cpp create mode 100644 llvm/test/Analysis/KernelInfo/addrspace0.ll create mode 100644 llvm/test/Analysis/KernelInfo/allocas.ll create mode 100644 llvm/test/Analysis/KernelInfo/calls.ll create mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll create mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll create mode 100644 llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll create mode 100644 llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll create mode 100644 llvm/test/Analysis/KernelInfo/linkage.ll create mode 100644 llvm/test/Analysis/KernelInfo/openmp/README.md create mode 100644 llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll create mode 100644 llvm/test/Analysis/KernelInfo/openmp/nvptx.ll diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h new file mode 100644 index 0000000000000..5495bb2fd4d92 --- /dev/null +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -0,0 +1,148 @@ +//=- KernelInfo.h - Kernel Analysis -------------------------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter +// classes used to extract function properties from a GPU kernel. +// +// To analyze a C program as it appears to an LLVM GPU backend at the end of +// LTO: +// +// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ +// -Rpass=kernel-info -mllvm -kernel-info-end-lto +// +// To analyze specified LLVM IR, perhaps previously generated by something like +// 'clang -save-temps -g -fopenmp --offload-arch=native test.c': +// +// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ +// -pass-remarks=kernel-info -passes=kernel-info +// +// kernel-info can also be inserted into a specified LLVM pass pipeline using +// -kernel-info-end-lto, or it can be positioned explicitly in that pipeline: +// +// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ +// -Rpass=kernel-info -mllvm -kernel-info-end-lto \ +// -Xoffload-linker --lto-newpm-passes='lto' +// +// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ +// -Rpass=kernel-info \ +// -Xoffload-linker --lto-newpm-passes='lto,module(kernel-info)' +// +// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ +// -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto' +// +// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ +// -pass-remarks=kernel-info -passes='lto,module(kernel-info)' +// ===---------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_KERNELINFO_H +#define LLVM_ANALYSIS_KERNELINFO_H + +#include "llvm/Analysis/OptimizationRemarkEmitter.h" + +namespace llvm { +class DominatorTree; +class Function; + +/// Data structure holding function info for kernels. +class KernelInfo { + void updateForBB(const BasicBlock &BB, int64_t Direction, + OptimizationRemarkEmitter &ORE); + +public: + static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); + + bool operator==(const KernelInfo &FPI) const { + return std::memcmp(this, &FPI, sizeof(KernelInfo)) == 0; + } + + bool operator!=(const KernelInfo &FPI) const { return !(*this == FPI); } + + /// If false, nothing was recorded here because the supplied function didn't + /// appear in a module compiled for a GPU. + bool IsValid = false; + + /// Whether the function has external linkage and is not a kernel function. + bool ExternalNotKernel = false; + + /// OpenMP Launch bounds. + ///@{ + std::optional OmpTargetNumTeams; + std::optional OmpTargetThreadLimit; + ///@} + + /// AMDGPU launch bounds. + ///@{ + std::optional AmdgpuMaxNumWorkgroupsX; + std::optional AmdgpuMaxNumWorkgroupsY; + std::optional AmdgpuMaxNumWorkgroupsZ; + std::optional AmdgpuFlatWorkGroupSizeMin; + std::optional AmdgpuFlatWorkGroupSizeMax; + std::optional AmdgpuWavesPerEuMin; + std::optional AmdgpuWavesPerEuMax; + ///@} + + /// NVPTX launch bounds. + ///@{ + std::optional Maxclusterrank; + std::optional Maxntidx; + ///@} + + /// The number of alloca instructions inside the function, the number of those + /// with allocation sizes that cannot be determined at compile time, and the + /// sum of the sizes that can be. + /// + /// With the current implementation for at least some GPU archs, + /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in + /// case the implementation changes. + int64_t Allocas = 0; + int64_t AllocasDyn = 0; + int64_t AllocasStaticSizeSum = 0; + + /// Number of direct/indirect calls (anything derived from CallBase). + int64_t DirectCalls = 0; + int64_t IndirectCalls = 0; + + /// Number of direct calls made from this function to other functions + /// defined in this module. + int64_t DirectCallsToDefinedFunctions = 0; + + /// Number of calls of type InvokeInst. + int64_t Invokes = 0; + + /// Number of addrspace(0) memory accesses (via load, store, etc.). + int64_t AddrspaceZeroAccesses = 0; +}; + +/// Analysis class for KernelInfo. +class KernelInfoAnalysis : public AnalysisInfoMixin { +public: + static AnalysisKey Key; + + using Result = const KernelInfo; + + KernelInfo run(Function &F, FunctionAnalysisManager &FAM) { + return KernelInfo::getKernelInfo(F, FAM); + } +}; + +/// Printer pass for KernelInfoAnalysis. +/// +/// It just calls KernelInfoAnalysis, which prints remarks if they are enabled. +class KernelInfoPrinter : public PassInfoMixin { +public: + explicit KernelInfoPrinter() {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) { + AM.getResult(F); + return PreservedAnalyses::all(); + } + + static bool isRequired() { return true; } +}; +} // namespace llvm +#endif // LLVM_ANALYSIS_KERNELINFO_H diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h index c3e9d41315f61..5c338a8fcd0cf 100644 --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -18,6 +18,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/PGOOptions.h" #include "llvm/Target/CGPassBuilderOption.h" @@ -27,6 +28,8 @@ #include #include +extern llvm::cl::opt KernelInfoEndLTO; + namespace llvm { class AAManager; diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index 2cb3547ec4047..02e76af8d903d 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -78,6 +78,7 @@ add_llvm_component_library(LLVMAnalysis InstructionPrecedenceTracking.cpp InstructionSimplify.cpp InteractiveModelRunner.cpp + KernelInfo.cpp LazyBranchProbabilityInfo.cpp LazyBlockFrequencyInfo.cpp LazyCallGraph.cpp diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp new file mode 100644 index 0000000000000..9df3b5b32afcb --- /dev/null +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -0,0 +1,350 @@ +//===- KernelInfo.cpp - Kernel Analysis -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter +// classes used to extract function properties from a kernel. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/KernelInfo.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassBuilder.h" + +using namespace llvm; + +#define DEBUG_TYPE "kernel-info" + +static bool isKernelFunction(Function &F) { + // TODO: Is this general enough? Consider languages beyond OpenMP. + return F.hasFnAttribute("kernel"); +} + +static void identifyFunction(OptimizationRemark &R, const Function &F) { + if (auto *SubProgram = F.getSubprogram()) { + if (SubProgram->isArtificial()) + R << "artificial "; + } + R << "function '" << F.getName() << "'"; +} + +static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller, + const AllocaInst &Alloca, + TypeSize::ScalarTy StaticSize) { + ORE.emit([&] { + StringRef Name; + DebugLoc Loc; + bool Artificial = false; + auto DVRs = findDVRDeclares(&const_cast(Alloca)); + if (!DVRs.empty()) { + const DbgVariableRecord &DVR = **DVRs.begin(); + Name = DVR.getVariable()->getName(); + Loc = DVR.getDebugLoc(); + Artificial = DVR.Variable->isArtificial(); + } + OptimizationRemark R(DEBUG_TYPE, "Alloca", DiagnosticLocation(Loc), + Alloca.getParent()); + R << "in "; + identifyFunction(R, Caller); + R << ", "; + if (Artificial) + R << "artificial "; + if (Name.empty()) { + R << "unnamed alloca "; + if (DVRs.empty()) + R << "(missing debug metadata) "; + } else { + R << "alloca '" << Name << "' "; + } + R << "with "; + if (StaticSize) + R << "static size of " << itostr(StaticSize) << " bytes"; + else + R << "dynamic size"; + return R; + }); +} + +static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, + const CallBase &Call, StringRef CallKind, + StringRef RemarkKind) { + ORE.emit([&] { + OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call); + R << "in "; + identifyFunction(R, Caller); + R << ", " << CallKind; + if (const Function *Callee = + dyn_cast_or_null(Call.getCalledOperand())) { + R << ", callee is"; + StringRef Name = Callee->getName(); + if (auto *SubProgram = Callee->getSubprogram()) { + if (SubProgram->isArtificial()) + R << " artificial"; + } + if (!Name.empty()) + R << " '" << Name << "'"; + else + R << " with unknown name"; + } + return R; + }); +} + +static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE, + const Function &Caller, + const Instruction &Inst) { + ORE.emit([&] { + OptimizationRemark R(DEBUG_TYPE, "AddrspaceZeroAccess", &Inst); + R << "in "; + identifyFunction(R, Caller); + if (const IntrinsicInst *II = dyn_cast(&Inst)) { + R << ", '" << II->getCalledFunction()->getName() << "' call"; + } else { + R << ", '" << Inst.getOpcodeName() << "' instruction"; + } + if (Inst.hasName()) + R << " ('%" << Inst.getName() << "')"; + R << " accesses memory in addrspace(0)"; + return R; + }); +} + +void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, + OptimizationRemarkEmitter &ORE) { + assert(Direction == 1 || Direction == -1); + const Function &F = *BB.getParent(); + const Module &M = *F.getParent(); + const DataLayout &DL = M.getDataLayout(); + for (const Instruction &I : BB.instructionsWithoutDebug()) { + if (const AllocaInst *Alloca = dyn_cast(&I)) { + Allocas += Direction; + TypeSize::ScalarTy StaticSize = 0; + if (std::optional Size = Alloca->getAllocationSize(DL)) { + StaticSize = Size->getFixedValue(); + assert(StaticSize <= std::numeric_limits::max()); + AllocasStaticSizeSum += Direction * StaticSize; + } else { + AllocasDyn += Direction; + } + remarkAlloca(ORE, F, *Alloca, StaticSize); + } else if (const CallBase *Call = dyn_cast(&I)) { + std::string CallKind; + std::string RemarkKind; + if (Call->isIndirectCall()) { + IndirectCalls += Direction; + CallKind += "indirect"; + RemarkKind += "Indirect"; + } else { + DirectCalls += Direction; + CallKind += "direct"; + RemarkKind += "Direct"; + } + if (isa(Call)) { + Invokes += Direction; + CallKind += " invoke"; + RemarkKind += "Invoke"; + } else { + CallKind += " call"; + RemarkKind += "Call"; + } + if (!Call->isIndirectCall()) { + if (const Function *Callee = Call->getCalledFunction()) { + if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration()) { + DirectCallsToDefinedFunctions += Direction; + CallKind += " to defined function"; + RemarkKind += "ToDefinedFunction"; + } + } + } + remarkCall(ORE, F, *Call, CallKind, RemarkKind); + if (const AnyMemIntrinsic *MI = dyn_cast(Call)) { + if (MI->getDestAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } else if (const AnyMemTransferInst *MT = + dyn_cast(MI)) { + if (MT->getSourceAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } + } + } + } else if (const LoadInst *Load = dyn_cast(&I)) { + if (Load->getPointerAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } + } else if (const StoreInst *Store = dyn_cast(&I)) { + if (Store->getPointerAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } + } else if (const AtomicRMWInst *At = dyn_cast(&I)) { + if (At->getPointerAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } + } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { + if (At->getPointerAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } + } + } +} + +static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, + StringRef Name, int64_t Value) { + ORE.emit([&] { + OptimizationRemark R(DEBUG_TYPE, Name, &F); + R << "in "; + identifyFunction(R, F); + R << ", " << Name << " = " << itostr(Value); + return R; + }); +} + +static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, + StringRef Name, std::optional Value) { + if (!Value) + return; + remarkProperty(ORE, F, Name, Value.value()); +} + +static std::vector> +parseFnAttrAsIntegerFields(Function &F, StringRef Name, unsigned NumFields) { + std::vector> Result(NumFields); + Attribute A = F.getFnAttribute(Name); + if (!A.isStringAttribute()) + return Result; + StringRef Rest = A.getValueAsString(); + for (unsigned I = 0; I < NumFields; ++I) { + StringRef Field; + std::tie(Field, Rest) = Rest.split(','); + if (Field.empty()) + break; + int64_t Val; + if (Field.getAsInteger(0, Val)) { + F.getContext().emitError("cannot parse integer in attribute '" + Name + + "': " + Field); + break; + } + Result[I] = Val; + } + if (!Rest.empty()) + F.getContext().emitError("too many fields in attribute " + Name); + return Result; +} + +static std::optional parseFnAttrAsInteger(Function &F, + StringRef Name) { + return parseFnAttrAsIntegerFields(F, Name, 1)[0]; +} + +// TODO: This nearly duplicates the same function in OMPIRBuilder.cpp. Can we +// share? +static MDNode *getNVPTXMDNode(Function &F, StringRef Name) { + Module &M = *F.getParent(); + NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"); + if (!MD) + return nullptr; + for (auto *Op : MD->operands()) { + if (Op->getNumOperands() != 3) + continue; + auto *KernelOp = dyn_cast(Op->getOperand(0)); + if (!KernelOp || KernelOp->getValue() != &F) + continue; + auto *Prop = dyn_cast(Op->getOperand(1)); + if (!Prop || Prop->getString() != Name) + continue; + return Op; + } + return nullptr; +} + +static std::optional parseNVPTXMDNodeAsInteger(Function &F, + StringRef Name) { + std::optional Result; + if (MDNode *ExistingOp = getNVPTXMDNode(F, Name)) { + auto *Op = cast(ExistingOp->getOperand(2)); + Result = cast(Op->getValue())->getZExtValue(); + } + return Result; +} + +KernelInfo KernelInfo::getKernelInfo(Function &F, + FunctionAnalysisManager &FAM) { + KernelInfo KI; + // Only analyze modules for GPUs. + // TODO: This would be more maintainable if there were an isGPU. + const std::string &TT = F.getParent()->getTargetTriple(); + llvm::Triple T(TT); + if (!T.isAMDGPU() && !T.isNVPTX()) + return KI; + KI.IsValid = true; + + // Record function properties. + KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); + KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams"); + KI.OmpTargetThreadLimit = parseFnAttrAsInteger(F, "omp_target_thread_limit"); + auto AmdgpuMaxNumWorkgroups = + parseFnAttrAsIntegerFields(F, "amdgpu-max-num-workgroups", 3); + KI.AmdgpuMaxNumWorkgroupsX = AmdgpuMaxNumWorkgroups[0]; + KI.AmdgpuMaxNumWorkgroupsY = AmdgpuMaxNumWorkgroups[1]; + KI.AmdgpuMaxNumWorkgroupsZ = AmdgpuMaxNumWorkgroups[2]; + auto AmdgpuFlatWorkGroupSize = + parseFnAttrAsIntegerFields(F, "amdgpu-flat-work-group-size", 2); + KI.AmdgpuFlatWorkGroupSizeMin = AmdgpuFlatWorkGroupSize[0]; + KI.AmdgpuFlatWorkGroupSizeMax = AmdgpuFlatWorkGroupSize[1]; + auto AmdgpuWavesPerEu = + parseFnAttrAsIntegerFields(F, "amdgpu-waves-per-eu", 2); + KI.AmdgpuWavesPerEuMin = AmdgpuWavesPerEu[0]; + KI.AmdgpuWavesPerEuMax = AmdgpuWavesPerEu[1]; + KI.Maxclusterrank = parseNVPTXMDNodeAsInteger(F, "maxclusterrank"); + KI.Maxntidx = parseNVPTXMDNodeAsInteger(F, "maxntidx"); + + const DominatorTree &DT = FAM.getResult(F); + auto &ORE = FAM.getResult(F); + for (const auto &BB : F) + if (DT.isReachableFromEntry(&BB)) + KI.updateForBB(BB, +1, ORE); + +#define REMARK_PROPERTY(PROP_NAME) \ + remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) + REMARK_PROPERTY(ExternalNotKernel); + REMARK_PROPERTY(OmpTargetNumTeams); + REMARK_PROPERTY(OmpTargetThreadLimit); + REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsX); + REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsY); + REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsZ); + REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMin); + REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMax); + REMARK_PROPERTY(AmdgpuWavesPerEuMin); + REMARK_PROPERTY(AmdgpuWavesPerEuMax); + REMARK_PROPERTY(Maxclusterrank); + REMARK_PROPERTY(Maxntidx); + REMARK_PROPERTY(Allocas); + REMARK_PROPERTY(AllocasStaticSizeSum); + REMARK_PROPERTY(AllocasDyn); + REMARK_PROPERTY(DirectCalls); + REMARK_PROPERTY(IndirectCalls); + REMARK_PROPERTY(DirectCallsToDefinedFunctions); + REMARK_PROPERTY(Invokes); + REMARK_PROPERTY(AddrspaceZeroAccesses); +#undef REMARK_PROPERTY + + return KI; +} + +AnalysisKey KernelInfoAnalysis::Key; diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 46f43f3de4705..61677f02783cc 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -44,6 +44,7 @@ #include "llvm/Analysis/InlineAdvisor.h" #include "llvm/Analysis/InlineSizeEstimatorAnalysis.h" #include "llvm/Analysis/InstCount.h" +#include "llvm/Analysis/KernelInfo.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Lint.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 0cec9fbd7cd05..dcfa732f410b3 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -278,6 +278,7 @@ FUNCTION_ANALYSIS( MachineFunctionAnalysis(static_cast(TM))) FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis()) FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis()) +FUNCTION_ANALYSIS("kernel-info", KernelInfoAnalysis()) FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis()) FUNCTION_ANALYSIS("loops", LoopAnalysis()) FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis()) @@ -374,6 +375,7 @@ FUNCTION_PASS("irce", IRCEPass()) FUNCTION_PASS("jump-threading", JumpThreadingPass()) FUNCTION_PASS("jump-table-to-switch", JumpTableToSwitchPass()); FUNCTION_PASS("kcfi", KCFIPass()) +FUNCTION_PASS("kernel-info", KernelInfoPrinter()) FUNCTION_PASS("lcssa", LCSSAPass()) FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass()) FUNCTION_PASS("lint", LintPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0523fee5bcf9f..3b2ed9fe4236c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -40,6 +40,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/KernelInfo.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" @@ -772,6 +773,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { return onlyAllocateVGPRs; return nullptr; }); + + PB.registerFullLinkTimeOptimizationLastEPCallback( + [](ModulePassManager &PM, OptimizationLevel Level) { + if (KernelInfoEndLTO) { + FunctionPassManager FPM; + FPM.addPass(KernelInfoPrinter()); + PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + }); } int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 097e29527eed9..8d77c8e53f7a6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -22,6 +22,7 @@ #include "NVPTXTargetTransformInfo.h" #include "TargetInfo/NVPTXTargetInfo.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/KernelInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -238,6 +239,15 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { FPM.addPass(NVVMIntrRangePass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); + + PB.registerFullLinkTimeOptimizationLastEPCallback( + [](ModulePassManager &PM, OptimizationLevel Level) { + if (KernelInfoEndLTO) { + FunctionPassManager FPM; + FPM.addPass(KernelInfoPrinter()); + PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + }); } TargetTransformInfo diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index c0985f3be91a5..b235fd8f6f49a 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -26,6 +26,11 @@ #include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; +cl::opt KernelInfoEndLTO( + "kernel-info-end-lto", + cl::desc("add the kernel-info pass at the end of the full LTO pipeline"), + cl::init(false), cl::Hidden); + //--------------------------------------------------------------------------- // TargetMachine Class // diff --git a/llvm/test/Analysis/KernelInfo/addrspace0.ll b/llvm/test/Analysis/KernelInfo/addrspace0.ll new file mode 100644 index 0000000000000..4c472396443f5 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/addrspace0.ll @@ -0,0 +1,152 @@ +; Check info on addrspace(0) memory accesses. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines --implicit-check-not='addrspace(0)' %s + +target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define void @f() !dbg !3 { +entry: + ; load + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in addrspace(0) + %0 = load i32, ptr null, align 4, !dbg !6 + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in addrspace(0) + %load = load i32, ptr null, align 4, !dbg !6 + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in addrspace(0) + %load0 = load i32, ptr addrspace(0) null, align 4, !dbg !6 + %load1 = load i32, ptr addrspace(1) null, align 4, !dbg !6 + %load2 = load i32, ptr addrspace(2) null, align 4, !dbg !6 + + ; store + ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0) + store i32 0, ptr null, align 4, !dbg !7 + ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0) + store i32 0, ptr addrspace(0) null, align 4, !dbg !7 + store i32 0, ptr addrspace(1) null, align 4, !dbg !7 + store i32 0, ptr addrspace(8) null, align 4, !dbg !7 + + ; atomicrmw + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0) + atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8 + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0) + atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8 + atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8 + atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8 + + ; cmpxchg + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0) + cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0) + cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + + ; llvm.memcpy + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + + ; llvm.memcpy.inline + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.inline.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.inline.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.inline.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + + ; llvm.memcpy.element.unordered.atomic + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10 + call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10 + + ; llvm.memmove + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11 + + ; llvm.memmove.element.unordered.atomic + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11 + call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11 + + ; llvm.memset + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12 + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + + ; llvm.memset.inline + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.inline.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12 + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.inline.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.inline.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.inline.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + + ; llvm.memset.element.unordered.atomic + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 null, i8 0, i64 10, i32 4), !dbg !12 + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.element.unordered.atomic.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i32 4), !dbg !12 + call void @llvm.memset.element.unordered.atomic.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i32 4), !dbg !12 + call void @llvm.memset.element.unordered.atomic.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i32 4), !dbg !12 + + ret void +} +; CHECK: remark: test.c:2:0: in function 'f', AddrspaceZeroAccesses = 36 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 2, type: !4, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !5) +!4 = !DISubroutineType(types: !5) +!5 = !{} +!6 = !DILocation(line: 3, column: 11, scope: !3) +!7 = !DILocation(line: 4, column: 6, scope: !3) +!8 = !DILocation(line: 5, column: 1, scope: !3) +!9 = !DILocation(line: 6, column: 2, scope: !3) +!10 = !DILocation(line: 7, column: 3, scope: !3) +!11 = !DILocation(line: 8, column: 4, scope: !3) +!12 = !DILocation(line: 9, column: 5, scope: !3) diff --git a/llvm/test/Analysis/KernelInfo/allocas.ll b/llvm/test/Analysis/KernelInfo/allocas.ll new file mode 100644 index 0000000000000..048d53799c33e --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/allocas.ll @@ -0,0 +1,78 @@ +; Check info on allocas. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define void @h() !dbg !3 { +entry: + ; CHECK: remark: test.c:0:0: in artificial function 'h', artificial alloca 'dyn_ptr' with static size of 8 bytes + %dyn_ptr.addr = alloca ptr, align 8 + ; CHECK: remark: test.c:14:9: in artificial function 'h', alloca 'i' with static size of 4 bytes + %i = alloca i32, align 4 + ; CHECK: remark: test.c:15:9: in artificial function 'h', alloca 'a' with static size of 8 bytes + %a = alloca [2 x i32], align 4 + tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !7, metadata !DIExpression()), !dbg !11 + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !12, metadata !DIExpression()), !dbg !15 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !16, metadata !DIExpression()), !dbg !20 + ret void +} +; CHECK: remark: test.c:13:0: in artificial function 'h', Allocas = 3 +; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasStaticSizeSum = 20 +; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasDyn = 0 + +define void @g() !dbg !21 { +entry: + ; CHECK: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes + %i = alloca i32, align 4 + ; CHECK: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes + %a = alloca [2 x i32], align 4 + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !23, metadata !DIExpression()), !dbg !24 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !25, metadata !DIExpression()), !dbg !26 + ret void +} +; CHECK: remark: test.c:3:0: in function 'g', Allocas = 2 +; CHECK: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 +; CHECK: remark: test.c:3:0: in function 'g', AllocasDyn = 0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.declare(metadata, metadata, metadata) #0 + +; uselistorder directives +uselistorder ptr @llvm.dbg.declare, { 4, 3, 2, 1, 0 } + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !4, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !6) +!4 = distinct !DISubroutineType(types: !5) +!5 = !{null} +!6 = !{} +!7 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !3, type: !8, flags: DIFlagArtificial) +!8 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !9) +!9 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !10) +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!11 = !DILocation(line: 0, scope: !3) +!12 = !DILocalVariable(name: "i", scope: !13, file: !2, line: 14, type: !14) +!13 = distinct !DILexicalBlock(scope: !3, file: !2, line: 13, column: 3) +!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!15 = !DILocation(line: 14, column: 9, scope: !13) +!16 = !DILocalVariable(name: "a", scope: !13, file: !2, line: 15, type: !17) +!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !14, size: 64, elements: !18) +!18 = !{!19} +!19 = !DISubrange(count: 2) +!20 = !DILocation(line: 15, column: 9, scope: !13) +!21 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !22, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !6) +!22 = !DISubroutineType(types: !5) +!23 = !DILocalVariable(name: "i", scope: !21, file: !2, line: 4, type: !14) +!24 = !DILocation(line: 4, column: 7, scope: !21) +!25 = !DILocalVariable(name: "a", scope: !21, file: !2, line: 5, type: !17) +!26 = !DILocation(line: 5, column: 7, scope: !21) diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll new file mode 100644 index 0000000000000..6101a71254898 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/calls.ll @@ -0,0 +1,112 @@ +; Check info on calls. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +declare void @personality() + +define void @h() personality ptr @personality !dbg !100 { +entry: + ; CHECK: remark: test.c:16:5: in artificial function 'h', direct call, callee is 'f' + call void @f(), !dbg !102 + ; CHECK: remark: test.c:17:5: in artificial function 'h', direct call to defined function, callee is 'g' + call void @g(), !dbg !104 + ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h' + call void @h(), !dbg !105 + %0 = load ptr, ptr null, align 8 + ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call + call void %0(), !dbg !106 + ; CHECK: remark: test.c:20:5: in artificial function 'h', direct invoke, callee is 'f' + invoke void @f() to label %fcont unwind label %cleanup, !dbg !107 +fcont: + ; CHECK: remark: test.c:21:5: in artificial function 'h', direct invoke to defined function, callee is 'g' + invoke void @g() to label %gcont unwind label %cleanup, !dbg !108 +gcont: + ; CHECK: remark: test.c:22:5: in artificial function 'h', direct invoke to defined function, callee is artificial 'h' + invoke void @h() to label %hcont unwind label %cleanup, !dbg !109 +hcont: + ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke + invoke void %0() to label %end unwind label %cleanup, !dbg !110 +cleanup: + %ll = landingpad { ptr, i32 } + cleanup + br label %end +end: + ret void +} +; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCalls = 6 +; CHECK: remark: test.c:13:0: in artificial function 'h', IndirectCalls = 2 +; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCallsToDefinedFunctions = 4 +; CHECK: remark: test.c:13:0: in artificial function 'h', Invokes = 4 + +declare void @f() + +define void @g() personality ptr @personality !dbg !200 { +entry: + ; CHECK: remark: test.c:6:3: in function 'g', direct call, callee is 'f' + call void @f(), !dbg !202 + ; CHECK: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' + call void @g(), !dbg !203 + ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h' + call void @h(), !dbg !204 + %0 = load ptr, ptr null, align 8 + ; CHECK: remark: test.c:9:3: in function 'g', indirect call + call void %0(), !dbg !205 + ; CHECK: remark: test.c:10:3: in function 'g', direct invoke, callee is 'f' + invoke void @f() to label %fcont unwind label %cleanup, !dbg !206 +fcont: + ; CHECK: remark: test.c:11:3: in function 'g', direct invoke to defined function, callee is 'g' + invoke void @g() to label %gcont unwind label %cleanup, !dbg !207 +gcont: + ; CHECK: remark: test.c:12:3: in function 'g', direct invoke to defined function, callee is artificial 'h' + invoke void @h() to label %hcont unwind label %cleanup, !dbg !208 +hcont: + ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke + invoke void %0() to label %end unwind label %cleanup, !dbg !209 +cleanup: + %ll = landingpad { ptr, i32 } + cleanup + br label %end +end: + ret void +} +; CHECK: remark: test.c:3:0: in function 'g', DirectCalls = 6 +; CHECK: remark: test.c:3:0: in function 'g', IndirectCalls = 2 +; CHECK: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 4 +; CHECK: remark: test.c:3:0: in function 'g', Invokes = 4 + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{null} +!4 = !{} + +!100 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!101 = distinct !DISubroutineType(types: !3) +!102 = !DILocation(line: 16, column: 5, scope: !103) +!103 = distinct !DILexicalBlock(scope: !100, file: !2, line: 13, column: 3) +!104 = !DILocation(line: 17, column: 5, scope: !103) +!105 = !DILocation(line: 18, column: 5, scope: !103) +!106 = !DILocation(line: 19, column: 5, scope: !103) +!107 = !DILocation(line: 20, column: 5, scope: !103) +!108 = !DILocation(line: 21, column: 5, scope: !103) +!109 = !DILocation(line: 22, column: 5, scope: !103) +!110 = !DILocation(line: 23, column: 5, scope: !103) + +!200 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!201 = !DISubroutineType(types: !3) +!202 = !DILocation(line: 6, column: 3, scope: !200) +!203 = !DILocation(line: 7, column: 3, scope: !200) +!204 = !DILocation(line: 8, column: 3, scope: !200) +!205 = !DILocation(line: 9, column: 3, scope: !200) +!206 = !DILocation(line: 10, column: 3, scope: !200) +!207 = !DILocation(line: 11, column: 3, scope: !200) +!208 = !DILocation(line: 12, column: 3, scope: !200) +!209 = !DILocation(line: 13, column: 3, scope: !200) diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll new file mode 100644 index 0000000000000..7d190ece46e16 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll @@ -0,0 +1,47 @@ +; Check that -kernel-info-end-lto enables kernel-info in the AMD GPU target +; backend. + +; REQUIRES: amdgpu-registered-target + +; -kernel-info-end-lto inserts kernel-info into LTO pipeline. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +; Omitting -kernel-info-end-lto disables kernel-info. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='lto' 2>&1 | \ +; RUN: FileCheck -allow-empty -check-prefixes=NONE %s + +; Omitting LTO disables kernel-info. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ +; RUN: FileCheck -allow-empty -check-prefixes=NONE %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; NONE-NOT: remark: +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!6, !7, !8} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = !{ptr @test, !"maxclusterrank", i32 200} +!7 = !{ptr @test, !"maxntidx", i32 210} +!8 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll new file mode 100644 index 0000000000000..4e790123c313a --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll @@ -0,0 +1,47 @@ +; Check that -kernel-info-end-lto enables kernel-info in the NVPTX target +; backend. + +; REQUIRES: nvptx-registered-target + +; -kernel-info-end-lto inserts kernel-info into LTO pipeline. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +; Omitting -kernel-info-end-lto disables kernel-info. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='lto' 2>&1 | \ +; RUN: FileCheck -allow-empty -check-prefixes=NONE %s + +; Omitting LTO disables kernel-info. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ +; RUN: FileCheck -allow-empty -check-prefixes=NONE %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; NONE-NOT: remark: +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!6, !7, !8} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = !{ptr @test, !"maxclusterrank", i32 200} +!7 = !{ptr @test, !"maxntidx", i32 210} +!8 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll new file mode 100644 index 0000000000000..0c98f4ad45950 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll @@ -0,0 +1,40 @@ +; Check info on launch bounds for AMD GPU. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsX = 200 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsY = 201 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsZ = 202 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMin = 210 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMax = 211 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMin = 220 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMax = 221 +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" + "omp_target_thread_limit"="101" + "amdgpu-max-num-workgroups"="200,201,202" + "amdgpu-flat-work-group-size"="210,211" + "amdgpu-waves-per-eu"="220,221" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll new file mode 100644 index 0000000000000..c7339f90e3ca9 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll @@ -0,0 +1,36 @@ +; Check info on launch bounds for NVPTX. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'test', Maxclusterrank = 200 +; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidx = 210 +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" + "omp_target_thread_limit"="101" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!6, !7, !8} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = !{ptr @test, !"maxclusterrank", i32 200} +!7 = !{ptr @test, !"maxntidx", i32 210} +!8 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/linkage.ll b/llvm/test/Analysis/KernelInfo/linkage.ll new file mode 100644 index 0000000000000..43154d2379825 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/linkage.ll @@ -0,0 +1,51 @@ +; Check info on linkage. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK: remark: test.c:3:0: in function 'f', ExternalNotKernel = 1 +define external void @f() !dbg !10 { +entry: + ret void +} + +; CHECK: remark: test.c:13:0: in artificial function 'g', ExternalNotKernel = 1 +define void @g() !dbg !20 { +entry: + ret void +} + +; CHECK: remark: test.c:23:0: in function 'h', ExternalNotKernel = 0 +define external void @h() #0 !dbg !30 { +entry: + ret void +} + +; CHECK: remark: test.c:33:0: in artificial function 'i', ExternalNotKernel = 0 +define weak void @i() !dbg !40 { +entry: + ret void +} + +attributes #0 = { "kernel" } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{null} +!4 = !{} +!10 = distinct !DISubprogram(name: "f", scope: !2, file: !2, line: 3, type: !11, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!11 = !DISubroutineType(types: !3) +!20 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 13, type: !21, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!21 = distinct !DISubroutineType(types: !3) +!30 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 23, type: !31, scopeLine: 23, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!31 = distinct !DISubroutineType(types: !3) +!40 = distinct !DISubprogram(name: "i", scope: !2, file: !2, line: 33, type: !41, scopeLine: 33, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!41 = distinct !DISubroutineType(types: !3) diff --git a/llvm/test/Analysis/KernelInfo/openmp/README.md b/llvm/test/Analysis/KernelInfo/openmp/README.md new file mode 100644 index 0000000000000..0d13950e198ed --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/openmp/README.md @@ -0,0 +1,40 @@ +The tests in this directory check that basic KernelInfoAnalysis functionality +behaves reasonably for LLVM IR produced by Clang OpenMP codegen. + +So that these tests are straightforward to maintain and faithfully represent +Clang OpenMP codegen, do not tweak or reduce the LLVM IR in them. Other tests +more exhaustively check KernelInfoAnalysis features using reduced LLVM IR. + +The LLVM IR in each test file `$TEST` can be regenerated as follows in the case +that Clang OpenMP codegen changes or it becomes desirable to adjust the source +OpenMP program below. First, remove the existing LLVM IR from `$TEST`. Then, +where `$TARGET` (e.g., `nvptx64-nvidia-cuda` or `amdgcn-amd-amdhsa`) depends on +`$TEST`: + +``` +$ cd /tmp +$ cat test.c +#pragma omp declare target +void f(); +void g() { + int i; + int a[2]; + f(); + g(); +} +#pragma omp end declare target + +void h(int i) { + #pragma omp target map(tofrom:i) + { + int i; + int a[2]; + f(); + g(); + } +} + +$ clang -g -fopenmp -fopenmp-targets=$TARGET -save-temps -c test.c +$ llvm-dis test-openmp-$TARGET.bc +$ cat test-openmp-$TARGET.ll >> $TEST +``` diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll new file mode 100644 index 0000000000000..ee5f65b8e5ab7 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -0,0 +1,217 @@ +; See ./README.md for how to maintain the LLVM IR in this test. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +; For some builds, we see a warning like: +; +; opt: WARNING: failed to create target machine for 'amdgcn-amd-amdhsa': unable to get target for 'amdgcn-amd-amdhsa', see --version and --triple. +; +; But there should be no other remarks here. +; CHECK-NOT: remark: + +; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_init' +; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_deinit' +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMin = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMax = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1 + +; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2 + +; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0 +; CHECK-NOT: {{.}} + + +; ModuleID = 'test-openmp-amdgcn-amd-amdhsa.bc' +source_filename = "test.c" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +%struct.ident_t = type { i32, i32, i32, i32, ptr } +%struct.DynamicEnvironmentTy = type { i16 } +%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr } +%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 } + +@__omp_rtl_debug_kind = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_teams_oversubscription = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_threads_oversubscription = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1) constant i32 0 +@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_71f35_h_l12_debug__;13;3;;\00", align 1 +@1 = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 56, ptr @0 }, align 8 +@__omp_offloading_fd02_71f35_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_71f35_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_71f35_h_l12_dynamic_environment to ptr) } +@__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define internal void @__omp_offloading_fd02_71f35_h_l12_debug__(ptr noalias noundef %dyn_ptr) #0 !dbg !16 { +entry: + %dyn_ptr.addr = alloca ptr, align 8, addrspace(5) + %i = alloca i32, align 4, addrspace(5) + %a = alloca [2 x i32], align 4, addrspace(5) + %dyn_ptr.addr.ascast = addrspacecast ptr addrspace(5) %dyn_ptr.addr to ptr + %i.ascast = addrspacecast ptr addrspace(5) %i to ptr + %a.ascast = addrspacecast ptr addrspace(5) %a to ptr + store ptr %dyn_ptr, ptr %dyn_ptr.addr.ascast, align 8 + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %dyn_ptr.addr, metadata !24, metadata !DIExpression()), !dbg !25 + %0 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_71f35_h_l12_kernel_environment to ptr), ptr %dyn_ptr), !dbg !26 + %exec_user_code = icmp eq i32 %0, -1, !dbg !26 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit, !dbg !26 + +user_code.entry: ; preds = %entry + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %i, metadata !27, metadata !DIExpression()), !dbg !30 + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %a, metadata !31, metadata !DIExpression()), !dbg !35 + call void @f() #5, !dbg !36 + call void @g() #5, !dbg !37 + call void @__kmpc_target_deinit(), !dbg !38 + ret void, !dbg !39 + +worker.exit: ; preds = %entry + ret void, !dbg !26 +} + +declare i32 @__kmpc_target_init(ptr, ptr) + +; Function Attrs: convergent +declare void @f(...) #1 + +declare void @__kmpc_target_deinit() + +; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_71f35_h_l12(ptr noalias noundef %dyn_ptr) #2 !dbg !40 { +entry: + %dyn_ptr.addr = alloca ptr, align 8, addrspace(5) + %dyn_ptr.addr.ascast = addrspacecast ptr addrspace(5) %dyn_ptr.addr to ptr + store ptr %dyn_ptr, ptr %dyn_ptr.addr.ascast, align 8 + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %dyn_ptr.addr, metadata !41, metadata !DIExpression()), !dbg !42 + %0 = load ptr, ptr %dyn_ptr.addr.ascast, align 8, !dbg !43 + call void @__omp_offloading_fd02_71f35_h_l12_debug__(ptr %0) #6, !dbg !43 + ret void, !dbg !43 +} + +; Function Attrs: convergent noinline nounwind optnone +define hidden void @g() #3 !dbg !44 { +entry: + %i = alloca i32, align 4, addrspace(5) + %a = alloca [2 x i32], align 4, addrspace(5) + %i.ascast = addrspacecast ptr addrspace(5) %i to ptr + %a.ascast = addrspacecast ptr addrspace(5) %a to ptr + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %i, metadata !47, metadata !DIExpression()), !dbg !48 + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %a, metadata !49, metadata !DIExpression()), !dbg !50 + call void @f() #5, !dbg !51 + call void @g() #5, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.declare(metadata, metadata, metadata) #4 + +attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "frame-pointer"="all" "no-trapping-math"="true" "omp_target_thread_limit"="256" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +attributes #2 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } +attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { convergent } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!omp_offload.info = !{!2} +!nvvm.annotations = !{!3} +!llvm.module.flags = !{!4, !5, !6, !7, !8, !9, !10, !11, !12} +!llvm.ident = !{!13, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14} +!opencl.ocl.version = !{!15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git (/tmp/llvm/clang 5a5e94265d423fa9eb39dc1b855511195f8dc0fe)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "eff61a7cf33c8dd1bd6933250fc90157") +!2 = !{i32 0, i32 64770, i32 466741, !"h", i32 12, i32 0, i32 0} +!3 = !{ptr @__omp_offloading_fd02_71f35_h_l12, !"kernel", i32 1} +!4 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!5 = !{i32 7, !"Dwarf Version", i32 5} +!6 = !{i32 2, !"Debug Info Version", i32 3} +!7 = !{i32 1, !"wchar_size", i32 4} +!8 = !{i32 7, !"openmp", i32 51} +!9 = !{i32 7, !"openmp-device", i32 51} +!10 = !{i32 8, !"PIC Level", i32 2} +!11 = !{i32 7, !"frame-pointer", i32 2} +!12 = !{i32 4, !"amdgpu_hostcall", i32 1} +!13 = !{!"clang version 19.0.0git (/tmp/llvm/clang 5a5e94265d423fa9eb39dc1b855511195f8dc0fe)"} +!14 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"} +!15 = !{i32 2, i32 0} +!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_71f35_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!17 = !DIFile(filename: "test.c", directory: "/tmp") +!18 = !DISubroutineType(types: !19) +!19 = !{null, !20} +!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21) +!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22) +!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!23 = !{} +!24 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !16, type: !20, flags: DIFlagArtificial) +!25 = !DILocation(line: 0, scope: !16) +!26 = !DILocation(line: 13, column: 3, scope: !16) +!27 = !DILocalVariable(name: "i", scope: !28, file: !17, line: 14, type: !29) +!28 = distinct !DILexicalBlock(scope: !16, file: !17, line: 13, column: 3) +!29 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!30 = !DILocation(line: 14, column: 9, scope: !28) +!31 = !DILocalVariable(name: "a", scope: !28, file: !17, line: 15, type: !32) +!32 = !DICompositeType(tag: DW_TAG_array_type, baseType: !29, size: 64, elements: !33) +!33 = !{!34} +!34 = !DISubrange(count: 2) +!35 = !DILocation(line: 15, column: 9, scope: !28) +!36 = !DILocation(line: 16, column: 5, scope: !28) +!37 = !DILocation(line: 17, column: 5, scope: !28) +!38 = !DILocation(line: 18, column: 3, scope: !28) +!39 = !DILocation(line: 18, column: 3, scope: !16) +!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_71f35_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial) +!42 = !DILocation(line: 0, scope: !40) +!43 = !DILocation(line: 12, column: 1, scope: !40) +!44 = distinct !DISubprogram(name: "g", scope: !17, file: !17, line: 3, type: !45, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !23) +!45 = !DISubroutineType(types: !46) +!46 = !{null} +!47 = !DILocalVariable(name: "i", scope: !44, file: !17, line: 4, type: !29) +!48 = !DILocation(line: 4, column: 7, scope: !44) +!49 = !DILocalVariable(name: "a", scope: !44, file: !17, line: 5, type: !32) +!50 = !DILocation(line: 5, column: 7, scope: !44) +!51 = !DILocation(line: 6, column: 3, scope: !44) +!52 = !DILocation(line: 7, column: 3, scope: !44) +!53 = !DILocation(line: 8, column: 1, scope: !44) diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll new file mode 100644 index 0000000000000..41d068b03548b --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -0,0 +1,811 @@ +; See ./README.md for how to maintain the LLVM IR in this test. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +; For some builds, we see a warning like: +; +; opt: WARNING: failed to create target machine for 'nvptx64-nvidia-cuda': unable to get target for 'nvptx64-nvidia-cuda', see --version and --triple. +; +; But there should be no other remarks here. +; CHECK-NOT: remark: + +; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_init' +; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_deinit' +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 128 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Maxntidx = 128 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 3 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1 + +; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2 + +; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0 +; CHECK-NOT: remark: {{.*: in function 'g',.*}} + +; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't +; want to maintain a list of their allocas, calls, etc. in this test. + + +; ModuleID = 'test-openmp-nvptx64-nvidia-cuda.bc' +source_filename = "test.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, ptr } +%struct.DynamicEnvironmentTy = type { i16 } +%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr } +%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 } +%struct.DeviceMemoryPoolTy = type { ptr, i64 } +%struct.DeviceMemoryPoolTrackingTy = type { i64, i64, i64, i64 } +%struct.DeviceEnvironmentTy = type { i32, i32, i32, i32, i64, i64, i64, i64 } +%"struct.(anonymous namespace)::SharedMemorySmartStackTy" = type { [512 x i8], [1024 x i8] } +%"struct.ompx::state::TeamStateTy" = type { %"struct.ompx::state::ICVStateTy", i32, i32, ptr } +%"struct.ompx::state::ICVStateTy" = type { i32, i32, i32, i32, i32, i32, i32 } +%printf_args = type { ptr, i32, ptr, ptr, ptr } +%printf_args.7 = type { ptr, i32, ptr, ptr } + +@__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 +@__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +@0 = private unnamed_addr constant [59 x i8] c";test.c;__omp_offloading_10305_5c00dd_h_l12_debug__;13;3;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 58, ptr @0 }, align 8 +@__omp_offloading_10305_5c00dd_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_10305_5c00dd_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_10305_5c00dd_h_l12_dynamic_environment } +@llvm.used = appending global [3 x ptr] [ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata" +@__omp_rtl_device_memory_pool = weak protected global %struct.DeviceMemoryPoolTy zeroinitializer, align 8 +@__omp_rtl_device_memory_pool_tracker = weak protected global %struct.DeviceMemoryPoolTrackingTy zeroinitializer, align 8 +@__omp_rtl_debug_kind = weak_odr hidden constant i32 0 +@__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 +@__omp_rtl_device_environment = weak protected addrspace(4) global %struct.DeviceEnvironmentTy undef, align 8 +@.str = private unnamed_addr constant [40 x i8] c"%s:%u: %s: Assertion %s (`%s`) failed.\0A\00", align 1 +@.str1 = private unnamed_addr constant [35 x i8] c"%s:%u: %s: Assertion `%s` failed.\0A\00", align 1 +@.str15 = private unnamed_addr constant [43 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Kernel.cpp\00", align 1 +@__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy = private unnamed_addr constant [36 x i8] c"void genericStateMachine(IdentTy *)\00", align 1 +@.str2 = private unnamed_addr constant [18 x i8] c"WorkFn == nullptr\00", align 1 +@__PRETTY_FUNCTION__.__kmpc_target_deinit = private unnamed_addr constant [28 x i8] c"void __kmpc_target_deinit()\00", align 1 +@IsSPMDMode = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 +@.str1127 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 +@.str13 = private unnamed_addr constant [23 x i8] c"!mapping::isSPMDMode()\00", align 1 +@__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel = private unnamed_addr constant [34 x i8] c"void __kmpc_kernel_end_parallel()\00", align 1 +@_ZL20KernelEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 +@_ZL26KernelLaunchEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 +@_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16 +@.str544 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 +@.str847 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 +@__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_ = private unnamed_addr constant [68 x i8] c"void ompx::state::ICVStateTy::assertEqual(const ICVStateTy &) const\00", align 1 +@.str948 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 +@.str1049 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 +@.str1150 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 +@.str1251 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 +@.str1352 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 +@.str14 = private unnamed_addr constant [43 x i8] c"ParallelTeamSize == Other.ParallelTeamSize\00", align 1 +@__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_ = private unnamed_addr constant [64 x i8] c"void ompx::state::TeamStateTy::assertEqual(TeamStateTy &) const\00", align 1 +@.str1553 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 +@.str24 = private unnamed_addr constant [32 x i8] c"mapping::isSPMDMode() == IsSPMD\00", align 1 +@__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb = private unnamed_addr constant [43 x i8] c"void ompx::state::assumeInitialState(bool)\00", align 1 +@_ZN4ompx5state9TeamStateE = internal local_unnamed_addr addrspace(3) global %"struct.ompx::state::TeamStateTy" undef, align 8 +@_ZN4ompx5state12ThreadStatesE = internal addrspace(3) global ptr undef, align 8 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define internal void @__omp_offloading_10305_5c00dd_h_l12_debug__(ptr noalias noundef %dyn_ptr) #0 !dbg !17 { +entry: + %dyn_ptr.addr = alloca ptr, align 8 + %i = alloca i32, align 4 + %a = alloca [2 x i32], align 4 + store ptr %dyn_ptr, ptr %dyn_ptr.addr, align 8 + tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !24, metadata !DIExpression()), !dbg !25 + %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_10305_5c00dd_h_l12_kernel_environment, ptr %dyn_ptr), !dbg !26 + %exec_user_code = icmp eq i32 %0, -1, !dbg !26 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit, !dbg !26 + +user_code.entry: ; preds = %entry + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !27, metadata !DIExpression()), !dbg !30 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !31, metadata !DIExpression()), !dbg !35 + call void @f() #16, !dbg !36 + call void @g() #16, !dbg !37 + call void @__kmpc_target_deinit(), !dbg !38 + ret void, !dbg !39 + +worker.exit: ; preds = %entry + ret void, !dbg !26 +} + +; Function Attrs: convergent +declare void @f(...) #1 + +; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +define weak_odr protected void @__omp_offloading_10305_5c00dd_h_l12(ptr noalias noundef %dyn_ptr) #2 !dbg !40 { +entry: + %dyn_ptr.addr = alloca ptr, align 8 + store ptr %dyn_ptr, ptr %dyn_ptr.addr, align 8 + tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !41, metadata !DIExpression()), !dbg !42 + %0 = load ptr, ptr %dyn_ptr.addr, align 8, !dbg !43 + call void @__omp_offloading_10305_5c00dd_h_l12_debug__(ptr %0) #17, !dbg !43 + ret void, !dbg !43 +} + +; Function Attrs: convergent noinline nounwind optnone +define hidden void @g() #3 !dbg !44 { +entry: + %i = alloca i32, align 4 + %a = alloca [2 x i32], align 4 + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !47, metadata !DIExpression()), !dbg !48 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !49, metadata !DIExpression()), !dbg !50 + call void @f() #16, !dbg !51 + call void @g() #16, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: convergent mustprogress nounwind +define internal noundef i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(48) %KernelEnvironment, ptr nofree noundef nonnull align 8 dereferenceable(16) %KernelLaunchEnvironment) #4 { +entry: + %WorkFn.i = alloca ptr, align 8 + %ExecMode = getelementptr inbounds i8, ptr %KernelEnvironment, i64 2 + %0 = load i8, ptr %ExecMode, align 2, !tbaa !54 + %1 = and i8 %0, 2 + %tobool.not = icmp eq i8 %1, 0 + %2 = load i8, ptr %KernelEnvironment, align 8, !tbaa !60 + %tobool3.not = icmp ne i8 %2, 0 + br i1 %tobool.not, label %if.else, label %if.then + +if.then: ; preds = %entry + %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %cmp.i.i.i = icmp eq i32 %3, 0 + br i1 %cmp.i.i.i, label %if.then.i, label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge + +if.then.i: ; preds = %if.then + store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 + %idxprom.i.i = zext nneg i32 %3 to i64 + %arrayidx.i.i = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i + %4 = addrspacecast ptr %arrayidx.i.i to ptr addrspace(3) + store i8 0, ptr addrspace(3) %4, align 1, !tbaa !62 + store i32 0, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !63 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !67 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !68 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 12) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !72 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !75 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 + store ptr %KernelEnvironment, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + store ptr %KernelLaunchEnvironment, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !76 + br label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit + +_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge: ; preds = %if.then + %idxprom.i.i.c = zext i32 %3 to i64 + %arrayidx.i.i.c = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i.c + %5 = addrspacecast ptr %arrayidx.i.i.c to ptr addrspace(3) + store i8 0, ptr addrspace(3) %5, align 1, !tbaa !62 + br label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit + +_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit: ; preds = %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge, %if.then.i + tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19 + br label %if.end + +if.else: ; preds = %entry + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 + %sub.i.i.i7 = add i32 %6, -1 + %and.i.i.i8 = and i32 %sub.i.i.i7, -32 + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %cmp.i.i.i9 = icmp eq i32 %7, %and.i.i.i8 + br i1 %cmp.i.i.i9, label %if.then.i11, label %if.end.critedge + +if.then.i11: ; preds = %if.else + store i32 0, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 + %idxprom.i.i13 = zext i32 %7 to i64 + %arrayidx.i.i14 = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i13 + %8 = addrspacecast ptr %arrayidx.i.i14 to ptr addrspace(3) + store i8 0, ptr addrspace(3) %8, align 1, !tbaa !62 + store i32 0, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !63 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !67 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !68 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 12) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !72 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !75 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 + store ptr %KernelEnvironment, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + store ptr %KernelLaunchEnvironment, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !76 + br label %if.end + +if.end.critedge: ; preds = %if.else + %idxprom.i.i13.c = zext i32 %7 to i64 + %arrayidx.i.i14.c = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i13.c + %9 = addrspacecast ptr %arrayidx.i.i14.c to ptr addrspace(3) + store i8 0, ptr addrspace(3) %9, align 1, !tbaa !62 + br label %if.end + +if.end: ; preds = %if.end.critedge, %if.then.i11, %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit + br i1 %tobool.not, label %if.end9, label %if.then7 + +if.then7: ; preds = %if.end + %10 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 + %11 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 + %and.i.i.i21 = and i32 %10, 1 + %and.i.i = and i32 %and.i.i.i21, %11 + %tobool.i.i = icmp ne i32 %and.i.i, 0 + %.pre67.i.i.i = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !80 + %cmp.i.i.i22 = icmp ne i32 %.pre67.i.i.i, 0 + %or.cond.not.i.i.i = select i1 %tobool.i.i, i1 %cmp.i.i.i22, i1 false + br i1 %or.cond.not.i.i.i, label %if.then.i.i.i, label %if.else.i.i.i + +if.then.i.i.i: ; preds = %if.then7 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str847, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else.i.i.i: ; preds = %if.then7 + %cmp5.i.i.i = icmp eq i32 %.pre67.i.i.i, 0 + tail call void @llvm.assume(i1 noundef %cmp5.i.i.i) #21 + %12 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !81 + br i1 %tobool.i.i, label %land.lhs.true7.i.i.i, label %if.else11.i.i.i + +land.lhs.true7.i.i.i: ; preds = %if.else.i.i.i + %cmp9.i.i.i = icmp eq i32 %12, 0 + br i1 %cmp9.i.i.i, label %if.else11.i.i.i, label %if.then10.i.i.i + +if.then10.i.i.i: ; preds = %land.lhs.true7.i.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str948, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else11.i.i.i: ; preds = %land.lhs.true7.i.i.i, %if.else.i.i.i + %13 = phi i32 [ 0, %land.lhs.true7.i.i.i ], [ %12, %if.else.i.i.i ] + %cmp14.i.i.i = icmp eq i32 %13, 0 + tail call void @llvm.assume(i1 noundef %cmp14.i.i.i) #21 + %14 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !82 + br i1 %tobool.i.i, label %land.lhs.true17.i.i.i, label %if.else21.i.i.i + +land.lhs.true17.i.i.i: ; preds = %if.else11.i.i.i + %cmp19.i.i.i = icmp eq i32 %14, 0 + br i1 %cmp19.i.i.i, label %if.else21.i.i.i, label %if.then20.i.i.i + +if.then20.i.i.i: ; preds = %land.lhs.true17.i.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1049, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else21.i.i.i: ; preds = %land.lhs.true17.i.i.i, %if.else11.i.i.i + %15 = phi i32 [ 0, %land.lhs.true17.i.i.i ], [ %14, %if.else11.i.i.i ] + %cmp24.i.i.i = icmp eq i32 %15, 0 + tail call void @llvm.assume(i1 noundef %cmp24.i.i.i) #21 + %16 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !83 + br i1 %tobool.i.i, label %land.lhs.true27.i.i.i, label %if.else31.i.i.i + +land.lhs.true27.i.i.i: ; preds = %if.else21.i.i.i + %cmp29.i.i.i = icmp eq i32 %16, 1 + br i1 %cmp29.i.i.i, label %if.else31.i.i.i, label %if.then30.i.i.i + +if.then30.i.i.i: ; preds = %land.lhs.true27.i.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1150, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else31.i.i.i: ; preds = %land.lhs.true27.i.i.i, %if.else21.i.i.i + %17 = phi i32 [ 1, %land.lhs.true27.i.i.i ], [ %16, %if.else21.i.i.i ] + %cmp34.i.i.i = icmp eq i32 %17, 1 + tail call void @llvm.assume(i1 noundef %cmp34.i.i.i) #21 + %18 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !84 + br i1 %tobool.i.i, label %land.lhs.true37.i.i.i, label %if.else.critedge.i.critedge.critedge.critedge + +land.lhs.true37.i.i.i: ; preds = %if.else31.i.i.i + %cmp39.i.i.i = icmp eq i32 %18, 1 + br i1 %cmp39.i.i.i, label %if.else41.i.i.i, label %if.then40.i.i.i + +if.then40.i.i.i: ; preds = %land.lhs.true37.i.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1251, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else41.i.i.i: ; preds = %land.lhs.true37.i.i.i + %cmp44.i.i.i = icmp eq i32 1, 1 + tail call void @llvm.assume(i1 noundef %cmp44.i.i.i) #21 + br i1 %tobool.i.i, label %land.lhs.true47.i.i.i, label %if.else.critedge.i.critedge + +land.lhs.true47.i.i.i: ; preds = %if.else41.i.i.i + %19 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !85 + %cmp49.i.i.i = icmp eq i32 %19, 1 + br i1 %cmp49.i.i.i, label %if.else51.i.i.i, label %if.then50.i.i.i + +if.then50.i.i.i: ; preds = %land.lhs.true47.i.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1352, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else51.i.i.i: ; preds = %land.lhs.true47.i.i.i + br i1 %tobool.i.i, label %land.lhs.true.i.i, label %if.else.critedge.i.critedge + +land.lhs.true.i.i: ; preds = %if.else51.i.i.i + %20 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 + %cmp.i.i = icmp eq i32 %20, 1 + br i1 %cmp.i.i, label %land.lhs.true8.i.i, label %if.then.i.i + +if.then.i.i: ; preds = %land.lhs.true.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 + unreachable + +land.lhs.true8.i.i: ; preds = %land.lhs.true.i.i + %21 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 + %cmp10.i.i = icmp eq i32 %21, 0 + br i1 %cmp10.i.i, label %land.lhs.true.i24, label %if.then11.i.i + +if.then11.i.i: ; preds = %land.lhs.true8.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1553, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 + unreachable + +land.lhs.true.i24: ; preds = %land.lhs.true8.i.i + %22 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 + %tobool.i25.i.not = icmp eq i32 %22, 0 + br i1 %tobool.i25.i.not, label %if.then.i25, label %_ZN4ompx5state18assumeInitialStateEb.exit + +if.then.i25: ; preds = %land.lhs.true.i24 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20 + unreachable + +if.else.critedge.i.critedge.critedge.critedge: ; preds = %if.else31.i.i.i + %cmp44.i.i.i.c = icmp eq i32 %18, 1 + tail call void @llvm.assume(i1 noundef %cmp44.i.i.i.c) #21 + br label %if.else.critedge.i.critedge + +if.else.critedge.i.critedge: ; preds = %if.else41.i.i.i, %if.else.critedge.i.critedge.critedge.critedge, %if.else51.i.i.i + %.pre.i = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 + %23 = icmp ne i32 %.pre.i, 0 + br label %_ZN4ompx5state18assumeInitialStateEb.exit + +_ZN4ompx5state18assumeInitialStateEb.exit: ; preds = %land.lhs.true.i24, %if.else.critedge.i.critedge + %cmp8.i = phi i1 [ %23, %if.else.critedge.i.critedge ], [ true, %land.lhs.true.i24 ] + tail call void @llvm.assume(i1 noundef %cmp8.i) #21 + tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19 + br label %cleanup + +if.end9: ; preds = %if.end + %24 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 + %sub.i.i = add i32 %24, -1 + %and.i.i26 = and i32 %sub.i.i, -32 + %25 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %cmp.i.i27 = icmp eq i32 %25, %and.i.i26 + br i1 %cmp.i.i27, label %cleanup, label %if.end12 + +if.end12: ; preds = %if.end9 + %sub.i = add i32 %24, -32 + %cmp = icmp ult i32 %25, %sub.i + %or.cond33 = and i1 %tobool3.not, %cmp + br i1 %or.cond33, label %do.body.i.preheader, label %cleanup + +do.body.i.preheader: ; preds = %if.end12 + %26 = load i32, ptr @__omp_rtl_debug_kind, align 4 + %27 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8 + %and.i.i29 = and i32 %26, 1 + %and.i = and i32 %and.i.i29, %27 + %tobool.i = icmp ne i32 %and.i, 0 + br label %do.body.i + +do.body.i: ; preds = %do.body.i.preheader, %if.end9.i + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %WorkFn.i) #22 + store ptr null, ptr %WorkFn.i, align 8, !tbaa !76 + tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 + %call1.i = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn.i) #22 + %28 = load ptr, ptr %WorkFn.i, align 8, !tbaa !76 + %tobool.not.not.i = icmp eq ptr %28, null + br i1 %tobool.not.not.i, label %_ZL19genericStateMachineP7IdentTy.exit, label %if.end.i + +if.end.i: ; preds = %do.body.i + br i1 %call1.i, label %if.then3.i, label %if.end9.i + +if.then3.i: ; preds = %if.end.i + %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %tobool.i30 = icmp ne i32 %29, 0 + %or.cond = select i1 %tobool.i, i1 %tobool.i30, i1 false + br i1 %or.cond, label %if.then6.i, label %if.else.i + +if.then6.i: ; preds = %if.then3.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(70) @.str15, i32 noundef 58, ptr nofree noundef nonnull dereferenceable(36) @__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy) #20 + unreachable + +if.else.i: ; preds = %if.then3.i + %tobool.i31.not = icmp eq i32 %29, 0 + tail call void @llvm.assume(i1 noundef %tobool.i31.not) #21 + tail call void %28(i32 noundef 0, i32 noundef %25) #23 + tail call void @__kmpc_kernel_end_parallel() #24 + br label %if.end9.i + +if.end9.i: ; preds = %if.else.i, %if.end.i + tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn.i) #22 + br label %do.body.i, !llvm.loop !86 + +_ZL19genericStateMachineP7IdentTy.exit: ; preds = %do.body.i + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn.i) #22 + br label %cleanup + +cleanup: ; preds = %if.end12, %_ZL19genericStateMachineP7IdentTy.exit, %if.end9, %_ZN4ompx5state18assumeInitialStateEb.exit + %retval.0 = phi i32 [ -1, %_ZN4ompx5state18assumeInitialStateEb.exit ], [ -1, %if.end9 ], [ %25, %_ZL19genericStateMachineP7IdentTy.exit ], [ %25, %if.end12 ] + ret i32 %retval.0 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5 + +; Function Attrs: convergent mustprogress noinline norecurse nounwind +define internal void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 %Ordering) local_unnamed_addr #6 { +entry: + tail call void @llvm.nvvm.barrier0() #25 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5 + +; Function Attrs: convergent mustprogress noreturn nounwind +define internal fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(8) %expr, ptr noundef %msg, ptr nofree noundef nonnull dereferenceable(69) %file, i32 noundef %line, ptr nofree noundef nonnull dereferenceable(20) %function) unnamed_addr #7 { +entry: + %tmp = alloca %printf_args, align 8 + %tmp1 = alloca %printf_args.7, align 8 + %tobool.not = icmp eq ptr %msg, null + br i1 %tobool.not, label %if.else, label %if.then + +if.then: ; preds = %entry + store ptr %file, ptr %tmp, align 8 + %0 = getelementptr inbounds i8, ptr %tmp, i64 8 + store i32 %line, ptr %0, align 8 + %1 = getelementptr inbounds i8, ptr %tmp, i64 16 + store ptr %function, ptr %1, align 8 + br label %if.end + +if.else: ; preds = %entry + store ptr %file, ptr %tmp1, align 8 + %2 = getelementptr inbounds i8, ptr %tmp1, i64 8 + store i32 %line, ptr %2, align 8 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %.sink12 = phi i64 [ 16, %if.else ], [ 24, %if.then ] + %tmp1.sink11 = phi ptr [ %tmp1, %if.else ], [ %tmp, %if.then ] + %function.sink = phi ptr [ %function, %if.else ], [ %msg, %if.then ] + %.sink9 = phi i64 [ 24, %if.else ], [ 32, %if.then ] + %.str1.sink = phi ptr [ @.str1, %if.else ], [ @.str, %if.then ] + %3 = getelementptr inbounds i8, ptr %tmp1.sink11, i64 %.sink12 + store ptr %function.sink, ptr %3, align 8 + %4 = getelementptr inbounds i8, ptr %tmp1.sink11, i64 %.sink9 + store ptr %expr, ptr %4, align 8 + %call.i.i = call noundef i32 @vprintf(ptr noundef nonnull %.str1.sink, ptr noundef nonnull %tmp1.sink11) #24 + call void @llvm.trap() #26 + unreachable +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #8 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #9 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.sync(i32) #10 + +; Function Attrs: convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) +define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn) local_unnamed_addr #11 { +entry: + %0 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !76 + store ptr %0, ptr %WorkFn, align 8, !tbaa !76 + %tobool.not = icmp eq ptr %0, null + br i1 %tobool.not, label %return, label %if.end + +if.end: ; preds = %entry + %1 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27 + %2 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !61 + %tobool.not.i = icmp eq i32 %2, 0 + %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 + %4 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %tobool.i.not.i.i = icmp eq i32 %4, 0 + %mul.neg.i.i.i = select i1 %tobool.i.not.i.i, i32 -32, i32 0 + %sub.i.i.i = add i32 %mul.neg.i.i.i, %3 + %cond.i = select i1 %tobool.not.i, i32 %sub.i.i.i, i32 %2 + %cmp = icmp ult i32 %1, %cond.i + br label %return + +return: ; preds = %if.end, %entry + %retval.0 = phi i1 [ %cmp, %if.end ], [ false, %entry ] + ret i1 %retval.0 +} + +; Function Attrs: convergent mustprogress noinline nounwind +define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #12 { +entry: + %0 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 + %1 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 + %and.i.i = and i32 %0, 1 + %and.i = and i32 %and.i.i, %1 + %tobool.i = icmp ne i32 %and.i, 0 + %2 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %tobool.i1 = icmp ne i32 %2, 0 + %or.cond = select i1 %tobool.i, i1 %tobool.i1, i1 false + br i1 %or.cond, label %if.then, label %if.else + +if.then: ; preds = %entry + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(75) @.str1127, i32 noundef 297, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 + unreachable + +if.else: ; preds = %entry + %tobool.i2.not = icmp eq i32 %2, 0 + tail call void @llvm.assume(i1 noundef %tobool.i2.not) #21 + %3 = load i32, ptr @__omp_rtl_assume_no_thread_state, align 4, !tbaa !61 + %tobool.not.i.i = icmp eq i32 %3, 0 + %4 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8 + %tobool.not.i = icmp ne i32 %4, 0 + %or.cond.not.i = select i1 %tobool.not.i.i, i1 %tobool.not.i, i1 false + br i1 %or.cond.not.i, label %lor.rhs.i, label %_ZN4ompx5state19resetStateForThreadEj.exit + +lor.rhs.i: ; preds = %if.else + %5 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27 + %6 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 + %idxprom.i = zext i32 %5 to i64 + %arrayidx.i = getelementptr inbounds ptr, ptr %6, i64 %idxprom.i + %7 = load ptr, ptr %arrayidx.i, align 8, !tbaa !76 + %tobool1.not.i = icmp eq ptr %7, null + br i1 %tobool1.not.i, label %_ZN4ompx5state19resetStateForThreadEj.exit, label %if.end4.i, !prof !88 + +if.end4.i: ; preds = %lor.rhs.i + %PreviousThreadState7.i = getelementptr inbounds i8, ptr %7, i64 32 + %8 = load ptr, ptr %PreviousThreadState7.i, align 8, !tbaa !89 + tail call void @free(ptr noundef nonnull dereferenceable(40) %7) #28 + %9 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 + %arrayidx11.i = getelementptr inbounds ptr, ptr %9, i64 %idxprom.i + store ptr %8, ptr %arrayidx11.i, align 8, !tbaa !76 + %.pre = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + br label %_ZN4ompx5state19resetStateForThreadEj.exit + +_ZN4ompx5state19resetStateForThreadEj.exit: ; preds = %if.else, %lor.rhs.i, %if.end4.i + %10 = phi i32 [ 0, %if.else ], [ 0, %lor.rhs.i ], [ %.pre, %if.end4.i ] + %tobool.i6 = icmp ne i32 %10, 0 + %or.cond8 = select i1 %tobool.i, i1 %tobool.i6, i1 false + br i1 %or.cond8, label %if.then7, label %if.else8 + +if.then7: ; preds = %_ZN4ompx5state19resetStateForThreadEj.exit + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(75) @.str1127, i32 noundef 300, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 + unreachable + +if.else8: ; preds = %_ZN4ompx5state19resetStateForThreadEj.exit + %tobool.i7.not = icmp eq i32 %10, 0 + tail call void @llvm.assume(i1 noundef %tobool.i7.not) #21 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #9 + +; Function Attrs: convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) +declare extern_weak void @free(ptr allocptr nocapture noundef) local_unnamed_addr #13 + +; Function Attrs: convergent +declare i32 @vprintf(ptr noundef, ptr noundef) local_unnamed_addr #14 + +; Function Attrs: cold noreturn nounwind memory(inaccessiblemem: write) +declare void @llvm.trap() #15 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #10 + +; Function Attrs: convergent mustprogress nounwind +define internal void @__kmpc_target_deinit() #4 { +entry: + %WorkFn = alloca ptr, align 8 + %0 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 + %tobool.i.not = icmp eq i32 %0, 0 + br i1 %tobool.i.not, label %if.end, label %cleanup + +if.end: ; preds = %entry + %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 + %sub.i.i = add i32 %1, -1 + %and.i.i = and i32 %sub.i.i, -32 + %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %cmp.i.i = icmp eq i32 %2, %and.i.i + br i1 %cmp.i.i, label %if.then3, label %if.else + +if.then3: ; preds = %if.end + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !76 + br label %cleanup + +if.else: ; preds = %if.end + %3 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + %4 = load i8, ptr %3, align 8, !tbaa !91 + %tobool6.not = icmp eq i8 %4, 0 + br i1 %tobool6.not, label %if.then7, label %cleanup + +if.then7: ; preds = %if.else + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %WorkFn) #29 + store ptr null, ptr %WorkFn, align 8, !tbaa !76 + %call8 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn) #22 + %5 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 + %6 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 + %and.i.i1 = and i32 %5, 1 + %and.i = and i32 %and.i.i1, %6 + %tobool.i2.not = icmp eq i32 %and.i, 0 + %7 = load ptr, ptr %WorkFn, align 8 + %cmp = icmp eq ptr %7, null + %or.cond = select i1 %tobool.i2.not, i1 true, i1 %cmp + br i1 %or.cond, label %if.else11, label %if.then10 + +if.then10: ; preds = %if.then7 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(18) @.str2, ptr noundef null, ptr nofree noundef nonnull dereferenceable(70) @.str15, i32 noundef 150, ptr nofree noundef nonnull dereferenceable(28) @__PRETTY_FUNCTION__.__kmpc_target_deinit) #20 + unreachable + +if.else11: ; preds = %if.then7 + tail call void @llvm.assume(i1 noundef %cmp) #21 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn) #22 + br label %cleanup + +cleanup: ; preds = %if.else11, %if.else, %if.then3, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.declare(metadata, metadata, metadata) #5 + +attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "omp_target_thread_limit"="128" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } +attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } +attributes #2 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } +attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } +attributes #4 = { convergent mustprogress nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #6 = { convergent mustprogress noinline norecurse nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #7 = { convergent mustprogress noreturn nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #8 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #9 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #10 = { convergent nocallback nounwind } +attributes #11 = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #12 = { convergent mustprogress noinline nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #13 = { convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #14 = { convergent "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #15 = { cold noreturn nounwind memory(inaccessiblemem: write) } +attributes #16 = { convergent } +attributes #17 = { nounwind } +attributes #18 = { "llvm.assume"="ompx_no_call_asm" } +attributes #19 = { convergent nounwind "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" } +attributes #20 = { noreturn nounwind "llvm.assume"="ompx_no_call_asm" } +attributes #21 = { memory(write) "llvm.assume"="ompx_no_call_asm" } +attributes #22 = { nounwind "llvm.assume"="ompx_no_call_asm" } +attributes #23 = { convergent nounwind } +attributes #24 = { convergent nounwind "llvm.assume"="ompx_no_call_asm" } +attributes #25 = { "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" } +attributes #26 = { noreturn "llvm.assume"="ompx_no_call_asm" } +attributes #27 = { nofree willreturn "llvm.assume"="ompx_no_call_asm" } +attributes #28 = { convergent nounwind willreturn "llvm.assume"="ompx_no_call_asm" } +attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9} +!llvm.dbg.cu = !{!10} +!nvvm.annotations = !{!12, !13} +!omp_offload.info = !{!14} +!llvm.ident = !{!15, !16, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]} +!1 = !{i32 7, !"Dwarf Version", i32 2} +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 1, !"wchar_size", i32 4} +!4 = !{i32 7, !"openmp", i32 51} +!5 = !{i32 7, !"openmp-device", i32 51} +!6 = !{i32 8, !"PIC Level", i32 2} +!7 = !{i32 7, !"frame-pointer", i32 2} +!8 = !{i32 1, !"ThinLTO", i32 0} +!9 = !{i32 1, !"EnableSplitLTOUnit", i32 1} +!10 = distinct !DICompileUnit(language: DW_LANG_C11, file: !11, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!11 = !DIFile(filename: "test.c", directory: "/tmp") +!12 = !{ptr @__omp_offloading_10305_5c00dd_h_l12_debug__, !"maxntidx", i32 128} +!13 = !{ptr @__omp_offloading_10305_5c00dd_h_l12, !"kernel", i32 1} +!14 = !{i32 0, i32 66309, i32 6029533, !"h", i32 12, i32 0, i32 0} +!15 = !{!"clang version 19.0.0git"} +!16 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!17 = distinct !DISubprogram(name: "__omp_offloading_10305_5c00dd_h_l12_debug__", scope: !11, file: !11, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !10, retainedNodes: !23) +!18 = !DISubroutineType(types: !19) +!19 = !{null, !20} +!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21) +!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22) +!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!23 = !{} +!24 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !17, type: !20, flags: DIFlagArtificial) +!25 = !DILocation(line: 0, scope: !17) +!26 = !DILocation(line: 13, column: 3, scope: !17) +!27 = !DILocalVariable(name: "i", scope: !28, file: !11, line: 14, type: !29) +!28 = distinct !DILexicalBlock(scope: !17, file: !11, line: 13, column: 3) +!29 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!30 = !DILocation(line: 14, column: 9, scope: !28) +!31 = !DILocalVariable(name: "a", scope: !28, file: !11, line: 15, type: !32) +!32 = !DICompositeType(tag: DW_TAG_array_type, baseType: !29, size: 64, elements: !33) +!33 = !{!34} +!34 = !DISubrange(count: 2) +!35 = !DILocation(line: 15, column: 9, scope: !28) +!36 = !DILocation(line: 16, column: 5, scope: !28) +!37 = !DILocation(line: 17, column: 5, scope: !28) +!38 = !DILocation(line: 18, column: 3, scope: !28) +!39 = !DILocation(line: 18, column: 3, scope: !17) +!40 = distinct !DISubprogram(name: "__omp_offloading_10305_5c00dd_h_l12", scope: !11, file: !11, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !10, retainedNodes: !23) +!41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial) +!42 = !DILocation(line: 0, scope: !40) +!43 = !DILocation(line: 12, column: 1, scope: !40) +!44 = distinct !DISubprogram(name: "g", scope: !11, file: !11, line: 3, type: !45, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !10, retainedNodes: !23) +!45 = !DISubroutineType(types: !46) +!46 = !{null} +!47 = !DILocalVariable(name: "i", scope: !44, file: !11, line: 4, type: !29) +!48 = !DILocation(line: 4, column: 7, scope: !44) +!49 = !DILocalVariable(name: "a", scope: !44, file: !11, line: 5, type: !32) +!50 = !DILocation(line: 5, column: 7, scope: !44) +!51 = !DILocation(line: 6, column: 3, scope: !44) +!52 = !DILocation(line: 7, column: 3, scope: !44) +!53 = !DILocation(line: 8, column: 1, scope: !44) +!54 = !{!55, !58, i64 2} +!55 = !{!"_ZTS26ConfigurationEnvironmentTy", !56, i64 0, !56, i64 1, !58, i64 2, !59, i64 4, !59, i64 8, !59, i64 12, !59, i64 16, !59, i64 20, !59, i64 24} +!56 = !{!"omnipotent char", !57, i64 0} +!57 = !{!"Simple C++ TBAA"} +!58 = !{!"_ZTSN4llvm3omp19OMPTgtExecModeFlagsE", !56, i64 0} +!59 = !{!"int", !56, i64 0} +!60 = !{!55, !56, i64 0} +!61 = !{!59, !59, i64 0} +!62 = !{!56, !56, i64 0} +!63 = !{!64, !59, i64 0} +!64 = !{!"_ZTSN4ompx5state11TeamStateTyE", !65, i64 0, !59, i64 28, !59, i64 32, !66, i64 40} +!65 = !{!"_ZTSN4ompx5state10ICVStateTyE", !59, i64 0, !59, i64 4, !59, i64 8, !59, i64 12, !59, i64 16, !59, i64 20, !59, i64 24} +!66 = !{!"any pointer", !56, i64 0} +!67 = !{!64, !59, i64 4} +!68 = !{!64, !59, i64 8} +!69 = !{!64, !59, i64 12} +!70 = !{!64, !59, i64 16} +!71 = !{!64, !59, i64 20} +!72 = !{!64, !59, i64 24} +!73 = !{!64, !59, i64 28} +!74 = !{!64, !59, i64 32} +!75 = !{!64, !66, i64 40} +!76 = !{!66, !66, i64 0} +!77 = !{!78, !59, i64 0} +!78 = !{!"_ZTS19DeviceEnvironmentTy", !59, i64 0, !59, i64 4, !59, i64 8, !59, i64 12, !79, i64 16, !79, i64 24, !79, i64 32, !79, i64 40} +!79 = !{!"long", !56, i64 0} +!80 = !{!65, !59, i64 0} +!81 = !{!65, !59, i64 4} +!82 = !{!65, !59, i64 8} +!83 = !{!65, !59, i64 16} +!84 = !{!65, !59, i64 20} +!85 = !{!65, !59, i64 24} +!86 = distinct !{!86, !87} +!87 = !{!"llvm.loop.mustprogress"} +!88 = !{!"branch_weights", i32 2000, i32 1} +!89 = !{!90, !66, i64 32} +!90 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !65, i64 0, !66, i64 32} +!91 = !{!92, !56, i64 0} +!92 = !{!"_ZTS19KernelEnvironmentTy", !55, i64 0, !66, i64 32, !66, i64 40} From a7656de882610df9a7f1e60c65ce214cef70a32a Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 17:40:35 -0400 Subject: [PATCH 02/46] Move docs to KernelInfo.rst --- llvm/docs/KernelInfo.rst | 61 +++++++++++++++++++++++++ llvm/include/llvm/Analysis/KernelInfo.h | 29 +----------- 2 files changed, 62 insertions(+), 28 deletions(-) create mode 100644 llvm/docs/KernelInfo.rst diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst new file mode 100644 index 0000000000000..397b32602bce2 --- /dev/null +++ b/llvm/docs/KernelInfo.rst @@ -0,0 +1,61 @@ +========== +KernelInfo +========== + +.. contents:: + :local: + +Introduction +============ + +This LLVM IR pass reports various statistics for codes compiled for GPUs. The +goal of these statistics is to help identify bad code patterns and ways to +mitigate them. The pass operates at the LLVM IR level so that it can, in +theory, support any LLVM-based compiler for programming languages supporting +GPUs. + +By default, the pass is disabled. For convenience, the command-line option +``-kernel-info-end-lto`` inserts it at the end of LTO, and options like +``-Rpass=kernel-info`` enable its remarks. Example ``opt`` and ``clang`` +command lines appear in the next section. + +Remarks include summary statistics (e.g., total size of static allocas) and +individual occurrences (e.g., source location of each alloca). Examples of the +output appear in tests in `llvm/test/Analysis/KernelInfo`. + +Example Command Lines +===================== + +To analyze a C program as it appears to an LLVM GPU backend at the end of LTO: + +.. code-block:: shell + + $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ + -Rpass=kernel-info -mllvm -kernel-info-end-lto + +To analyze specified LLVM IR, perhaps previously generated by something like +``clang -save-temps -g -fopenmp --offload-arch=native test.c``: + +.. code-block:: shell + + $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ + -pass-remarks=kernel-info -passes=kernel-info + +kernel-info can also be inserted into a specified LLVM pass pipeline using +``-kernel-info-end-lto``, or it can be positioned explicitly in that pipeline: + +.. code-block:: shell + + $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ + -Rpass=kernel-info -mllvm -kernel-info-end-lto \ + -Xoffload-linker --lto-newpm-passes='lto' + + $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ + -Rpass=kernel-info \ + -Xoffload-linker --lto-newpm-passes='lto,module(kernel-info)' + + $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ + -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto' + + $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ + -pass-remarks=kernel-info -passes='lto,module(kernel-info)' diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 5495bb2fd4d92..96cd5f68af646 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -9,34 +9,7 @@ // This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter // classes used to extract function properties from a GPU kernel. // -// To analyze a C program as it appears to an LLVM GPU backend at the end of -// LTO: -// -// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ -// -Rpass=kernel-info -mllvm -kernel-info-end-lto -// -// To analyze specified LLVM IR, perhaps previously generated by something like -// 'clang -save-temps -g -fopenmp --offload-arch=native test.c': -// -// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ -// -pass-remarks=kernel-info -passes=kernel-info -// -// kernel-info can also be inserted into a specified LLVM pass pipeline using -// -kernel-info-end-lto, or it can be positioned explicitly in that pipeline: -// -// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ -// -Rpass=kernel-info -mllvm -kernel-info-end-lto \ -// -Xoffload-linker --lto-newpm-passes='lto' -// -// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ -// -Rpass=kernel-info \ -// -Xoffload-linker --lto-newpm-passes='lto,module(kernel-info)' -// -// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ -// -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto' -// -// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ -// -pass-remarks=kernel-info -passes='lto,module(kernel-info)' +// See llvm/docs/KernelInfo.rst. // ===---------------------------------------------------------------------===// #ifndef LLVM_ANALYSIS_KERNELINFO_H From d92856ec609d4bdf7642b8186cf0458dadd80f4a Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 17:41:02 -0400 Subject: [PATCH 03/46] Move conditional outside registration call --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 +++++----- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 3b2ed9fe4236c..93d1d6b1b80b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -774,14 +774,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { return nullptr; }); - PB.registerFullLinkTimeOptimizationLastEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { - if (KernelInfoEndLTO) { + if (KernelInfoEndLTO) { + PB.registerFullLinkTimeOptimizationLastEPCallback( + [](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; FPM.addPass(KernelInfoPrinter()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); - } - }); + }); + } } int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 8d77c8e53f7a6..1a4a9781db333 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -240,14 +240,14 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); - PB.registerFullLinkTimeOptimizationLastEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { - if (KernelInfoEndLTO) { + if (KernelInfoEndLTO) { + PB.registerFullLinkTimeOptimizationLastEPCallback( + [](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; FPM.addPass(KernelInfoPrinter()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); - } - }); + }); + } } TargetTransformInfo From 6ac3f419b94e5c5ecd4e7a33b16e1f7e89fa1790 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 18:03:51 -0400 Subject: [PATCH 04/46] Use llvm::SmallString --- llvm/lib/Analysis/KernelInfo.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 9df3b5b32afcb..caeada91c31af 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/KernelInfo.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/DebugInfo.h" @@ -139,8 +140,8 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, } remarkAlloca(ORE, F, *Alloca, StaticSize); } else if (const CallBase *Call = dyn_cast(&I)) { - std::string CallKind; - std::string RemarkKind; + SmallString<40> CallKind; + SmallString<40> RemarkKind; if (Call->isIndirectCall()) { IndirectCalls += Direction; CallKind += "indirect"; From 6367ad7ea65d7ef1da51b4fe8cf6e50af90b1f36 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 19:22:45 -0400 Subject: [PATCH 05/46] Use TTI.getFlatAddressSpace for addrspace(0) We have to be more careful about targets in the test suite now because `getFlatAddressSpace` returns garbage for unsupported targets. Should we change the remarks to say flat addrspace instead of addrspace(0)? --- llvm/include/llvm/Analysis/KernelInfo.h | 4 +++- llvm/lib/Analysis/KernelInfo.cpp | 18 ++++++++++-------- .../Inputs/test.ll} | 9 --------- .../Analysis/KernelInfo/addrspace0/amdgpu.ll | 12 ++++++++++++ .../Analysis/KernelInfo/addrspace0/nvptx.ll | 12 ++++++++++++ llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll | 10 +++------- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 10 +++------- 7 files changed, 43 insertions(+), 32 deletions(-) rename llvm/test/Analysis/KernelInfo/{addrspace0.ll => addrspace0/Inputs/test.ll} (97%) create mode 100644 llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll create mode 100644 llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 96cd5f68af646..c4a18d47723ab 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -16,6 +16,7 @@ #define LLVM_ANALYSIS_KERNELINFO_H #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetTransformInfo.h" namespace llvm { class DominatorTree; @@ -24,7 +25,8 @@ class Function; /// Data structure holding function info for kernels. class KernelInfo { void updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE); + OptimizationRemarkEmitter &ORE, + const TargetTransformInfo &TTI); public: static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index caeada91c31af..de08bd49aacfc 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -122,7 +122,8 @@ static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE, } void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE) { + OptimizationRemarkEmitter &ORE, + const TargetTransformInfo &TTI) { assert(Direction == 1 || Direction == -1); const Function &F = *BB.getParent(); const Module &M = *F.getParent(); @@ -170,34 +171,34 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, } remarkCall(ORE, F, *Call, CallKind, RemarkKind); if (const AnyMemIntrinsic *MI = dyn_cast(Call)) { - if (MI->getDestAddressSpace() == 0) { + if (MI->getDestAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } else if (const AnyMemTransferInst *MT = dyn_cast(MI)) { - if (MT->getSourceAddressSpace() == 0) { + if (MT->getSourceAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } } } } else if (const LoadInst *Load = dyn_cast(&I)) { - if (Load->getPointerAddressSpace() == 0) { + if (Load->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } } else if (const StoreInst *Store = dyn_cast(&I)) { - if (Store->getPointerAddressSpace() == 0) { + if (Store->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } } else if (const AtomicRMWInst *At = dyn_cast(&I)) { - if (At->getPointerAddressSpace() == 0) { + if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { - if (At->getPointerAddressSpace() == 0) { + if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } @@ -286,6 +287,7 @@ static std::optional parseNVPTXMDNodeAsInteger(Function &F, KernelInfo KernelInfo::getKernelInfo(Function &F, FunctionAnalysisManager &FAM) { + const TargetTransformInfo &TTI = FAM.getResult(F); KernelInfo KI; // Only analyze modules for GPUs. // TODO: This would be more maintainable if there were an isGPU. @@ -319,7 +321,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, auto &ORE = FAM.getResult(F); for (const auto &BB : F) if (DT.isReachableFromEntry(&BB)) - KI.updateForBB(BB, +1, ORE); + KI.updateForBB(BB, +1, ORE, TTI); #define REMARK_PROPERTY(PROP_NAME) \ remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) diff --git a/llvm/test/Analysis/KernelInfo/addrspace0.ll b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll similarity index 97% rename from llvm/test/Analysis/KernelInfo/addrspace0.ll rename to llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll index 4c472396443f5..79d3cd2562e90 100644 --- a/llvm/test/Analysis/KernelInfo/addrspace0.ll +++ b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll @@ -1,12 +1,3 @@ -; Check info on addrspace(0) memory accesses. - -; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ -; RUN: -disable-output %s 2>&1 | \ -; RUN: FileCheck -match-full-lines --implicit-check-not='addrspace(0)' %s - -target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-nvidia-cuda" - define void @f() !dbg !3 { entry: ; load diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll b/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll new file mode 100644 index 0000000000000..b7a26d6cb47ba --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll @@ -0,0 +1,12 @@ +; Check info on addrspace(0) memory accesses when the target is amdgpu. +; +; The target matters because kernel-info calls +; TargetTransformInfo::getFlatAddressSpace to select addrspace(0). + +; REQUIRES: amdgpu-registered-target + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -mtriple="amdgcn-amd-amdhsa" \ +; RUN: -disable-output %S/Inputs/test.ll 2>&1 | \ +; RUN: FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \ +; RUN: %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll b/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll new file mode 100644 index 0000000000000..43bb985744e0c --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll @@ -0,0 +1,12 @@ +; Check info on addrspace(0) memory accesses when the target is nvptx. +; +; The target matters because kernel-info calls +; TargetTransformInfo::getFlatAddressSpace to select addrspace(0). + +; REQUIRES: nvptx-registered-target + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -mtriple="nvptx64-nvidia-cuda" \ +; RUN: -disable-output %S/Inputs/test.ll 2>&1 | \ +; RUN: FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \ +; RUN: %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index ee5f65b8e5ab7..d417f8b866f73 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -1,16 +1,12 @@ ; See ./README.md for how to maintain the LLVM IR in this test. +; REQUIRES: amdgpu-registered-target + ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -disable-output %s 2>&1 | \ ; RUN: FileCheck -match-full-lines %s -; For some builds, we see a warning like: -; -; opt: WARNING: failed to create target machine for 'amdgcn-amd-amdhsa': unable to get target for 'amdgcn-amd-amdhsa', see --version and --triple. -; -; But there should be no other remarks here. -; CHECK-NOT: remark: - +; CHECK-NOT: remark: ; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 41d068b03548b..1222267a8fe57 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -1,16 +1,12 @@ ; See ./README.md for how to maintain the LLVM IR in this test. +; REQUIRES: nvptx-registered-target + ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -disable-output %s 2>&1 | \ ; RUN: FileCheck -match-full-lines %s -; For some builds, we see a warning like: -; -; opt: WARNING: failed to create target machine for 'nvptx64-nvidia-cuda': unable to get target for 'nvptx64-nvidia-cuda', see --version and --triple. -; -; But there should be no other remarks here. -; CHECK-NOT: remark: - +; CHECK-NOT: remark: ; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes From 78446bbb9e1caed303288a2962dd7c78a8779c06 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 19:31:58 -0400 Subject: [PATCH 06/46] Avoid repetition between amdgpu and nvptx tests --- .../kernel-info-after-lto/Inputs/test.ll | 22 ++++++++++ .../kernel-info-after-lto/amdgpu.ll | 43 ++++--------------- .../KernelInfo/kernel-info-after-lto/nvptx.ll | 43 ++++--------------- 3 files changed, 40 insertions(+), 68 deletions(-) create mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll new file mode 100644 index 0000000000000..b85e3c581867c --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll @@ -0,0 +1,22 @@ +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; NONE-NOT: remark: +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll index 7d190ece46e16..6d6e83e8d317f 100644 --- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll @@ -4,44 +4,19 @@ ; REQUIRES: amdgpu-registered-target ; -kernel-info-end-lto inserts kernel-info into LTO pipeline. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="amdgcn-amd-amdhsa" \ ; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -match-full-lines %s +; RUN: FileCheck -match-full-lines %S/Inputs/test.ll ; Omitting -kernel-info-end-lto disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="amdgcn-amd-amdhsa" \ ; RUN: -passes='lto' 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %s +; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll ; Omitting LTO disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="amdgcn-amd-amdhsa" \ ; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" -target triple = "amdgcn-amd-amdhsa" - -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 -; NONE-NOT: remark: -define void @test() #0 !dbg !5 { -entry: - ret void -} - -attributes #0 = { - "omp_target_num_teams"="100" -} - -!llvm.module.flags = !{!0} -!llvm.dbg.cu = !{!1} -!nvvm.annotations = !{!6, !7, !8} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!2 = !DIFile(filename: "test.c", directory: "/tmp") -!3 = !{} -!4 = !DISubroutineType(types: !3) -!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) -!6 = !{ptr @test, !"maxclusterrank", i32 200} -!7 = !{ptr @test, !"maxntidx", i32 210} -!8 = distinct !{ptr null, !"kernel", i32 1} +; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll index 4e790123c313a..1e427daed671e 100644 --- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll @@ -4,44 +4,19 @@ ; REQUIRES: nvptx-registered-target ; -kernel-info-end-lto inserts kernel-info into LTO pipeline. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="nvptx64-nvidia-cuda" \ ; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -match-full-lines %s +; RUN: FileCheck -match-full-lines %S/Inputs/test.ll ; Omitting -kernel-info-end-lto disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="nvptx64-nvidia-cuda" \ ; RUN: -passes='lto' 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %s +; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll ; Omitting LTO disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="nvptx64-nvidia-cuda" \ ; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %s - -target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-nvidia-cuda" - -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 -; NONE-NOT: remark: -define void @test() #0 !dbg !5 { -entry: - ret void -} - -attributes #0 = { - "omp_target_num_teams"="100" -} - -!llvm.module.flags = !{!0} -!llvm.dbg.cu = !{!1} -!nvvm.annotations = !{!6, !7, !8} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!2 = !DIFile(filename: "test.c", directory: "/tmp") -!3 = !{} -!4 = !DISubroutineType(types: !3) -!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) -!6 = !{ptr @test, !"maxclusterrank", i32 200} -!7 = !{ptr @test, !"maxntidx", i32 210} -!8 = distinct !{ptr null, !"kernel", i32 1} +; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll From fede524269915edb51b7d6680a7280a79ca0f710 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 19:39:14 -0400 Subject: [PATCH 07/46] Use named values in tests --- .../Analysis/KernelInfo/addrspace0/Inputs/test.ll | 2 +- llvm/test/Analysis/KernelInfo/calls.ll | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll index 79d3cd2562e90..0821fde8e25b1 100644 --- a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll +++ b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll @@ -1,6 +1,6 @@ define void @f() !dbg !3 { entry: - ; load + ; load: check remarks for both unnamed and named values. ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in addrspace(0) %0 = load i32, ptr null, align 4, !dbg !6 ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in addrspace(0) diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll index 6101a71254898..25b8e3d880303 100644 --- a/llvm/test/Analysis/KernelInfo/calls.ll +++ b/llvm/test/Analysis/KernelInfo/calls.ll @@ -17,9 +17,9 @@ entry: call void @g(), !dbg !104 ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !105 - %0 = load ptr, ptr null, align 8 + %fnPtr = load ptr, ptr null, align 8 ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call - call void %0(), !dbg !106 + call void %fnPtr(), !dbg !106 ; CHECK: remark: test.c:20:5: in artificial function 'h', direct invoke, callee is 'f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !107 fcont: @@ -30,7 +30,7 @@ gcont: invoke void @h() to label %hcont unwind label %cleanup, !dbg !109 hcont: ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke - invoke void %0() to label %end unwind label %cleanup, !dbg !110 + invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110 cleanup: %ll = landingpad { ptr, i32 } cleanup @@ -53,9 +53,9 @@ entry: call void @g(), !dbg !203 ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !204 - %0 = load ptr, ptr null, align 8 + %fnPtr = load ptr, ptr null, align 8 ; CHECK: remark: test.c:9:3: in function 'g', indirect call - call void %0(), !dbg !205 + call void %fnPtr(), !dbg !205 ; CHECK: remark: test.c:10:3: in function 'g', direct invoke, callee is 'f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !206 fcont: @@ -66,7 +66,7 @@ gcont: invoke void @h() to label %hcont unwind label %cleanup, !dbg !208 hcont: ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke - invoke void %0() to label %end unwind label %cleanup, !dbg !209 + invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209 cleanup: %ll = landingpad { ptr, i32 } cleanup From 4c30b8a767c8e5fcaa4c6e8979d5515b9f4656f1 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Tue, 13 Aug 2024 12:03:06 -0400 Subject: [PATCH 08/46] Say flat address space instead of addrspace(0) --- llvm/include/llvm/Analysis/KernelInfo.h | 4 +- llvm/lib/Analysis/KernelInfo.cpp | 32 ++++---- .../Inputs/test.ll | 74 +++++++++---------- .../{addrspace0 => flat-addrspace}/amdgpu.ll | 6 +- .../{addrspace0 => flat-addrspace}/nvptx.ll | 6 +- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 12 +-- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 12 +-- 7 files changed, 73 insertions(+), 73 deletions(-) rename llvm/test/Analysis/KernelInfo/{addrspace0 => flat-addrspace}/Inputs/test.ll (82%) rename llvm/test/Analysis/KernelInfo/{addrspace0 => flat-addrspace}/amdgpu.ll (53%) rename llvm/test/Analysis/KernelInfo/{addrspace0 => flat-addrspace}/nvptx.ll (54%) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index c4a18d47723ab..66dd95046dd97 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -89,8 +89,8 @@ class KernelInfo { /// Number of calls of type InvokeInst. int64_t Invokes = 0; - /// Number of addrspace(0) memory accesses (via load, store, etc.). - int64_t AddrspaceZeroAccesses = 0; + /// Number of flat addrspace memory accesses (via load, store, etc.). + int64_t FlatAddrspaceAccesses = 0; }; /// Analysis class for KernelInfo. diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index de08bd49aacfc..4eccc8807106b 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -102,11 +102,11 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, }); } -static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE, +static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, const Function &Caller, const Instruction &Inst) { ORE.emit([&] { - OptimizationRemark R(DEBUG_TYPE, "AddrspaceZeroAccess", &Inst); + OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &Inst); R << "in "; identifyFunction(R, Caller); if (const IntrinsicInst *II = dyn_cast(&Inst)) { @@ -116,7 +116,7 @@ static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE, } if (Inst.hasName()) R << " ('%" << Inst.getName() << "')"; - R << " accesses memory in addrspace(0)"; + R << " accesses memory in flat address space"; return R; }); } @@ -172,35 +172,35 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, remarkCall(ORE, F, *Call, CallKind, RemarkKind); if (const AnyMemIntrinsic *MI = dyn_cast(Call)) { if (MI->getDestAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } else if (const AnyMemTransferInst *MT = dyn_cast(MI)) { if (MT->getSourceAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } } } } else if (const LoadInst *Load = dyn_cast(&I)) { if (Load->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const StoreInst *Store = dyn_cast(&I)) { if (Store->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicRMWInst *At = dyn_cast(&I)) { if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } } } @@ -344,7 +344,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, REMARK_PROPERTY(IndirectCalls); REMARK_PROPERTY(DirectCallsToDefinedFunctions); REMARK_PROPERTY(Invokes); - REMARK_PROPERTY(AddrspaceZeroAccesses); + REMARK_PROPERTY(FlatAddrspaceAccesses); #undef REMARK_PROPERTY return KI; diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll similarity index 82% rename from llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll rename to llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll index 0821fde8e25b1..07c884792f45c 100644 --- a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll @@ -1,129 +1,129 @@ define void @f() !dbg !3 { entry: ; load: check remarks for both unnamed and named values. - ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in flat address space %0 = load i32, ptr null, align 4, !dbg !6 - ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in addrspace(0) + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in flat address space %load = load i32, ptr null, align 4, !dbg !6 - ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in addrspace(0) + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in flat address space %load0 = load i32, ptr addrspace(0) null, align 4, !dbg !6 %load1 = load i32, ptr addrspace(1) null, align 4, !dbg !6 %load2 = load i32, ptr addrspace(2) null, align 4, !dbg !6 ; store - ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space store i32 0, ptr null, align 4, !dbg !7 - ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space store i32 0, ptr addrspace(0) null, align 4, !dbg !7 store i32 0, ptr addrspace(1) null, align 4, !dbg !7 store i32 0, ptr addrspace(8) null, align 4, !dbg !7 ; atomicrmw - ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8 - ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8 atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8 atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8 ; cmpxchg - ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9 - ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 ; llvm.memcpy - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in flat address space call void @llvm.memcpy.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in flat address space call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 ; llvm.memcpy.inline - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.inline.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p1.i64' call accesses memory in flat address space call void @llvm.memcpy.inline.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p1.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.inline.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 call void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 ; llvm.memcpy.element.unordered.atomic - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p1.i64' call accesses memory in flat address space call void @llvm.memcpy.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p1.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10 call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10 ; llvm.memmove - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in flat address space call void @llvm.memmove.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in flat address space call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in flat address space call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in flat address space call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in flat address space call void @llvm.memmove.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in flat address space call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11 ; llvm.memmove.element.unordered.atomic - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p0.i64' call accesses memory in flat address space call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p1.i64' call accesses memory in flat address space call void @llvm.memmove.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p1.p0.i64' call accesses memory in flat address space call void @llvm.memmove.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11 call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11 ; llvm.memset - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in flat address space call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12 - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in flat address space call void @llvm.memset.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12 call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12 call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12 ; llvm.memset.inline - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in flat address space call void @llvm.memset.inline.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12 - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in flat address space call void @llvm.memset.inline.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12 call void @llvm.memset.inline.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12 call void @llvm.memset.inline.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12 ; llvm.memset.element.unordered.atomic - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in flat address space call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 null, i8 0, i64 10, i32 4), !dbg !12 - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in flat address space call void @llvm.memset.element.unordered.atomic.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i32 4), !dbg !12 call void @llvm.memset.element.unordered.atomic.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i32 4), !dbg !12 call void @llvm.memset.element.unordered.atomic.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i32 4), !dbg !12 ret void } -; CHECK: remark: test.c:2:0: in function 'f', AddrspaceZeroAccesses = 36 +; CHECK: remark: test.c:2:0: in function 'f', FlatAddrspaceAccesses = 36 !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2} diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll similarity index 53% rename from llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll rename to llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll index b7a26d6cb47ba..7447dcf51cc89 100644 --- a/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll @@ -1,12 +1,12 @@ -; Check info on addrspace(0) memory accesses when the target is amdgpu. +; Check info on flat address space memory accesses when the target is amdgpu. ; ; The target matters because kernel-info calls -; TargetTransformInfo::getFlatAddressSpace to select addrspace(0). +; TargetTransformInfo::getFlatAddressSpace to select the flat address space. ; REQUIRES: amdgpu-registered-target ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -mtriple="amdgcn-amd-amdhsa" \ ; RUN: -disable-output %S/Inputs/test.ll 2>&1 | \ -; RUN: FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \ +; RUN: FileCheck -match-full-lines -implicit-check-not='flat address space' \ ; RUN: %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll similarity index 54% rename from llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll rename to llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll index 43bb985744e0c..02321c19e022d 100644 --- a/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll @@ -1,12 +1,12 @@ -; Check info on addrspace(0) memory accesses when the target is nvptx. +; Check info on flat address space memory accesses when the target is nvptx. ; ; The target matters because kernel-info calls -; TargetTransformInfo::getFlatAddressSpace to select addrspace(0). +; TargetTransformInfo::getFlatAddressSpace to select the flat address space. ; REQUIRES: nvptx-registered-target ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -mtriple="nvptx64-nvidia-cuda" \ ; RUN: -disable-output %S/Inputs/test.ll 2>&1 | \ -; RUN: FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \ +; RUN: FileCheck -match-full-lines -implicit-check-not='flat address space' \ ; RUN: %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index d417f8b866f73..56ee35810ef26 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -10,7 +10,7 @@ ; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_init' ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' @@ -26,11 +26,11 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0) -; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 @@ -40,7 +40,7 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes @@ -54,7 +54,7 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 ; CHECK-NOT: {{.}} diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 1222267a8fe57..ee76ecdf5d795 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -10,7 +10,7 @@ ; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_init' ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' @@ -25,11 +25,11 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0) -; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 @@ -39,7 +39,7 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes @@ -53,7 +53,7 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 ; CHECK-NOT: remark: {{.*: in function 'g',.*}} ; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't From 33f0d4dd276eda64f495cdf66411bc77d20517c6 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Tue, 13 Aug 2024 12:23:42 -0400 Subject: [PATCH 09/46] Cache the flat address space --- llvm/include/llvm/Analysis/KernelInfo.h | 8 +++++--- llvm/lib/Analysis/KernelInfo.cpp | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 66dd95046dd97..3cf5bec58cf55 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -25,8 +25,7 @@ class Function; /// Data structure holding function info for kernels. class KernelInfo { void updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE, - const TargetTransformInfo &TTI); + OptimizationRemarkEmitter &ORE); public: static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); @@ -89,7 +88,10 @@ class KernelInfo { /// Number of calls of type InvokeInst. int64_t Invokes = 0; - /// Number of flat addrspace memory accesses (via load, store, etc.). + /// Target-specific flat address space. + unsigned FlatAddrspace; + + /// Number of flat address space memory accesses (via load, store, etc.). int64_t FlatAddrspaceAccesses = 0; }; diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 4eccc8807106b..b5b9145641550 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -122,8 +122,7 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, } void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE, - const TargetTransformInfo &TTI) { + OptimizationRemarkEmitter &ORE) { assert(Direction == 1 || Direction == -1); const Function &F = *BB.getParent(); const Module &M = *F.getParent(); @@ -171,34 +170,34 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, } remarkCall(ORE, F, *Call, CallKind, RemarkKind); if (const AnyMemIntrinsic *MI = dyn_cast(Call)) { - if (MI->getDestAddressSpace() == TTI.getFlatAddressSpace()) { + if (MI->getDestAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } else if (const AnyMemTransferInst *MT = dyn_cast(MI)) { - if (MT->getSourceAddressSpace() == TTI.getFlatAddressSpace()) { + if (MT->getSourceAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } } } } else if (const LoadInst *Load = dyn_cast(&I)) { - if (Load->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { + if (Load->getPointerAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const StoreInst *Store = dyn_cast(&I)) { - if (Store->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { + if (Store->getPointerAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicRMWInst *At = dyn_cast(&I)) { - if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { + if (At->getPointerAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { - if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { + if (At->getPointerAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } @@ -287,7 +286,6 @@ static std::optional parseNVPTXMDNodeAsInteger(Function &F, KernelInfo KernelInfo::getKernelInfo(Function &F, FunctionAnalysisManager &FAM) { - const TargetTransformInfo &TTI = FAM.getResult(F); KernelInfo KI; // Only analyze modules for GPUs. // TODO: This would be more maintainable if there were an isGPU. @@ -297,6 +295,8 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, return KI; KI.IsValid = true; + KI.FlatAddrspace = FAM.getResult(F).getFlatAddressSpace(); + // Record function properties. KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams"); @@ -321,7 +321,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, auto &ORE = FAM.getResult(F); for (const auto &BB : F) if (DT.isReachableFromEntry(&BB)) - KI.updateForBB(BB, +1, ORE, TTI); + KI.updateForBB(BB, +1, ORE); #define REMARK_PROPERTY(PROP_NAME) \ remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) From a2a512c5bfbea1bbe14f4db2574631b0703106ea Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Tue, 13 Aug 2024 13:18:20 -0400 Subject: [PATCH 10/46] Link KernelInfo.rst from Passes.rst --- llvm/docs/Passes.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst index 49f633e98d16f..939aeabd599b9 100644 --- a/llvm/docs/Passes.rst +++ b/llvm/docs/Passes.rst @@ -5,6 +5,11 @@ LLVM's Analysis and Transform Passes .. contents:: :local: +.. toctree:: + :hidden: + + KernelInfo + Introduction ============ .. warning:: This document is not updated frequently, and the list of passes @@ -148,6 +153,12 @@ This pass collects the count of all instructions and reports them. Bookkeeping for "interesting" users of expressions computed from induction variables. +``kernel-info``: GPU Kernel Info +-------------------------------- + +Reports various statistics for codes compiled for GPUs. This pass is +:doc:`documented separately`. + ``lazy-value-info``: Lazy Value Information Analysis ---------------------------------------------------- From de04ac4fee83f24bad8510f055cc7b303cf76939 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Tue, 13 Aug 2024 13:48:32 -0400 Subject: [PATCH 11/46] Don't filter out cpus -kernel-info-end-lto doesn't insert kernel-info for cpu modules. If the user explicitly specifies the pass for a cpu module, then it will run now. --- llvm/include/llvm/Analysis/KernelInfo.h | 4 ---- llvm/lib/Analysis/KernelInfo.cpp | 8 -------- 2 files changed, 12 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 3cf5bec58cf55..951c58cfc0218 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -36,10 +36,6 @@ class KernelInfo { bool operator!=(const KernelInfo &FPI) const { return !(*this == FPI); } - /// If false, nothing was recorded here because the supplied function didn't - /// appear in a module compiled for a GPU. - bool IsValid = false; - /// Whether the function has external linkage and is not a kernel function. bool ExternalNotKernel = false; diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index b5b9145641550..b29c3c3fecd16 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -287,14 +287,6 @@ static std::optional parseNVPTXMDNodeAsInteger(Function &F, KernelInfo KernelInfo::getKernelInfo(Function &F, FunctionAnalysisManager &FAM) { KernelInfo KI; - // Only analyze modules for GPUs. - // TODO: This would be more maintainable if there were an isGPU. - const std::string &TT = F.getParent()->getTargetTriple(); - llvm::Triple T(TT); - if (!T.isAMDGPU() && !T.isNVPTX()) - return KI; - KI.IsValid = true; - KI.FlatAddrspace = FAM.getResult(F).getFlatAddressSpace(); // Record function properties. From ec5d2bd00ed0c9305a0820d56f69f1be25ebdd6b Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 12:19:16 -0400 Subject: [PATCH 12/46] Include less in header --- llvm/include/llvm/Analysis/KernelInfo.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 951c58cfc0218..c3bc0849efa0f 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -15,12 +15,11 @@ #ifndef LLVM_ANALYSIS_KERNELINFO_H #define LLVM_ANALYSIS_KERNELINFO_H -#include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/PassManager.h" namespace llvm { -class DominatorTree; -class Function; +class BasicBlock; +class OptimizationRemarkEmitter; /// Data structure holding function info for kernels. class KernelInfo { From c06b9052e6f18e2f290f54eb1ca2583aa3bbeee0 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 12:19:42 -0400 Subject: [PATCH 13/46] Removed unused comparison operators They wouldn't have worked reliably anyway given uninitialized padding in the struct. --- llvm/include/llvm/Analysis/KernelInfo.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index c3bc0849efa0f..6d4edfb3525cc 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -29,12 +29,6 @@ class KernelInfo { public: static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); - bool operator==(const KernelInfo &FPI) const { - return std::memcmp(this, &FPI, sizeof(KernelInfo)) == 0; - } - - bool operator!=(const KernelInfo &FPI) const { return !(*this == FPI); } - /// Whether the function has external linkage and is not a kernel function. bool ExternalNotKernel = false; From d83d22a1079eb66487b084905af114ec384a8319 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 12:19:52 -0400 Subject: [PATCH 14/46] Remove redundant null check --- llvm/lib/Analysis/KernelInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index b29c3c3fecd16..c039d495ee6ed 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -161,7 +161,7 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, } if (!Call->isIndirectCall()) { if (const Function *Callee = Call->getCalledFunction()) { - if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration()) { + if (!Callee->isIntrinsic() && !Callee->isDeclaration()) { DirectCallsToDefinedFunctions += Direction; CallKind += " to defined function"; RemarkKind += "ToDefinedFunction"; From 1649cf8d3af43fd4bdcb5bf6335fffb52f9d92af Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 15:41:36 -0400 Subject: [PATCH 15/46] Move KernelInfo to KernelInfo.cpp, remove KernelInfoAnalysis For now, analysis results will not be used beyond emitting remarks. If that changes, we can revert. --- llvm/include/llvm/Analysis/KernelInfo.h | 90 +------------------ llvm/lib/Analysis/KernelInfo.cpp | 73 ++++++++++++++- llvm/lib/Passes/PassRegistry.def | 1 - .../test/Analysis/KernelInfo/openmp/README.md | 4 +- 4 files changed, 75 insertions(+), 93 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 6d4edfb3525cc..c5c33fac34655 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter -// classes used to extract function properties from a GPU kernel. +// This file defines the KernelInfoPrinter class used to emit remarks about +// function properties from a GPU kernel. // // See llvm/docs/KernelInfo.rst. // ===---------------------------------------------------------------------===// @@ -18,95 +18,11 @@ #include "llvm/IR/PassManager.h" namespace llvm { -class BasicBlock; -class OptimizationRemarkEmitter; - -/// Data structure holding function info for kernels. -class KernelInfo { - void updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE); - -public: - static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); - - /// Whether the function has external linkage and is not a kernel function. - bool ExternalNotKernel = false; - - /// OpenMP Launch bounds. - ///@{ - std::optional OmpTargetNumTeams; - std::optional OmpTargetThreadLimit; - ///@} - - /// AMDGPU launch bounds. - ///@{ - std::optional AmdgpuMaxNumWorkgroupsX; - std::optional AmdgpuMaxNumWorkgroupsY; - std::optional AmdgpuMaxNumWorkgroupsZ; - std::optional AmdgpuFlatWorkGroupSizeMin; - std::optional AmdgpuFlatWorkGroupSizeMax; - std::optional AmdgpuWavesPerEuMin; - std::optional AmdgpuWavesPerEuMax; - ///@} - - /// NVPTX launch bounds. - ///@{ - std::optional Maxclusterrank; - std::optional Maxntidx; - ///@} - - /// The number of alloca instructions inside the function, the number of those - /// with allocation sizes that cannot be determined at compile time, and the - /// sum of the sizes that can be. - /// - /// With the current implementation for at least some GPU archs, - /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in - /// case the implementation changes. - int64_t Allocas = 0; - int64_t AllocasDyn = 0; - int64_t AllocasStaticSizeSum = 0; - - /// Number of direct/indirect calls (anything derived from CallBase). - int64_t DirectCalls = 0; - int64_t IndirectCalls = 0; - - /// Number of direct calls made from this function to other functions - /// defined in this module. - int64_t DirectCallsToDefinedFunctions = 0; - - /// Number of calls of type InvokeInst. - int64_t Invokes = 0; - - /// Target-specific flat address space. - unsigned FlatAddrspace; - - /// Number of flat address space memory accesses (via load, store, etc.). - int64_t FlatAddrspaceAccesses = 0; -}; - -/// Analysis class for KernelInfo. -class KernelInfoAnalysis : public AnalysisInfoMixin { -public: - static AnalysisKey Key; - - using Result = const KernelInfo; - - KernelInfo run(Function &F, FunctionAnalysisManager &FAM) { - return KernelInfo::getKernelInfo(F, FAM); - } -}; - -/// Printer pass for KernelInfoAnalysis. -/// -/// It just calls KernelInfoAnalysis, which prints remarks if they are enabled. class KernelInfoPrinter : public PassInfoMixin { public: explicit KernelInfoPrinter() {} - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) { - AM.getResult(F); - return PreservedAnalyses::all(); - } + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); static bool isRequired() { return true; } }; diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index c039d495ee6ed..a628f370c802e 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter -// classes used to extract function properties from a kernel. +// This file defines the KernelInfoPrinter class used to emit remarks about +// function properties from a GPU kernel. // //===----------------------------------------------------------------------===// @@ -27,6 +27,69 @@ using namespace llvm; #define DEBUG_TYPE "kernel-info" +/// Data structure holding function info for kernels. +class KernelInfo { + void updateForBB(const BasicBlock &BB, int64_t Direction, + OptimizationRemarkEmitter &ORE); + +public: + static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); + + /// Whether the function has external linkage and is not a kernel function. + bool ExternalNotKernel = false; + + /// OpenMP Launch bounds. + ///@{ + std::optional OmpTargetNumTeams; + std::optional OmpTargetThreadLimit; + ///@} + + /// AMDGPU launch bounds. + ///@{ + std::optional AmdgpuMaxNumWorkgroupsX; + std::optional AmdgpuMaxNumWorkgroupsY; + std::optional AmdgpuMaxNumWorkgroupsZ; + std::optional AmdgpuFlatWorkGroupSizeMin; + std::optional AmdgpuFlatWorkGroupSizeMax; + std::optional AmdgpuWavesPerEuMin; + std::optional AmdgpuWavesPerEuMax; + ///@} + + /// NVPTX launch bounds. + ///@{ + std::optional Maxclusterrank; + std::optional Maxntidx; + ///@} + + /// The number of alloca instructions inside the function, the number of those + /// with allocation sizes that cannot be determined at compile time, and the + /// sum of the sizes that can be. + /// + /// With the current implementation for at least some GPU archs, + /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in + /// case the implementation changes. + int64_t Allocas = 0; + int64_t AllocasDyn = 0; + int64_t AllocasStaticSizeSum = 0; + + /// Number of direct/indirect calls (anything derived from CallBase). + int64_t DirectCalls = 0; + int64_t IndirectCalls = 0; + + /// Number of direct calls made from this function to other functions + /// defined in this module. + int64_t DirectCallsToDefinedFunctions = 0; + + /// Number of calls of type InvokeInst. + int64_t Invokes = 0; + + /// Target-specific flat address space. + unsigned FlatAddrspace; + + /// Number of flat address space memory accesses (via load, store, etc.). + int64_t FlatAddrspaceAccesses = 0; +}; + static bool isKernelFunction(Function &F) { // TODO: Is this general enough? Consider languages beyond OpenMP. return F.hasFnAttribute("kernel"); @@ -342,4 +405,8 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, return KI; } -AnalysisKey KernelInfoAnalysis::Key; +PreservedAnalyses KernelInfoPrinter::run(Function &F, + FunctionAnalysisManager &AM) { + KernelInfo::getKernelInfo(F, AM); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index dcfa732f410b3..391cca0da2ea1 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -278,7 +278,6 @@ FUNCTION_ANALYSIS( MachineFunctionAnalysis(static_cast(TM))) FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis()) FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis()) -FUNCTION_ANALYSIS("kernel-info", KernelInfoAnalysis()) FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis()) FUNCTION_ANALYSIS("loops", LoopAnalysis()) FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis()) diff --git a/llvm/test/Analysis/KernelInfo/openmp/README.md b/llvm/test/Analysis/KernelInfo/openmp/README.md index 0d13950e198ed..5471b2e1b220d 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/README.md +++ b/llvm/test/Analysis/KernelInfo/openmp/README.md @@ -1,9 +1,9 @@ -The tests in this directory check that basic KernelInfoAnalysis functionality +The tests in this directory check that basic KernelInfoPrinter functionality behaves reasonably for LLVM IR produced by Clang OpenMP codegen. So that these tests are straightforward to maintain and faithfully represent Clang OpenMP codegen, do not tweak or reduce the LLVM IR in them. Other tests -more exhaustively check KernelInfoAnalysis features using reduced LLVM IR. +more exhaustively check KernelInfoPrinter features using reduced LLVM IR. The LLVM IR in each test file `$TEST` can be regenerated as follows in the case that Clang OpenMP codegen changes or it becomes desirable to adjust the source From 1a3c0aef034087e235fda909c69cc9e75b0bb874 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 15:42:20 -0400 Subject: [PATCH 16/46] Use printAsOperand not getName to identify instruction --- llvm/lib/Analysis/KernelInfo.cpp | 8 ++++++-- .../Analysis/KernelInfo/flat-addrspace/Inputs/test.ll | 10 +++++----- llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll | 2 +- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index a628f370c802e..41acde725b471 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -177,8 +177,12 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, } else { R << ", '" << Inst.getOpcodeName() << "' instruction"; } - if (Inst.hasName()) - R << " ('%" << Inst.getName() << "')"; + if (!Inst.getType()->isVoidTy()) { + std::string Name; + raw_string_ostream OS(Name); + Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); + R << " ('" << Name << "')"; + } R << " accesses memory in flat address space"; return R; }); diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll index 07c884792f45c..b54c3a18f3e70 100644 --- a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll @@ -1,7 +1,7 @@ define void @f() !dbg !3 { entry: ; load: check remarks for both unnamed and named values. - ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in flat address space + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%0') accesses memory in flat address space %0 = load i32, ptr null, align 4, !dbg !6 ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in flat address space %load = load i32, ptr null, align 4, !dbg !6 @@ -19,17 +19,17 @@ entry: store i32 0, ptr addrspace(8) null, align 4, !dbg !7 ; atomicrmw - ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8 - ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8 atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8 atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8 ; cmpxchg - ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9 - ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 56ee35810ef26..82f6f243264bc 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -30,7 +30,7 @@ ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space -; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index ee76ecdf5d795..eb2cba596be22 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -29,7 +29,7 @@ ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space -; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 From ea89a81b0ebf30fa331f3bcd0dbfced21478846d Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 15:42:28 -0400 Subject: [PATCH 17/46] Use printAsOperand to report indirect callee --- llvm/lib/Analysis/KernelInfo.cpp | 21 +++++++++++---------- llvm/test/Analysis/KernelInfo/calls.ll | 8 ++++---- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 41acde725b471..9768fe90b1433 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -147,20 +147,21 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call); R << "in "; identifyFunction(R, Caller); - R << ", " << CallKind; - if (const Function *Callee = - dyn_cast_or_null(Call.getCalledOperand())) { - R << ", callee is"; - StringRef Name = Callee->getName(); - if (auto *SubProgram = Callee->getSubprogram()) { + R << ", " << CallKind << ", callee is"; + Value *Callee = Call.getCalledOperand(); + std::string Name; + if (const Function *FnCallee = dyn_cast(Callee)) { + if (auto *SubProgram = FnCallee->getSubprogram()) { if (SubProgram->isArtificial()) R << " artificial"; } - if (!Name.empty()) - R << " '" << Name << "'"; - else - R << " with unknown name"; + Name = FnCallee->getName(); } + if (Name.empty()) { + raw_string_ostream OS(Name); + Callee->printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); + } + R << " '" << Name << "'"; return R; }); } diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll index 25b8e3d880303..d00ab2b74d398 100644 --- a/llvm/test/Analysis/KernelInfo/calls.ll +++ b/llvm/test/Analysis/KernelInfo/calls.ll @@ -18,7 +18,7 @@ entry: ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !105 %fnPtr = load ptr, ptr null, align 8 - ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call + ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !106 ; CHECK: remark: test.c:20:5: in artificial function 'h', direct invoke, callee is 'f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !107 @@ -29,7 +29,7 @@ gcont: ; CHECK: remark: test.c:22:5: in artificial function 'h', direct invoke to defined function, callee is artificial 'h' invoke void @h() to label %hcont unwind label %cleanup, !dbg !109 hcont: - ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke + ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110 cleanup: %ll = landingpad { ptr, i32 } @@ -54,7 +54,7 @@ entry: ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !204 %fnPtr = load ptr, ptr null, align 8 - ; CHECK: remark: test.c:9:3: in function 'g', indirect call + ; CHECK: remark: test.c:9:3: in function 'g', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !205 ; CHECK: remark: test.c:10:3: in function 'g', direct invoke, callee is 'f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !206 @@ -65,7 +65,7 @@ gcont: ; CHECK: remark: test.c:12:3: in function 'g', direct invoke to defined function, callee is artificial 'h' invoke void @h() to label %hcont unwind label %cleanup, !dbg !208 hcont: - ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke + ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209 cleanup: %ll = landingpad { ptr, i32 } From 8da602b92369af0d9a4f794b1956bd15ecac0263 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 16:36:36 -0400 Subject: [PATCH 18/46] Report inline assembly calls --- llvm/lib/Analysis/KernelInfo.cpp | 8 +++++++ llvm/test/Analysis/KernelInfo/calls.ll | 24 +++++++++++++++---- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 3 +++ llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 3 +++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 9768fe90b1433..034194e27f9fb 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -80,6 +80,9 @@ class KernelInfo { /// defined in this module. int64_t DirectCallsToDefinedFunctions = 0; + /// Number of direct calls to inline assembly. + int64_t InlineAssemblyCalls = 0; + /// Number of calls of type InvokeInst. int64_t Invokes = 0; @@ -234,6 +237,10 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, CallKind += " to defined function"; RemarkKind += "ToDefinedFunction"; } + } else if (Call->isInlineAsm()) { + InlineAssemblyCalls += Direction; + CallKind += " to inline assembly"; + RemarkKind += "ToInlineAssembly"; } } remarkCall(ORE, F, *Call, CallKind, RemarkKind); @@ -403,6 +410,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, REMARK_PROPERTY(DirectCalls); REMARK_PROPERTY(IndirectCalls); REMARK_PROPERTY(DirectCallsToDefinedFunctions); + REMARK_PROPERTY(InlineAssemblyCalls); REMARK_PROPERTY(Invokes); REMARK_PROPERTY(FlatAddrspaceAccesses); #undef REMARK_PROPERTY diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll index d00ab2b74d398..2a2672c70b85c 100644 --- a/llvm/test/Analysis/KernelInfo/calls.ll +++ b/llvm/test/Analysis/KernelInfo/calls.ll @@ -17,6 +17,8 @@ entry: call void @g(), !dbg !104 ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !105 + ; CHECK: remark: test.c:24:5: in artificial function 'h', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' + call void asm sideeffect "eieio", ""(), !dbg !111 %fnPtr = load ptr, ptr null, align 8 ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !106 @@ -29,6 +31,9 @@ gcont: ; CHECK: remark: test.c:22:5: in artificial function 'h', direct invoke to defined function, callee is artificial 'h' invoke void @h() to label %hcont unwind label %cleanup, !dbg !109 hcont: + ; CHECK: remark: test.c:25:5: in artificial function 'h', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' + invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !112 +asmcont: ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110 cleanup: @@ -38,10 +43,11 @@ cleanup: end: ret void } -; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCalls = 6 +; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCalls = 8 ; CHECK: remark: test.c:13:0: in artificial function 'h', IndirectCalls = 2 ; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCallsToDefinedFunctions = 4 -; CHECK: remark: test.c:13:0: in artificial function 'h', Invokes = 4 +; CHECK: remark: test.c:13:0: in artificial function 'h', InlineAssemblyCalls = 2 +; CHECK: remark: test.c:13:0: in artificial function 'h', Invokes = 5 declare void @f() @@ -53,6 +59,8 @@ entry: call void @g(), !dbg !203 ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !204 + ; CHECK: remark: test.c:14:3: in function 'g', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' + call void asm sideeffect "eieio", ""(), !dbg !210 %fnPtr = load ptr, ptr null, align 8 ; CHECK: remark: test.c:9:3: in function 'g', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !205 @@ -65,6 +73,9 @@ gcont: ; CHECK: remark: test.c:12:3: in function 'g', direct invoke to defined function, callee is artificial 'h' invoke void @h() to label %hcont unwind label %cleanup, !dbg !208 hcont: + ; CHECK: remark: test.c:15:3: in function 'g', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' + invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !211 +asmcont: ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209 cleanup: @@ -74,10 +85,11 @@ cleanup: end: ret void } -; CHECK: remark: test.c:3:0: in function 'g', DirectCalls = 6 +; CHECK: remark: test.c:3:0: in function 'g', DirectCalls = 8 ; CHECK: remark: test.c:3:0: in function 'g', IndirectCalls = 2 ; CHECK: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 4 -; CHECK: remark: test.c:3:0: in function 'g', Invokes = 4 +; CHECK: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 2 +; CHECK: remark: test.c:3:0: in function 'g', Invokes = 5 !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} @@ -99,6 +111,8 @@ end: !108 = !DILocation(line: 21, column: 5, scope: !103) !109 = !DILocation(line: 22, column: 5, scope: !103) !110 = !DILocation(line: 23, column: 5, scope: !103) +!111 = !DILocation(line: 24, column: 5, scope: !103) +!112 = !DILocation(line: 25, column: 5, scope: !103) !200 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) !201 = !DISubroutineType(types: !3) @@ -110,3 +124,5 @@ end: !207 = !DILocation(line: 11, column: 3, scope: !200) !208 = !DILocation(line: 12, column: 3, scope: !200) !209 = !DILocation(line: 13, column: 3, scope: !200) +!210 = !DILocation(line: 14, column: 3, scope: !200) +!211 = !DILocation(line: 15, column: 3, scope: !200) diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 82f6f243264bc..be3b357cc4530 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -25,6 +25,7 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 @@ -39,6 +40,7 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 @@ -53,6 +55,7 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 ; CHECK-NOT: {{.}} diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index eb2cba596be22..2dbd04b2536c4 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -24,6 +24,7 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 3 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 @@ -38,6 +39,7 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 @@ -52,6 +54,7 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 ; CHECK-NOT: remark: {{.*: in function 'g',.*}} From 45114fd9d85d614f2f3bc18543fb6779cab1053d Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 16:41:34 -0400 Subject: [PATCH 19/46] Use llvm::SmallString --- llvm/lib/Analysis/KernelInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 034194e27f9fb..96a77c96dc1f3 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -152,7 +152,7 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, identifyFunction(R, Caller); R << ", " << CallKind << ", callee is"; Value *Callee = Call.getCalledOperand(); - std::string Name; + SmallString<100> Name; // might be function name or asm expression if (const Function *FnCallee = dyn_cast(Callee)) { if (auto *SubProgram = FnCallee->getSubprogram()) { if (SubProgram->isArtificial()) @@ -161,7 +161,7 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, Name = FnCallee->getName(); } if (Name.empty()) { - raw_string_ostream OS(Name); + raw_svector_ostream OS(Name); Callee->printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); } R << " '" << Name << "'"; From eea139c63cde6f900962c5e999ffce79568b4391 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 16:47:20 -0400 Subject: [PATCH 20/46] Use llvm::SmallString --- llvm/lib/Analysis/KernelInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 96a77c96dc1f3..ff71323516238 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -182,8 +182,8 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, R << ", '" << Inst.getOpcodeName() << "' instruction"; } if (!Inst.getType()->isVoidTy()) { - std::string Name; - raw_string_ostream OS(Name); + SmallString<20> Name; + raw_svector_ostream OS(Name); Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); R << " ('" << Name << "')"; } From 8bf6e4e4bb262e0866d3e2098bb1a16c7293e2be Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 17:17:07 -0400 Subject: [PATCH 21/46] getKernelInfo -> emitKernelInfo because return is unused --- llvm/lib/Analysis/KernelInfo.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index ff71323516238..282dc092bfd62 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -33,7 +33,7 @@ class KernelInfo { OptimizationRemarkEmitter &ORE); public: - static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); + static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM); /// Whether the function has external linkage and is not a kernel function. bool ExternalNotKernel = false; @@ -359,8 +359,7 @@ static std::optional parseNVPTXMDNodeAsInteger(Function &F, return Result; } -KernelInfo KernelInfo::getKernelInfo(Function &F, - FunctionAnalysisManager &FAM) { +void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) { KernelInfo KI; KI.FlatAddrspace = FAM.getResult(F).getFlatAddressSpace(); @@ -415,11 +414,11 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, REMARK_PROPERTY(FlatAddrspaceAccesses); #undef REMARK_PROPERTY - return KI; + return; } PreservedAnalyses KernelInfoPrinter::run(Function &F, FunctionAnalysisManager &AM) { - KernelInfo::getKernelInfo(F, AM); + KernelInfo::emitKernelInfo(F, AM); return PreservedAnalyses::all(); } From 62d494d9a9f13e5b58d71f083a6cb9f67f19579b Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 13 Sep 2024 17:30:23 -0400 Subject: [PATCH 22/46] Clean up launch bounds * For amdgpu, use AMGPUSubtarget functions to query values. Thus, we end up with logical values that don't appear explicitly in the IR, and we ignore some impossible values that do appear explicitly. * For nvptx, use NVPTXUtilities.h functions to query values. Thus, drop KernelInfo.cpp's implementation of NVVM annotation parsing. Also, add support for a few more launch bounds. * Move target-specific collection of launch bounds to target-specific classes (GCNSubtarget and NVPTXSubtarget). While making the above changes, I struggled to find the right headers to enable keeping the implementation in KernelInfo.cpp, and one reviewer wanted to see some reorganization along these lines anyway. --- llvm/include/llvm/Analysis/KernelInfo.h | 8 +- .../llvm/CodeGen/TargetSubtargetInfo.h | 5 + llvm/lib/Analysis/KernelInfo.cpp | 127 ++++-------------- llvm/lib/Passes/PassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 16 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 + llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp | 16 +++ llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 4 + llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 4 +- .../KernelInfo/launch-bounds/amdgpu.ll | 67 +++++++-- .../KernelInfo/launch-bounds/nvptx.ll | 10 +- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 19 +++ 13 files changed, 166 insertions(+), 120 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index c5c33fac34655..6633c28858a2f 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -18,9 +18,15 @@ #include "llvm/IR/PassManager.h" namespace llvm { + +class TargetMachine; + class KernelInfoPrinter : public PassInfoMixin { +private: + TargetMachine *TM; + public: - explicit KernelInfoPrinter() {} + explicit KernelInfoPrinter(TargetMachine *TM) : TM(TM) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h index b4b018f080914..5d75510e91513 100644 --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -338,6 +338,11 @@ class TargetSubtargetInfo : public MCSubtargetInfo { /// the pass, with architecture specific overrides providing the information /// where they are implemented. virtual bool supportsInitUndef() const { return false; } + + /// For \p F, call \p Body with the name and value of each launch bound. + virtual void forEachLaunchBound( + const Function &F, + std::function Body) const {} }; } // end namespace llvm diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 282dc092bfd62..6d0efdfec8344 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -22,6 +23,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/Passes/PassBuilder.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -33,7 +35,8 @@ class KernelInfo { OptimizationRemarkEmitter &ORE); public: - static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM); + static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, + TargetMachine *TM); /// Whether the function has external linkage and is not a kernel function. bool ExternalNotKernel = false; @@ -44,23 +47,6 @@ class KernelInfo { std::optional OmpTargetThreadLimit; ///@} - /// AMDGPU launch bounds. - ///@{ - std::optional AmdgpuMaxNumWorkgroupsX; - std::optional AmdgpuMaxNumWorkgroupsY; - std::optional AmdgpuMaxNumWorkgroupsZ; - std::optional AmdgpuFlatWorkGroupSizeMin; - std::optional AmdgpuFlatWorkGroupSizeMax; - std::optional AmdgpuWavesPerEuMin; - std::optional AmdgpuWavesPerEuMax; - ///@} - - /// NVPTX launch bounds. - ///@{ - std::optional Maxclusterrank; - std::optional Maxntidx; - ///@} - /// The number of alloca instructions inside the function, the number of those /// with allocation sizes that cannot be determined at compile time, and the /// sum of the sizes that can be. @@ -298,68 +284,23 @@ static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, remarkProperty(ORE, F, Name, Value.value()); } -static std::vector> -parseFnAttrAsIntegerFields(Function &F, StringRef Name, unsigned NumFields) { - std::vector> Result(NumFields); - Attribute A = F.getFnAttribute(Name); - if (!A.isStringAttribute()) - return Result; - StringRef Rest = A.getValueAsString(); - for (unsigned I = 0; I < NumFields; ++I) { - StringRef Field; - std::tie(Field, Rest) = Rest.split(','); - if (Field.empty()) - break; - int64_t Val; - if (Field.getAsInteger(0, Val)) { - F.getContext().emitError("cannot parse integer in attribute '" + Name + - "': " + Field); - break; - } - Result[I] = Val; - } - if (!Rest.empty()) - F.getContext().emitError("too many fields in attribute " + Name); - return Result; -} - static std::optional parseFnAttrAsInteger(Function &F, StringRef Name) { - return parseFnAttrAsIntegerFields(F, Name, 1)[0]; -} - -// TODO: This nearly duplicates the same function in OMPIRBuilder.cpp. Can we -// share? -static MDNode *getNVPTXMDNode(Function &F, StringRef Name) { - Module &M = *F.getParent(); - NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"); - if (!MD) - return nullptr; - for (auto *Op : MD->operands()) { - if (Op->getNumOperands() != 3) - continue; - auto *KernelOp = dyn_cast(Op->getOperand(0)); - if (!KernelOp || KernelOp->getValue() != &F) - continue; - auto *Prop = dyn_cast(Op->getOperand(1)); - if (!Prop || Prop->getString() != Name) - continue; - return Op; - } - return nullptr; -} - -static std::optional parseNVPTXMDNodeAsInteger(Function &F, - StringRef Name) { - std::optional Result; - if (MDNode *ExistingOp = getNVPTXMDNode(F, Name)) { - auto *Op = cast(ExistingOp->getOperand(2)); - Result = cast(Op->getValue())->getZExtValue(); + Attribute A = F.getFnAttribute(Name); + if (!A.isStringAttribute()) + return std::nullopt; + StringRef Field = A.getValueAsString(); + int64_t Val; + if (Field.getAsInteger(0, Val)) { + F.getContext().emitError("cannot parse integer in attribute '" + Name + + "': " + Field); + return std::nullopt; } - return Result; + return Val; } -void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) { +void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, + TargetMachine *TM) { KernelInfo KI; KI.FlatAddrspace = FAM.getResult(F).getFlatAddressSpace(); @@ -367,21 +308,6 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) { KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams"); KI.OmpTargetThreadLimit = parseFnAttrAsInteger(F, "omp_target_thread_limit"); - auto AmdgpuMaxNumWorkgroups = - parseFnAttrAsIntegerFields(F, "amdgpu-max-num-workgroups", 3); - KI.AmdgpuMaxNumWorkgroupsX = AmdgpuMaxNumWorkgroups[0]; - KI.AmdgpuMaxNumWorkgroupsY = AmdgpuMaxNumWorkgroups[1]; - KI.AmdgpuMaxNumWorkgroupsZ = AmdgpuMaxNumWorkgroups[2]; - auto AmdgpuFlatWorkGroupSize = - parseFnAttrAsIntegerFields(F, "amdgpu-flat-work-group-size", 2); - KI.AmdgpuFlatWorkGroupSizeMin = AmdgpuFlatWorkGroupSize[0]; - KI.AmdgpuFlatWorkGroupSizeMax = AmdgpuFlatWorkGroupSize[1]; - auto AmdgpuWavesPerEu = - parseFnAttrAsIntegerFields(F, "amdgpu-waves-per-eu", 2); - KI.AmdgpuWavesPerEuMin = AmdgpuWavesPerEu[0]; - KI.AmdgpuWavesPerEuMax = AmdgpuWavesPerEu[1]; - KI.Maxclusterrank = parseNVPTXMDNodeAsInteger(F, "maxclusterrank"); - KI.Maxntidx = parseNVPTXMDNodeAsInteger(F, "maxntidx"); const DominatorTree &DT = FAM.getResult(F); auto &ORE = FAM.getResult(F); @@ -394,15 +320,16 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) { REMARK_PROPERTY(ExternalNotKernel); REMARK_PROPERTY(OmpTargetNumTeams); REMARK_PROPERTY(OmpTargetThreadLimit); - REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsX); - REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsY); - REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsZ); - REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMin); - REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMax); - REMARK_PROPERTY(AmdgpuWavesPerEuMin); - REMARK_PROPERTY(AmdgpuWavesPerEuMax); - REMARK_PROPERTY(Maxclusterrank); - REMARK_PROPERTY(Maxntidx); + // TM might be nullptr if support for the target was not built. For example, + // we currently have some KernelInfo tests where the choice of target isn't + // important, so they arbitrarily choose a target triple. Those tests are + // expected to run successfully even if support for that target was not built. + if (TM) { + TM->getSubtargetImpl(F)->forEachLaunchBound( + F, [&](StringRef Name, unsigned Value) { + remarkProperty(ORE, F, Name, Value); + }); + } REMARK_PROPERTY(Allocas); REMARK_PROPERTY(AllocasStaticSizeSum); REMARK_PROPERTY(AllocasDyn); @@ -419,6 +346,6 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) { PreservedAnalyses KernelInfoPrinter::run(Function &F, FunctionAnalysisManager &AM) { - KernelInfo::emitKernelInfo(F, AM); + KernelInfo::emitKernelInfo(F, AM, TM); return PreservedAnalyses::all(); } diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 6b3ccfccf3ae0..10b0b3f57c289 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -382,7 +382,7 @@ FUNCTION_PASS("irce", IRCEPass()) FUNCTION_PASS("jump-threading", JumpThreadingPass()) FUNCTION_PASS("jump-table-to-switch", JumpTableToSwitchPass()); FUNCTION_PASS("kcfi", KCFIPass()) -FUNCTION_PASS("kernel-info", KernelInfoPrinter()) +FUNCTION_PASS("kernel-info", KernelInfoPrinter(TM)) FUNCTION_PASS("lcssa", LCSSAPass()) FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass()) FUNCTION_PASS("lint", LintPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 555302b290da2..c0e3df93264c9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -790,9 +790,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (KernelInfoEndLTO) { PB.registerFullLinkTimeOptimizationLastEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { + [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; - FPM.addPass(KernelInfoPrinter()); + FPM.addPass(KernelInfoPrinter(this)); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 52c24a5c25ec2..f1eb5fcb2c06f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -711,6 +711,22 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { return NSAThreshold; } +void GCNSubtarget::forEachLaunchBound( + const Function &F, + std::function Body) const { + auto AmdgpuMaxNumWorkgroups = getMaxNumWorkGroups(F); + Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]); + Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]); + Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]); + auto AmdgpuFlatWorkGroupSize = getFlatWorkGroupSizes(F); + Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first); + Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second); + auto AmdgpuWavesPerEU = getWavesPerEU(F); + Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first); + Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second); + // TODO: Any others we should add? +} + GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST) : ST(ST) { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 7b74eab96c567..a514945a5e6f5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1587,6 +1587,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // the nop. return true; } + + virtual void forEachLaunchBound( + const Function &F, + std::function Body) const override; }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index 420065585b384..fccb3de453734 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -12,6 +12,7 @@ #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" +#include "NVPTXUtilities.h" using namespace llvm; @@ -69,3 +70,18 @@ bool NVPTXSubtarget::hasImageHandles() const { bool NVPTXSubtarget::allowFP16Math() const { return hasFP16Math() && NoF16Math == false; } + +void NVPTXSubtarget::forEachLaunchBound( + const Function &F, + std::function Body) const { + unsigned Val; + if (getMaxClusterRank(F, Val)) + Body("Maxclusterrank", Val); + if (auto Val = getMaxNTIDx(F)) + Body("Maxntidx", *Val); + if (auto Val = getMaxNTIDy(F)) + Body("Maxntidy", *Val); + if (auto Val = getMaxNTIDz(F)) + Body("Maxntidz", *Val); + // TODO: Any others we should add? +} diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 457f10f1d64a2..6cc8b6764cf8e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -119,6 +119,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); + + virtual void forEachLaunchBound( + const Function &F, + std::function Body) const override; }; } // End llvm namespace diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 777d1215214ec..8fd3dacbab87e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -242,9 +242,9 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (KernelInfoEndLTO) { PB.registerFullLinkTimeOptimizationLastEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { + [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; - FPM.addPass(KernelInfoPrinter()); + FPM.addPass(KernelInfoPrinter(this)); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); } diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll index 0c98f4ad45950..472d7c0286b01 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll @@ -1,5 +1,7 @@ ; Check info on launch bounds for AMD GPU. +; REQUIRES: amdgpu-registered-target + ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -disable-output %s 2>&1 | \ ; RUN: FileCheck -match-full-lines %s @@ -7,16 +9,44 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" target triple = "amdgcn-amd-amdhsa" -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsX = 200 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsY = 201 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsZ = 202 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMin = 210 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMax = 211 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMin = 220 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMax = 221 -define void @test() #0 !dbg !5 { +; CHECK: remark: test.c:10:0: in artificial function 'all', OmpTargetNumTeams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'all', OmpTargetThreadLimit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsX = 200 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsY = 201 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsZ = 202 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuFlatWorkGroupSizeMin = 210 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuFlatWorkGroupSizeMax = 211 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuWavesPerEUMin = 2 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuWavesPerEUMax = 9 +define void @all() #0 !dbg !5 { +entry: + ret void +} + +; CHECK-NOT: remark: test.c:11:0: in function 'none', OmpTargetNumTeams = {{.*}} +; CHECK-NOT: remark: test.c:11:0: in function 'none', OmpTargetThreadLimit = {{.*}} +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsX = 0 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsY = 0 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsZ = 0 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuFlatWorkGroupSizeMin = 1 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuFlatWorkGroupSizeMax = 1024 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuWavesPerEUMin = 4 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuWavesPerEUMax = 10 +define void @none() !dbg !6 { +entry: + ret void +} + +; CHECK: remark: test.c:12:0: in function 'bogus', OmpTargetNumTeams = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', OmpTargetThreadLimit = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsX = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsY = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsZ = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuFlatWorkGroupSizeMin = 1 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuFlatWorkGroupSizeMax = 1024 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuWavesPerEUMin = 4 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuWavesPerEUMax = 10 +define void @bogus() #1 !dbg !7 { entry: ret void } @@ -26,7 +56,18 @@ attributes #0 = { "omp_target_thread_limit"="101" "amdgpu-max-num-workgroups"="200,201,202" "amdgpu-flat-work-group-size"="210,211" - "amdgpu-waves-per-eu"="220,221" + "amdgpu-waves-per-eu"="2,9" +} + +; We choose values that are small enough to parse successfully but that are +; impossibly large. For values that are validated, we check that they are +; overridden with realistic values. +attributes #1 = { + "omp_target_num_teams"="987654321" + "omp_target_thread_limit"="987654321" + "amdgpu-max-num-workgroups"="987654321,987654321,987654321" + "amdgpu-flat-work-group-size"="987654321,987654321" + "amdgpu-waves-per-eu"="987654321,987654321" } !llvm.module.flags = !{!0} @@ -37,4 +78,6 @@ attributes #0 = { !2 = !DIFile(filename: "test.c", directory: "/tmp") !3 = !{} !4 = !DISubroutineType(types: !3) -!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!5 = distinct !DISubprogram(name: "all", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = distinct !DISubprogram(name: "none", scope: !2, file: !2, line: 11, type: !4, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!7 = distinct !DISubprogram(name: "bogus", scope: !2, file: !2, line: 12, type: !4, scopeLine: 12, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll index c7339f90e3ca9..d9a024f38652e 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll @@ -1,5 +1,7 @@ ; Check info on launch bounds for NVPTX. +; REQUIRES: nvptx-registered-target + ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -disable-output %s 2>&1 | \ ; RUN: FileCheck -match-full-lines %s @@ -11,6 +13,8 @@ target triple = "nvptx64-nvidia-cuda" ; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101 ; CHECK: remark: test.c:10:0: in artificial function 'test', Maxclusterrank = 200 ; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidx = 210 +; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidy = 211 +; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidz = 212 define void @test() #0 !dbg !5 { entry: ret void @@ -23,7 +27,7 @@ attributes #0 = { !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} -!nvvm.annotations = !{!6, !7, !8} +!nvvm.annotations = !{!6, !7, !8, !9, !10} !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) @@ -33,4 +37,6 @@ attributes #0 = { !5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) !6 = !{ptr @test, !"maxclusterrank", i32 200} !7 = !{ptr @test, !"maxntidx", i32 210} -!8 = distinct !{ptr null, !"kernel", i32 1} +!8 = !{ptr @test, !"maxntidy", i32 211} +!9 = !{ptr @test, !"maxntidz", i32 212} +!10 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index be3b357cc4530..d21dde10f979a 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -17,8 +17,13 @@ ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsX = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsY = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsZ = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMin = 1 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMax = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuWavesPerEUMin = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuWavesPerEUMax = 10 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 @@ -34,6 +39,13 @@ ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsX = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsY = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsZ = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuFlatWorkGroupSizeMin = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuFlatWorkGroupSizeMax = 1024 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuWavesPerEUMin = 4 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuWavesPerEUMax = 10 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 @@ -49,6 +61,13 @@ ; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsX = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsY = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsZ = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuFlatWorkGroupSizeMin = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuFlatWorkGroupSizeMax = 1024 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuWavesPerEUMin = 4 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuWavesPerEUMax = 10 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0 From 94d90d17e156f6a8e89cf3155bde2138a65c4f42 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 16 Sep 2024 16:21:32 -0400 Subject: [PATCH 23/46] Adjust forEachLaunchBound param * std::function -> llvm::function_ref * unsigned -> int64_t --- llvm/include/llvm/CodeGen/TargetSubtargetInfo.h | 2 +- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 7 ++++--- llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 7 ++++--- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h index 485aa7e13fe69..d301304a47275 100644 --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -343,7 +343,7 @@ class TargetSubtargetInfo : public MCSubtargetInfo { /// For \p F, call \p Body with the name and value of each launch bound. virtual void forEachLaunchBound( const Function &F, - std::function Body) const {} + llvm::function_ref Body) const {} }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index f1eb5fcb2c06f..1ec7a6f64bbf5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -713,7 +713,7 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { void GCNSubtarget::forEachLaunchBound( const Function &F, - std::function Body) const { + llvm::function_ref Body) const { auto AmdgpuMaxNumWorkgroups = getMaxNumWorkGroups(F); Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]); Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 7fb7fcd496ade..0df0a3e8ecca6 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1594,9 +1594,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return false; } - virtual void forEachLaunchBound( - const Function &F, - std::function Body) const override; + virtual void + forEachLaunchBound(const Function &F, + llvm::function_ref + Body) const override; }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index fccb3de453734..ab68f54f0473c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -73,7 +73,7 @@ bool NVPTXSubtarget::allowFP16Math() const { void NVPTXSubtarget::forEachLaunchBound( const Function &F, - std::function Body) const { + llvm::function_ref Body) const { unsigned Val; if (getMaxClusterRank(F, Val)) Body("Maxclusterrank", Val); diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 6cc8b6764cf8e..710faf0665054 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -120,9 +120,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); - virtual void forEachLaunchBound( - const Function &F, - std::function Body) const override; + virtual void + forEachLaunchBound(const Function &F, + llvm::function_ref + Body) const override; }; } // End llvm namespace From 762a217705f0ffd90723e2d8d9d54f1c39975c2a Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 16 Sep 2024 16:21:58 -0400 Subject: [PATCH 24/46] Reuse Function::getFnAttributeAsParsedInteger --- llvm/lib/Analysis/KernelInfo.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 6d0efdfec8344..85d923a97740d 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -286,17 +286,9 @@ static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, static std::optional parseFnAttrAsInteger(Function &F, StringRef Name) { - Attribute A = F.getFnAttribute(Name); - if (!A.isStringAttribute()) + if (!F.hasFnAttribute(Name)) return std::nullopt; - StringRef Field = A.getValueAsString(); - int64_t Val; - if (Field.getAsInteger(0, Val)) { - F.getContext().emitError("cannot parse integer in attribute '" + Name + - "': " + Field); - return std::nullopt; - } - return Val; + return F.getFnAttributeAsParsedInteger(Name); } void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, From df66a3d2c28339f2f3d6cc515a550894e5a05bef Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 16 Sep 2024 16:22:09 -0400 Subject: [PATCH 25/46] Move forEachLaunchBound to TargetTransformInfo --- .../include/llvm/Analysis/TargetTransformInfo.h | 15 +++++++++++++++ .../llvm/Analysis/TargetTransformInfoImpl.h | 4 ++++ llvm/include/llvm/CodeGen/TargetSubtargetInfo.h | 5 ----- llvm/lib/Analysis/KernelInfo.cpp | 17 +++++------------ llvm/lib/Analysis/TargetTransformInfo.cpp | 6 ++++++ .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 16 ++++++++++++++++ .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 3 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 16 ---------------- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 ----- llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp | 16 ---------------- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 5 ----- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 15 +++++++++++++++ .../lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 3 +++ 13 files changed, 67 insertions(+), 59 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index b2124c6106198..e55aed11e53c9 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1788,6 +1788,11 @@ class TargetTransformInfo { /// @} + /// For \p F, call \p Body with the name and value of each launch bound. + void forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const; + private: /// The abstract base class used to type erase specific TTI /// implementations. @@ -2179,6 +2184,9 @@ class TargetTransformInfo::Concept { getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual unsigned getMaxNumArgs() const = 0; + virtual void forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const = 0; }; template @@ -2952,6 +2960,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } + + void + forEachLaunchBound(const Function &F, + llvm::function_ref + Body) const override { + return Impl.forEachLaunchBound(F, Body); + } }; template diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 90eef93a2a54d..684aa44cb945e 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -973,6 +973,10 @@ class TargetTransformInfoImplBase { unsigned getMaxNumArgs() const { return UINT_MAX; } + void forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const {} + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h index d301304a47275..bfaa6450779ae 100644 --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -339,11 +339,6 @@ class TargetSubtargetInfo : public MCSubtargetInfo { // Conservatively assume such instructions exist by default. return true; } - - /// For \p F, call \p Body with the name and value of each launch bound. - virtual void forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const {} }; } // end namespace llvm diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 85d923a97740d..a71d8b3acd09f 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -15,7 +15,6 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -294,7 +293,8 @@ static std::optional parseFnAttrAsInteger(Function &F, void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, TargetMachine *TM) { KernelInfo KI; - KI.FlatAddrspace = FAM.getResult(F).getFlatAddressSpace(); + TargetTransformInfo &TheTTI = FAM.getResult(F); + KI.FlatAddrspace = TheTTI.getFlatAddressSpace(); // Record function properties. KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); @@ -312,16 +312,9 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, REMARK_PROPERTY(ExternalNotKernel); REMARK_PROPERTY(OmpTargetNumTeams); REMARK_PROPERTY(OmpTargetThreadLimit); - // TM might be nullptr if support for the target was not built. For example, - // we currently have some KernelInfo tests where the choice of target isn't - // important, so they arbitrarily choose a target triple. Those tests are - // expected to run successfully even if support for that target was not built. - if (TM) { - TM->getSubtargetImpl(F)->forEachLaunchBound( - F, [&](StringRef Name, unsigned Value) { - remarkProperty(ORE, F, Name, Value); - }); - } + TheTTI.forEachLaunchBound(F, [&](StringRef Name, unsigned Value) { + remarkProperty(ORE, F, Name, Value); + }); REMARK_PROPERTY(Allocas); REMARK_PROPERTY(AllocasStaticSizeSum); REMARK_PROPERTY(AllocasDyn); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 2c26493bd3f1c..cf48fa7614173 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1348,6 +1348,12 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } +void TargetTransformInfo::forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const { + return TTIImpl->forEachLaunchBound(F, Body); +} + TargetTransformInfo::Concept::~Concept() = default; TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 4cf7733a260ff..fe362f40cf56f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1390,3 +1390,19 @@ unsigned GCNTTIImpl::getPrefetchDistance() const { bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { return AMDGPU::isFlatGlobalAddrSpace(AS); } + +void GCNTTIImpl::forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const { + auto AmdgpuMaxNumWorkgroups = ST->getMaxNumWorkGroups(F); + Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]); + Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]); + Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]); + auto AmdgpuFlatWorkGroupSize = ST->getFlatWorkGroupSizes(F); + Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first); + Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second); + auto AmdgpuWavesPerEU = ST->getWavesPerEU(F); + Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first); + Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second); + // TODO: Any others we should add? +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 01df2e6caaba1..529170888f2e9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -266,6 +266,9 @@ class GCNTTIImpl final : public BasicTTIImplBase { /// \return if target want to issue a prefetch in address space \p AS. bool shouldPrefetchAddressSpace(unsigned AS) const override; + void forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 1ec7a6f64bbf5..52c24a5c25ec2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -711,22 +711,6 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { return NSAThreshold; } -void GCNSubtarget::forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const { - auto AmdgpuMaxNumWorkgroups = getMaxNumWorkGroups(F); - Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]); - Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]); - Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]); - auto AmdgpuFlatWorkGroupSize = getFlatWorkGroupSizes(F); - Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first); - Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second); - auto AmdgpuWavesPerEU = getWavesPerEU(F); - Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first); - Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second); - // TODO: Any others we should add? -} - GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST) : ST(ST) { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 0df0a3e8ecca6..a4ae8a1be3225 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1593,11 +1593,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // to the same register. return false; } - - virtual void - forEachLaunchBound(const Function &F, - llvm::function_ref - Body) const override; }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index ab68f54f0473c..420065585b384 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -12,7 +12,6 @@ #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" -#include "NVPTXUtilities.h" using namespace llvm; @@ -70,18 +69,3 @@ bool NVPTXSubtarget::hasImageHandles() const { bool NVPTXSubtarget::allowFP16Math() const { return hasFP16Math() && NoF16Math == false; } - -void NVPTXSubtarget::forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const { - unsigned Val; - if (getMaxClusterRank(F, Val)) - Body("Maxclusterrank", Val); - if (auto Val = getMaxNTIDx(F)) - Body("Maxntidx", *Val); - if (auto Val = getMaxNTIDy(F)) - Body("Maxntidy", *Val); - if (auto Val = getMaxNTIDz(F)) - Body("Maxntidz", *Val); - // TODO: Any others we should add? -} diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 710faf0665054..457f10f1d64a2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -119,11 +119,6 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); - - virtual void - forEachLaunchBound(const Function &F, - llvm::function_ref - Body) const override; }; } // End llvm namespace diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 9a8ea8f87896a..50cc2c8e22d4f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -442,3 +442,18 @@ void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) { BaseT::getPeelingPreferences(L, SE, PP); } + +void NVPTXTTIImpl::forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const { + unsigned Val; + if (getMaxClusterRank(F, Val)) + Body("Maxclusterrank", Val); + if (auto Val = getMaxNTIDx(F)) + Body("Maxntidx", *Val); + if (auto Val = getMaxNTIDy(F)) + Body("Maxntidy", *Val); + if (auto Val = getMaxNTIDz(F)) + Body("Maxntidz", *Val); + // TODO: Any others we should add? +} diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 4160f5f6bfae7..2d794f1d80050 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -124,6 +124,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase { return true; } } + void forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const; }; } // end namespace llvm From 3f63d532fa99a59a3be58e31d09943b143b1c889 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Thu, 26 Sep 2024 14:03:13 -0400 Subject: [PATCH 26/46] forEachLaunchBound -> collectLaunchBounds Return the launch bounds instead of passing them to a callback. --- .../llvm/Analysis/TargetTransformInfo.h | 21 +++++++------- .../llvm/Analysis/TargetTransformInfoImpl.h | 4 +-- llvm/lib/Analysis/KernelInfo.cpp | 28 ++++++------------- llvm/lib/Analysis/TargetTransformInfo.cpp | 6 ++-- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 18 ++++++------ .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 6 ++-- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 12 ++++---- .../Target/NVPTX/NVPTXTargetTransformInfo.h | 6 ++-- 8 files changed, 45 insertions(+), 56 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 27798ca4747e6..106fef4ef820b 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1794,10 +1794,10 @@ class TargetTransformInfo { /// @} - /// For \p F, call \p Body with the name and value of each launch bound. - void forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const; + /// Collect launch bounds for \p F into \p LB. + void + collectLaunchBounds(const Function &F, + SmallVectorImpl> &LB) const; private: /// The abstract base class used to type erase specific TTI @@ -2191,9 +2191,9 @@ class TargetTransformInfo::Concept { getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual unsigned getMaxNumArgs() const = 0; - virtual void forEachLaunchBound( + virtual void collectLaunchBounds( const Function &F, - llvm::function_ref Body) const = 0; + SmallVectorImpl> &LB) const = 0; }; template @@ -2973,11 +2973,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.getMaxNumArgs(); } - void - forEachLaunchBound(const Function &F, - llvm::function_ref - Body) const override { - return Impl.forEachLaunchBound(F, Body); + void collectLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const override { + Impl.collectLaunchBounds(F, LB); } }; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 7832c2f2c2803..1e05fa7200fe7 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -979,9 +979,9 @@ class TargetTransformInfoImplBase { unsigned getMaxNumArgs() const { return UINT_MAX; } - void forEachLaunchBound( + void collectLaunchBounds( const Function &F, - llvm::function_ref Body) const {} + SmallVectorImpl> &LB) const {} protected: // Obtain the minimum required size to hold the value (without the sign) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index a71d8b3acd09f..826340ca8401d 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -40,11 +40,8 @@ class KernelInfo { /// Whether the function has external linkage and is not a kernel function. bool ExternalNotKernel = false; - /// OpenMP Launch bounds. - ///@{ - std::optional OmpTargetNumTeams; - std::optional OmpTargetThreadLimit; - ///@} + /// Launch bounds. + SmallVector> LaunchBounds; /// The number of alloca instructions inside the function, the number of those /// with allocation sizes that cannot be determined at compile time, and the @@ -276,13 +273,6 @@ static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, }); } -static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, - StringRef Name, std::optional Value) { - if (!Value) - return; - remarkProperty(ORE, F, Name, Value.value()); -} - static std::optional parseFnAttrAsInteger(Function &F, StringRef Name) { if (!F.hasFnAttribute(Name)) @@ -298,8 +288,11 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, // Record function properties. KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); - KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams"); - KI.OmpTargetThreadLimit = parseFnAttrAsInteger(F, "omp_target_thread_limit"); + if (auto Val = parseFnAttrAsInteger(F, "omp_target_num_teams")) + KI.LaunchBounds.push_back({"OmpTargetNumTeams", *Val}); + if (auto Val = parseFnAttrAsInteger(F, "omp_target_thread_limit")) + KI.LaunchBounds.push_back({"OmpTargetThreadLimit", *Val}); + TheTTI.collectLaunchBounds(F, KI.LaunchBounds); const DominatorTree &DT = FAM.getResult(F); auto &ORE = FAM.getResult(F); @@ -310,11 +303,8 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, #define REMARK_PROPERTY(PROP_NAME) \ remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) REMARK_PROPERTY(ExternalNotKernel); - REMARK_PROPERTY(OmpTargetNumTeams); - REMARK_PROPERTY(OmpTargetThreadLimit); - TheTTI.forEachLaunchBound(F, [&](StringRef Name, unsigned Value) { - remarkProperty(ORE, F, Name, Value); - }); + for (auto LB : KI.LaunchBounds) + remarkProperty(ORE, F, LB.first, LB.second); REMARK_PROPERTY(Allocas); REMARK_PROPERTY(AllocasStaticSizeSum); REMARK_PROPERTY(AllocasDyn); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 946754735efcb..6c24ec34d80b2 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1354,10 +1354,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } -void TargetTransformInfo::forEachLaunchBound( +void TargetTransformInfo::collectLaunchBounds( const Function &F, - llvm::function_ref Body) const { - return TTIImpl->forEachLaunchBound(F, Body); + SmallVectorImpl> &LB) const { + return TTIImpl->collectLaunchBounds(F, LB); } TargetTransformInfo::Concept::~Concept() = default; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index fe362f40cf56f..6094e5a42f4bf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1391,18 +1391,18 @@ bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { return AMDGPU::isFlatGlobalAddrSpace(AS); } -void GCNTTIImpl::forEachLaunchBound( +void GCNTTIImpl::collectLaunchBounds( const Function &F, - llvm::function_ref Body) const { + SmallVectorImpl> &LB) const { auto AmdgpuMaxNumWorkgroups = ST->getMaxNumWorkGroups(F); - Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]); - Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]); - Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]); + LB.push_back({"AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]}); + LB.push_back({"AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]}); + LB.push_back({"AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]}); auto AmdgpuFlatWorkGroupSize = ST->getFlatWorkGroupSizes(F); - Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first); - Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second); + LB.push_back({"AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first}); + LB.push_back({"AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second}); auto AmdgpuWavesPerEU = ST->getWavesPerEU(F); - Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first); - Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second); + LB.push_back({"AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first}); + LB.push_back({"AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second}); // TODO: Any others we should add? } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 34944e6c478aa..4b30ac71ccd33 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -265,9 +265,9 @@ class GCNTTIImpl final : public BasicTTIImplBase { /// \return if target want to issue a prefetch in address space \p AS. bool shouldPrefetchAddressSpace(unsigned AS) const override; - void forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const; + void + collectLaunchBounds(const Function &F, + SmallVectorImpl> &LB) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 1a99a1cf91144..4752cf01dd205 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -443,16 +443,16 @@ void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, BaseT::getPeelingPreferences(L, SE, PP); } -void NVPTXTTIImpl::forEachLaunchBound( +void NVPTXTTIImpl::collectLaunchBounds( const Function &F, - llvm::function_ref Body) const { + SmallVectorImpl> &LB) const { if (auto Val = getMaxClusterRank(F)) - Body("Maxclusterrank", *Val); + LB.push_back({"Maxclusterrank", *Val}); if (auto Val = getMaxNTIDx(F)) - Body("Maxntidx", *Val); + LB.push_back({"Maxntidx", *Val}); if (auto Val = getMaxNTIDy(F)) - Body("Maxntidy", *Val); + LB.push_back({"Maxntidy", *Val}); if (auto Val = getMaxNTIDz(F)) - Body("Maxntidz", *Val); + LB.push_back({"Maxntidz", *Val}); // TODO: Any others we should add? } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 7e7d1dd588855..07c14e88cc786 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -123,9 +123,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase { return true; } } - void forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const; + void + collectLaunchBounds(const Function &F, + SmallVectorImpl> &LB) const; }; } // end namespace llvm From feeaa3780cf725f0da1404b99b3f8634dbce75de Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Sat, 28 Sep 2024 12:50:23 -0400 Subject: [PATCH 27/46] Remove redundant private --- llvm/include/llvm/Analysis/KernelInfo.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 6633c28858a2f..75d92c202212b 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -22,7 +22,6 @@ namespace llvm { class TargetMachine; class KernelInfoPrinter : public PassInfoMixin { -private: TargetMachine *TM; public: From 116f1c9f14961b287d3393f9706badbcd63c515d Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 14:51:24 -0400 Subject: [PATCH 28/46] Remove todos, as requested --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 1 - llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 1 - 2 files changed, 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index f730738494219..880497908df27 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1424,5 +1424,4 @@ void GCNTTIImpl::collectLaunchBounds( auto AmdgpuWavesPerEU = ST->getWavesPerEU(F); LB.push_back({"AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first}); LB.push_back({"AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second}); - // TODO: Any others we should add? } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 47510912fd1f7..3004620b40cbe 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -455,5 +455,4 @@ void NVPTXTTIImpl::collectLaunchBounds( LB.push_back({"Maxntidy", *Val}); if (auto Val = getMaxNTIDz(F)) LB.push_back({"Maxntidz", *Val}); - // TODO: Any others we should add? } From 2094465ae367d35a5cc05bdc2e1703d806491976 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 15:22:56 -0400 Subject: [PATCH 29/46] Combine registerFullLinkTimeOptimizationLastEPCallback calls --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 65771e145ff11..1264749059359 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -825,6 +825,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(AMDGPULowerModuleLDSPass(*this)); if (EnableAMDGPUAttributor && Level != OptimizationLevel::O0) PM.addPass(AMDGPUAttributorPass(*this)); + if (KernelInfoEndLTO) { + FunctionPassManager FPM; + FPM.addPass(KernelInfoPrinter(this)); + PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } }); PB.registerRegClassFilterParsingCallback( @@ -836,14 +841,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { return nullptr; }); - if (KernelInfoEndLTO) { - PB.registerFullLinkTimeOptimizationLastEPCallback( - [this](ModulePassManager &PM, OptimizationLevel Level) { - FunctionPassManager FPM; - FPM.addPass(KernelInfoPrinter(this)); - PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); - }); - } } int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { From 39bce7c5a18c94ff8085d169322d0683e266a792 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 15:28:34 -0400 Subject: [PATCH 30/46] collectLaunchBounds -> collectKernelLaunchBounds --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 14 +++++++------- .../llvm/Analysis/TargetTransformInfoImpl.h | 2 +- llvm/lib/Analysis/KernelInfo.cpp | 2 +- llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++-- .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 6 +++--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 6 +++--- 8 files changed, 19 insertions(+), 19 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 29d96b4333ef2..e6ceb19ef045e 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1815,10 +1815,10 @@ class TargetTransformInfo { /// @} - /// Collect launch bounds for \p F into \p LB. - void - collectLaunchBounds(const Function &F, - SmallVectorImpl> &LB) const; + /// Collect kernel launch bounds for \p F into \p LB. + void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const; private: /// The abstract base class used to type erase specific TTI @@ -2220,7 +2220,7 @@ class TargetTransformInfo::Concept { getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual unsigned getMaxNumArgs() const = 0; - virtual void collectLaunchBounds( + virtual void collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const = 0; }; @@ -3020,10 +3020,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.getMaxNumArgs(); } - void collectLaunchBounds( + void collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const override { - Impl.collectLaunchBounds(F, LB); + Impl.collectKernelLaunchBounds(F, LB); } }; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 89317ac2ec32d..1e6cb7841ccdf 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1001,7 +1001,7 @@ class TargetTransformInfoImplBase { unsigned getMaxNumArgs() const { return UINT_MAX; } - void collectLaunchBounds( + void collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const {} diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 826340ca8401d..49e001c85b08f 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -292,7 +292,7 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, KI.LaunchBounds.push_back({"OmpTargetNumTeams", *Val}); if (auto Val = parseFnAttrAsInteger(F, "omp_target_thread_limit")) KI.LaunchBounds.push_back({"OmpTargetThreadLimit", *Val}); - TheTTI.collectLaunchBounds(F, KI.LaunchBounds); + TheTTI.collectKernelLaunchBounds(F, KI.LaunchBounds); const DominatorTree &DT = FAM.getResult(F); auto &ORE = FAM.getResult(F); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 8f05a19644d31..24fea39e44f05 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1372,10 +1372,10 @@ bool TargetTransformInfo::isVectorShiftByScalarCheap(Type *Ty) const { return TTIImpl->isVectorShiftByScalarCheap(Ty); } -void TargetTransformInfo::collectLaunchBounds( +void TargetTransformInfo::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { - return TTIImpl->collectLaunchBounds(F, LB); + return TTIImpl->collectKernelLaunchBounds(F, LB); } TargetTransformInfo::Concept::~Concept() = default; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 880497908df27..98dbb064ac79f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1411,7 +1411,7 @@ bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { return AMDGPU::isFlatGlobalAddrSpace(AS); } -void GCNTTIImpl::collectLaunchBounds( +void GCNTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { auto AmdgpuMaxNumWorkgroups = ST->getMaxNumWorkGroups(F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 60bc829f5242a..0081748253c92 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -269,9 +269,9 @@ class GCNTTIImpl final : public BasicTTIImplBase { /// \return if target want to issue a prefetch in address space \p AS. bool shouldPrefetchAddressSpace(unsigned AS) const override; - void - collectLaunchBounds(const Function &F, - SmallVectorImpl> &LB) const; + void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 3004620b40cbe..c590d4a2c7d06 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -444,7 +444,7 @@ void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, BaseT::getPeelingPreferences(L, SE, PP); } -void NVPTXTTIImpl::collectLaunchBounds( +void NVPTXTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { if (auto Val = getMaxClusterRank(F)) diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 07c14e88cc786..5a8db82616ce1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -123,9 +123,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase { return true; } } - void - collectLaunchBounds(const Function &F, - SmallVectorImpl> &LB) const; + void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const; }; } // end namespace llvm From 14345cf13bd071efa4bbff695351846560647d5d Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 16:02:04 -0400 Subject: [PATCH 31/46] Spell kernel-info properties like their IR attributes --- llvm/lib/Analysis/KernelInfo.cpp | 8 +-- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 20 +++---- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 8 +-- .../kernel-info-after-lto/Inputs/test.ll | 2 +- .../KernelInfo/launch-bounds/amdgpu.ll | 54 +++++++++---------- .../KernelInfo/launch-bounds/nvptx.ll | 12 ++--- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 44 +++++++-------- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 4 +- 8 files changed, 76 insertions(+), 76 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 49e001c85b08f..3aca4c59105ce 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -288,10 +288,10 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, // Record function properties. KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); - if (auto Val = parseFnAttrAsInteger(F, "omp_target_num_teams")) - KI.LaunchBounds.push_back({"OmpTargetNumTeams", *Val}); - if (auto Val = parseFnAttrAsInteger(F, "omp_target_thread_limit")) - KI.LaunchBounds.push_back({"OmpTargetThreadLimit", *Val}); + for (StringRef Name : {"omp_target_num_teams", "omp_target_thread_limit"}) { + if (auto Val = parseFnAttrAsInteger(F, Name)) + KI.LaunchBounds.push_back({Name, *Val}); + } TheTTI.collectKernelLaunchBounds(F, KI.LaunchBounds); const DominatorTree &DT = FAM.getResult(F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 98dbb064ac79f..6d92dccad076f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1414,14 +1414,14 @@ bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { void GCNTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { - auto AmdgpuMaxNumWorkgroups = ST->getMaxNumWorkGroups(F); - LB.push_back({"AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]}); - LB.push_back({"AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]}); - LB.push_back({"AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]}); - auto AmdgpuFlatWorkGroupSize = ST->getFlatWorkGroupSizes(F); - LB.push_back({"AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first}); - LB.push_back({"AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second}); - auto AmdgpuWavesPerEU = ST->getWavesPerEU(F); - LB.push_back({"AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first}); - LB.push_back({"AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second}); + auto MaxNumWorkgroups = ST->getMaxNumWorkGroups(F); + LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]}); + LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]}); + LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]}); + auto FlatWorkGroupSize = ST->getFlatWorkGroupSizes(F); + LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first}); + LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second}); + auto WavesPerEU = ST->getWavesPerEU(F); + LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first}); + LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second}); } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index c590d4a2c7d06..d230a66449063 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -448,11 +448,11 @@ void NVPTXTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { if (auto Val = getMaxClusterRank(F)) - LB.push_back({"Maxclusterrank", *Val}); + LB.push_back({"maxclusterrank", *Val}); if (auto Val = getMaxNTIDx(F)) - LB.push_back({"Maxntidx", *Val}); + LB.push_back({"maxntidx", *Val}); if (auto Val = getMaxNTIDy(F)) - LB.push_back({"Maxntidy", *Val}); + LB.push_back({"maxntidy", *Val}); if (auto Val = getMaxNTIDz(F)) - LB.push_back({"Maxntidz", *Val}); + LB.push_back({"maxntidz", *Val}); } diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll index b85e3c581867c..461544e44d538 100644 --- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll @@ -1,4 +1,4 @@ -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_num_teams = 100 ; NONE-NOT: remark: define void @test() #0 !dbg !5 { entry: diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll index 472d7c0286b01..d37dceec003f9 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll @@ -9,43 +9,43 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" target triple = "amdgcn-amd-amdhsa" -; CHECK: remark: test.c:10:0: in artificial function 'all', OmpTargetNumTeams = 100 -; CHECK: remark: test.c:10:0: in artificial function 'all', OmpTargetThreadLimit = 101 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsX = 200 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsY = 201 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsZ = 202 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuFlatWorkGroupSizeMin = 210 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuFlatWorkGroupSizeMax = 211 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuWavesPerEUMin = 2 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuWavesPerEUMax = 9 +; CHECK: remark: test.c:10:0: in artificial function 'all', omp_target_num_teams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'all', omp_target_thread_limit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[0] = 200 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[1] = 201 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[2] = 202 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-flat-work-group-size[0] = 210 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-flat-work-group-size[1] = 211 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-waves-per-eu[0] = 2 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-waves-per-eu[1] = 9 define void @all() #0 !dbg !5 { entry: ret void } -; CHECK-NOT: remark: test.c:11:0: in function 'none', OmpTargetNumTeams = {{.*}} -; CHECK-NOT: remark: test.c:11:0: in function 'none', OmpTargetThreadLimit = {{.*}} -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsX = 0 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsY = 0 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsZ = 0 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuFlatWorkGroupSizeMin = 1 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuFlatWorkGroupSizeMax = 1024 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuWavesPerEUMin = 4 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuWavesPerEUMax = 10 +; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_num_teams = {{.*}} +; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_thread_limit = {{.*}} +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[0] = 0 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[1] = 0 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[2] = 0 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[0] = 1 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[1] = 1024 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-waves-per-eu[0] = 4 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-waves-per-eu[1] = 10 define void @none() !dbg !6 { entry: ret void } -; CHECK: remark: test.c:12:0: in function 'bogus', OmpTargetNumTeams = 987654321 -; CHECK: remark: test.c:12:0: in function 'bogus', OmpTargetThreadLimit = 987654321 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsX = 987654321 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsY = 987654321 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsZ = 987654321 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuFlatWorkGroupSizeMin = 1 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuFlatWorkGroupSizeMax = 1024 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuWavesPerEUMin = 4 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuWavesPerEUMax = 10 +; CHECK: remark: test.c:12:0: in function 'bogus', omp_target_num_teams = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', omp_target_thread_limit = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[0] = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[1] = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[2] = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-flat-work-group-size[0] = 1 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-flat-work-group-size[1] = 1024 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-waves-per-eu[0] = 4 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-waves-per-eu[1] = 10 define void @bogus() #1 !dbg !7 { entry: ret void diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll index d9a024f38652e..7a055c7152ec8 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll @@ -9,12 +9,12 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101 -; CHECK: remark: test.c:10:0: in artificial function 'test', Maxclusterrank = 200 -; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidx = 210 -; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidy = 211 -; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidz = 212 +; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_num_teams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_thread_limit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxclusterrank = 200 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidx = 210 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidy = 211 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidz = 212 define void @test() #0 !dbg !5 { entry: ret void diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index d21dde10f979a..17ded0b6d3753 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -16,14 +16,14 @@ ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 256 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsX = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsY = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsZ = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMin = 1 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMax = 256 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuWavesPerEUMin = 1 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuWavesPerEUMax = 10 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[1] = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[0] = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[1] = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[0] = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[1] = 10 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 @@ -39,13 +39,13 @@ ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsX = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsY = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsZ = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuFlatWorkGroupSizeMin = 1 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuFlatWorkGroupSizeMax = 1024 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuWavesPerEUMin = 4 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuWavesPerEUMax = 10 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[0] = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[1] = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[0] = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[1] = 1024 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[0] = 4 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[1] = 10 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 @@ -61,13 +61,13 @@ ; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsX = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsY = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsZ = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuFlatWorkGroupSizeMin = 1 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuFlatWorkGroupSizeMax = 1024 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuWavesPerEUMin = 4 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuWavesPerEUMax = 10 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[0] = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[1] = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[0] = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[1] = 1024 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[0] = 4 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[1] = 10 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0 diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 2dbd04b2536c4..68c416acd6388 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -16,8 +16,8 @@ ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 128 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Maxntidx = 128 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 128 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', maxntidx = 128 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 From ad393d25109d16b9ce8bdb718eb1b7d3b02b1319 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 18:53:41 -0400 Subject: [PATCH 32/46] Replace -kernel-info-end-lto with -no-kernel-info-end-lto --- llvm/docs/KernelInfo.rst | 22 ++++++++++--------- llvm/include/llvm/Target/TargetMachine.h | 2 +- llvm/lib/Analysis/KernelInfo.cpp | 4 +++- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 2 +- llvm/lib/Target/TargetMachine.cpp | 6 ++--- .../Inputs/test.ll | 0 .../KernelInfo/enable-kernel-info/amdgpu.ll | 18 +++++++++++++++ .../KernelInfo/enable-kernel-info/nvptx.ll | 18 +++++++++++++++ .../kernel-info-after-lto/amdgpu.ll | 22 ------------------- .../KernelInfo/kernel-info-after-lto/nvptx.ll | 22 ------------------- 11 files changed, 57 insertions(+), 61 deletions(-) rename llvm/test/Analysis/KernelInfo/{kernel-info-after-lto => enable-kernel-info}/Inputs/test.ll (100%) create mode 100644 llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll create mode 100644 llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll delete mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll delete mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst index 397b32602bce2..dac642f1ffc65 100644 --- a/llvm/docs/KernelInfo.rst +++ b/llvm/docs/KernelInfo.rst @@ -14,8 +14,7 @@ mitigate them. The pass operates at the LLVM IR level so that it can, in theory, support any LLVM-based compiler for programming languages supporting GPUs. -By default, the pass is disabled. For convenience, the command-line option -``-kernel-info-end-lto`` inserts it at the end of LTO, and options like +By default, the pass runs at the end of LTO, and options like ``-Rpass=kernel-info`` enable its remarks. Example ``opt`` and ``clang`` command lines appear in the next section. @@ -31,7 +30,7 @@ To analyze a C program as it appears to an LLVM GPU backend at the end of LTO: .. code-block:: shell $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ - -Rpass=kernel-info -mllvm -kernel-info-end-lto + -Rpass=kernel-info To analyze specified LLVM IR, perhaps previously generated by something like ``clang -save-temps -g -fopenmp --offload-arch=native test.c``: @@ -41,21 +40,24 @@ To analyze specified LLVM IR, perhaps previously generated by something like $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ -pass-remarks=kernel-info -passes=kernel-info -kernel-info can also be inserted into a specified LLVM pass pipeline using -``-kernel-info-end-lto``, or it can be positioned explicitly in that pipeline: +When specifying an LLVM pass pipeline on the command line, ``kernel-info`` still +runs at the end of LTO by default. ``-no-kernel-info-end-lto`` disables that +behavior so you can position ``kernel-info`` explicitly: .. code-block:: shell $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ - -Rpass=kernel-info -mllvm -kernel-info-end-lto \ + -Rpass=kernel-info \ -Xoffload-linker --lto-newpm-passes='lto' $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ - -Rpass=kernel-info \ - -Xoffload-linker --lto-newpm-passes='lto,module(kernel-info)' + -Rpass=kernel-info -mllvm -no-kernel-info-end-lto \ + -Xoffload-linker --lto-newpm-passes='module(kernel-info),lto' $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ - -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto' + -pass-remarks=kernel-info \ + -passes='lto' $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ - -pass-remarks=kernel-info -passes='lto,module(kernel-info)' + -pass-remarks=kernel-info -no-kernel-info-end-lto \ + -passes='module(kernel-info),lto' diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h index 5c338a8fcd0cf..f34f4c3528dfe 100644 --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -28,7 +28,7 @@ #include #include -extern llvm::cl::opt KernelInfoEndLTO; +extern llvm::cl::opt NoKernelInfoEndLTO; namespace llvm { diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 3aca4c59105ce..81085c8c6beba 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -321,6 +321,8 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, PreservedAnalyses KernelInfoPrinter::run(Function &F, FunctionAnalysisManager &AM) { - KernelInfo::emitKernelInfo(F, AM, TM); + // Skip it if remarks are not enabled as it will do nothing useful. + if (F.getContext().getDiagHandlerPtr()->isPassedOptRemarkEnabled(DEBUG_TYPE)) + KernelInfo::emitKernelInfo(F, AM, TM); return PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 1264749059359..936de58633b87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -825,7 +825,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(AMDGPULowerModuleLDSPass(*this)); if (EnableAMDGPUAttributor && Level != OptimizationLevel::O0) PM.addPass(AMDGPUAttributorPass(*this)); - if (KernelInfoEndLTO) { + if (!NoKernelInfoEndLTO) { FunctionPassManager FPM; FPM.addPass(KernelInfoPrinter(this)); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 3955d173b48f2..db1ea2b38bb54 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -240,7 +240,7 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); - if (KernelInfoEndLTO) { + if (!NoKernelInfoEndLTO) { PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index b235fd8f6f49a..07e2a44c21cdd 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -26,9 +26,9 @@ #include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; -cl::opt KernelInfoEndLTO( - "kernel-info-end-lto", - cl::desc("add the kernel-info pass at the end of the full LTO pipeline"), +cl::opt NoKernelInfoEndLTO( + "no-kernel-info-end-lto", + cl::desc("remove the kernel-info pass at the end of the full LTO pipeline"), cl::init(false), cl::Hidden); //--------------------------------------------------------------------------- diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/Inputs/test.ll similarity index 100% rename from llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll rename to llvm/test/Analysis/KernelInfo/enable-kernel-info/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll new file mode 100644 index 0000000000000..e969eabfe7cd8 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll @@ -0,0 +1,18 @@ +; Check when kernel-info is enabled in the AMD GPU target backend. + +; REQUIRES: amdgpu-registered-target + +; DEFINE: %{opt} = opt -disable-output %S/Inputs/test.ll \ +; DEFINE: -mtriple="amdgcn-amd-amdhsa" 2>&1 +; DEFINE: %{fcheck-on} = FileCheck -match-full-lines %S/Inputs/test.ll +; DEFINE: %{fcheck-off} = FileCheck -allow-empty -check-prefixes=NONE \ +; DEFINE: %S/Inputs/test.ll + +; By default, kernel-info is in the LTO pipeline. To see output, the LTO +; pipeline must run, -no-kernel-info-end-lto must not be specified, and remarks +; must be enabled. +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info | %{fcheck-on} +; RUN: %{opt} -passes='default' -pass-remarks=kernel-info | %{fcheck-off} +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info \ +; RUN: -no-kernel-info-end-lto | %{fcheck-off} +; RUN: %{opt} -passes='lto' | %{fcheck-off} diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll new file mode 100644 index 0000000000000..65249b4d92e34 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll @@ -0,0 +1,18 @@ +; Check when kernel-info is enabled in the NVPTX target backend. + +; REQUIRES: nvptx-registered-target + +; DEFINE: %{opt} = opt -disable-output %S/Inputs/test.ll \ +; DEFINE: -mtriple="nvptx64-nvidia-cuda" 2>&1 +; DEFINE: %{fcheck-on} = FileCheck -match-full-lines %S/Inputs/test.ll +; DEFINE: %{fcheck-off} = FileCheck -allow-empty -check-prefixes=NONE \ +; DEFINE: %S/Inputs/test.ll + +; By default, kernel-info is in the LTO pipeline. To see output, the LTO +; pipeline must run, -no-kernel-info-end-lto must not be specified, and remarks +; must be enabled. +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info | %{fcheck-on} +; RUN: %{opt} -passes='default' -pass-remarks=kernel-info | %{fcheck-off} +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info \ +; RUN: -no-kernel-info-end-lto | %{fcheck-off} +; RUN: %{opt} -passes='lto' | %{fcheck-off} diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll deleted file mode 100644 index 6d6e83e8d317f..0000000000000 --- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll +++ /dev/null @@ -1,22 +0,0 @@ -; Check that -kernel-info-end-lto enables kernel-info in the AMD GPU target -; backend. - -; REQUIRES: amdgpu-registered-target - -; -kernel-info-end-lto inserts kernel-info into LTO pipeline. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="amdgcn-amd-amdhsa" \ -; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -match-full-lines %S/Inputs/test.ll - -; Omitting -kernel-info-end-lto disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="amdgcn-amd-amdhsa" \ -; RUN: -passes='lto' 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll - -; Omitting LTO disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="amdgcn-amd-amdhsa" \ -; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll deleted file mode 100644 index 1e427daed671e..0000000000000 --- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll +++ /dev/null @@ -1,22 +0,0 @@ -; Check that -kernel-info-end-lto enables kernel-info in the NVPTX target -; backend. - -; REQUIRES: nvptx-registered-target - -; -kernel-info-end-lto inserts kernel-info into LTO pipeline. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="nvptx64-nvidia-cuda" \ -; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -match-full-lines %S/Inputs/test.ll - -; Omitting -kernel-info-end-lto disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="nvptx64-nvidia-cuda" \ -; RUN: -passes='lto' 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll - -; Omitting LTO disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="nvptx64-nvidia-cuda" \ -; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll From d3beccfe9eb34636cd0015b867d6fcda8fa6ea26 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 19:39:37 -0400 Subject: [PATCH 33/46] Apply clang-format --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 936de58633b87..bdf13d33c11c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -840,7 +840,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { return onlyAllocateVGPRs; return nullptr; }); - } int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { From 5a4b873d99545a5865a8577ea8f48f0aac4623d5 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 14 Oct 2024 17:57:50 -0400 Subject: [PATCH 34/46] Avoid auto, as requested --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 7 ++++--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 9 +++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 6d92dccad076f..254fb72c0fb71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1414,14 +1414,15 @@ bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { void GCNTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { - auto MaxNumWorkgroups = ST->getMaxNumWorkGroups(F); + SmallVector MaxNumWorkgroups = ST->getMaxNumWorkGroups(F); LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]}); LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]}); LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]}); - auto FlatWorkGroupSize = ST->getFlatWorkGroupSizes(F); + std::pair FlatWorkGroupSize = + ST->getFlatWorkGroupSizes(F); LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first}); LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second}); - auto WavesPerEU = ST->getWavesPerEU(F); + std::pair WavesPerEU = ST->getWavesPerEU(F); LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first}); LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second}); } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index d230a66449063..f0229c202c283 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -447,12 +447,13 @@ void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, void NVPTXTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { - if (auto Val = getMaxClusterRank(F)) + std::optional Val; + if ((Val = getMaxClusterRank(F))) LB.push_back({"maxclusterrank", *Val}); - if (auto Val = getMaxNTIDx(F)) + if ((Val = getMaxNTIDx(F))) LB.push_back({"maxntidx", *Val}); - if (auto Val = getMaxNTIDy(F)) + if ((Val = getMaxNTIDy(F))) LB.push_back({"maxntidy", *Val}); - if (auto Val = getMaxNTIDz(F)) + if ((Val = getMaxNTIDz(F))) LB.push_back({"maxntidz", *Val}); } From 571181b4e1a5a5dd2f08841e9c637a933ab4451e Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 14 Oct 2024 17:58:03 -0400 Subject: [PATCH 35/46] For function name, use debug info or keep @ --- llvm/lib/Analysis/KernelInfo.cpp | 42 ++++++----- llvm/test/Analysis/KernelInfo/calls.ll | 75 +++++++++++-------- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 8 +- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 8 +- 4 files changed, 73 insertions(+), 60 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 81085c8c6beba..8c25b3b901047 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -80,12 +80,27 @@ static bool isKernelFunction(Function &F) { return F.hasFnAttribute("kernel"); } -static void identifyFunction(OptimizationRemark &R, const Function &F) { - if (auto *SubProgram = F.getSubprogram()) { - if (SubProgram->isArtificial()) - R << "artificial "; +static void identifyCallee(OptimizationRemark &R, const Module *M, + const Value *V, StringRef Kind = "") { + SmallString<100> Name; // might be function name or asm expression + if (const Function *F = dyn_cast(V)) { + if (auto *SubProgram = F->getSubprogram()) { + if (SubProgram->isArtificial()) + R << "artificial "; + Name = SubProgram->getName(); + } } - R << "function '" << F.getName() << "'"; + if (Name.empty()) { + raw_svector_ostream OS(Name); + V->printAsOperand(OS, /*PrintType=*/false, M); + } + if (!Kind.empty()) + R << Kind << " "; + R << "'" << Name << "'"; +} + +static void identifyFunction(OptimizationRemark &R, const Function &F) { + identifyCallee(R, F.getParent(), &F, "function"); } static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller, @@ -132,21 +147,8 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call); R << "in "; identifyFunction(R, Caller); - R << ", " << CallKind << ", callee is"; - Value *Callee = Call.getCalledOperand(); - SmallString<100> Name; // might be function name or asm expression - if (const Function *FnCallee = dyn_cast(Callee)) { - if (auto *SubProgram = FnCallee->getSubprogram()) { - if (SubProgram->isArtificial()) - R << " artificial"; - } - Name = FnCallee->getName(); - } - if (Name.empty()) { - raw_svector_ostream OS(Name); - Callee->printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); - } - R << " '" << Name << "'"; + R << ", " << CallKind << ", callee is "; + identifyCallee(R, Caller.getParent(), Call.getCalledOperand()); return R; }); } diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll index 2a2672c70b85c..6a2a5c426b78b 100644 --- a/llvm/test/Analysis/KernelInfo/calls.ll +++ b/llvm/test/Analysis/KernelInfo/calls.ll @@ -11,30 +11,30 @@ declare void @personality() define void @h() personality ptr @personality !dbg !100 { entry: - ; CHECK: remark: test.c:16:5: in artificial function 'h', direct call, callee is 'f' + ; CHECK: remark: test.c:16:5: in artificial function 'h_dbg', direct call, callee is '@f' call void @f(), !dbg !102 - ; CHECK: remark: test.c:17:5: in artificial function 'h', direct call to defined function, callee is 'g' + ; CHECK: remark: test.c:17:5: in artificial function 'h_dbg', direct call to defined function, callee is 'g_dbg' call void @g(), !dbg !104 - ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h' + ; CHECK: remark: test.c:18:5: in artificial function 'h_dbg', direct call to defined function, callee is artificial 'h_dbg' call void @h(), !dbg !105 - ; CHECK: remark: test.c:24:5: in artificial function 'h', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' + ; CHECK: remark: test.c:24:5: in artificial function 'h_dbg', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' call void asm sideeffect "eieio", ""(), !dbg !111 %fnPtr = load ptr, ptr null, align 8 - ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call, callee is '%fnPtr' + ; CHECK: remark: test.c:19:5: in artificial function 'h_dbg', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !106 - ; CHECK: remark: test.c:20:5: in artificial function 'h', direct invoke, callee is 'f' + ; CHECK: remark: test.c:20:5: in artificial function 'h_dbg', direct invoke, callee is '@f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !107 fcont: - ; CHECK: remark: test.c:21:5: in artificial function 'h', direct invoke to defined function, callee is 'g' + ; CHECK: remark: test.c:21:5: in artificial function 'h_dbg', direct invoke to defined function, callee is 'g_dbg' invoke void @g() to label %gcont unwind label %cleanup, !dbg !108 gcont: - ; CHECK: remark: test.c:22:5: in artificial function 'h', direct invoke to defined function, callee is artificial 'h' + ; CHECK: remark: test.c:22:5: in artificial function 'h_dbg', direct invoke to defined function, callee is artificial 'h_dbg' invoke void @h() to label %hcont unwind label %cleanup, !dbg !109 hcont: - ; CHECK: remark: test.c:25:5: in artificial function 'h', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' + ; CHECK: remark: test.c:25:5: in artificial function 'h_dbg', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !112 asmcont: - ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke, callee is '%fnPtr' + ; CHECK: remark: test.c:23:5: in artificial function 'h_dbg', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110 cleanup: %ll = landingpad { ptr, i32 } @@ -43,40 +43,40 @@ cleanup: end: ret void } -; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCalls = 8 -; CHECK: remark: test.c:13:0: in artificial function 'h', IndirectCalls = 2 -; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCallsToDefinedFunctions = 4 -; CHECK: remark: test.c:13:0: in artificial function 'h', InlineAssemblyCalls = 2 -; CHECK: remark: test.c:13:0: in artificial function 'h', Invokes = 5 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', DirectCalls = 8 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', IndirectCalls = 2 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', DirectCallsToDefinedFunctions = 4 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', InlineAssemblyCalls = 2 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', Invokes = 5 declare void @f() define void @g() personality ptr @personality !dbg !200 { entry: - ; CHECK: remark: test.c:6:3: in function 'g', direct call, callee is 'f' + ; CHECK: remark: test.c:6:3: in function 'g_dbg', direct call, callee is '@f' call void @f(), !dbg !202 - ; CHECK: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' + ; CHECK: remark: test.c:7:3: in function 'g_dbg', direct call to defined function, callee is 'g_dbg' call void @g(), !dbg !203 - ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h' + ; CHECK: remark: test.c:8:3: in function 'g_dbg', direct call to defined function, callee is artificial 'h_dbg' call void @h(), !dbg !204 - ; CHECK: remark: test.c:14:3: in function 'g', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' + ; CHECK: remark: test.c:14:3: in function 'g_dbg', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' call void asm sideeffect "eieio", ""(), !dbg !210 %fnPtr = load ptr, ptr null, align 8 - ; CHECK: remark: test.c:9:3: in function 'g', indirect call, callee is '%fnPtr' + ; CHECK: remark: test.c:9:3: in function 'g_dbg', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !205 - ; CHECK: remark: test.c:10:3: in function 'g', direct invoke, callee is 'f' + ; CHECK: remark: test.c:10:3: in function 'g_dbg', direct invoke, callee is '@f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !206 fcont: - ; CHECK: remark: test.c:11:3: in function 'g', direct invoke to defined function, callee is 'g' + ; CHECK: remark: test.c:11:3: in function 'g_dbg', direct invoke to defined function, callee is 'g_dbg' invoke void @g() to label %gcont unwind label %cleanup, !dbg !207 gcont: - ; CHECK: remark: test.c:12:3: in function 'g', direct invoke to defined function, callee is artificial 'h' + ; CHECK: remark: test.c:12:3: in function 'g_dbg', direct invoke to defined function, callee is artificial 'h_dbg' invoke void @h() to label %hcont unwind label %cleanup, !dbg !208 hcont: - ; CHECK: remark: test.c:15:3: in function 'g', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' + ; CHECK: remark: test.c:15:3: in function 'g_dbg', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !211 asmcont: - ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke, callee is '%fnPtr' + ; CHECK: remark: test.c:13:3: in function 'g_dbg', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209 cleanup: %ll = landingpad { ptr, i32 } @@ -85,11 +85,22 @@ cleanup: end: ret void } -; CHECK: remark: test.c:3:0: in function 'g', DirectCalls = 8 -; CHECK: remark: test.c:3:0: in function 'g', IndirectCalls = 2 -; CHECK: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 4 -; CHECK: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 2 -; CHECK: remark: test.c:3:0: in function 'g', Invokes = 5 +; CHECK: remark: test.c:3:0: in function 'g_dbg', DirectCalls = 8 +; CHECK: remark: test.c:3:0: in function 'g_dbg', IndirectCalls = 2 +; CHECK: remark: test.c:3:0: in function 'g_dbg', DirectCallsToDefinedFunctions = 4 +; CHECK: remark: test.c:3:0: in function 'g_dbg', InlineAssemblyCalls = 2 +; CHECK: remark: test.c:3:0: in function 'g_dbg', Invokes = 5 + +define void @i() { + ; CHECK: remark: :0:0: in function '@i', direct call, callee is '@f' + call void @f() + ret void +} +; CHECK: remark: :0:0: in function '@i', DirectCalls = 1 +; CHECK: remark: :0:0: in function '@i', IndirectCalls = 0 +; CHECK: remark: :0:0: in function '@i', DirectCallsToDefinedFunctions = 0 +; CHECK: remark: :0:0: in function '@i', InlineAssemblyCalls = 0 +; CHECK: remark: :0:0: in function '@i', Invokes = 0 !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} @@ -100,7 +111,7 @@ end: !3 = !{null} !4 = !{} -!100 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!100 = distinct !DISubprogram(name: "h_dbg", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) !101 = distinct !DISubroutineType(types: !3) !102 = !DILocation(line: 16, column: 5, scope: !103) !103 = distinct !DILexicalBlock(scope: !100, file: !2, line: 13, column: 3) @@ -114,7 +125,7 @@ end: !111 = !DILocation(line: 24, column: 5, scope: !103) !112 = !DILocation(line: 25, column: 5, scope: !103) -!200 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!200 = distinct !DISubprogram(name: "g_dbg", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) !201 = !DISubroutineType(types: !3) !202 = !DILocation(line: 6, column: 3, scope: !200) !203 = !DILocation(line: 7, column: 3, scope: !200) diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 17ded0b6d3753..10bfa164e2386 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -11,10 +11,10 @@ ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space -; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_init' -; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_init' +; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' -; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_deinit' +; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 256 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 0 @@ -58,7 +58,7 @@ ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes -; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[0] = 0 diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 68c416acd6388..0d55cbbe79135 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -11,10 +11,10 @@ ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space -; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_init' -; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_init' +; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' -; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_deinit' +; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 128 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', maxntidx = 128 @@ -45,7 +45,7 @@ ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes -; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 From a5ce5477d12f0ca1c6020cde5c51a96887945b17 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 16 Oct 2024 12:40:30 -0400 Subject: [PATCH 36/46] Use anonymous namespace --- llvm/lib/Analysis/KernelInfo.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 8c25b3b901047..2f52e819036cb 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -28,6 +28,8 @@ using namespace llvm; #define DEBUG_TYPE "kernel-info" +namespace { + /// Data structure holding function info for kernels. class KernelInfo { void updateForBB(const BasicBlock &BB, int64_t Direction, @@ -75,6 +77,8 @@ class KernelInfo { int64_t FlatAddrspaceAccesses = 0; }; +} // end anonymous namespace + static bool isKernelFunction(Function &F) { // TODO: Is this general enough? Consider languages beyond OpenMP. return F.hasFnAttribute("kernel"); From 4d60911942d3aa876599e4f9ad2a0e23f5b92bc3 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 16 Oct 2024 12:55:22 -0400 Subject: [PATCH 37/46] Remove currently unused capabilities, as requested They were originally copied from FunctionPropertiesAnalysis.cpp. --- llvm/lib/Analysis/KernelInfo.cpp | 38 ++++++++++++++------------------ 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 2f52e819036cb..3658f54923e3f 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -32,8 +32,7 @@ namespace { /// Data structure holding function info for kernels. class KernelInfo { - void updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE); + void updateForBB(const BasicBlock &BB, OptimizationRemarkEmitter &ORE); public: static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, @@ -180,38 +179,37 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, }); } -void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, +void KernelInfo::updateForBB(const BasicBlock &BB, OptimizationRemarkEmitter &ORE) { - assert(Direction == 1 || Direction == -1); const Function &F = *BB.getParent(); const Module &M = *F.getParent(); const DataLayout &DL = M.getDataLayout(); for (const Instruction &I : BB.instructionsWithoutDebug()) { if (const AllocaInst *Alloca = dyn_cast(&I)) { - Allocas += Direction; + ++Allocas; TypeSize::ScalarTy StaticSize = 0; if (std::optional Size = Alloca->getAllocationSize(DL)) { StaticSize = Size->getFixedValue(); assert(StaticSize <= std::numeric_limits::max()); - AllocasStaticSizeSum += Direction * StaticSize; + AllocasStaticSizeSum += StaticSize; } else { - AllocasDyn += Direction; + ++AllocasDyn; } remarkAlloca(ORE, F, *Alloca, StaticSize); } else if (const CallBase *Call = dyn_cast(&I)) { SmallString<40> CallKind; SmallString<40> RemarkKind; if (Call->isIndirectCall()) { - IndirectCalls += Direction; + ++IndirectCalls; CallKind += "indirect"; RemarkKind += "Indirect"; } else { - DirectCalls += Direction; + ++DirectCalls; CallKind += "direct"; RemarkKind += "Direct"; } if (isa(Call)) { - Invokes += Direction; + ++Invokes; CallKind += " invoke"; RemarkKind += "Invoke"; } else { @@ -221,12 +219,12 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, if (!Call->isIndirectCall()) { if (const Function *Callee = Call->getCalledFunction()) { if (!Callee->isIntrinsic() && !Callee->isDeclaration()) { - DirectCallsToDefinedFunctions += Direction; + ++DirectCallsToDefinedFunctions; CallKind += " to defined function"; RemarkKind += "ToDefinedFunction"; } } else if (Call->isInlineAsm()) { - InlineAssemblyCalls += Direction; + ++InlineAssemblyCalls; CallKind += " to inline assembly"; RemarkKind += "ToInlineAssembly"; } @@ -234,34 +232,34 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, remarkCall(ORE, F, *Call, CallKind, RemarkKind); if (const AnyMemIntrinsic *MI = dyn_cast(Call)) { if (MI->getDestAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } else if (const AnyMemTransferInst *MT = dyn_cast(MI)) { if (MT->getSourceAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } } } } else if (const LoadInst *Load = dyn_cast(&I)) { if (Load->getPointerAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const StoreInst *Store = dyn_cast(&I)) { if (Store->getPointerAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicRMWInst *At = dyn_cast(&I)) { if (At->getPointerAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { if (At->getPointerAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } } @@ -300,11 +298,9 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, } TheTTI.collectKernelLaunchBounds(F, KI.LaunchBounds); - const DominatorTree &DT = FAM.getResult(F); auto &ORE = FAM.getResult(F); for (const auto &BB : F) - if (DT.isReachableFromEntry(&BB)) - KI.updateForBB(BB, +1, ORE); + KI.updateForBB(BB, ORE); #define REMARK_PROPERTY(PROP_NAME) \ remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) From 0c30e7ceeb36294f4523da2590101314ca1c662d Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 16 Oct 2024 13:11:12 -0400 Subject: [PATCH 38/46] Rename test files without LLVM IR to .test --- .../KernelInfo/enable-kernel-info/{amdgpu.ll => amdgpu.test} | 0 .../KernelInfo/enable-kernel-info/{nvptx.ll => nvptx.test} | 0 .../Analysis/KernelInfo/flat-addrspace/{amdgpu.ll => amdgpu.test} | 0 .../Analysis/KernelInfo/flat-addrspace/{nvptx.ll => nvptx.test} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename llvm/test/Analysis/KernelInfo/enable-kernel-info/{amdgpu.ll => amdgpu.test} (100%) rename llvm/test/Analysis/KernelInfo/enable-kernel-info/{nvptx.ll => nvptx.test} (100%) rename llvm/test/Analysis/KernelInfo/flat-addrspace/{amdgpu.ll => amdgpu.test} (100%) rename llvm/test/Analysis/KernelInfo/flat-addrspace/{nvptx.ll => nvptx.test} (100%) diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.test similarity index 100% rename from llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll rename to llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.test diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.test similarity index 100% rename from llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll rename to llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.test diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.test similarity index 100% rename from llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll rename to llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.test diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.test similarity index 100% rename from llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll rename to llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.test From f5a6fbd408b111570e8d9b2e37655704b36a9ca3 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Thu, 17 Oct 2024 13:26:06 -0400 Subject: [PATCH 39/46] Regenerate OpenMP tests from current clang See llvm/test/Analysis/KernelInfo/openmp/README.md. --- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 125 +- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 1141 ++++++++--------- 2 files changed, 627 insertions(+), 639 deletions(-) diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 10bfa164e2386..c2caf8267cae7 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -16,13 +16,12 @@ ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 256 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[1] = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[2] = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[0] = 1 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[1] = 256 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[0] = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[1] = 1024 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[0] = 4 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[1] = 10 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 @@ -39,12 +38,13 @@ ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 256 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[0] = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[1] = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[2] = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[0] = 1 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[1] = 1024 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[0] = 4 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[1] = 256 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[0] = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[1] = 10 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 @@ -95,82 +95,75 @@ target triple = "amdgcn-amd-amdhsa" @__omp_rtl_assume_threads_oversubscription = weak_odr hidden addrspace(1) constant i32 0 @__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1) constant i32 0 @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1) constant i32 0 -@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_71f35_h_l12_debug__;13;3;;\00", align 1 +@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_6f0c0_h_l12_debug__;13;3;;\00", align 1 @1 = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 56, ptr @0 }, align 8 -@__omp_offloading_fd02_71f35_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer -@__omp_offloading_fd02_71f35_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_71f35_h_l12_dynamic_environment to ptr) } +@__omp_offloading_fd02_6f0c0_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_6f0c0_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_6f0c0_h_l12_dynamic_environment to ptr) } @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 ; Function Attrs: convergent noinline norecurse nounwind optnone -define internal void @__omp_offloading_fd02_71f35_h_l12_debug__(ptr noalias noundef %dyn_ptr) #0 !dbg !16 { -entry: - %dyn_ptr.addr = alloca ptr, align 8, addrspace(5) - %i = alloca i32, align 4, addrspace(5) - %a = alloca [2 x i32], align 4, addrspace(5) - %dyn_ptr.addr.ascast = addrspacecast ptr addrspace(5) %dyn_ptr.addr to ptr - %i.ascast = addrspacecast ptr addrspace(5) %i to ptr - %a.ascast = addrspacecast ptr addrspace(5) %a to ptr - store ptr %dyn_ptr, ptr %dyn_ptr.addr.ascast, align 8 - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %dyn_ptr.addr, metadata !24, metadata !DIExpression()), !dbg !25 - %0 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_71f35_h_l12_kernel_environment to ptr), ptr %dyn_ptr), !dbg !26 - %exec_user_code = icmp eq i32 %0, -1, !dbg !26 - br i1 %exec_user_code, label %user_code.entry, label %worker.exit, !dbg !26 - -user_code.entry: ; preds = %entry - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %i, metadata !27, metadata !DIExpression()), !dbg !30 - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %a, metadata !31, metadata !DIExpression()), !dbg !35 - call void @f() #5, !dbg !36 - call void @g() #5, !dbg !37 +define internal void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !16 { + %2 = alloca ptr, align 8, addrspace(5) + %3 = alloca i32, align 4, addrspace(5) + %4 = alloca [2 x i32], align 4, addrspace(5) + %5 = addrspacecast ptr addrspace(5) %2 to ptr + %6 = addrspacecast ptr addrspace(5) %3 to ptr + %7 = addrspacecast ptr addrspace(5) %4 to ptr + store ptr %0, ptr %5, align 8 + #dbg_declare(ptr addrspace(5) %2, !24, !DIExpression(), !25) + %8 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_6f0c0_h_l12_kernel_environment to ptr), ptr %0), !dbg !26 + %9 = icmp eq i32 %8, -1, !dbg !26 + br i1 %9, label %10, label %11, !dbg !26 + +10: ; preds = %1 + #dbg_declare(ptr addrspace(5) %3, !27, !DIExpression(), !30) + #dbg_declare(ptr addrspace(5) %4, !31, !DIExpression(), !35) + call void @f() #4, !dbg !36 + call void @g() #4, !dbg !37 call void @__kmpc_target_deinit(), !dbg !38 ret void, !dbg !39 -worker.exit: ; preds = %entry +11: ; preds = %1 ret void, !dbg !26 } +; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_6f0c0_h_l12(ptr noalias noundef %0) #1 !dbg !40 { + %2 = alloca ptr, align 8, addrspace(5) + %3 = addrspacecast ptr addrspace(5) %2 to ptr + store ptr %0, ptr %3, align 8 + #dbg_declare(ptr addrspace(5) %2, !41, !DIExpression(), !42) + %4 = load ptr, ptr %3, align 8, !dbg !43 + call void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr %4) #5, !dbg !43 + ret void, !dbg !43 +} + declare i32 @__kmpc_target_init(ptr, ptr) ; Function Attrs: convergent -declare void @f(...) #1 +declare void @f(...) #2 declare void @__kmpc_target_deinit() -; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone -define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_71f35_h_l12(ptr noalias noundef %dyn_ptr) #2 !dbg !40 { -entry: - %dyn_ptr.addr = alloca ptr, align 8, addrspace(5) - %dyn_ptr.addr.ascast = addrspacecast ptr addrspace(5) %dyn_ptr.addr to ptr - store ptr %dyn_ptr, ptr %dyn_ptr.addr.ascast, align 8 - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %dyn_ptr.addr, metadata !41, metadata !DIExpression()), !dbg !42 - %0 = load ptr, ptr %dyn_ptr.addr.ascast, align 8, !dbg !43 - call void @__omp_offloading_fd02_71f35_h_l12_debug__(ptr %0) #6, !dbg !43 - ret void, !dbg !43 -} - ; Function Attrs: convergent noinline nounwind optnone define hidden void @g() #3 !dbg !44 { -entry: - %i = alloca i32, align 4, addrspace(5) - %a = alloca [2 x i32], align 4, addrspace(5) - %i.ascast = addrspacecast ptr addrspace(5) %i to ptr - %a.ascast = addrspacecast ptr addrspace(5) %a to ptr - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %i, metadata !47, metadata !DIExpression()), !dbg !48 - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %a, metadata !49, metadata !DIExpression()), !dbg !50 - call void @f() #5, !dbg !51 - call void @g() #5, !dbg !52 + %1 = alloca i32, align 4, addrspace(5) + %2 = alloca [2 x i32], align 4, addrspace(5) + %3 = addrspacecast ptr addrspace(5) %1 to ptr + %4 = addrspacecast ptr addrspace(5) %2 to ptr + #dbg_declare(ptr addrspace(5) %1, !47, !DIExpression(), !48) + #dbg_declare(ptr addrspace(5) %2, !49, !DIExpression(), !50) + call void @f() #4, !dbg !51 + call void @g() #4, !dbg !52 ret void, !dbg !53 } -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare void @llvm.dbg.declare(metadata, metadata, metadata) #4 - -attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "frame-pointer"="all" "no-trapping-math"="true" "omp_target_thread_limit"="256" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -attributes #2 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } +attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "frame-pointer"="all" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="256" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } +attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -attributes #5 = { convergent } -attributes #6 = { nounwind } +attributes #4 = { convergent } +attributes #5 = { nounwind } !llvm.dbg.cu = !{!0} !omp_offload.info = !{!2} @@ -179,10 +172,10 @@ attributes #6 = { nounwind } !llvm.ident = !{!13, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14} !opencl.ocl.version = !{!15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} -!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git (/tmp/llvm/clang 5a5e94265d423fa9eb39dc1b855511195f8dc0fe)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "eff61a7cf33c8dd1bd6933250fc90157") -!2 = !{i32 0, i32 64770, i32 466741, !"h", i32 12, i32 0, i32 0} -!3 = !{ptr @__omp_offloading_fd02_71f35_h_l12, !"kernel", i32 1} +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "854099697e49b3ca7d3b3c08503e6fef") +!2 = !{i32 0, i32 64770, i32 454848, !"h", i32 12, i32 0, i32 0} +!3 = !{ptr @__omp_offloading_fd02_6f0c0_h_l12, !"kernel", i32 1} !4 = !{i32 1, !"amdhsa_code_object_version", i32 500} !5 = !{i32 7, !"Dwarf Version", i32 5} !6 = !{i32 2, !"Debug Info Version", i32 3} @@ -192,10 +185,10 @@ attributes #6 = { nounwind } !10 = !{i32 8, !"PIC Level", i32 2} !11 = !{i32 7, !"frame-pointer", i32 2} !12 = !{i32 4, !"amdgpu_hostcall", i32 1} -!13 = !{!"clang version 19.0.0git (/tmp/llvm/clang 5a5e94265d423fa9eb39dc1b855511195f8dc0fe)"} +!13 = !{!"clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)"} !14 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"} !15 = !{i32 2, i32 0} -!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_71f35_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_6f0c0_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) !17 = !DIFile(filename: "test.c", directory: "/tmp") !18 = !DISubroutineType(types: !19) !19 = !{null, !20} @@ -219,7 +212,7 @@ attributes #6 = { nounwind } !37 = !DILocation(line: 17, column: 5, scope: !28) !38 = !DILocation(line: 18, column: 3, scope: !28) !39 = !DILocation(line: 18, column: 3, scope: !16) -!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_71f35_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_6f0c0_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) !41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial) !42 = !DILocation(line: 0, scope: !40) !43 = !DILocation(line: 12, column: 1, scope: !40) diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 0d55cbbe79135..e717599aab687 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -16,8 +16,6 @@ ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 128 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', maxntidx = 128 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 @@ -33,6 +31,8 @@ ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 128 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', maxntidx = 128 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 @@ -83,10 +83,10 @@ target triple = "nvptx64-nvidia-cuda" @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 -@0 = private unnamed_addr constant [59 x i8] c";test.c;__omp_offloading_10305_5c00dd_h_l12_debug__;13;3;;\00", align 1 -@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 58, ptr @0 }, align 8 -@__omp_offloading_10305_5c00dd_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer -@__omp_offloading_10305_5c00dd_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_10305_5c00dd_h_l12_dynamic_environment } +@0 = private unnamed_addr constant [58 x i8] c";test.c;__omp_offloading_fd02_10d1d6_h_l12_debug__;13;3;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 57, ptr @0 }, align 8 +@__omp_offloading_fd02_10d1d6_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_10d1d6_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_fd02_10d1d6_h_l12_dynamic_environment } @llvm.used = appending global [3 x ptr] [ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata" @__omp_rtl_device_memory_pool = weak protected global %struct.DeviceMemoryPoolTy zeroinitializer, align 8 @__omp_rtl_device_memory_pool_tracker = weak protected global %struct.DeviceMemoryPoolTrackingTy zeroinitializer, align 8 @@ -101,371 +101,372 @@ target triple = "nvptx64-nvidia-cuda" @.str2 = private unnamed_addr constant [18 x i8] c"WorkFn == nullptr\00", align 1 @__PRETTY_FUNCTION__.__kmpc_target_deinit = private unnamed_addr constant [28 x i8] c"void __kmpc_target_deinit()\00", align 1 @IsSPMDMode = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 -@.str1127 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 +@.str1124 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 @.str13 = private unnamed_addr constant [23 x i8] c"!mapping::isSPMDMode()\00", align 1 @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel = private unnamed_addr constant [34 x i8] c"void __kmpc_kernel_end_parallel()\00", align 1 @_ZL20KernelEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZL26KernelLaunchEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16 -@.str544 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 -@.str847 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 +@.str541 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 +@.str844 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_ = private unnamed_addr constant [68 x i8] c"void ompx::state::ICVStateTy::assertEqual(const ICVStateTy &) const\00", align 1 -@.str948 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 -@.str1049 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 -@.str1150 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 -@.str1251 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 -@.str1352 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 +@.str945 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 +@.str1046 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 +@.str1147 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 +@.str1248 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 +@.str1349 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 @.str14 = private unnamed_addr constant [43 x i8] c"ParallelTeamSize == Other.ParallelTeamSize\00", align 1 @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_ = private unnamed_addr constant [64 x i8] c"void ompx::state::TeamStateTy::assertEqual(TeamStateTy &) const\00", align 1 -@.str1553 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 +@.str1550 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 @.str24 = private unnamed_addr constant [32 x i8] c"mapping::isSPMDMode() == IsSPMD\00", align 1 @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb = private unnamed_addr constant [43 x i8] c"void ompx::state::assumeInitialState(bool)\00", align 1 +@_ZL9ThreadDST = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZN4ompx5state9TeamStateE = internal local_unnamed_addr addrspace(3) global %"struct.ompx::state::TeamStateTy" undef, align 8 @_ZN4ompx5state12ThreadStatesE = internal addrspace(3) global ptr undef, align 8 ; Function Attrs: convergent noinline norecurse nounwind optnone -define internal void @__omp_offloading_10305_5c00dd_h_l12_debug__(ptr noalias noundef %dyn_ptr) #0 !dbg !17 { -entry: - %dyn_ptr.addr = alloca ptr, align 8 - %i = alloca i32, align 4 - %a = alloca [2 x i32], align 4 - store ptr %dyn_ptr, ptr %dyn_ptr.addr, align 8 - tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !24, metadata !DIExpression()), !dbg !25 - %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_10305_5c00dd_h_l12_kernel_environment, ptr %dyn_ptr), !dbg !26 - %exec_user_code = icmp eq i32 %0, -1, !dbg !26 - br i1 %exec_user_code, label %user_code.entry, label %worker.exit, !dbg !26 - -user_code.entry: ; preds = %entry - tail call void @llvm.dbg.declare(metadata ptr %i, metadata !27, metadata !DIExpression()), !dbg !30 - tail call void @llvm.dbg.declare(metadata ptr %a, metadata !31, metadata !DIExpression()), !dbg !35 - call void @f() #16, !dbg !36 - call void @g() #16, !dbg !37 - call void @__kmpc_target_deinit(), !dbg !38 - ret void, !dbg !39 - -worker.exit: ; preds = %entry - ret void, !dbg !26 +define internal void @__omp_offloading_fd02_10d1d6_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !19 { + %2 = alloca ptr, align 8 + %3 = alloca i32, align 4 + %4 = alloca [2 x i32], align 4 + store ptr %0, ptr %2, align 8 + #dbg_declare(ptr %2, !26, !DIExpression(), !27) + %5 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_10d1d6_h_l12_kernel_environment, ptr %0), !dbg !28 + %6 = icmp eq i32 %5, -1, !dbg !28 + br i1 %6, label %7, label %8, !dbg !28 + +7: ; preds = %1 + #dbg_declare(ptr %3, !29, !DIExpression(), !32) + #dbg_declare(ptr %4, !33, !DIExpression(), !37) + call void @f() #16, !dbg !38 + call void @g() #16, !dbg !39 + call void @__kmpc_target_deinit(), !dbg !40 + ret void, !dbg !41 + +8: ; preds = %1 + ret void, !dbg !28 } -; Function Attrs: convergent -declare void @f(...) #1 - ; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone -define weak_odr protected void @__omp_offloading_10305_5c00dd_h_l12(ptr noalias noundef %dyn_ptr) #2 !dbg !40 { -entry: - %dyn_ptr.addr = alloca ptr, align 8 - store ptr %dyn_ptr, ptr %dyn_ptr.addr, align 8 - tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !41, metadata !DIExpression()), !dbg !42 - %0 = load ptr, ptr %dyn_ptr.addr, align 8, !dbg !43 - call void @__omp_offloading_10305_5c00dd_h_l12_debug__(ptr %0) #17, !dbg !43 - ret void, !dbg !43 +define weak_odr protected void @__omp_offloading_fd02_10d1d6_h_l12(ptr noalias noundef %0) #1 !dbg !42 { + %2 = alloca ptr, align 8 + store ptr %0, ptr %2, align 8 + #dbg_declare(ptr %2, !43, !DIExpression(), !44) + %3 = load ptr, ptr %2, align 8, !dbg !45 + call void @__omp_offloading_fd02_10d1d6_h_l12_debug__(ptr %3) #17, !dbg !45 + ret void, !dbg !45 } +; Function Attrs: convergent +declare void @f(...) #2 + ; Function Attrs: convergent noinline nounwind optnone -define hidden void @g() #3 !dbg !44 { -entry: - %i = alloca i32, align 4 - %a = alloca [2 x i32], align 4 - tail call void @llvm.dbg.declare(metadata ptr %i, metadata !47, metadata !DIExpression()), !dbg !48 - tail call void @llvm.dbg.declare(metadata ptr %a, metadata !49, metadata !DIExpression()), !dbg !50 - call void @f() #16, !dbg !51 - call void @g() #16, !dbg !52 - ret void, !dbg !53 +define hidden void @g() #3 !dbg !46 { + %1 = alloca i32, align 4 + %2 = alloca [2 x i32], align 4 + #dbg_declare(ptr %1, !49, !DIExpression(), !50) + #dbg_declare(ptr %2, !51, !DIExpression(), !52) + call void @f() #16, !dbg !53 + call void @g() #16, !dbg !54 + ret void, !dbg !55 } ; Function Attrs: convergent mustprogress nounwind -define internal noundef i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(48) %KernelEnvironment, ptr nofree noundef nonnull align 8 dereferenceable(16) %KernelLaunchEnvironment) #4 { -entry: - %WorkFn.i = alloca ptr, align 8 - %ExecMode = getelementptr inbounds i8, ptr %KernelEnvironment, i64 2 - %0 = load i8, ptr %ExecMode, align 2, !tbaa !54 - %1 = and i8 %0, 2 - %tobool.not = icmp eq i8 %1, 0 - %2 = load i8, ptr %KernelEnvironment, align 8, !tbaa !60 - %tobool3.not = icmp ne i8 %2, 0 - br i1 %tobool.not, label %if.else, label %if.then - -if.then: ; preds = %entry - %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 - %cmp.i.i.i = icmp eq i32 %3, 0 - br i1 %cmp.i.i.i, label %if.then.i, label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge - -if.then.i: ; preds = %if.then - store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 - %idxprom.i.i = zext nneg i32 %3 to i64 - %arrayidx.i.i = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i - %4 = addrspacecast ptr %arrayidx.i.i to ptr addrspace(3) - store i8 0, ptr addrspace(3) %4, align 1, !tbaa !62 - store i32 0, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !63 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !67 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !68 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 12) to ptr addrspace(3)), align 4, !tbaa !69 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !70 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !71 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !72 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !75 - store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 - store ptr %KernelEnvironment, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 - store ptr %KernelLaunchEnvironment, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !76 - br label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit - -_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge: ; preds = %if.then - %idxprom.i.i.c = zext i32 %3 to i64 - %arrayidx.i.i.c = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i.c - %5 = addrspacecast ptr %arrayidx.i.i.c to ptr addrspace(3) - store i8 0, ptr addrspace(3) %5, align 1, !tbaa !62 - br label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit - -_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit: ; preds = %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge, %if.then.i +define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(48) %0, ptr nofree noundef nonnull align 8 dereferenceable(16) %1) #4 { + %3 = alloca ptr, align 8 + %4 = getelementptr inbounds nuw i8, ptr %0, i64 2 + %5 = load i8, ptr %4, align 2, !tbaa !56 + %6 = and i8 %5, 2 + %7 = icmp eq i8 %6, 0 + %8 = load i8, ptr %0, align 8, !tbaa !62 + %9 = icmp ne i8 %8, 0 + br i1 %7, label %21, label %10 + +10: ; preds = %2 + %11 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %12 = icmp eq i32 %11, 0 + br i1 %12, label %13, label %14 + +13: ; preds = %10 + store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + store i8 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512) to ptr addrspace(3)), align 1, !tbaa !64 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !65 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !74 + br label %18 + +14: ; preds = %10 + %15 = zext nneg i32 %11 to i64 + %16 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %15 + %17 = addrspacecast ptr %16 to ptr addrspace(3) + store i8 0, ptr addrspace(3) %17, align 1, !tbaa !64 + br label %18 + +18: ; preds = %14, %13 + br i1 %12, label %19, label %20 + +19: ; preds = %18 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !74 + br label %20 + +20: ; preds = %18, %19 tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19 - br label %if.end - -if.else: ; preds = %entry - %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 - %sub.i.i.i7 = add i32 %6, -1 - %and.i.i.i8 = and i32 %sub.i.i.i7, -32 - %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 - %cmp.i.i.i9 = icmp eq i32 %7, %and.i.i.i8 - br i1 %cmp.i.i.i9, label %if.then.i11, label %if.end.critedge - -if.then.i11: ; preds = %if.else - store i32 0, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 - %idxprom.i.i13 = zext i32 %7 to i64 - %arrayidx.i.i14 = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i13 - %8 = addrspacecast ptr %arrayidx.i.i14 to ptr addrspace(3) - store i8 0, ptr addrspace(3) %8, align 1, !tbaa !62 - store i32 0, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !63 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !67 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !68 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 12) to ptr addrspace(3)), align 4, !tbaa !69 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !70 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !71 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !72 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !75 - store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 - store ptr %KernelEnvironment, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 - store ptr %KernelLaunchEnvironment, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !76 - br label %if.end - -if.end.critedge: ; preds = %if.else - %idxprom.i.i13.c = zext i32 %7 to i64 - %arrayidx.i.i14.c = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i13.c - %9 = addrspacecast ptr %arrayidx.i.i14.c to ptr addrspace(3) - store i8 0, ptr addrspace(3) %9, align 1, !tbaa !62 - br label %if.end - -if.end: ; preds = %if.end.critedge, %if.then.i11, %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit - br i1 %tobool.not, label %if.end9, label %if.then7 - -if.then7: ; preds = %if.end - %10 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 - %11 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 - %and.i.i.i21 = and i32 %10, 1 - %and.i.i = and i32 %and.i.i.i21, %11 - %tobool.i.i = icmp ne i32 %and.i.i, 0 - %.pre67.i.i.i = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !80 - %cmp.i.i.i22 = icmp ne i32 %.pre67.i.i.i, 0 - %or.cond.not.i.i.i = select i1 %tobool.i.i, i1 %cmp.i.i.i22, i1 false - br i1 %or.cond.not.i.i.i, label %if.then.i.i.i, label %if.else.i.i.i - -if.then.i.i.i: ; preds = %if.then7 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str847, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + br label %37 + +21: ; preds = %2 + %22 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %23 = add nsw i32 %22, -1 + %24 = and i32 %23, -32 + %25 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %26 = icmp eq i32 %25, %24 + br i1 %26, label %27, label %31 + +27: ; preds = %21 + store i32 0, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + %28 = zext nneg i32 %25 to i64 + %29 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %28 + %30 = addrspacecast ptr %29 to ptr addrspace(3) + store i8 0, ptr addrspace(3) %30, align 1, !tbaa !64 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !65 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !74 + br label %35 + +31: ; preds = %21 + %32 = zext nneg i32 %25 to i64 + %33 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %32 + %34 = addrspacecast ptr %33 to ptr addrspace(3) + store i8 0, ptr addrspace(3) %34, align 1, !tbaa !64 + br label %35 + +35: ; preds = %31, %27 + br i1 %26, label %36, label %37 + +36: ; preds = %35 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !74 + br label %37 + +37: ; preds = %36, %35, %20 + br i1 %7, label %100, label %38 + +38: ; preds = %37 + %39 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 + %40 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %41 = and i32 %39, 1 + %42 = and i32 %41, %40 + %43 = icmp ne i32 %42, 0 + %44 = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !79 + %45 = icmp ne i32 %44, 0 + %46 = select i1 %43, i1 %45, i1 false + br i1 %46, label %47, label %48 + +47: ; preds = %38 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str844, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else.i.i.i: ; preds = %if.then7 - %cmp5.i.i.i = icmp eq i32 %.pre67.i.i.i, 0 - tail call void @llvm.assume(i1 noundef %cmp5.i.i.i) #21 - %12 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !81 - br i1 %tobool.i.i, label %land.lhs.true7.i.i.i, label %if.else11.i.i.i +48: ; preds = %38 + %49 = icmp eq i32 %44, 0 + tail call void @llvm.assume(i1 noundef %49) #21 + %50 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !80 + br i1 %43, label %51, label %54 -land.lhs.true7.i.i.i: ; preds = %if.else.i.i.i - %cmp9.i.i.i = icmp eq i32 %12, 0 - br i1 %cmp9.i.i.i, label %if.else11.i.i.i, label %if.then10.i.i.i +51: ; preds = %48 + %52 = icmp eq i32 %50, 0 + br i1 %52, label %54, label %53 -if.then10.i.i.i: ; preds = %land.lhs.true7.i.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str948, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 +53: ; preds = %51 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str945, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else11.i.i.i: ; preds = %land.lhs.true7.i.i.i, %if.else.i.i.i - %13 = phi i32 [ 0, %land.lhs.true7.i.i.i ], [ %12, %if.else.i.i.i ] - %cmp14.i.i.i = icmp eq i32 %13, 0 - tail call void @llvm.assume(i1 noundef %cmp14.i.i.i) #21 - %14 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !82 - br i1 %tobool.i.i, label %land.lhs.true17.i.i.i, label %if.else21.i.i.i +54: ; preds = %51, %48 + %55 = phi i32 [ 0, %51 ], [ %50, %48 ] + %56 = icmp eq i32 %55, 0 + tail call void @llvm.assume(i1 noundef %56) #21 + %57 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !81 + br i1 %43, label %58, label %61 -land.lhs.true17.i.i.i: ; preds = %if.else11.i.i.i - %cmp19.i.i.i = icmp eq i32 %14, 0 - br i1 %cmp19.i.i.i, label %if.else21.i.i.i, label %if.then20.i.i.i +58: ; preds = %54 + %59 = icmp eq i32 %57, 0 + br i1 %59, label %61, label %60 -if.then20.i.i.i: ; preds = %land.lhs.true17.i.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1049, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 +60: ; preds = %58 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1046, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else21.i.i.i: ; preds = %land.lhs.true17.i.i.i, %if.else11.i.i.i - %15 = phi i32 [ 0, %land.lhs.true17.i.i.i ], [ %14, %if.else11.i.i.i ] - %cmp24.i.i.i = icmp eq i32 %15, 0 - tail call void @llvm.assume(i1 noundef %cmp24.i.i.i) #21 - %16 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !83 - br i1 %tobool.i.i, label %land.lhs.true27.i.i.i, label %if.else31.i.i.i +61: ; preds = %58, %54 + %62 = phi i32 [ 0, %58 ], [ %57, %54 ] + %63 = icmp eq i32 %62, 0 + tail call void @llvm.assume(i1 noundef %63) #21 + %64 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !82 + br i1 %43, label %65, label %68 -land.lhs.true27.i.i.i: ; preds = %if.else21.i.i.i - %cmp29.i.i.i = icmp eq i32 %16, 1 - br i1 %cmp29.i.i.i, label %if.else31.i.i.i, label %if.then30.i.i.i +65: ; preds = %61 + %66 = icmp eq i32 %64, 1 + br i1 %66, label %68, label %67 -if.then30.i.i.i: ; preds = %land.lhs.true27.i.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1150, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 +67: ; preds = %65 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1147, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else31.i.i.i: ; preds = %land.lhs.true27.i.i.i, %if.else21.i.i.i - %17 = phi i32 [ 1, %land.lhs.true27.i.i.i ], [ %16, %if.else21.i.i.i ] - %cmp34.i.i.i = icmp eq i32 %17, 1 - tail call void @llvm.assume(i1 noundef %cmp34.i.i.i) #21 - %18 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !84 - br i1 %tobool.i.i, label %land.lhs.true37.i.i.i, label %if.else.critedge.i.critedge.critedge.critedge +68: ; preds = %65, %61 + %69 = phi i32 [ 1, %65 ], [ %64, %61 ] + %70 = icmp eq i32 %69, 1 + tail call void @llvm.assume(i1 noundef %70) #21 + %71 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !83 + br i1 %43, label %72, label %93 -land.lhs.true37.i.i.i: ; preds = %if.else31.i.i.i - %cmp39.i.i.i = icmp eq i32 %18, 1 - br i1 %cmp39.i.i.i, label %if.else41.i.i.i, label %if.then40.i.i.i +72: ; preds = %68 + %73 = icmp eq i32 %71, 1 + br i1 %73, label %75, label %74 -if.then40.i.i.i: ; preds = %land.lhs.true37.i.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1251, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 +74: ; preds = %72 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1248, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else41.i.i.i: ; preds = %land.lhs.true37.i.i.i - %cmp44.i.i.i = icmp eq i32 1, 1 - tail call void @llvm.assume(i1 noundef %cmp44.i.i.i) #21 - br i1 %tobool.i.i, label %land.lhs.true47.i.i.i, label %if.else.critedge.i.critedge +75: ; preds = %72 + %76 = icmp eq i32 1, 1 + tail call void @llvm.assume(i1 noundef %76) #21 + br i1 %43, label %77, label %95 -land.lhs.true47.i.i.i: ; preds = %if.else41.i.i.i - %19 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !85 - %cmp49.i.i.i = icmp eq i32 %19, 1 - br i1 %cmp49.i.i.i, label %if.else51.i.i.i, label %if.then50.i.i.i +77: ; preds = %75 + %78 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !84 + %79 = icmp eq i32 %78, 1 + br i1 %79, label %81, label %80 -if.then50.i.i.i: ; preds = %land.lhs.true47.i.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1352, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 +80: ; preds = %77 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1349, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else51.i.i.i: ; preds = %land.lhs.true47.i.i.i - br i1 %tobool.i.i, label %land.lhs.true.i.i, label %if.else.critedge.i.critedge +81: ; preds = %77 + %82 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + %83 = icmp eq i32 %82, 1 + br i1 %83, label %85, label %84 -land.lhs.true.i.i: ; preds = %if.else51.i.i.i - %20 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 - %cmp.i.i = icmp eq i32 %20, 1 - br i1 %cmp.i.i, label %land.lhs.true8.i.i, label %if.then.i.i - -if.then.i.i: ; preds = %land.lhs.true.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 +84: ; preds = %81 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 unreachable -land.lhs.true8.i.i: ; preds = %land.lhs.true.i.i - %21 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 - %cmp10.i.i = icmp eq i32 %21, 0 - br i1 %cmp10.i.i, label %land.lhs.true.i24, label %if.then11.i.i +85: ; preds = %81 + %86 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + %87 = icmp eq i32 %86, 0 + br i1 %87, label %89, label %88 -if.then11.i.i: ; preds = %land.lhs.true8.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1553, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 +88: ; preds = %85 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1550, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 unreachable -land.lhs.true.i24: ; preds = %land.lhs.true8.i.i - %22 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 - %tobool.i25.i.not = icmp eq i32 %22, 0 - br i1 %tobool.i25.i.not, label %if.then.i25, label %_ZN4ompx5state18assumeInitialStateEb.exit +89: ; preds = %85 + %90 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + %91 = icmp eq i32 %90, 0 + br i1 %91, label %92, label %98 -if.then.i25: ; preds = %land.lhs.true.i24 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20 +92: ; preds = %89 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20 unreachable -if.else.critedge.i.critedge.critedge.critedge: ; preds = %if.else31.i.i.i - %cmp44.i.i.i.c = icmp eq i32 %18, 1 - tail call void @llvm.assume(i1 noundef %cmp44.i.i.i.c) #21 - br label %if.else.critedge.i.critedge +93: ; preds = %68 + %94 = icmp eq i32 %71, 1 + tail call void @llvm.assume(i1 noundef %94) #21 + br label %95 -if.else.critedge.i.critedge: ; preds = %if.else41.i.i.i, %if.else.critedge.i.critedge.critedge.critedge, %if.else51.i.i.i - %.pre.i = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 - %23 = icmp ne i32 %.pre.i, 0 - br label %_ZN4ompx5state18assumeInitialStateEb.exit +95: ; preds = %75, %93 + %96 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + %97 = icmp ne i32 %96, 0 + br label %98 -_ZN4ompx5state18assumeInitialStateEb.exit: ; preds = %land.lhs.true.i24, %if.else.critedge.i.critedge - %cmp8.i = phi i1 [ %23, %if.else.critedge.i.critedge ], [ true, %land.lhs.true.i24 ] - tail call void @llvm.assume(i1 noundef %cmp8.i) #21 +98: ; preds = %89, %95 + %99 = phi i1 [ %97, %95 ], [ true, %89 ] + tail call void @llvm.assume(i1 noundef %99) #21 tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19 - br label %cleanup - -if.end9: ; preds = %if.end - %24 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 - %sub.i.i = add i32 %24, -1 - %and.i.i26 = and i32 %sub.i.i, -32 - %25 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 - %cmp.i.i27 = icmp eq i32 %25, %and.i.i26 - br i1 %cmp.i.i27, label %cleanup, label %if.end12 - -if.end12: ; preds = %if.end9 - %sub.i = add i32 %24, -32 - %cmp = icmp ult i32 %25, %sub.i - %or.cond33 = and i1 %tobool3.not, %cmp - br i1 %or.cond33, label %do.body.i.preheader, label %cleanup - -do.body.i.preheader: ; preds = %if.end12 - %26 = load i32, ptr @__omp_rtl_debug_kind, align 4 - %27 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8 - %and.i.i29 = and i32 %26, 1 - %and.i = and i32 %and.i.i29, %27 - %tobool.i = icmp ne i32 %and.i, 0 - br label %do.body.i - -do.body.i: ; preds = %do.body.i.preheader, %if.end9.i - call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %WorkFn.i) #22 - store ptr null, ptr %WorkFn.i, align 8, !tbaa !76 + br label %130 + +100: ; preds = %37 + %101 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %102 = add nsw i32 %101, -1 + %103 = and i32 %102, -32 + %104 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !85 + %105 = icmp eq i32 %104, %103 + br i1 %105, label %130, label %106 + +106: ; preds = %100 + %107 = add nsw i32 %101, -32 + %108 = icmp ult i32 %104, %107 + %109 = select i1 %9, i1 %108, i1 false + br i1 %109, label %110, label %130 + +110: ; preds = %106 + %111 = load i32, ptr @__omp_rtl_debug_kind, align 4 + %112 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8 + %113 = and i32 %111, 1 + %114 = and i32 %113, %112 + %115 = icmp ne i32 %114, 0 + br label %116 + +116: ; preds = %110, %128 + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %3) #22 + store ptr null, ptr %3, align 8, !tbaa !74 tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 - %call1.i = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn.i) #22 - %28 = load ptr, ptr %WorkFn.i, align 8, !tbaa !76 - %tobool.not.not.i = icmp eq ptr %28, null - br i1 %tobool.not.not.i, label %_ZL19genericStateMachineP7IdentTy.exit, label %if.end.i - -if.end.i: ; preds = %do.body.i - br i1 %call1.i, label %if.then3.i, label %if.end9.i - -if.then3.i: ; preds = %if.end.i - %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 - %tobool.i30 = icmp ne i32 %29, 0 - %or.cond = select i1 %tobool.i, i1 %tobool.i30, i1 false - br i1 %or.cond, label %if.then6.i, label %if.else.i - -if.then6.i: ; preds = %if.then3.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(70) @.str15, i32 noundef 58, ptr nofree noundef nonnull dereferenceable(36) @__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy) #20 + %117 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %3) #22 + %118 = load ptr, ptr %3, align 8, !tbaa !74 + %119 = icmp eq ptr %118, null + br i1 %119, label %129, label %120 + +120: ; preds = %116 + br i1 %117, label %121, label %128 + +121: ; preds = %120 + %122 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %123 = icmp ne i32 %122, 0 + %124 = select i1 %115, i1 %123, i1 false + br i1 %124, label %125, label %126 + +125: ; preds = %121 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(67) @.str15, i32 noundef 60, ptr nofree noundef nonnull dereferenceable(36) @__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy) #20 unreachable -if.else.i: ; preds = %if.then3.i - %tobool.i31.not = icmp eq i32 %29, 0 - tail call void @llvm.assume(i1 noundef %tobool.i31.not) #21 - tail call void %28(i32 noundef 0, i32 noundef %25) #23 +126: ; preds = %121 + %127 = icmp eq i32 %122, 0 + tail call void @llvm.assume(i1 noundef %127) #21 + tail call void %118(i32 noundef 0, i32 noundef %104) #23 tail call void @__kmpc_kernel_end_parallel() #24 - br label %if.end9.i + br label %128 -if.end9.i: ; preds = %if.else.i, %if.end.i +128: ; preds = %126, %120 tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 - call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn.i) #22 - br label %do.body.i, !llvm.loop !86 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #22 + br label %116, !llvm.loop !86 -_ZL19genericStateMachineP7IdentTy.exit: ; preds = %do.body.i - call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn.i) #22 - br label %cleanup +129: ; preds = %116 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #22 + br label %130 -cleanup: ; preds = %if.end12, %_ZL19genericStateMachineP7IdentTy.exit, %if.end9, %_ZN4ompx5state18assumeInitialStateEb.exit - %retval.0 = phi i32 [ -1, %_ZN4ompx5state18assumeInitialStateEb.exit ], [ -1, %if.end9 ], [ %25, %_ZL19genericStateMachineP7IdentTy.exit ], [ %25, %if.end12 ] - ret i32 %retval.0 +130: ; preds = %106, %129, %100, %98 + %131 = phi i32 [ -1, %98 ], [ -1, %100 ], [ %104, %129 ], [ %104, %106 ] + ret i32 %131 } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5 +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #6 + ; Function Attrs: convergent mustprogress noinline norecurse nounwind -define internal void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 %Ordering) local_unnamed_addr #6 { -entry: +define internal void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 %0) local_unnamed_addr #7 { tail call void @llvm.nvvm.barrier0() #25 ret void } @@ -473,338 +474,332 @@ entry: ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5 -; Function Attrs: convergent mustprogress noreturn nounwind -define internal fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(8) %expr, ptr noundef %msg, ptr nofree noundef nonnull dereferenceable(69) %file, i32 noundef %line, ptr nofree noundef nonnull dereferenceable(20) %function) unnamed_addr #7 { -entry: - %tmp = alloca %printf_args, align 8 - %tmp1 = alloca %printf_args.7, align 8 - %tobool.not = icmp eq ptr %msg, null - br i1 %tobool.not, label %if.else, label %if.then - -if.then: ; preds = %entry - store ptr %file, ptr %tmp, align 8 - %0 = getelementptr inbounds i8, ptr %tmp, i64 8 - store i32 %line, ptr %0, align 8 - %1 = getelementptr inbounds i8, ptr %tmp, i64 16 - store ptr %function, ptr %1, align 8 - br label %if.end - -if.else: ; preds = %entry - store ptr %file, ptr %tmp1, align 8 - %2 = getelementptr inbounds i8, ptr %tmp1, i64 8 - store i32 %line, ptr %2, align 8 - br label %if.end - -if.end: ; preds = %if.else, %if.then - %.sink12 = phi i64 [ 16, %if.else ], [ 24, %if.then ] - %tmp1.sink11 = phi ptr [ %tmp1, %if.else ], [ %tmp, %if.then ] - %function.sink = phi ptr [ %function, %if.else ], [ %msg, %if.then ] - %.sink9 = phi i64 [ 24, %if.else ], [ 32, %if.then ] - %.str1.sink = phi ptr [ @.str1, %if.else ], [ @.str, %if.then ] - %3 = getelementptr inbounds i8, ptr %tmp1.sink11, i64 %.sink12 - store ptr %function.sink, ptr %3, align 8 - %4 = getelementptr inbounds i8, ptr %tmp1.sink11, i64 %.sink9 - store ptr %expr, ptr %4, align 8 - %call.i.i = call noundef i32 @vprintf(ptr noundef nonnull %.str1.sink, ptr noundef nonnull %tmp1.sink11) #24 +; Function Attrs: cold convergent mustprogress noreturn nounwind +define internal fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(8) %0, ptr noundef %1, ptr nofree noundef nonnull dereferenceable(66) %2, i32 noundef range(i32 60, 905) %3, ptr nofree noundef nonnull dereferenceable(20) %4) unnamed_addr #8 { + %6 = alloca %printf_args, align 8 + %7 = alloca %printf_args.7, align 8 + %8 = icmp eq ptr %1, null + br i1 %8, label %12, label %9 + +9: ; preds = %5 + store ptr %2, ptr %6, align 8 + %10 = getelementptr inbounds nuw i8, ptr %6, i64 8 + store i32 %3, ptr %10, align 8 + %11 = getelementptr inbounds nuw i8, ptr %6, i64 16 + store ptr %4, ptr %11, align 8 + br label %14 + +12: ; preds = %5 + store ptr %2, ptr %7, align 8 + %13 = getelementptr inbounds nuw i8, ptr %7, i64 8 + store i32 %3, ptr %13, align 8 + br label %14 + +14: ; preds = %12, %9 + %15 = phi i64 [ 16, %12 ], [ 24, %9 ] + %16 = phi ptr [ %7, %12 ], [ %6, %9 ] + %17 = phi ptr [ %4, %12 ], [ %1, %9 ] + %18 = phi i64 [ 24, %12 ], [ 32, %9 ] + %19 = phi ptr [ @.str1, %12 ], [ @.str, %9 ] + %20 = getelementptr inbounds nuw i8, ptr %16, i64 %15 + store ptr %17, ptr %20, align 8 + %21 = getelementptr inbounds nuw i8, ptr %16, i64 %18 + store ptr %0, ptr %21, align 8 + %22 = call i32 @vprintf(ptr noundef nonnull %19, ptr noundef nonnull %16) #22 call void @llvm.trap() #26 unreachable } ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) -declare void @llvm.assume(i1 noundef) #8 +declare void @llvm.assume(i1 noundef) #9 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) -declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #9 +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #10 ; Function Attrs: convergent nocallback nounwind -declare void @llvm.nvvm.barrier.sync(i32) #10 +declare void @llvm.nvvm.barrier.sync(i32) #11 ; Function Attrs: convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) -define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn) local_unnamed_addr #11 { -entry: - %0 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !76 - store ptr %0, ptr %WorkFn, align 8, !tbaa !76 - %tobool.not = icmp eq ptr %0, null - br i1 %tobool.not, label %return, label %if.end - -if.end: ; preds = %entry - %1 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27 - %2 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !61 - %tobool.not.i = icmp eq i32 %2, 0 - %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 - %4 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 - %tobool.i.not.i.i = icmp eq i32 %4, 0 - %mul.neg.i.i.i = select i1 %tobool.i.not.i.i, i32 -32, i32 0 - %sub.i.i.i = add i32 %mul.neg.i.i.i, %3 - %cond.i = select i1 %tobool.not.i, i32 %sub.i.i.i, i32 %2 - %cmp = icmp ult i32 %1, %cond.i - br label %return - -return: ; preds = %if.end, %entry - %retval.0 = phi i1 [ %cmp, %if.end ], [ false, %entry ] - ret i1 %retval.0 +define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %0) local_unnamed_addr #12 { + %2 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr %2, ptr %0, align 8, !tbaa !74 + %3 = icmp eq ptr %2, null + br i1 %3, label %15, label %4 + +4: ; preds = %1 + %5 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !85 + %6 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !63 + %7 = icmp eq i32 %6, 0 + %8 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %9 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %10 = icmp eq i32 %9, 0 + %11 = select i1 %10, i32 -32, i32 0 + %12 = add nsw i32 %11, %8 + %13 = select i1 %7, i32 %12, i32 %6 + %14 = icmp ult i32 %5, %13 + br label %15 + +15: ; preds = %4, %1 + %16 = phi i1 [ %14, %4 ], [ false, %1 ] + ret i1 %16 } ; Function Attrs: convergent mustprogress noinline nounwind -define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #12 { -entry: - %0 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 - %1 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 - %and.i.i = and i32 %0, 1 - %and.i = and i32 %and.i.i, %1 - %tobool.i = icmp ne i32 %and.i, 0 - %2 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 - %tobool.i1 = icmp ne i32 %2, 0 - %or.cond = select i1 %tobool.i, i1 %tobool.i1, i1 false - br i1 %or.cond, label %if.then, label %if.else - -if.then: ; preds = %entry - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(75) @.str1127, i32 noundef 297, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 +define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { + %1 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 + %2 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %3 = and i32 %1, 1 + %4 = and i32 %3, %2 + %5 = icmp ne i32 %4, 0 + %6 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %7 = icmp ne i32 %6, 0 + %8 = select i1 %5, i1 %7, i1 false + br i1 %8, label %9, label %10 + +9: ; preds = %0 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1124, i32 noundef 298, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 unreachable -if.else: ; preds = %entry - %tobool.i2.not = icmp eq i32 %2, 0 - tail call void @llvm.assume(i1 noundef %tobool.i2.not) #21 - %3 = load i32, ptr @__omp_rtl_assume_no_thread_state, align 4, !tbaa !61 - %tobool.not.i.i = icmp eq i32 %3, 0 - %4 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8 - %tobool.not.i = icmp ne i32 %4, 0 - %or.cond.not.i = select i1 %tobool.not.i.i, i1 %tobool.not.i, i1 false - br i1 %or.cond.not.i, label %lor.rhs.i, label %_ZN4ompx5state19resetStateForThreadEj.exit - -lor.rhs.i: ; preds = %if.else - %5 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27 - %6 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 - %idxprom.i = zext i32 %5 to i64 - %arrayidx.i = getelementptr inbounds ptr, ptr %6, i64 %idxprom.i - %7 = load ptr, ptr %arrayidx.i, align 8, !tbaa !76 - %tobool1.not.i = icmp eq ptr %7, null - br i1 %tobool1.not.i, label %_ZN4ompx5state19resetStateForThreadEj.exit, label %if.end4.i, !prof !88 - -if.end4.i: ; preds = %lor.rhs.i - %PreviousThreadState7.i = getelementptr inbounds i8, ptr %7, i64 32 - %8 = load ptr, ptr %PreviousThreadState7.i, align 8, !tbaa !89 - tail call void @free(ptr noundef nonnull dereferenceable(40) %7) #28 - %9 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 - %arrayidx11.i = getelementptr inbounds ptr, ptr %9, i64 %idxprom.i - store ptr %8, ptr %arrayidx11.i, align 8, !tbaa !76 - %.pre = load i32, ptr addrspace(3) @IsSPMDMode, align 4 - br label %_ZN4ompx5state19resetStateForThreadEj.exit - -_ZN4ompx5state19resetStateForThreadEj.exit: ; preds = %if.else, %lor.rhs.i, %if.end4.i - %10 = phi i32 [ 0, %if.else ], [ 0, %lor.rhs.i ], [ %.pre, %if.end4.i ] - %tobool.i6 = icmp ne i32 %10, 0 - %or.cond8 = select i1 %tobool.i, i1 %tobool.i6, i1 false - br i1 %or.cond8, label %if.then7, label %if.else8 - -if.then7: ; preds = %_ZN4ompx5state19resetStateForThreadEj.exit - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(75) @.str1127, i32 noundef 300, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 +10: ; preds = %0 + %11 = icmp eq i32 %6, 0 + tail call void @llvm.assume(i1 noundef %11) #21 + %12 = load i32, ptr @__omp_rtl_assume_no_thread_state, align 4, !tbaa !63 + %13 = icmp eq i32 %12, 0 + %14 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8 + %15 = icmp ne i32 %14, 0 + %16 = select i1 %13, i1 %15, i1 false + br i1 %16, label %17, label %30 + +17: ; preds = %10 + %18 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !85 + %19 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + %20 = zext nneg i32 %18 to i64 + %21 = getelementptr inbounds nuw ptr, ptr %19, i64 %20 + %22 = load ptr, ptr %21, align 8, !tbaa !74 + %23 = icmp eq ptr %22, null + br i1 %23, label %30, label %24, !prof !88 + +24: ; preds = %17 + %25 = getelementptr inbounds nuw i8, ptr %22, i64 32 + %26 = load ptr, ptr %25, align 8, !tbaa !89 + tail call void @free(ptr noundef nonnull dereferenceable(40) %22) #28 + %27 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + %28 = getelementptr inbounds nuw ptr, ptr %27, i64 %20 + store ptr %26, ptr %28, align 8, !tbaa !74 + %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + br label %30 + +30: ; preds = %10, %17, %24 + %31 = phi i32 [ 0, %10 ], [ 0, %17 ], [ %29, %24 ] + %32 = icmp ne i32 %31, 0 + %33 = select i1 %5, i1 %32, i1 false + br i1 %33, label %34, label %35 + +34: ; preds = %30 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1124, i32 noundef 301, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 unreachable -if.else8: ; preds = %_ZN4ompx5state19resetStateForThreadEj.exit - %tobool.i7.not = icmp eq i32 %10, 0 - tail call void @llvm.assume(i1 noundef %tobool.i7.not) #21 +35: ; preds = %30 + %36 = icmp eq i32 %31, 0 + tail call void @llvm.assume(i1 noundef %36) #21 ret void } ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #9 +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #10 ; Function Attrs: convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) -declare extern_weak void @free(ptr allocptr nocapture noundef) local_unnamed_addr #13 +declare extern_weak void @free(ptr allocptr nocapture noundef) local_unnamed_addr #14 ; Function Attrs: convergent -declare i32 @vprintf(ptr noundef, ptr noundef) local_unnamed_addr #14 +declare i32 @vprintf(ptr, ptr) local_unnamed_addr #2 ; Function Attrs: cold noreturn nounwind memory(inaccessiblemem: write) declare void @llvm.trap() #15 ; Function Attrs: convergent nocallback nounwind -declare void @llvm.nvvm.barrier0() #10 +declare void @llvm.nvvm.barrier0() #11 ; Function Attrs: convergent mustprogress nounwind define internal void @__kmpc_target_deinit() #4 { -entry: - %WorkFn = alloca ptr, align 8 - %0 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 - %tobool.i.not = icmp eq i32 %0, 0 - br i1 %tobool.i.not, label %if.end, label %cleanup - -if.end: ; preds = %entry - %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 - %sub.i.i = add i32 %1, -1 - %and.i.i = and i32 %sub.i.i, -32 - %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 - %cmp.i.i = icmp eq i32 %2, %and.i.i - br i1 %cmp.i.i, label %if.then3, label %if.else - -if.then3: ; preds = %if.end - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !76 - br label %cleanup - -if.else: ; preds = %if.end - %3 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 - %4 = load i8, ptr %3, align 8, !tbaa !91 - %tobool6.not = icmp eq i8 %4, 0 - br i1 %tobool6.not, label %if.then7, label %cleanup - -if.then7: ; preds = %if.else - call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %WorkFn) #29 - store ptr null, ptr %WorkFn, align 8, !tbaa !76 - %call8 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn) #22 - %5 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 - %6 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 - %and.i.i1 = and i32 %5, 1 - %and.i = and i32 %and.i.i1, %6 - %tobool.i2.not = icmp eq i32 %and.i, 0 - %7 = load ptr, ptr %WorkFn, align 8 - %cmp = icmp eq ptr %7, null - %or.cond = select i1 %tobool.i2.not, i1 true, i1 %cmp - br i1 %or.cond, label %if.else11, label %if.then10 - -if.then10: ; preds = %if.then7 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(18) @.str2, ptr noundef null, ptr nofree noundef nonnull dereferenceable(70) @.str15, i32 noundef 150, ptr nofree noundef nonnull dereferenceable(28) @__PRETTY_FUNCTION__.__kmpc_target_deinit) #20 + %1 = alloca ptr, align 8 + %2 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + %3 = icmp eq i32 %2, 0 + br i1 %3, label %4, label %27 + +4: ; preds = %0 + %5 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %6 = add nsw i32 %5, -1 + %7 = and i32 %6, -32 + %8 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !85 + %9 = icmp eq i32 %8, %7 + br i1 %9, label %10, label %11 + +10: ; preds = %4 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 + br label %27 + +11: ; preds = %4 + %12 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 + %13 = load i8, ptr %12, align 8, !tbaa !91 + %14 = icmp eq i8 %13, 0 + br i1 %14, label %15, label %27 + +15: ; preds = %11 + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %1) #29 + store ptr null, ptr %1, align 8, !tbaa !74 + %16 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %1) #22 + %17 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 + %18 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %19 = and i32 %17, 1 + %20 = and i32 %19, %18 + %21 = icmp eq i32 %20, 0 + %22 = load ptr, ptr %1, align 8 + %23 = icmp eq ptr %22, null + %24 = select i1 %21, i1 true, i1 %23 + br i1 %24, label %26, label %25 + +25: ; preds = %15 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(18) @.str2, ptr noundef null, ptr nofree noundef nonnull dereferenceable(67) @.str15, i32 noundef 152, ptr nofree noundef nonnull dereferenceable(28) @__PRETTY_FUNCTION__.__kmpc_target_deinit) #20 unreachable -if.else11: ; preds = %if.then7 - tail call void @llvm.assume(i1 noundef %cmp) #21 - call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn) #22 - br label %cleanup +26: ; preds = %15 + tail call void @llvm.assume(i1 noundef %23) #21 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %1) #22 + br label %27 -cleanup: ; preds = %if.else11, %if.else, %if.then3, %entry +27: ; preds = %26, %11, %10, %0 ret void } -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare void @llvm.dbg.declare(metadata, metadata, metadata) #5 - -attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "omp_target_thread_limit"="128" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } -attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } -attributes #2 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } -attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } -attributes #4 = { convergent mustprogress nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="128" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #4 = { convergent mustprogress nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -attributes #6 = { convergent mustprogress noinline norecurse nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } -attributes #7 = { convergent mustprogress noreturn nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } -attributes #8 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } -attributes #9 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } -attributes #10 = { convergent nocallback nounwind } -attributes #11 = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } -attributes #12 = { convergent mustprogress noinline nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } -attributes #13 = { convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } -attributes #14 = { convergent "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #6 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #7 = { convergent mustprogress noinline norecurse nounwind "frame-pointer"="all" "llvm.assume"="ompx_aligned_barrier,ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #8 = { cold convergent mustprogress noreturn nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #9 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #10 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #11 = { convergent nocallback nounwind } +attributes #12 = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #13 = { convergent mustprogress noinline nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #14 = { convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } attributes #15 = { cold noreturn nounwind memory(inaccessiblemem: write) } attributes #16 = { convergent } attributes #17 = { nounwind } attributes #18 = { "llvm.assume"="ompx_no_call_asm" } -attributes #19 = { convergent nounwind "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" } +attributes #19 = { convergent nounwind "llvm.assume"="ompx_aligned_barrier,ompx_no_call_asm" } attributes #20 = { noreturn nounwind "llvm.assume"="ompx_no_call_asm" } attributes #21 = { memory(write) "llvm.assume"="ompx_no_call_asm" } attributes #22 = { nounwind "llvm.assume"="ompx_no_call_asm" } attributes #23 = { convergent nounwind } attributes #24 = { convergent nounwind "llvm.assume"="ompx_no_call_asm" } -attributes #25 = { "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" } +attributes #25 = { "llvm.assume"="ompx_aligned_barrier,ompx_no_call_asm" } attributes #26 = { noreturn "llvm.assume"="ompx_no_call_asm" } attributes #27 = { nofree willreturn "llvm.assume"="ompx_no_call_asm" } attributes #28 = { convergent nounwind willreturn "llvm.assume"="ompx_no_call_asm" } attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } -!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9} -!llvm.dbg.cu = !{!10} -!nvvm.annotations = !{!12, !13} -!omp_offload.info = !{!14} -!llvm.ident = !{!15, !16, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10} +!llvm.dbg.cu = !{!11} +!nvvm.annotations = !{!13, !14} +!omp_offload.info = !{!15} +!llvm.ident = !{!16, !17, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16} +!nvvmir.version = !{!18} -!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]} +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 12, i32 3]} !1 = !{i32 7, !"Dwarf Version", i32 2} !2 = !{i32 2, !"Debug Info Version", i32 3} !3 = !{i32 1, !"wchar_size", i32 4} -!4 = !{i32 7, !"openmp", i32 51} -!5 = !{i32 7, !"openmp-device", i32 51} -!6 = !{i32 8, !"PIC Level", i32 2} -!7 = !{i32 7, !"frame-pointer", i32 2} -!8 = !{i32 1, !"ThinLTO", i32 0} -!9 = !{i32 1, !"EnableSplitLTOUnit", i32 1} -!10 = distinct !DICompileUnit(language: DW_LANG_C11, file: !11, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!11 = !DIFile(filename: "test.c", directory: "/tmp") -!12 = !{ptr @__omp_offloading_10305_5c00dd_h_l12_debug__, !"maxntidx", i32 128} -!13 = !{ptr @__omp_offloading_10305_5c00dd_h_l12, !"kernel", i32 1} -!14 = !{i32 0, i32 66309, i32 6029533, !"h", i32 12, i32 0, i32 0} -!15 = !{!"clang version 19.0.0git"} -!16 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} -!17 = distinct !DISubprogram(name: "__omp_offloading_10305_5c00dd_h_l12_debug__", scope: !11, file: !11, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !10, retainedNodes: !23) -!18 = !DISubroutineType(types: !19) -!19 = !{null, !20} -!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21) -!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22) -!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) -!23 = !{} -!24 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !17, type: !20, flags: DIFlagArtificial) -!25 = !DILocation(line: 0, scope: !17) -!26 = !DILocation(line: 13, column: 3, scope: !17) -!27 = !DILocalVariable(name: "i", scope: !28, file: !11, line: 14, type: !29) -!28 = distinct !DILexicalBlock(scope: !17, file: !11, line: 13, column: 3) -!29 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!30 = !DILocation(line: 14, column: 9, scope: !28) -!31 = !DILocalVariable(name: "a", scope: !28, file: !11, line: 15, type: !32) -!32 = !DICompositeType(tag: DW_TAG_array_type, baseType: !29, size: 64, elements: !33) -!33 = !{!34} -!34 = !DISubrange(count: 2) -!35 = !DILocation(line: 15, column: 9, scope: !28) -!36 = !DILocation(line: 16, column: 5, scope: !28) -!37 = !DILocation(line: 17, column: 5, scope: !28) -!38 = !DILocation(line: 18, column: 3, scope: !28) -!39 = !DILocation(line: 18, column: 3, scope: !17) -!40 = distinct !DISubprogram(name: "__omp_offloading_10305_5c00dd_h_l12", scope: !11, file: !11, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !10, retainedNodes: !23) -!41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial) -!42 = !DILocation(line: 0, scope: !40) -!43 = !DILocation(line: 12, column: 1, scope: !40) -!44 = distinct !DISubprogram(name: "g", scope: !11, file: !11, line: 3, type: !45, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !10, retainedNodes: !23) -!45 = !DISubroutineType(types: !46) -!46 = !{null} -!47 = !DILocalVariable(name: "i", scope: !44, file: !11, line: 4, type: !29) -!48 = !DILocation(line: 4, column: 7, scope: !44) -!49 = !DILocalVariable(name: "a", scope: !44, file: !11, line: 5, type: !32) -!50 = !DILocation(line: 5, column: 7, scope: !44) -!51 = !DILocation(line: 6, column: 3, scope: !44) -!52 = !DILocation(line: 7, column: 3, scope: !44) -!53 = !DILocation(line: 8, column: 1, scope: !44) -!54 = !{!55, !58, i64 2} -!55 = !{!"_ZTS26ConfigurationEnvironmentTy", !56, i64 0, !56, i64 1, !58, i64 2, !59, i64 4, !59, i64 8, !59, i64 12, !59, i64 16, !59, i64 20, !59, i64 24} -!56 = !{!"omnipotent char", !57, i64 0} -!57 = !{!"Simple C++ TBAA"} -!58 = !{!"_ZTSN4llvm3omp19OMPTgtExecModeFlagsE", !56, i64 0} -!59 = !{!"int", !56, i64 0} -!60 = !{!55, !56, i64 0} -!61 = !{!59, !59, i64 0} -!62 = !{!56, !56, i64 0} -!63 = !{!64, !59, i64 0} -!64 = !{!"_ZTSN4ompx5state11TeamStateTyE", !65, i64 0, !59, i64 28, !59, i64 32, !66, i64 40} -!65 = !{!"_ZTSN4ompx5state10ICVStateTyE", !59, i64 0, !59, i64 4, !59, i64 8, !59, i64 12, !59, i64 16, !59, i64 20, !59, i64 24} -!66 = !{!"any pointer", !56, i64 0} -!67 = !{!64, !59, i64 4} -!68 = !{!64, !59, i64 8} -!69 = !{!64, !59, i64 12} -!70 = !{!64, !59, i64 16} -!71 = !{!64, !59, i64 20} -!72 = !{!64, !59, i64 24} -!73 = !{!64, !59, i64 28} -!74 = !{!64, !59, i64 32} -!75 = !{!64, !66, i64 40} -!76 = !{!66, !66, i64 0} -!77 = !{!78, !59, i64 0} -!78 = !{!"_ZTS19DeviceEnvironmentTy", !59, i64 0, !59, i64 4, !59, i64 8, !59, i64 12, !79, i64 16, !79, i64 24, !79, i64 32, !79, i64 40} -!79 = !{!"long", !56, i64 0} -!80 = !{!65, !59, i64 0} -!81 = !{!65, !59, i64 4} -!82 = !{!65, !59, i64 8} -!83 = !{!65, !59, i64 16} -!84 = !{!65, !59, i64 20} -!85 = !{!65, !59, i64 24} +!4 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!5 = !{i32 7, !"openmp", i32 51} +!6 = !{i32 7, !"openmp-device", i32 51} +!7 = !{i32 8, !"PIC Level", i32 2} +!8 = !{i32 7, !"frame-pointer", i32 2} +!9 = !{i32 1, !"ThinLTO", i32 0} +!10 = !{i32 1, !"EnableSplitLTOUnit", i32 1} +!11 = distinct !DICompileUnit(language: DW_LANG_C11, file: !12, producer: "clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!12 = !DIFile(filename: "test.c", directory: "/tmp") +!13 = !{ptr @__omp_offloading_fd02_10d1d6_h_l12, !"maxntidx", i32 128} +!14 = !{ptr @__omp_offloading_fd02_10d1d6_h_l12, !"kernel", i32 1} +!15 = !{i32 0, i32 64770, i32 1102294, !"h", i32 12, i32 0, i32 0} +!16 = !{!"clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)"} +!17 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!18 = !{i32 2, i32 0} +!19 = distinct !DISubprogram(name: "__omp_offloading_fd02_10d1d6_h_l12_debug__", scope: !12, file: !12, line: 13, type: !20, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) +!20 = !DISubroutineType(types: !21) +!21 = !{null, !22} +!22 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !23) +!23 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !24) +!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!25 = !{} +!26 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !19, type: !22, flags: DIFlagArtificial) +!27 = !DILocation(line: 0, scope: !19) +!28 = !DILocation(line: 13, column: 3, scope: !19) +!29 = !DILocalVariable(name: "i", scope: !30, file: !12, line: 14, type: !31) +!30 = distinct !DILexicalBlock(scope: !19, file: !12, line: 13, column: 3) +!31 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!32 = !DILocation(line: 14, column: 9, scope: !30) +!33 = !DILocalVariable(name: "a", scope: !30, file: !12, line: 15, type: !34) +!34 = !DICompositeType(tag: DW_TAG_array_type, baseType: !31, size: 64, elements: !35) +!35 = !{!36} +!36 = !DISubrange(count: 2) +!37 = !DILocation(line: 15, column: 9, scope: !30) +!38 = !DILocation(line: 16, column: 5, scope: !30) +!39 = !DILocation(line: 17, column: 5, scope: !30) +!40 = !DILocation(line: 18, column: 3, scope: !30) +!41 = !DILocation(line: 18, column: 3, scope: !19) +!42 = distinct !DISubprogram(name: "__omp_offloading_fd02_10d1d6_h_l12", scope: !12, file: !12, line: 12, type: !20, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) +!43 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !42, type: !22, flags: DIFlagArtificial) +!44 = !DILocation(line: 0, scope: !42) +!45 = !DILocation(line: 12, column: 1, scope: !42) +!46 = distinct !DISubprogram(name: "g", scope: !12, file: !12, line: 3, type: !47, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !11, retainedNodes: !25) +!47 = !DISubroutineType(types: !48) +!48 = !{null} +!49 = !DILocalVariable(name: "i", scope: !46, file: !12, line: 4, type: !31) +!50 = !DILocation(line: 4, column: 7, scope: !46) +!51 = !DILocalVariable(name: "a", scope: !46, file: !12, line: 5, type: !34) +!52 = !DILocation(line: 5, column: 7, scope: !46) +!53 = !DILocation(line: 6, column: 3, scope: !46) +!54 = !DILocation(line: 7, column: 3, scope: !46) +!55 = !DILocation(line: 8, column: 1, scope: !46) +!56 = !{!57, !60, i64 2} +!57 = !{!"_ZTS26ConfigurationEnvironmentTy", !58, i64 0, !58, i64 1, !60, i64 2, !61, i64 4, !61, i64 8, !61, i64 12, !61, i64 16, !61, i64 20, !61, i64 24} +!58 = !{!"omnipotent char", !59, i64 0} +!59 = !{!"Simple C++ TBAA"} +!60 = !{!"_ZTSN4llvm3omp19OMPTgtExecModeFlagsE", !58, i64 0} +!61 = !{!"int", !58, i64 0} +!62 = !{!57, !58, i64 0} +!63 = !{!61, !61, i64 0} +!64 = !{!58, !58, i64 0} +!65 = !{!66, !61, i64 16} +!66 = !{!"_ZTSN4ompx5state11TeamStateTyE", !67, i64 0, !61, i64 28, !61, i64 32, !68, i64 40} +!67 = !{!"_ZTSN4ompx5state10ICVStateTyE", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !61, i64 16, !61, i64 20, !61, i64 24} +!68 = !{!"any pointer", !58, i64 0} +!69 = !{!66, !61, i64 20} +!70 = !{!66, !61, i64 24} +!71 = !{!66, !61, i64 28} +!72 = !{!66, !61, i64 32} +!73 = !{!66, !68, i64 40} +!74 = !{!68, !68, i64 0} +!75 = !{i32 1, i32 1025} +!76 = !{!77, !61, i64 0} +!77 = !{!"_ZTS19DeviceEnvironmentTy", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !78, i64 16, !78, i64 24, !78, i64 32, !78, i64 40} +!78 = !{!"long", !58, i64 0} +!79 = !{!67, !61, i64 0} +!80 = !{!67, !61, i64 4} +!81 = !{!67, !61, i64 8} +!82 = !{!67, !61, i64 16} +!83 = !{!67, !61, i64 20} +!84 = !{!67, !61, i64 24} +!85 = !{i32 0, i32 1024} !86 = distinct !{!86, !87} !87 = !{!"llvm.loop.mustprogress"} -!88 = !{!"branch_weights", i32 2000, i32 1} -!89 = !{!90, !66, i64 32} -!90 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !65, i64 0, !66, i64 32} -!91 = !{!92, !56, i64 0} -!92 = !{!"_ZTS19KernelEnvironmentTy", !55, i64 0, !66, i64 32, !66, i64 40} +!88 = !{!"branch_weights", !"expected", i32 2000, i32 1} +!89 = !{!90, !68, i64 32} +!90 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !67, i64 0, !68, i64 32} +!91 = !{!92, !58, i64 0} +!92 = !{!"_ZTS19KernelEnvironmentTy", !57, i64 0, !68, i64 32, !68, i64 40} From baad223fc0772766f6a9463635a4bd681d435b2b Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Thu, 17 Oct 2024 15:44:34 -0400 Subject: [PATCH 40/46] Include LLVM value name in alloca report --- llvm/lib/Analysis/KernelInfo.cpp | 19 ++++++++++--------- llvm/test/Analysis/KernelInfo/allocas.ll | 16 +++++++++------- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 12 ++++++------ llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 12 ++++++------ 4 files changed, 31 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 3658f54923e3f..f9832a6deb75a 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -110,13 +110,13 @@ static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller, const AllocaInst &Alloca, TypeSize::ScalarTy StaticSize) { ORE.emit([&] { - StringRef Name; + StringRef DbgName; DebugLoc Loc; bool Artificial = false; auto DVRs = findDVRDeclares(&const_cast(Alloca)); if (!DVRs.empty()) { const DbgVariableRecord &DVR = **DVRs.begin(); - Name = DVR.getVariable()->getName(); + DbgName = DVR.getVariable()->getName(); Loc = DVR.getDebugLoc(); Artificial = DVR.Variable->isArtificial(); } @@ -127,13 +127,14 @@ static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller, R << ", "; if (Artificial) R << "artificial "; - if (Name.empty()) { - R << "unnamed alloca "; - if (DVRs.empty()) - R << "(missing debug metadata) "; - } else { - R << "alloca '" << Name << "' "; - } + SmallString<20> ValName; + raw_svector_ostream OS(ValName); + Alloca.printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); + R << "alloca ('" << ValName << "') "; + if (!DbgName.empty()) + R << "for '" << DbgName << "' "; + else + R << "without debug info "; R << "with "; if (StaticSize) R << "static size of " << itostr(StaticSize) << " bytes"; diff --git a/llvm/test/Analysis/KernelInfo/allocas.ll b/llvm/test/Analysis/KernelInfo/allocas.ll index 048d53799c33e..3ecde004a9b2a 100644 --- a/llvm/test/Analysis/KernelInfo/allocas.ll +++ b/llvm/test/Analysis/KernelInfo/allocas.ll @@ -9,26 +9,28 @@ target triple = "nvptx64-nvidia-cuda" define void @h() !dbg !3 { entry: - ; CHECK: remark: test.c:0:0: in artificial function 'h', artificial alloca 'dyn_ptr' with static size of 8 bytes + ; CHECK: remark: test.c:0:0: in artificial function 'h', artificial alloca ('%dyn_ptr.addr') for 'dyn_ptr' with static size of 8 bytes %dyn_ptr.addr = alloca ptr, align 8 - ; CHECK: remark: test.c:14:9: in artificial function 'h', alloca 'i' with static size of 4 bytes + ; CHECK: remark: test.c:14:9: in artificial function 'h', alloca ('%i') for 'i' with static size of 4 bytes %i = alloca i32, align 4 - ; CHECK: remark: test.c:15:9: in artificial function 'h', alloca 'a' with static size of 8 bytes + ; CHECK: remark: test.c:15:9: in artificial function 'h', alloca ('%a') for 'a' with static size of 8 bytes %a = alloca [2 x i32], align 4 + ; CHECK: remark: :0:0: in artificial function 'h', alloca ('%nodbg') without debug info with static size of 4 bytes + %nodbg = alloca i32, align 4 tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !7, metadata !DIExpression()), !dbg !11 tail call void @llvm.dbg.declare(metadata ptr %i, metadata !12, metadata !DIExpression()), !dbg !15 tail call void @llvm.dbg.declare(metadata ptr %a, metadata !16, metadata !DIExpression()), !dbg !20 ret void } -; CHECK: remark: test.c:13:0: in artificial function 'h', Allocas = 3 -; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasStaticSizeSum = 20 +; CHECK: remark: test.c:13:0: in artificial function 'h', Allocas = 4 +; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasStaticSizeSum = 24 ; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasDyn = 0 define void @g() !dbg !21 { entry: - ; CHECK: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes + ; CHECK: remark: test.c:4:7: in function 'g', alloca ('%i') for 'i' with static size of 4 bytes %i = alloca i32, align 4 - ; CHECK: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes + ; CHECK: remark: test.c:5:7: in function 'g', alloca ('%a') for 'a' with static size of 8 bytes %a = alloca [2 x i32], align 4 tail call void @llvm.dbg.declare(metadata ptr %i, metadata !23, metadata !DIExpression()), !dbg !24 tail call void @llvm.dbg.declare(metadata ptr %a, metadata !25, metadata !DIExpression()), !dbg !26 diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index c2caf8267cae7..246eccaac2fc0 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -7,9 +7,9 @@ ; RUN: FileCheck -match-full-lines %s ; CHECK-NOT: remark: -; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes -; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes -; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes +; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'a' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_init' ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f' @@ -33,7 +33,7 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 -; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' @@ -56,8 +56,8 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 -; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes -; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index e717599aab687..656171896a4ff 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -7,9 +7,9 @@ ; RUN: FileCheck -match-full-lines %s ; CHECK-NOT: remark: -; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes -; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes -; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes +; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'a' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_init' ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f' @@ -26,7 +26,7 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 -; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' @@ -43,8 +43,8 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 -; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes -; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 From c9aebce3b1d7fd489970e68f51621c1009559a62 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 27 Nov 2024 13:08:55 -0500 Subject: [PATCH 41/46] Update expected amdgpu-max-num-workgroups default values Due to 0b40f979298a. --- .../KernelInfo/launch-bounds/amdgpu.ll | 6 +++--- llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll index d37dceec003f9..7fbdb923d8800 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll @@ -25,9 +25,9 @@ entry: ; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_num_teams = {{.*}} ; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_thread_limit = {{.*}} -; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[0] = 0 -; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[1] = 0 -; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[2] = 0 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[2] = 4294967295 ; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[0] = 1 ; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[1] = 1024 ; CHECK: remark: test.c:11:0: in function 'none', amdgpu-waves-per-eu[0] = 4 diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 246eccaac2fc0..f9aadb21825f9 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -16,9 +16,9 @@ ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[1] = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[2] = 4294967295 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[0] = 1 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[1] = 1024 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[0] = 4 @@ -39,9 +39,9 @@ ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 256 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[0] = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[1] = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[2] = 4294967295 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[0] = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[1] = 256 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[0] = 1 @@ -61,9 +61,9 @@ ; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[0] = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[1] = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[2] = 4294967295 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[0] = 1 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[1] = 1024 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[0] = 4 From 151bfb3529c8bf62ad98243c7583450a6d1354b7 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 27 Dec 2024 14:54:58 -0500 Subject: [PATCH 42/46] Regenerate OpenMP tests from current clang See llvm/test/Analysis/KernelInfo/openmp/README.md. --- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 29 +- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 247 ++++++++++-------- 2 files changed, 145 insertions(+), 131 deletions(-) diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index f9aadb21825f9..6016919ec8280 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -79,7 +79,6 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 ; CHECK-NOT: {{.}} - ; ModuleID = 'test-openmp-amdgcn-amd-amdhsa.bc' source_filename = "test.c" target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" @@ -95,14 +94,14 @@ target triple = "amdgcn-amd-amdhsa" @__omp_rtl_assume_threads_oversubscription = weak_odr hidden addrspace(1) constant i32 0 @__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1) constant i32 0 @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1) constant i32 0 -@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_6f0c0_h_l12_debug__;13;3;;\00", align 1 +@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_624a0_h_l12_debug__;13;3;;\00", align 1 @1 = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 56, ptr @0 }, align 8 -@__omp_offloading_fd02_6f0c0_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer -@__omp_offloading_fd02_6f0c0_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_6f0c0_h_l12_dynamic_environment to ptr) } +@__omp_offloading_fd02_624a0_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_624a0_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_624a0_h_l12_dynamic_environment to ptr) } @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 ; Function Attrs: convergent noinline norecurse nounwind optnone -define internal void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !16 { +define internal void @__omp_offloading_fd02_624a0_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !16 { %2 = alloca ptr, align 8, addrspace(5) %3 = alloca i32, align 4, addrspace(5) %4 = alloca [2 x i32], align 4, addrspace(5) @@ -111,7 +110,7 @@ define internal void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr noalias noun %7 = addrspacecast ptr addrspace(5) %4 to ptr store ptr %0, ptr %5, align 8 #dbg_declare(ptr addrspace(5) %2, !24, !DIExpression(), !25) - %8 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_6f0c0_h_l12_kernel_environment to ptr), ptr %0), !dbg !26 + %8 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_624a0_h_l12_kernel_environment to ptr), ptr %0), !dbg !26 %9 = icmp eq i32 %8, -1, !dbg !26 br i1 %9, label %10, label %11, !dbg !26 @@ -128,13 +127,13 @@ define internal void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr noalias noun } ; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone -define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_6f0c0_h_l12(ptr noalias noundef %0) #1 !dbg !40 { +define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_624a0_h_l12(ptr noalias noundef %0) #1 !dbg !40 { %2 = alloca ptr, align 8, addrspace(5) %3 = addrspacecast ptr addrspace(5) %2 to ptr store ptr %0, ptr %3, align 8 #dbg_declare(ptr addrspace(5) %2, !41, !DIExpression(), !42) %4 = load ptr, ptr %3, align 8, !dbg !43 - call void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr %4) #5, !dbg !43 + call void @__omp_offloading_fd02_624a0_h_l12_debug__(ptr %4) #5, !dbg !43 ret void, !dbg !43 } @@ -172,10 +171,10 @@ attributes #5 = { nounwind } !llvm.ident = !{!13, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14} !opencl.ocl.version = !{!15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} -!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "854099697e49b3ca7d3b3c08503e6fef") -!2 = !{i32 0, i32 64770, i32 454848, !"h", i32 12, i32 0, i32 0} -!3 = !{ptr @__omp_offloading_fd02_6f0c0_h_l12, !"kernel", i32 1} +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "44c4bbdbb9b7a9c7492ced3432d74b0c") +!2 = !{i32 0, i32 64770, i32 402592, !"h", i32 12, i32 0, i32 0} +!3 = !{ptr @__omp_offloading_fd02_624a0_h_l12, !"kernel", i32 1} !4 = !{i32 1, !"amdhsa_code_object_version", i32 500} !5 = !{i32 7, !"Dwarf Version", i32 5} !6 = !{i32 2, !"Debug Info Version", i32 3} @@ -185,10 +184,10 @@ attributes #5 = { nounwind } !10 = !{i32 8, !"PIC Level", i32 2} !11 = !{i32 7, !"frame-pointer", i32 2} !12 = !{i32 4, !"amdgpu_hostcall", i32 1} -!13 = !{!"clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)"} +!13 = !{!"clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)"} !14 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"} !15 = !{i32 2, i32 0} -!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_6f0c0_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_624a0_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) !17 = !DIFile(filename: "test.c", directory: "/tmp") !18 = !DISubroutineType(types: !19) !19 = !{null, !20} @@ -212,7 +211,7 @@ attributes #5 = { nounwind } !37 = !DILocation(line: 17, column: 5, scope: !28) !38 = !DILocation(line: 18, column: 3, scope: !28) !39 = !DILocation(line: 18, column: 3, scope: !16) -!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_6f0c0_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_624a0_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) !41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial) !42 = !DILocation(line: 0, scope: !40) !43 = !DILocation(line: 12, column: 1, scope: !40) diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 656171896a4ff..0633c3fa687c1 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -62,7 +62,6 @@ ; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't ; want to maintain a list of their allocas, calls, etc. in this test. - ; ModuleID = 'test-openmp-nvptx64-nvidia-cuda.bc' source_filename = "test.c" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" @@ -75,6 +74,8 @@ target triple = "nvptx64-nvidia-cuda" %struct.DeviceMemoryPoolTy = type { ptr, i64 } %struct.DeviceMemoryPoolTrackingTy = type { i64, i64, i64, i64 } %struct.DeviceEnvironmentTy = type { i32, i32, i32, i32, i64, i64, i64, i64 } +%"struct.rpc::Client" = type { %"struct.rpc::Process" } +%"struct.rpc::Process" = type { i32, ptr, ptr, ptr, ptr, [128 x i32] } %"struct.(anonymous namespace)::SharedMemorySmartStackTy" = type { [512 x i8], [1024 x i8] } %"struct.ompx::state::TeamStateTy" = type { %"struct.ompx::state::ICVStateTy", i32, i32, ptr } %"struct.ompx::state::ICVStateTy" = type { i32, i32, i32, i32, i32, i32, i32 } @@ -83,11 +84,11 @@ target triple = "nvptx64-nvidia-cuda" @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 -@0 = private unnamed_addr constant [58 x i8] c";test.c;__omp_offloading_fd02_10d1d6_h_l12_debug__;13;3;;\00", align 1 +@0 = private unnamed_addr constant [58 x i8] c";test.c;__omp_offloading_fd02_100102_h_l12_debug__;13;3;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 57, ptr @0 }, align 8 -@__omp_offloading_fd02_10d1d6_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer -@__omp_offloading_fd02_10d1d6_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_fd02_10d1d6_h_l12_dynamic_environment } -@llvm.used = appending global [3 x ptr] [ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata" +@__omp_offloading_fd02_100102_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_100102_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_fd02_100102_h_l12_dynamic_environment } +@llvm.used = appending global [4 x ptr] [ptr @__llvm_rpc_client, ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata" @__omp_rtl_device_memory_pool = weak protected global %struct.DeviceMemoryPoolTy zeroinitializer, align 8 @__omp_rtl_device_memory_pool_tracker = weak protected global %struct.DeviceMemoryPoolTrackingTy zeroinitializer, align 8 @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 @@ -101,23 +102,24 @@ target triple = "nvptx64-nvidia-cuda" @.str2 = private unnamed_addr constant [18 x i8] c"WorkFn == nullptr\00", align 1 @__PRETTY_FUNCTION__.__kmpc_target_deinit = private unnamed_addr constant [28 x i8] c"void __kmpc_target_deinit()\00", align 1 @IsSPMDMode = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 -@.str1124 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 +@__llvm_rpc_client = weak protected global %"struct.rpc::Client" zeroinitializer, align 8 +@.str1125 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 @.str13 = private unnamed_addr constant [23 x i8] c"!mapping::isSPMDMode()\00", align 1 @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel = private unnamed_addr constant [34 x i8] c"void __kmpc_kernel_end_parallel()\00", align 1 @_ZL20KernelEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZL26KernelLaunchEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16 -@.str541 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 -@.str844 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 +@.str542 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 +@.str845 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_ = private unnamed_addr constant [68 x i8] c"void ompx::state::ICVStateTy::assertEqual(const ICVStateTy &) const\00", align 1 -@.str945 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 -@.str1046 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 -@.str1147 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 -@.str1248 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 -@.str1349 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 +@.str946 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 +@.str1047 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 +@.str1148 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 +@.str1249 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 +@.str1350 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 @.str14 = private unnamed_addr constant [43 x i8] c"ParallelTeamSize == Other.ParallelTeamSize\00", align 1 @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_ = private unnamed_addr constant [64 x i8] c"void ompx::state::TeamStateTy::assertEqual(TeamStateTy &) const\00", align 1 -@.str1550 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 +@.str1551 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 @.str24 = private unnamed_addr constant [32 x i8] c"mapping::isSPMDMode() == IsSPMD\00", align 1 @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb = private unnamed_addr constant [43 x i8] c"void ompx::state::assumeInitialState(bool)\00", align 1 @_ZL9ThreadDST = internal unnamed_addr addrspace(3) global ptr undef, align 8 @@ -125,13 +127,13 @@ target triple = "nvptx64-nvidia-cuda" @_ZN4ompx5state12ThreadStatesE = internal addrspace(3) global ptr undef, align 8 ; Function Attrs: convergent noinline norecurse nounwind optnone -define internal void @__omp_offloading_fd02_10d1d6_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !19 { +define internal void @__omp_offloading_fd02_100102_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !19 { %2 = alloca ptr, align 8 %3 = alloca i32, align 4 %4 = alloca [2 x i32], align 4 store ptr %0, ptr %2, align 8 #dbg_declare(ptr %2, !26, !DIExpression(), !27) - %5 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_10d1d6_h_l12_kernel_environment, ptr %0), !dbg !28 + %5 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_100102_h_l12_kernel_environment, ptr %0), !dbg !28 %6 = icmp eq i32 %5, -1, !dbg !28 br i1 %6, label %7, label %8, !dbg !28 @@ -148,12 +150,12 @@ define internal void @__omp_offloading_fd02_10d1d6_h_l12_debug__(ptr noalias nou } ; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone -define weak_odr protected void @__omp_offloading_fd02_10d1d6_h_l12(ptr noalias noundef %0) #1 !dbg !42 { +define weak_odr protected void @__omp_offloading_fd02_100102_h_l12(ptr noalias noundef %0) #1 !dbg !42 { %2 = alloca ptr, align 8 store ptr %0, ptr %2, align 8 #dbg_declare(ptr %2, !43, !DIExpression(), !44) %3 = load ptr, ptr %2, align 8, !dbg !45 - call void @__omp_offloading_fd02_10d1d6_h_l12_debug__(ptr %3) #17, !dbg !45 + call void @__omp_offloading_fd02_100102_h_l12_debug__(ptr %3) #17, !dbg !45 ret void, !dbg !45 } @@ -190,16 +192,16 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n 13: ; preds = %10 store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 store i8 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512) to ptr addrspace(3)), align 1, !tbaa !64 - tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !65 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 - store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 - store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 - store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !74 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !71 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !72 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !77 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !79 br label %18 14: ; preds = %10 @@ -213,7 +215,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %12, label %19, label %20 19: ; preds = %18 - store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !74 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !81 br label %20 20: ; preds = %18, %19 @@ -221,7 +223,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br label %37 21: ; preds = %2 - %22 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %22 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 %23 = add nsw i32 %22, -1 %24 = and i32 %23, -32 %25 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 @@ -234,16 +236,16 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n %29 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %28 %30 = addrspacecast ptr %29 to ptr addrspace(3) store i8 0, ptr addrspace(3) %30, align 1, !tbaa !64 - tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !65 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 - store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 - store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 - store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !74 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !71 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !72 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !77 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !79 br label %35 31: ; preds = %21 @@ -257,7 +259,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %26, label %36, label %37 36: ; preds = %35 - store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !74 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !81 br label %37 37: ; preds = %36, %35, %20 @@ -265,23 +267,23 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n 38: ; preds = %37 %39 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 - %40 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %40 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !84 %41 = and i32 %39, 1 %42 = and i32 %41, %40 %43 = icmp ne i32 %42, 0 - %44 = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !79 + %44 = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !87 %45 = icmp ne i32 %44, 0 %46 = select i1 %43, i1 %45, i1 false br i1 %46, label %47, label %48 47: ; preds = %38 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str844, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str845, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 48: ; preds = %38 %49 = icmp eq i32 %44, 0 tail call void @llvm.assume(i1 noundef %49) #21 - %50 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !80 + %50 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !88 br i1 %43, label %51, label %54 51: ; preds = %48 @@ -289,14 +291,14 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %52, label %54, label %53 53: ; preds = %51 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str945, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str946, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 54: ; preds = %51, %48 %55 = phi i32 [ 0, %51 ], [ %50, %48 ] %56 = icmp eq i32 %55, 0 tail call void @llvm.assume(i1 noundef %56) #21 - %57 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !81 + %57 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !89 br i1 %43, label %58, label %61 58: ; preds = %54 @@ -304,14 +306,14 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %59, label %61, label %60 60: ; preds = %58 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1046, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1047, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 61: ; preds = %58, %54 %62 = phi i32 [ 0, %58 ], [ %57, %54 ] %63 = icmp eq i32 %62, 0 tail call void @llvm.assume(i1 noundef %63) #21 - %64 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !82 + %64 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !90 br i1 %43, label %65, label %68 65: ; preds = %61 @@ -319,14 +321,14 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %66, label %68, label %67 67: ; preds = %65 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1147, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1148, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 68: ; preds = %65, %61 %69 = phi i32 [ 1, %65 ], [ %64, %61 ] %70 = icmp eq i32 %69, 1 tail call void @llvm.assume(i1 noundef %70) #21 - %71 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !83 + %71 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !91 br i1 %43, label %72, label %93 72: ; preds = %68 @@ -334,7 +336,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %73, label %75, label %74 74: ; preds = %72 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1248, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1249, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 75: ; preds = %72 @@ -343,30 +345,30 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %43, label %77, label %95 77: ; preds = %75 - %78 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !84 + %78 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !92 %79 = icmp eq i32 %78, 1 br i1 %79, label %81, label %80 80: ; preds = %77 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1349, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1350, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 81: ; preds = %77 - %82 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + %82 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !72 %83 = icmp eq i32 %82, 1 br i1 %83, label %85, label %84 84: ; preds = %81 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 unreachable 85: ; preds = %81 - %86 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + %86 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !73 %87 = icmp eq i32 %86, 0 br i1 %87, label %89, label %88 88: ; preds = %85 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1550, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1551, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 unreachable 89: ; preds = %85 @@ -375,7 +377,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %91, label %92, label %98 92: ; preds = %89 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20 unreachable 93: ; preds = %68 @@ -395,10 +397,10 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br label %130 100: ; preds = %37 - %101 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %101 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 %102 = add nsw i32 %101, -1 %103 = and i32 %102, -32 - %104 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !85 + %104 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !93 %105 = icmp eq i32 %104, %103 br i1 %105, label %130, label %106 @@ -418,10 +420,10 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n 116: ; preds = %110, %128 call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %3) #22 - store ptr null, ptr %3, align 8, !tbaa !74 + store ptr null, ptr %3, align 8, !tbaa !94 tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 %117 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %3) #22 - %118 = load ptr, ptr %3, align 8, !tbaa !74 + %118 = load ptr, ptr %3, align 8, !tbaa !94 %119 = icmp eq ptr %118, null br i1 %119, label %129, label %120 @@ -448,7 +450,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n 128: ; preds = %126, %120 tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #22 - br label %116, !llvm.loop !86 + br label %116, !llvm.loop !95 129: ; preds = %116 call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #22 @@ -520,17 +522,17 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #10 declare void @llvm.nvvm.barrier.sync(i32) #11 ; Function Attrs: convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) -define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %0) local_unnamed_addr #12 { - %2 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 - store ptr %2, ptr %0, align 8, !tbaa !74 +define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) initializes((0, 8)) %0) local_unnamed_addr #12 { + %2 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !94 + store ptr %2, ptr %0, align 8, !tbaa !94 %3 = icmp eq ptr %2, null br i1 %3, label %15, label %4 4: ; preds = %1 - %5 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !85 + %5 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !93 %6 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !63 %7 = icmp eq i32 %6, 0 - %8 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %8 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 %9 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 %10 = icmp eq i32 %9, 0 %11 = select i1 %10, i32 -32, i32 0 @@ -547,7 +549,7 @@ define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree ; Function Attrs: convergent mustprogress noinline nounwind define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { %1 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 - %2 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %2 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !84 %3 = and i32 %1, 1 %4 = and i32 %3, %2 %5 = icmp ne i32 %4, 0 @@ -557,7 +559,7 @@ define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { br i1 %8, label %9, label %10 9: ; preds = %0 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1124, i32 noundef 298, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1125, i32 noundef 298, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 unreachable 10: ; preds = %0 @@ -571,21 +573,21 @@ define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { br i1 %16, label %17, label %30 17: ; preds = %10 - %18 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !85 - %19 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + %18 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !93 + %19 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 %20 = zext nneg i32 %18 to i64 %21 = getelementptr inbounds nuw ptr, ptr %19, i64 %20 - %22 = load ptr, ptr %21, align 8, !tbaa !74 + %22 = load ptr, ptr %21, align 8, !tbaa !97 %23 = icmp eq ptr %22, null - br i1 %23, label %30, label %24, !prof !88 + br i1 %23, label %30, label %24, !prof !99 24: ; preds = %17 %25 = getelementptr inbounds nuw i8, ptr %22, i64 32 - %26 = load ptr, ptr %25, align 8, !tbaa !89 + %26 = load ptr, ptr %25, align 8, !tbaa !100 tail call void @free(ptr noundef nonnull dereferenceable(40) %22) #28 - %27 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + %27 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 %28 = getelementptr inbounds nuw ptr, ptr %27, i64 %20 - store ptr %26, ptr %28, align 8, !tbaa !74 + store ptr %26, ptr %28, align 8, !tbaa !97 %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 br label %30 @@ -596,7 +598,7 @@ define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { br i1 %33, label %34, label %35 34: ; preds = %30 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1124, i32 noundef 301, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1125, i32 noundef 301, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 unreachable 35: ; preds = %30 @@ -628,29 +630,29 @@ define internal void @__kmpc_target_deinit() #4 { br i1 %3, label %4, label %27 4: ; preds = %0 - %5 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %5 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 %6 = add nsw i32 %5, -1 %7 = and i32 %6, -32 - %8 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !85 + %8 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !93 %9 = icmp eq i32 %8, %7 br i1 %9, label %10, label %11 10: ; preds = %4 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !94 br label %27 11: ; preds = %4 - %12 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 - %13 = load i8, ptr %12, align 8, !tbaa !91 + %12 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !77 + %13 = load i8, ptr %12, align 8, !tbaa !102 %14 = icmp eq i8 %13, 0 br i1 %14, label %15, label %27 15: ; preds = %11 call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %1) #29 - store ptr null, ptr %1, align 8, !tbaa !74 + store ptr null, ptr %1, align 8, !tbaa !94 %16 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %1) #22 %17 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 - %18 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %18 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !84 %19 = and i32 %17, 1 %20 = and i32 %19, %18 %21 = icmp eq i32 %20, 0 @@ -721,15 +723,15 @@ attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } !8 = !{i32 7, !"frame-pointer", i32 2} !9 = !{i32 1, !"ThinLTO", i32 0} !10 = !{i32 1, !"EnableSplitLTOUnit", i32 1} -!11 = distinct !DICompileUnit(language: DW_LANG_C11, file: !12, producer: "clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!11 = distinct !DICompileUnit(language: DW_LANG_C11, file: !12, producer: "clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) !12 = !DIFile(filename: "test.c", directory: "/tmp") -!13 = !{ptr @__omp_offloading_fd02_10d1d6_h_l12, !"maxntidx", i32 128} -!14 = !{ptr @__omp_offloading_fd02_10d1d6_h_l12, !"kernel", i32 1} -!15 = !{i32 0, i32 64770, i32 1102294, !"h", i32 12, i32 0, i32 0} -!16 = !{!"clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)"} +!13 = !{ptr @__omp_offloading_fd02_100102_h_l12, !"maxntidx", i32 128} +!14 = !{ptr @__omp_offloading_fd02_100102_h_l12, !"kernel", i32 1} +!15 = !{i32 0, i32 64770, i32 1048834, !"h", i32 12, i32 0, i32 0} +!16 = !{!"clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)"} !17 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} !18 = !{i32 2, i32 0} -!19 = distinct !DISubprogram(name: "__omp_offloading_fd02_10d1d6_h_l12_debug__", scope: !12, file: !12, line: 13, type: !20, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) +!19 = distinct !DISubprogram(name: "__omp_offloading_fd02_100102_h_l12_debug__", scope: !12, file: !12, line: 13, type: !20, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) !20 = !DISubroutineType(types: !21) !21 = !{null, !22} !22 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !23) @@ -752,7 +754,7 @@ attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } !39 = !DILocation(line: 17, column: 5, scope: !30) !40 = !DILocation(line: 18, column: 3, scope: !30) !41 = !DILocation(line: 18, column: 3, scope: !19) -!42 = distinct !DISubprogram(name: "__omp_offloading_fd02_10d1d6_h_l12", scope: !12, file: !12, line: 12, type: !20, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) +!42 = distinct !DISubprogram(name: "__omp_offloading_fd02_100102_h_l12", scope: !12, file: !12, line: 12, type: !20, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) !43 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !42, type: !22, flags: DIFlagArtificial) !44 = !DILocation(line: 0, scope: !42) !45 = !DILocation(line: 12, column: 1, scope: !42) @@ -778,28 +780,41 @@ attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } !65 = !{!66, !61, i64 16} !66 = !{!"_ZTSN4ompx5state11TeamStateTyE", !67, i64 0, !61, i64 28, !61, i64 32, !68, i64 40} !67 = !{!"_ZTSN4ompx5state10ICVStateTyE", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !61, i64 16, !61, i64 20, !61, i64 24} -!68 = !{!"any pointer", !58, i64 0} -!69 = !{!66, !61, i64 20} -!70 = !{!66, !61, i64 24} -!71 = !{!66, !61, i64 28} -!72 = !{!66, !61, i64 32} -!73 = !{!66, !68, i64 40} -!74 = !{!68, !68, i64 0} -!75 = !{i32 1, i32 1025} -!76 = !{!77, !61, i64 0} -!77 = !{!"_ZTS19DeviceEnvironmentTy", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !78, i64 16, !78, i64 24, !78, i64 32, !78, i64 40} -!78 = !{!"long", !58, i64 0} -!79 = !{!67, !61, i64 0} -!80 = !{!67, !61, i64 4} -!81 = !{!67, !61, i64 8} -!82 = !{!67, !61, i64 16} -!83 = !{!67, !61, i64 20} -!84 = !{!67, !61, i64 24} -!85 = !{i32 0, i32 1024} -!86 = distinct !{!86, !87} -!87 = !{!"llvm.loop.mustprogress"} -!88 = !{!"branch_weights", !"expected", i32 2000, i32 1} -!89 = !{!90, !68, i64 32} -!90 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !67, i64 0, !68, i64 32} -!91 = !{!92, !58, i64 0} -!92 = !{!"_ZTS19KernelEnvironmentTy", !57, i64 0, !68, i64 32, !68, i64 40} +!68 = !{!"p1 void", !69, i64 0} +!69 = !{!"any pointer", !58, i64 0} +!70 = !{!66, !61, i64 20} +!71 = !{!66, !61, i64 24} +!72 = !{!66, !61, i64 28} +!73 = !{!66, !61, i64 32} +!74 = !{!66, !68, i64 40} +!75 = !{!76, !76, i64 0} +!76 = !{!"p2 _ZTSN4ompx5state13ThreadStateTyE", !69, i64 0} +!77 = !{!78, !78, i64 0} +!78 = !{!"p1 _ZTS19KernelEnvironmentTy", !69, i64 0} +!79 = !{!80, !80, i64 0} +!80 = !{!"p1 _ZTS25KernelLaunchEnvironmentTy", !69, i64 0} +!81 = !{!82, !82, i64 0} +!82 = !{!"p2 _ZTS22DynamicScheduleTracker", !69, i64 0} +!83 = !{i32 1, i32 1025} +!84 = !{!85, !61, i64 0} +!85 = !{!"_ZTS19DeviceEnvironmentTy", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !86, i64 16, !86, i64 24, !86, i64 32, !86, i64 40} +!86 = !{!"long", !58, i64 0} +!87 = !{!67, !61, i64 0} +!88 = !{!67, !61, i64 4} +!89 = !{!67, !61, i64 8} +!90 = !{!67, !61, i64 16} +!91 = !{!67, !61, i64 20} +!92 = !{!67, !61, i64 24} +!93 = !{i32 0, i32 1024} +!94 = !{!68, !68, i64 0} +!95 = distinct !{!95, !96} +!96 = !{!"llvm.loop.mustprogress"} +!97 = !{!98, !98, i64 0} +!98 = !{!"p1 _ZTSN4ompx5state13ThreadStateTyE", !69, i64 0} +!99 = !{!"branch_weights", !"expected", i32 2000, i32 1} +!100 = !{!101, !98, i64 32} +!101 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !67, i64 0, !98, i64 32} +!102 = !{!103, !58, i64 0} +!103 = !{!"_ZTS19KernelEnvironmentTy", !57, i64 0, !104, i64 32, !105, i64 40} +!104 = !{!"p1 _ZTS7IdentTy", !69, i64 0} +!105 = !{!"p1 _ZTS20DynamicEnvironmentTy", !69, i64 0} From bb9d5c20c869d3429ee8c9ea3c16ca2551bb2734 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 6 Jan 2025 18:58:10 -0500 Subject: [PATCH 43/46] Relocate and use llvm::omp::getDeviceKernels --- llvm/include/llvm/IR/Module.h | 4 ++ llvm/lib/Analysis/KernelInfo.cpp | 8 +--- llvm/lib/IR/Module.cpp | 26 ++++++++++++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 24 ++--------- llvm/test/Analysis/KernelInfo/linkage.ll | 51 ++++++++++++++++++------ 5 files changed, 74 insertions(+), 39 deletions(-) diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h index 12b50fc506516..9a50cb8ce320d 100644 --- a/llvm/include/llvm/IR/Module.h +++ b/llvm/include/llvm/IR/Module.h @@ -16,6 +16,7 @@ #include "llvm-c/Types.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator_range.h" @@ -513,6 +514,9 @@ class LLVM_ABI Module { /// Remove the given NamedMDNode from this module and delete it. void eraseNamedMetadata(NamedMDNode *NMD); + /// Get device kernels in the module. + SetVector getDeviceKernels(); + /// @} /// @name Comdat Accessors /// @{ diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index f9832a6deb75a..f9683d06c8737 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -78,11 +78,6 @@ class KernelInfo { } // end anonymous namespace -static bool isKernelFunction(Function &F) { - // TODO: Is this general enough? Consider languages beyond OpenMP. - return F.hasFnAttribute("kernel"); -} - static void identifyCallee(OptimizationRemark &R, const Module *M, const Value *V, StringRef Kind = "") { SmallString<100> Name; // might be function name or asm expression @@ -292,7 +287,8 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, KI.FlatAddrspace = TheTTI.getFlatAddressSpace(); // Record function properties. - KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); + KI.ExternalNotKernel = + F.hasExternalLinkage() && !F.getParent()->getDeviceKernels().contains(&F); for (StringRef Name : {"omp_target_num_teams", "omp_target_thread_limit"}) { if (auto Val = parseFnAttrAsInteger(F, Name)) KI.LaunchBounds.push_back({Name, *Val}); diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index c7b9f8744d8d3..b48a2466d7ecb 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -322,6 +322,32 @@ void Module::eraseNamedMetadata(NamedMDNode *NMD) { eraseNamedMDNode(NMD); } +SetVector Module::getDeviceKernels() { + // TODO: Create a more cross-platform way of determining device kernels. + NamedMDNode *MD = getNamedMetadata("nvvm.annotations"); + SetVector Kernels; + + if (!MD) + return Kernels; + + for (auto *Op : MD->operands()) { + if (Op->getNumOperands() < 2) + continue; + MDString *KindID = dyn_cast(Op->getOperand(1)); + if (!KindID || KindID->getString() != "kernel") + continue; + + Function *KernelFn = + mdconst::dyn_extract_or_null(Op->getOperand(0)); + if (!KernelFn) + continue; + + Kernels.insert(KernelFn); + } + + return Kernels; +} + bool Module::isValidModFlagBehavior(Metadata *MD, ModFlagBehavior &MFB) { if (ConstantInt *Behavior = mdconst::dyn_extract_or_null(MD)) { uint64_t Val = Behavior->getLimitedValue(); diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index b40ab357670b8..6777549f06d0a 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -5910,34 +5910,16 @@ bool llvm::omp::isOpenMPKernel(Function &Fn) { } KernelSet llvm::omp::getDeviceKernels(Module &M) { - // TODO: Create a more cross-platform way of determining device kernels. - NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"); KernelSet Kernels; - - if (!MD) - return Kernels; - - for (auto *Op : MD->operands()) { - if (Op->getNumOperands() < 2) - continue; - MDString *KindID = dyn_cast(Op->getOperand(1)); - if (!KindID || KindID->getString() != "kernel") - continue; - - Function *KernelFn = - mdconst::dyn_extract_or_null(Op->getOperand(0)); - if (!KernelFn) - continue; - + for (Kernel K : M.getDeviceKernels()) { // We are only interested in OpenMP target regions. Others, such as kernels // generated by CUDA but linked together, are not interesting to this pass. - if (isOpenMPKernel(*KernelFn)) { + if (isOpenMPKernel(*K)) { ++NumOpenMPTargetRegionKernels; - Kernels.insert(KernelFn); + Kernels.insert(K); } else ++NumNonOpenMPTargetRegionKernels; } - return Kernels; } diff --git a/llvm/test/Analysis/KernelInfo/linkage.ll b/llvm/test/Analysis/KernelInfo/linkage.ll index 43154d2379825..ae3657d9bcdc8 100644 --- a/llvm/test/Analysis/KernelInfo/linkage.ll +++ b/llvm/test/Analysis/KernelInfo/linkage.ll @@ -7,32 +7,46 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" -; CHECK: remark: test.c:3:0: in function 'f', ExternalNotKernel = 1 -define external void @f() !dbg !10 { +; CHECK: remark: test.c:3:0: in function 'extNotKer', ExternalNotKernel = 1 +define external void @extNotKer() !dbg !10 { entry: ret void } -; CHECK: remark: test.c:13:0: in artificial function 'g', ExternalNotKernel = 1 -define void @g() !dbg !20 { +; CHECK: remark: test.c:13:0: in artificial function 'impNotKer', ExternalNotKernel = 1 +define void @impNotKer() !dbg !20 { entry: ret void } -; CHECK: remark: test.c:23:0: in function 'h', ExternalNotKernel = 0 -define external void @h() #0 !dbg !30 { +; CHECK: remark: test.c:23:0: in artificial function 'weakNotKer', ExternalNotKernel = 0 +define weak void @weakNotKer() !dbg !30 { entry: ret void } -; CHECK: remark: test.c:33:0: in artificial function 'i', ExternalNotKernel = 0 -define weak void @i() !dbg !40 { +; CHECK: remark: test.c:33:0: in function 'extKerAttr', ExternalNotKernel = 0 +define external void @extKerAttr() #0 !dbg !40 { +entry: + ret void +} + +; CHECK: remark: test.c:43:0: in function 'extKer', ExternalNotKernel = 0 +define external void @extKer() !dbg !50 { +entry: + ret void +} + +; CHECK: remark: test.c:53:0: in artificial function 'weakKer', ExternalNotKernel = 0 +define weak void @weakKer() !dbg !60 { entry: ret void } attributes #0 = { "kernel" } +!nvvm.annotations = !{!42, !52, !62} + !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} @@ -41,11 +55,24 @@ attributes #0 = { "kernel" } !2 = !DIFile(filename: "test.c", directory: "/tmp") !3 = !{null} !4 = !{} -!10 = distinct !DISubprogram(name: "f", scope: !2, file: !2, line: 3, type: !11, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) + +!10 = distinct !DISubprogram(name: "extNotKer", scope: !2, file: !2, line: 3, type: !11, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) !11 = !DISubroutineType(types: !3) -!20 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 13, type: !21, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) + +!20 = distinct !DISubprogram(name: "impNotKer", scope: !2, file: !2, line: 13, type: !21, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) !21 = distinct !DISubroutineType(types: !3) -!30 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 23, type: !31, scopeLine: 23, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) + +!30 = distinct !DISubprogram(name: "weakNotKer", scope: !2, file: !2, line: 23, type: !31, scopeLine: 23, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) !31 = distinct !DISubroutineType(types: !3) -!40 = distinct !DISubprogram(name: "i", scope: !2, file: !2, line: 33, type: !41, scopeLine: 33, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) + +!40 = distinct !DISubprogram(name: "extKerAttr", scope: !2, file: !2, line: 33, type: !41, scopeLine: 33, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) !41 = distinct !DISubroutineType(types: !3) +!42 = !{ptr @extKerAttr, !"kernel", i32 1} + +!50 = distinct !DISubprogram(name: "extKer", scope: !2, file: !2, line: 43, type: !51, scopeLine: 43, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!51 = distinct !DISubroutineType(types: !3) +!52 = !{ptr @extKer, !"kernel", i32 1} + +!60 = distinct !DISubprogram(name: "weakKer", scope: !2, file: !2, line: 53, type: !61, scopeLine: 53, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!61 = distinct !DISubroutineType(types: !3) +!62 = !{ptr @weakKer, !"kernel", i32 1} From 0a347cf1fe2adc6cc6ef7bcaa6d66b432941d2fb Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 6 Jan 2025 20:17:59 -0500 Subject: [PATCH 44/46] Extend test to cover dyn and non-entry allocas --- llvm/test/Analysis/KernelInfo/allocas.ll | 107 +++++++++++++++-------- 1 file changed, 72 insertions(+), 35 deletions(-) diff --git a/llvm/test/Analysis/KernelInfo/allocas.ll b/llvm/test/Analysis/KernelInfo/allocas.ll index 3ecde004a9b2a..94506645f7ec6 100644 --- a/llvm/test/Analysis/KernelInfo/allocas.ll +++ b/llvm/test/Analysis/KernelInfo/allocas.ll @@ -7,7 +7,7 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" -define void @h() !dbg !3 { +define void @h() !dbg !100 { entry: ; CHECK: remark: test.c:0:0: in artificial function 'h', artificial alloca ('%dyn_ptr.addr') for 'dyn_ptr' with static size of 8 bytes %dyn_ptr.addr = alloca ptr, align 8 @@ -15,25 +15,39 @@ entry: %i = alloca i32, align 4 ; CHECK: remark: test.c:15:9: in artificial function 'h', alloca ('%a') for 'a' with static size of 8 bytes %a = alloca [2 x i32], align 4 + %size = load i32, ptr %i, align 4 + ; CHECK: remark: test.c:16:9: in artificial function 'h', alloca ('%adyn') for 'adyn' with dynamic size + %adyn = alloca i32, i32 %size, align 4 ; CHECK: remark: :0:0: in artificial function 'h', alloca ('%nodbg') without debug info with static size of 4 bytes %nodbg = alloca i32, align 4 - tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !7, metadata !DIExpression()), !dbg !11 - tail call void @llvm.dbg.declare(metadata ptr %i, metadata !12, metadata !DIExpression()), !dbg !15 - tail call void @llvm.dbg.declare(metadata ptr %a, metadata !16, metadata !DIExpression()), !dbg !20 + tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !110, metadata !DIExpression()), !dbg !114 + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !120, metadata !DIExpression()), !dbg !121 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !130, metadata !DIExpression()), !dbg !131 + tail call void @llvm.dbg.declare(metadata ptr %adyn, metadata !140, metadata !DIExpression()), !dbg !141 + br label %non-entry + +non-entry: + ; CHECK: remark: test.c:17:9: in artificial function 'h', alloca ('%i2') for 'i2' with static size of 4 bytes + %i2 = alloca i32, align 4 + %size2 = load i32, ptr %i2, align 4 + ; CHECK: remark: test.c:18:9: in artificial function 'h', alloca ('%adyn2') for 'adyn2' with dynamic size + %adyn2 = alloca i32, i32 %size, align 4 + tail call void @llvm.dbg.declare(metadata ptr %i2, metadata !150, metadata !DIExpression()), !dbg !151 + tail call void @llvm.dbg.declare(metadata ptr %adyn2, metadata !160, metadata !DIExpression()), !dbg !161 ret void } -; CHECK: remark: test.c:13:0: in artificial function 'h', Allocas = 4 -; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasStaticSizeSum = 24 -; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasDyn = 0 +; CHECK: remark: test.c:13:0: in artificial function 'h', Allocas = 7 +; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasStaticSizeSum = 28 +; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasDyn = 2 -define void @g() !dbg !21 { +define void @g() !dbg !200 { entry: ; CHECK: remark: test.c:4:7: in function 'g', alloca ('%i') for 'i' with static size of 4 bytes %i = alloca i32, align 4 ; CHECK: remark: test.c:5:7: in function 'g', alloca ('%a') for 'a' with static size of 8 bytes %a = alloca [2 x i32], align 4 - tail call void @llvm.dbg.declare(metadata ptr %i, metadata !23, metadata !DIExpression()), !dbg !24 - tail call void @llvm.dbg.declare(metadata ptr %a, metadata !25, metadata !DIExpression()), !dbg !26 + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !210, metadata !DIExpression()), !dbg !211 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !220, metadata !DIExpression()), !dbg !221 ret void } ; CHECK: remark: test.c:3:0: in function 'g', Allocas = 2 @@ -44,7 +58,7 @@ entry: declare void @llvm.dbg.declare(metadata, metadata, metadata) #0 ; uselistorder directives -uselistorder ptr @llvm.dbg.declare, { 4, 3, 2, 1, 0 } +uselistorder ptr @llvm.dbg.declare, { 7, 6, 5, 4, 3, 2, 1, 0 } attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } @@ -54,27 +68,50 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) !2 = !DIFile(filename: "test.c", directory: "/tmp") -!3 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !4, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !6) -!4 = distinct !DISubroutineType(types: !5) -!5 = !{null} -!6 = !{} -!7 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !3, type: !8, flags: DIFlagArtificial) -!8 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !9) -!9 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !10) -!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) -!11 = !DILocation(line: 0, scope: !3) -!12 = !DILocalVariable(name: "i", scope: !13, file: !2, line: 14, type: !14) -!13 = distinct !DILexicalBlock(scope: !3, file: !2, line: 13, column: 3) -!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!15 = !DILocation(line: 14, column: 9, scope: !13) -!16 = !DILocalVariable(name: "a", scope: !13, file: !2, line: 15, type: !17) -!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !14, size: 64, elements: !18) -!18 = !{!19} -!19 = !DISubrange(count: 2) -!20 = !DILocation(line: 15, column: 9, scope: !13) -!21 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !22, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !6) -!22 = !DISubroutineType(types: !5) -!23 = !DILocalVariable(name: "i", scope: !21, file: !2, line: 4, type: !14) -!24 = !DILocation(line: 4, column: 7, scope: !21) -!25 = !DILocalVariable(name: "a", scope: !21, file: !2, line: 5, type: !17) -!26 = !DILocation(line: 5, column: 7, scope: !21) +!3 = !{null} +!4 = !{} + +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + +!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21) +!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22) +!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) + +!30 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 64, elements: !31) +!31 = !{!32} +!32 = !DISubrange(count: 2) + +!40 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, elements: !41) +!41 = !{!42} +!42 = !DISubrange(count: !43) +!43 = !DILocalVariable(name: "__vla_expr0", scope: !100, type: !10, flags: DIFlagArtificial) + +!100 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!101 = distinct !DISubroutineType(types: !3) + +!110 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !100, type: !20, flags: DIFlagArtificial) +!114 = !DILocation(line: 0, scope: !100) + +!120 = !DILocalVariable(name: "i", scope: !100, file: !2, line: 14, type: !10) +!121 = !DILocation(line: 14, column: 9, scope: !100) + +!130 = !DILocalVariable(name: "a", scope: !100, file: !2, line: 15, type: !30) +!131 = !DILocation(line: 15, column: 9, scope: !100) + +!140 = !DILocalVariable(name: "adyn", scope: !100, file: !2, line: 16, type: !40) +!141 = !DILocation(line: 16, column: 9, scope: !100) + +!150 = !DILocalVariable(name: "i2", scope: !100, file: !2, line: 17, type: !10) +!151 = !DILocation(line: 17, column: 9, scope: !100) + +!160 = !DILocalVariable(name: "adyn2", scope: !100, file: !2, line: 18, type: !40) +!161 = !DILocation(line: 18, column: 9, scope: !100) + +!200 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!201 = !DISubroutineType(types: !3) + +!210 = !DILocalVariable(name: "i", scope: !200, file: !2, line: 4, type: !10) +!211 = !DILocation(line: 4, column: 7, scope: !200) + +!220 = !DILocalVariable(name: "a", scope: !200, file: !2, line: 5, type: !30) +!221 = !DILocation(line: 5, column: 7, scope: !200) From 2d321ce3cd51ac182f8e53a8f56881fba59c149b Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 27 Jan 2025 16:55:51 -0500 Subject: [PATCH 45/46] Revert "Relocate and use llvm::omp::getDeviceKernels" This reverts commit bb9d5c20c869d3429ee8c9ea3c16ca2551bb2734. This will facilitate merging main due to 07ed818 (PR #122320), which changes llvm::omp::getDeviceKernels. Will rewrite and reapply after merging main. --- llvm/include/llvm/IR/Module.h | 4 -- llvm/lib/Analysis/KernelInfo.cpp | 8 +++- llvm/lib/IR/Module.cpp | 26 ------------ llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 24 +++++++++-- llvm/test/Analysis/KernelInfo/linkage.ll | 51 ++++++------------------ 5 files changed, 39 insertions(+), 74 deletions(-) diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h index 9a50cb8ce320d..12b50fc506516 100644 --- a/llvm/include/llvm/IR/Module.h +++ b/llvm/include/llvm/IR/Module.h @@ -16,7 +16,6 @@ #include "llvm-c/Types.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator_range.h" @@ -514,9 +513,6 @@ class LLVM_ABI Module { /// Remove the given NamedMDNode from this module and delete it. void eraseNamedMetadata(NamedMDNode *NMD); - /// Get device kernels in the module. - SetVector getDeviceKernels(); - /// @} /// @name Comdat Accessors /// @{ diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index f9683d06c8737..f9832a6deb75a 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -78,6 +78,11 @@ class KernelInfo { } // end anonymous namespace +static bool isKernelFunction(Function &F) { + // TODO: Is this general enough? Consider languages beyond OpenMP. + return F.hasFnAttribute("kernel"); +} + static void identifyCallee(OptimizationRemark &R, const Module *M, const Value *V, StringRef Kind = "") { SmallString<100> Name; // might be function name or asm expression @@ -287,8 +292,7 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, KI.FlatAddrspace = TheTTI.getFlatAddressSpace(); // Record function properties. - KI.ExternalNotKernel = - F.hasExternalLinkage() && !F.getParent()->getDeviceKernels().contains(&F); + KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); for (StringRef Name : {"omp_target_num_teams", "omp_target_thread_limit"}) { if (auto Val = parseFnAttrAsInteger(F, Name)) KI.LaunchBounds.push_back({Name, *Val}); diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index b48a2466d7ecb..c7b9f8744d8d3 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -322,32 +322,6 @@ void Module::eraseNamedMetadata(NamedMDNode *NMD) { eraseNamedMDNode(NMD); } -SetVector Module::getDeviceKernels() { - // TODO: Create a more cross-platform way of determining device kernels. - NamedMDNode *MD = getNamedMetadata("nvvm.annotations"); - SetVector Kernels; - - if (!MD) - return Kernels; - - for (auto *Op : MD->operands()) { - if (Op->getNumOperands() < 2) - continue; - MDString *KindID = dyn_cast(Op->getOperand(1)); - if (!KindID || KindID->getString() != "kernel") - continue; - - Function *KernelFn = - mdconst::dyn_extract_or_null(Op->getOperand(0)); - if (!KernelFn) - continue; - - Kernels.insert(KernelFn); - } - - return Kernels; -} - bool Module::isValidModFlagBehavior(Metadata *MD, ModFlagBehavior &MFB) { if (ConstantInt *Behavior = mdconst::dyn_extract_or_null(MD)) { uint64_t Val = Behavior->getLimitedValue(); diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 6777549f06d0a..b40ab357670b8 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -5910,16 +5910,34 @@ bool llvm::omp::isOpenMPKernel(Function &Fn) { } KernelSet llvm::omp::getDeviceKernels(Module &M) { + // TODO: Create a more cross-platform way of determining device kernels. + NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"); KernelSet Kernels; - for (Kernel K : M.getDeviceKernels()) { + + if (!MD) + return Kernels; + + for (auto *Op : MD->operands()) { + if (Op->getNumOperands() < 2) + continue; + MDString *KindID = dyn_cast(Op->getOperand(1)); + if (!KindID || KindID->getString() != "kernel") + continue; + + Function *KernelFn = + mdconst::dyn_extract_or_null(Op->getOperand(0)); + if (!KernelFn) + continue; + // We are only interested in OpenMP target regions. Others, such as kernels // generated by CUDA but linked together, are not interesting to this pass. - if (isOpenMPKernel(*K)) { + if (isOpenMPKernel(*KernelFn)) { ++NumOpenMPTargetRegionKernels; - Kernels.insert(K); + Kernels.insert(KernelFn); } else ++NumNonOpenMPTargetRegionKernels; } + return Kernels; } diff --git a/llvm/test/Analysis/KernelInfo/linkage.ll b/llvm/test/Analysis/KernelInfo/linkage.ll index ae3657d9bcdc8..43154d2379825 100644 --- a/llvm/test/Analysis/KernelInfo/linkage.ll +++ b/llvm/test/Analysis/KernelInfo/linkage.ll @@ -7,46 +7,32 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" -; CHECK: remark: test.c:3:0: in function 'extNotKer', ExternalNotKernel = 1 -define external void @extNotKer() !dbg !10 { +; CHECK: remark: test.c:3:0: in function 'f', ExternalNotKernel = 1 +define external void @f() !dbg !10 { entry: ret void } -; CHECK: remark: test.c:13:0: in artificial function 'impNotKer', ExternalNotKernel = 1 -define void @impNotKer() !dbg !20 { +; CHECK: remark: test.c:13:0: in artificial function 'g', ExternalNotKernel = 1 +define void @g() !dbg !20 { entry: ret void } -; CHECK: remark: test.c:23:0: in artificial function 'weakNotKer', ExternalNotKernel = 0 -define weak void @weakNotKer() !dbg !30 { +; CHECK: remark: test.c:23:0: in function 'h', ExternalNotKernel = 0 +define external void @h() #0 !dbg !30 { entry: ret void } -; CHECK: remark: test.c:33:0: in function 'extKerAttr', ExternalNotKernel = 0 -define external void @extKerAttr() #0 !dbg !40 { -entry: - ret void -} - -; CHECK: remark: test.c:43:0: in function 'extKer', ExternalNotKernel = 0 -define external void @extKer() !dbg !50 { -entry: - ret void -} - -; CHECK: remark: test.c:53:0: in artificial function 'weakKer', ExternalNotKernel = 0 -define weak void @weakKer() !dbg !60 { +; CHECK: remark: test.c:33:0: in artificial function 'i', ExternalNotKernel = 0 +define weak void @i() !dbg !40 { entry: ret void } attributes #0 = { "kernel" } -!nvvm.annotations = !{!42, !52, !62} - !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} @@ -55,24 +41,11 @@ attributes #0 = { "kernel" } !2 = !DIFile(filename: "test.c", directory: "/tmp") !3 = !{null} !4 = !{} - -!10 = distinct !DISubprogram(name: "extNotKer", scope: !2, file: !2, line: 3, type: !11, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!10 = distinct !DISubprogram(name: "f", scope: !2, file: !2, line: 3, type: !11, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) !11 = !DISubroutineType(types: !3) - -!20 = distinct !DISubprogram(name: "impNotKer", scope: !2, file: !2, line: 13, type: !21, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!20 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 13, type: !21, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) !21 = distinct !DISubroutineType(types: !3) - -!30 = distinct !DISubprogram(name: "weakNotKer", scope: !2, file: !2, line: 23, type: !31, scopeLine: 23, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!30 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 23, type: !31, scopeLine: 23, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) !31 = distinct !DISubroutineType(types: !3) - -!40 = distinct !DISubprogram(name: "extKerAttr", scope: !2, file: !2, line: 33, type: !41, scopeLine: 33, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!40 = distinct !DISubprogram(name: "i", scope: !2, file: !2, line: 33, type: !41, scopeLine: 33, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) !41 = distinct !DISubroutineType(types: !3) -!42 = !{ptr @extKerAttr, !"kernel", i32 1} - -!50 = distinct !DISubprogram(name: "extKer", scope: !2, file: !2, line: 43, type: !51, scopeLine: 43, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) -!51 = distinct !DISubroutineType(types: !3) -!52 = !{ptr @extKer, !"kernel", i32 1} - -!60 = distinct !DISubprogram(name: "weakKer", scope: !2, file: !2, line: 53, type: !61, scopeLine: 53, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) -!61 = distinct !DISubroutineType(types: !3) -!62 = !{ptr @weakKer, !"kernel", i32 1} From 1f1ca6cfc17def5595adb71ea00282154d803fc7 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 27 Jan 2025 19:33:21 -0500 Subject: [PATCH 46/46] Relocate and use OpenMPOpt.cpp's isKernelCC Also, regenerate OpenMP tests from current clang so they see the new kernel calling conventions. --- llvm/include/llvm/IR/Function.h | 12 + llvm/lib/Analysis/KernelInfo.cpp | 7 +- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 13 +- llvm/test/Analysis/KernelInfo/linkage.ll | 51 +- .../test/Analysis/KernelInfo/openmp/README.md | 6 +- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 170 +++-- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 597 +++++++++--------- 7 files changed, 430 insertions(+), 426 deletions(-) diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index e7afcbd31420c..fcd5396ccfdbc 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -284,6 +284,18 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node { setValueSubclassData((getSubclassDataFromValue() & 0xc00f) | (ID << 4)); } + /// Does it have a kernel calling convention? + bool hasKernelCallingConv() const { + switch (getCallingConv()) { + default: + return false; + case CallingConv::PTX_Kernel: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return true; + } + } + enum ProfileCountType { PCT_Real, PCT_Synthetic }; /// Class to represent profile counts. diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index f9832a6deb75a..4a06fd5943089 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -78,11 +78,6 @@ class KernelInfo { } // end anonymous namespace -static bool isKernelFunction(Function &F) { - // TODO: Is this general enough? Consider languages beyond OpenMP. - return F.hasFnAttribute("kernel"); -} - static void identifyCallee(OptimizationRemark &R, const Module *M, const Value *V, StringRef Kind = "") { SmallString<100> Name; // might be function name or asm expression @@ -292,7 +287,7 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, KI.FlatAddrspace = TheTTI.getFlatAddressSpace(); // Record function properties. - KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); + KI.ExternalNotKernel = F.hasExternalLinkage() && !F.hasKernelCallingConv(); for (StringRef Name : {"omp_target_num_teams", "omp_target_thread_limit"}) { if (auto Val = parseFnAttrAsInteger(F, Name)) KI.LaunchBounds.push_back({Name, *Val}); diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 10008130016c3..682227916e712 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -5905,17 +5905,6 @@ bool llvm::omp::isOpenMPKernel(Function &Fn) { return Fn.hasFnAttribute("kernel"); } -static bool isKernelCC(Function &F) { - switch (F.getCallingConv()) { - default: - return false; - case CallingConv::PTX_Kernel: - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - return true; - } -} - KernelSet llvm::omp::getDeviceKernels(Module &M) { // TODO: Create a more cross-platform way of determining device kernels. KernelSet Kernels; @@ -5948,7 +5937,7 @@ KernelSet llvm::omp::getDeviceKernels(Module &M) { } for (Function &F : M) - if (isKernelCC(F)) + if (F.hasKernelCallingConv()) ProcessKernel(F); return Kernels; diff --git a/llvm/test/Analysis/KernelInfo/linkage.ll b/llvm/test/Analysis/KernelInfo/linkage.ll index 43154d2379825..8679d366d0cb7 100644 --- a/llvm/test/Analysis/KernelInfo/linkage.ll +++ b/llvm/test/Analysis/KernelInfo/linkage.ll @@ -7,31 +7,47 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" -; CHECK: remark: test.c:3:0: in function 'f', ExternalNotKernel = 1 -define external void @f() !dbg !10 { +; CHECK: remark: test.c:13:0: in artificial function 'extNotKer', ExternalNotKernel = 1 +define external void @extNotKer() !dbg !10 { entry: ret void } -; CHECK: remark: test.c:13:0: in artificial function 'g', ExternalNotKernel = 1 -define void @g() !dbg !20 { +; CHECK: remark: test.c:23:0: in function 'impNotKer', ExternalNotKernel = 1 +define void @impNotKer() !dbg !20 { entry: ret void } -; CHECK: remark: test.c:23:0: in function 'h', ExternalNotKernel = 0 -define external void @h() #0 !dbg !30 { +; CHECK: remark: test.c:33:0: in artificial function 'weakNotKer', ExternalNotKernel = 0 +define weak void @weakNotKer() !dbg !30 { entry: ret void } -; CHECK: remark: test.c:33:0: in artificial function 'i', ExternalNotKernel = 0 -define weak void @i() !dbg !40 { +; CHECK: remark: test.c:43:0: in function 'extPtxKer', ExternalNotKernel = 0 +define external ptx_kernel void @extPtxKer() !dbg !40 { entry: ret void } -attributes #0 = { "kernel" } +; CHECK: remark: test.c:53:0: in artificial function 'extAmdgpuKer', ExternalNotKernel = 0 +define external amdgpu_kernel void @extAmdgpuKer() !dbg !50 { +entry: + ret void +} + +; CHECK: remark: test.c:63:0: in function 'extSpirKer', ExternalNotKernel = 0 +define external spir_kernel void @extSpirKer() !dbg !60 { +entry: + ret void +} + +; CHECK: remark: test.c:73:0: in artificial function 'weakKer', ExternalNotKernel = 0 +define weak ptx_kernel void @weakKer() !dbg !70 { +entry: + ret void +} !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} @@ -41,11 +57,12 @@ attributes #0 = { "kernel" } !2 = !DIFile(filename: "test.c", directory: "/tmp") !3 = !{null} !4 = !{} -!10 = distinct !DISubprogram(name: "f", scope: !2, file: !2, line: 3, type: !11, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) -!11 = !DISubroutineType(types: !3) -!20 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 13, type: !21, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) -!21 = distinct !DISubroutineType(types: !3) -!30 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 23, type: !31, scopeLine: 23, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) -!31 = distinct !DISubroutineType(types: !3) -!40 = distinct !DISubprogram(name: "i", scope: !2, file: !2, line: 33, type: !41, scopeLine: 33, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) -!41 = distinct !DISubroutineType(types: !3) +!5 = !DISubroutineType(types: !3) + +!10 = distinct !DISubprogram(name: "extNotKer", scope: !2, file: !2, line: 13, type: !5, scopeLine: 13, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!20 = distinct !DISubprogram(name: "impNotKer", scope: !2, file: !2, line: 23, type: !5, scopeLine: 23, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!30 = distinct !DISubprogram(name: "weakNotKer", scope: !2, file: !2, line: 33, type: !5, scopeLine: 33, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!40 = distinct !DISubprogram(name: "extPtxKer", scope: !2, file: !2, line: 43, type: !5, scopeLine: 43, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!50 = distinct !DISubprogram(name: "extAmdgpuKer", scope: !2, file: !2, line: 53, type: !5, scopeLine: 53, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!60 = distinct !DISubprogram(name: "extSpirKer", scope: !2, file: !2, line: 63, type: !5, scopeLine: 63, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!70 = distinct !DISubprogram(name: "weakKer", scope: !2, file: !2, line: 73, type: !5, scopeLine: 73, flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) diff --git a/llvm/test/Analysis/KernelInfo/openmp/README.md b/llvm/test/Analysis/KernelInfo/openmp/README.md index 5471b2e1b220d..0aeb52f83c5c7 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/README.md +++ b/llvm/test/Analysis/KernelInfo/openmp/README.md @@ -8,8 +8,8 @@ more exhaustively check KernelInfoPrinter features using reduced LLVM IR. The LLVM IR in each test file `$TEST` can be regenerated as follows in the case that Clang OpenMP codegen changes or it becomes desirable to adjust the source OpenMP program below. First, remove the existing LLVM IR from `$TEST`. Then, -where `$TARGET` (e.g., `nvptx64-nvidia-cuda` or `amdgcn-amd-amdhsa`) depends on -`$TEST`: +where `$TARGET` (e.g., `nvptx64-nvidia-cuda-sm_70` or `amdgcn-amd-amdhsa-gfx906`) +depends on `$TEST`: ``` $ cd /tmp @@ -34,7 +34,7 @@ void h(int i) { } } -$ clang -g -fopenmp -fopenmp-targets=$TARGET -save-temps -c test.c +$ clang -g -fopenmp --offload-arch=native -save-temps -c test.c $ llvm-dis test-openmp-$TARGET.bc $ cat test-openmp-$TARGET.ll >> $TEST ``` diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 6016919ec8280..4843408bdda49 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -79,7 +79,7 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 ; CHECK-NOT: {{.}} -; ModuleID = 'test-openmp-amdgcn-amd-amdhsa.bc' +; ModuleID = 'test-openmp-amdgcn-amd-amdhsa-gfx906.bc' source_filename = "test.c" target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" target triple = "amdgcn-amd-amdhsa" @@ -94,14 +94,14 @@ target triple = "amdgcn-amd-amdhsa" @__omp_rtl_assume_threads_oversubscription = weak_odr hidden addrspace(1) constant i32 0 @__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1) constant i32 0 @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1) constant i32 0 -@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_624a0_h_l12_debug__;13;3;;\00", align 1 +@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_727e9_h_l12_debug__;13;3;;\00", align 1 @1 = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 56, ptr @0 }, align 8 -@__omp_offloading_fd02_624a0_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer -@__omp_offloading_fd02_624a0_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_624a0_h_l12_dynamic_environment to ptr) } +@__omp_offloading_fd02_727e9_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_727e9_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_727e9_h_l12_dynamic_environment to ptr) } @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 ; Function Attrs: convergent noinline norecurse nounwind optnone -define internal void @__omp_offloading_fd02_624a0_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !16 { +define internal void @__omp_offloading_fd02_727e9_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !15 { %2 = alloca ptr, align 8, addrspace(5) %3 = alloca i32, align 4, addrspace(5) %4 = alloca [2 x i32], align 4, addrspace(5) @@ -109,32 +109,32 @@ define internal void @__omp_offloading_fd02_624a0_h_l12_debug__(ptr noalias noun %6 = addrspacecast ptr addrspace(5) %3 to ptr %7 = addrspacecast ptr addrspace(5) %4 to ptr store ptr %0, ptr %5, align 8 - #dbg_declare(ptr addrspace(5) %2, !24, !DIExpression(), !25) - %8 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_624a0_h_l12_kernel_environment to ptr), ptr %0), !dbg !26 - %9 = icmp eq i32 %8, -1, !dbg !26 - br i1 %9, label %10, label %11, !dbg !26 + #dbg_declare(ptr addrspace(5) %2, !23, !DIExpression(), !24) + %8 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_727e9_h_l12_kernel_environment to ptr), ptr %0), !dbg !25 + %9 = icmp eq i32 %8, -1, !dbg !25 + br i1 %9, label %10, label %11, !dbg !25 10: ; preds = %1 - #dbg_declare(ptr addrspace(5) %3, !27, !DIExpression(), !30) - #dbg_declare(ptr addrspace(5) %4, !31, !DIExpression(), !35) - call void @f() #4, !dbg !36 - call void @g() #4, !dbg !37 - call void @__kmpc_target_deinit(), !dbg !38 - ret void, !dbg !39 + #dbg_declare(ptr addrspace(5) %3, !26, !DIExpression(), !29) + #dbg_declare(ptr addrspace(5) %4, !30, !DIExpression(), !34) + call void @f() #4, !dbg !35 + call void @g() #4, !dbg !36 + call void @__kmpc_target_deinit(), !dbg !37 + ret void, !dbg !38 11: ; preds = %1 - ret void, !dbg !26 + ret void, !dbg !25 } ; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone -define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_624a0_h_l12(ptr noalias noundef %0) #1 !dbg !40 { +define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_727e9_h_l12(ptr noalias noundef %0) #1 !dbg !39 { %2 = alloca ptr, align 8, addrspace(5) %3 = addrspacecast ptr addrspace(5) %2 to ptr store ptr %0, ptr %3, align 8 - #dbg_declare(ptr addrspace(5) %2, !41, !DIExpression(), !42) - %4 = load ptr, ptr %3, align 8, !dbg !43 - call void @__omp_offloading_fd02_624a0_h_l12_debug__(ptr %4) #5, !dbg !43 - ret void, !dbg !43 + #dbg_declare(ptr addrspace(5) %2, !40, !DIExpression(), !41) + %4 = load ptr, ptr %3, align 8, !dbg !42 + call void @__omp_offloading_fd02_727e9_h_l12_debug__(ptr %4) #5, !dbg !42 + ret void, !dbg !42 } declare i32 @__kmpc_target_init(ptr, ptr) @@ -145,16 +145,16 @@ declare void @f(...) #2 declare void @__kmpc_target_deinit() ; Function Attrs: convergent noinline nounwind optnone -define hidden void @g() #3 !dbg !44 { +define hidden void @g() #3 !dbg !43 { %1 = alloca i32, align 4, addrspace(5) %2 = alloca [2 x i32], align 4, addrspace(5) %3 = addrspacecast ptr addrspace(5) %1 to ptr %4 = addrspacecast ptr addrspace(5) %2 to ptr - #dbg_declare(ptr addrspace(5) %1, !47, !DIExpression(), !48) - #dbg_declare(ptr addrspace(5) %2, !49, !DIExpression(), !50) - call void @f() #4, !dbg !51 - call void @g() #4, !dbg !52 - ret void, !dbg !53 + #dbg_declare(ptr addrspace(5) %1, !46, !DIExpression(), !47) + #dbg_declare(ptr addrspace(5) %2, !48, !DIExpression(), !49) + call void @f() #4, !dbg !50 + call void @g() #4, !dbg !51 + ret void, !dbg !52 } attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } @@ -166,62 +166,60 @@ attributes #5 = { nounwind } !llvm.dbg.cu = !{!0} !omp_offload.info = !{!2} -!nvvm.annotations = !{!3} -!llvm.module.flags = !{!4, !5, !6, !7, !8, !9, !10, !11, !12} -!llvm.ident = !{!13, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14} -!opencl.ocl.version = !{!15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} - -!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "44c4bbdbb9b7a9c7492ced3432d74b0c") -!2 = !{i32 0, i32 64770, i32 402592, !"h", i32 12, i32 0, i32 0} -!3 = !{ptr @__omp_offloading_fd02_624a0_h_l12, !"kernel", i32 1} -!4 = !{i32 1, !"amdhsa_code_object_version", i32 500} -!5 = !{i32 7, !"Dwarf Version", i32 5} -!6 = !{i32 2, !"Debug Info Version", i32 3} -!7 = !{i32 1, !"wchar_size", i32 4} -!8 = !{i32 7, !"openmp", i32 51} -!9 = !{i32 7, !"openmp-device", i32 51} -!10 = !{i32 8, !"PIC Level", i32 2} -!11 = !{i32 7, !"frame-pointer", i32 2} -!12 = !{i32 4, !"amdgpu_hostcall", i32 1} -!13 = !{!"clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)"} -!14 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"} -!15 = !{i32 2, i32 0} -!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_624a0_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) -!17 = !DIFile(filename: "test.c", directory: "/tmp") -!18 = !DISubroutineType(types: !19) -!19 = !{null, !20} -!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21) -!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22) -!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) -!23 = !{} -!24 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !16, type: !20, flags: DIFlagArtificial) -!25 = !DILocation(line: 0, scope: !16) -!26 = !DILocation(line: 13, column: 3, scope: !16) -!27 = !DILocalVariable(name: "i", scope: !28, file: !17, line: 14, type: !29) -!28 = distinct !DILexicalBlock(scope: !16, file: !17, line: 13, column: 3) -!29 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!30 = !DILocation(line: 14, column: 9, scope: !28) -!31 = !DILocalVariable(name: "a", scope: !28, file: !17, line: 15, type: !32) -!32 = !DICompositeType(tag: DW_TAG_array_type, baseType: !29, size: 64, elements: !33) -!33 = !{!34} -!34 = !DISubrange(count: 2) -!35 = !DILocation(line: 15, column: 9, scope: !28) -!36 = !DILocation(line: 16, column: 5, scope: !28) -!37 = !DILocation(line: 17, column: 5, scope: !28) -!38 = !DILocation(line: 18, column: 3, scope: !28) -!39 = !DILocation(line: 18, column: 3, scope: !16) -!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_624a0_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) -!41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial) -!42 = !DILocation(line: 0, scope: !40) -!43 = !DILocation(line: 12, column: 1, scope: !40) -!44 = distinct !DISubprogram(name: "g", scope: !17, file: !17, line: 3, type: !45, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !23) -!45 = !DISubroutineType(types: !46) -!46 = !{null} -!47 = !DILocalVariable(name: "i", scope: !44, file: !17, line: 4, type: !29) -!48 = !DILocation(line: 4, column: 7, scope: !44) -!49 = !DILocalVariable(name: "a", scope: !44, file: !17, line: 5, type: !32) -!50 = !DILocation(line: 5, column: 7, scope: !44) -!51 = !DILocation(line: 6, column: 3, scope: !44) -!52 = !DILocation(line: 7, column: 3, scope: !44) -!53 = !DILocation(line: 8, column: 1, scope: !44) +!llvm.module.flags = !{!3, !4, !5, !6, !7, !8, !9, !10, !11} +!llvm.ident = !{!12, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13, !13} +!opencl.ocl.version = !{!14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "27a878d5e894ab6d41bfe96f997f8821") +!2 = !{i32 0, i32 64770, i32 468969, !"h", i32 12, i32 0, i32 0} +!3 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!4 = !{i32 7, !"Dwarf Version", i32 5} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 7, !"openmp", i32 51} +!8 = !{i32 7, !"openmp-device", i32 51} +!9 = !{i32 8, !"PIC Level", i32 2} +!10 = !{i32 7, !"frame-pointer", i32 2} +!11 = !{i32 4, !"amdgpu_hostcall", i32 1} +!12 = !{!"clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)"} +!13 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"} +!14 = !{i32 2, i32 0} +!15 = distinct !DISubprogram(name: "__omp_offloading_fd02_727e9_h_l12_debug__", scope: !16, file: !16, line: 13, type: !17, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !22) +!16 = !DIFile(filename: "test.c", directory: "/tmp") +!17 = !DISubroutineType(types: !18) +!18 = !{null, !19} +!19 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !20) +!20 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !21) +!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!22 = !{} +!23 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !15, type: !19, flags: DIFlagArtificial) +!24 = !DILocation(line: 0, scope: !15) +!25 = !DILocation(line: 13, column: 3, scope: !15) +!26 = !DILocalVariable(name: "i", scope: !27, file: !16, line: 14, type: !28) +!27 = distinct !DILexicalBlock(scope: !15, file: !16, line: 13, column: 3) +!28 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!29 = !DILocation(line: 14, column: 9, scope: !27) +!30 = !DILocalVariable(name: "a", scope: !27, file: !16, line: 15, type: !31) +!31 = !DICompositeType(tag: DW_TAG_array_type, baseType: !28, size: 64, elements: !32) +!32 = !{!33} +!33 = !DISubrange(count: 2) +!34 = !DILocation(line: 15, column: 9, scope: !27) +!35 = !DILocation(line: 16, column: 5, scope: !27) +!36 = !DILocation(line: 17, column: 5, scope: !27) +!37 = !DILocation(line: 18, column: 3, scope: !27) +!38 = !DILocation(line: 18, column: 3, scope: !15) +!39 = distinct !DISubprogram(name: "__omp_offloading_fd02_727e9_h_l12", scope: !16, file: !16, line: 12, type: !17, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !22) +!40 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !39, type: !19, flags: DIFlagArtificial) +!41 = !DILocation(line: 0, scope: !39) +!42 = !DILocation(line: 12, column: 1, scope: !39) +!43 = distinct !DISubprogram(name: "g", scope: !16, file: !16, line: 3, type: !44, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !22) +!44 = !DISubroutineType(types: !45) +!45 = !{null} +!46 = !DILocalVariable(name: "i", scope: !43, file: !16, line: 4, type: !28) +!47 = !DILocation(line: 4, column: 7, scope: !43) +!48 = !DILocalVariable(name: "a", scope: !43, file: !16, line: 5, type: !31) +!49 = !DILocation(line: 5, column: 7, scope: !43) +!50 = !DILocation(line: 6, column: 3, scope: !43) +!51 = !DILocation(line: 7, column: 3, scope: !43) +!52 = !DILocation(line: 8, column: 1, scope: !43) diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 0633c3fa687c1..bd46741b24e8c 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -62,7 +62,7 @@ ; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't ; want to maintain a list of their allocas, calls, etc. in this test. -; ModuleID = 'test-openmp-nvptx64-nvidia-cuda.bc' +; ModuleID = 'test-openmp-nvptx64-nvidia-cuda-sm_70.bc' source_filename = "test.c" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" @@ -79,15 +79,13 @@ target triple = "nvptx64-nvidia-cuda" %"struct.(anonymous namespace)::SharedMemorySmartStackTy" = type { [512 x i8], [1024 x i8] } %"struct.ompx::state::TeamStateTy" = type { %"struct.ompx::state::ICVStateTy", i32, i32, ptr } %"struct.ompx::state::ICVStateTy" = type { i32, i32, i32, i32, i32, i32, i32 } -%printf_args = type { ptr, i32, ptr, ptr, ptr } -%printf_args.7 = type { ptr, i32, ptr, ptr } @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 -@0 = private unnamed_addr constant [58 x i8] c";test.c;__omp_offloading_fd02_100102_h_l12_debug__;13;3;;\00", align 1 +@0 = private unnamed_addr constant [58 x i8] c";test.c;__omp_offloading_fd02_1116d6_h_l12_debug__;13;3;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 57, ptr @0 }, align 8 -@__omp_offloading_fd02_100102_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer -@__omp_offloading_fd02_100102_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_fd02_100102_h_l12_dynamic_environment } +@__omp_offloading_fd02_1116d6_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_1116d6_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_fd02_1116d6_h_l12_dynamic_environment } @llvm.used = appending global [4 x ptr] [ptr @__llvm_rpc_client, ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata" @__omp_rtl_device_memory_pool = weak protected global %struct.DeviceMemoryPoolTy zeroinitializer, align 8 @__omp_rtl_device_memory_pool_tracker = weak protected global %struct.DeviceMemoryPoolTrackingTy zeroinitializer, align 8 @@ -103,187 +101,187 @@ target triple = "nvptx64-nvidia-cuda" @__PRETTY_FUNCTION__.__kmpc_target_deinit = private unnamed_addr constant [28 x i8] c"void __kmpc_target_deinit()\00", align 1 @IsSPMDMode = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 @__llvm_rpc_client = weak protected global %"struct.rpc::Client" zeroinitializer, align 8 -@.str1125 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 -@.str13 = private unnamed_addr constant [23 x i8] c"!mapping::isSPMDMode()\00", align 1 +@.str1027 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 +@.str12 = private unnamed_addr constant [23 x i8] c"!mapping::isSPMDMode()\00", align 1 @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel = private unnamed_addr constant [34 x i8] c"void __kmpc_kernel_end_parallel()\00", align 1 @_ZL20KernelEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZL26KernelLaunchEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16 -@.str542 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 -@.str845 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 +@.str444 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 +@.str747 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_ = private unnamed_addr constant [68 x i8] c"void ompx::state::ICVStateTy::assertEqual(const ICVStateTy &) const\00", align 1 -@.str946 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 -@.str1047 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 -@.str1148 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 -@.str1249 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 -@.str1350 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 -@.str14 = private unnamed_addr constant [43 x i8] c"ParallelTeamSize == Other.ParallelTeamSize\00", align 1 +@.str848 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 +@.str949 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 +@.str1050 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 +@.str1151 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 +@.str1252 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 +@.str13 = private unnamed_addr constant [43 x i8] c"ParallelTeamSize == Other.ParallelTeamSize\00", align 1 @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_ = private unnamed_addr constant [64 x i8] c"void ompx::state::TeamStateTy::assertEqual(TeamStateTy &) const\00", align 1 -@.str1551 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 -@.str24 = private unnamed_addr constant [32 x i8] c"mapping::isSPMDMode() == IsSPMD\00", align 1 +@.str14 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 +@.str23 = private unnamed_addr constant [32 x i8] c"mapping::isSPMDMode() == IsSPMD\00", align 1 @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb = private unnamed_addr constant [43 x i8] c"void ompx::state::assumeInitialState(bool)\00", align 1 @_ZL9ThreadDST = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZN4ompx5state9TeamStateE = internal local_unnamed_addr addrspace(3) global %"struct.ompx::state::TeamStateTy" undef, align 8 @_ZN4ompx5state12ThreadStatesE = internal addrspace(3) global ptr undef, align 8 ; Function Attrs: convergent noinline norecurse nounwind optnone -define internal void @__omp_offloading_fd02_100102_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !19 { +define internal void @__omp_offloading_fd02_1116d6_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !18 { %2 = alloca ptr, align 8 %3 = alloca i32, align 4 %4 = alloca [2 x i32], align 4 store ptr %0, ptr %2, align 8 - #dbg_declare(ptr %2, !26, !DIExpression(), !27) - %5 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_100102_h_l12_kernel_environment, ptr %0), !dbg !28 - %6 = icmp eq i32 %5, -1, !dbg !28 - br i1 %6, label %7, label %8, !dbg !28 + #dbg_declare(ptr %2, !25, !DIExpression(), !26) + %5 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_1116d6_h_l12_kernel_environment, ptr %0), !dbg !27 + %6 = icmp eq i32 %5, -1, !dbg !27 + br i1 %6, label %7, label %8, !dbg !27 7: ; preds = %1 - #dbg_declare(ptr %3, !29, !DIExpression(), !32) - #dbg_declare(ptr %4, !33, !DIExpression(), !37) - call void @f() #16, !dbg !38 - call void @g() #16, !dbg !39 - call void @__kmpc_target_deinit(), !dbg !40 - ret void, !dbg !41 + #dbg_declare(ptr %3, !28, !DIExpression(), !31) + #dbg_declare(ptr %4, !32, !DIExpression(), !36) + call void @f() #19, !dbg !37 + call void @g() #19, !dbg !38 + call void @__kmpc_target_deinit(), !dbg !39 + ret void, !dbg !40 8: ; preds = %1 - ret void, !dbg !28 + ret void, !dbg !27 } ; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone -define weak_odr protected void @__omp_offloading_fd02_100102_h_l12(ptr noalias noundef %0) #1 !dbg !42 { +define weak_odr protected ptx_kernel void @__omp_offloading_fd02_1116d6_h_l12(ptr noalias noundef %0) #1 !dbg !41 { %2 = alloca ptr, align 8 store ptr %0, ptr %2, align 8 - #dbg_declare(ptr %2, !43, !DIExpression(), !44) - %3 = load ptr, ptr %2, align 8, !dbg !45 - call void @__omp_offloading_fd02_100102_h_l12_debug__(ptr %3) #17, !dbg !45 - ret void, !dbg !45 + #dbg_declare(ptr %2, !42, !DIExpression(), !43) + %3 = load ptr, ptr %2, align 8, !dbg !44 + call void @__omp_offloading_fd02_1116d6_h_l12_debug__(ptr %3) #20, !dbg !44 + ret void, !dbg !44 } ; Function Attrs: convergent declare void @f(...) #2 ; Function Attrs: convergent noinline nounwind optnone -define hidden void @g() #3 !dbg !46 { +define hidden void @g() #3 !dbg !45 { %1 = alloca i32, align 4 %2 = alloca [2 x i32], align 4 - #dbg_declare(ptr %1, !49, !DIExpression(), !50) - #dbg_declare(ptr %2, !51, !DIExpression(), !52) - call void @f() #16, !dbg !53 - call void @g() #16, !dbg !54 - ret void, !dbg !55 + #dbg_declare(ptr %1, !48, !DIExpression(), !49) + #dbg_declare(ptr %2, !50, !DIExpression(), !51) + call void @f() #19, !dbg !52 + call void @g() #19, !dbg !53 + ret void, !dbg !54 } ; Function Attrs: convergent mustprogress nounwind define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(48) %0, ptr nofree noundef nonnull align 8 dereferenceable(16) %1) #4 { %3 = alloca ptr, align 8 %4 = getelementptr inbounds nuw i8, ptr %0, i64 2 - %5 = load i8, ptr %4, align 2, !tbaa !56 + %5 = load i8, ptr %4, align 2, !tbaa !55 %6 = and i8 %5, 2 %7 = icmp eq i8 %6, 0 - %8 = load i8, ptr %0, align 8, !tbaa !62 + %8 = load i8, ptr %0, align 8, !tbaa !61 %9 = icmp ne i8 %8, 0 br i1 %7, label %21, label %10 10: ; preds = %2 - %11 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %11 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() %12 = icmp eq i32 %11, 0 br i1 %12, label %13, label %14 13: ; preds = %10 - store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 - store i8 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512) to ptr addrspace(3)), align 1, !tbaa !64 - tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !65 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !70 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !71 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !72 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !73 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 - store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 - store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !77 - store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !79 + store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62 + store i8 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512) to ptr addrspace(3)), align 1, !tbaa !63 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !64 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !78 br label %18 14: ; preds = %10 %15 = zext nneg i32 %11 to i64 %16 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %15 %17 = addrspacecast ptr %16 to ptr addrspace(3) - store i8 0, ptr addrspace(3) %17, align 1, !tbaa !64 + store i8 0, ptr addrspace(3) %17, align 1, !tbaa !63 br label %18 18: ; preds = %14, %13 br i1 %12, label %19, label %20 19: ; preds = %18 - store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !81 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !80 br label %20 20: ; preds = %18, %19 - tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19 + tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #21 br label %37 21: ; preds = %2 - %22 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 + %22 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82 %23 = add nsw i32 %22, -1 %24 = and i32 %23, -32 - %25 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %25 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() %26 = icmp eq i32 %25, %24 br i1 %26, label %27, label %31 27: ; preds = %21 - store i32 0, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + store i32 0, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62 %28 = zext nneg i32 %25 to i64 %29 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %28 %30 = addrspacecast ptr %29 to ptr addrspace(3) - store i8 0, ptr addrspace(3) %30, align 1, !tbaa !64 - tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !65 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !70 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !71 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !72 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !73 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 - store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 - store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !77 - store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !79 + store i8 0, ptr addrspace(3) %30, align 1, !tbaa !63 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !64 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !78 br label %35 31: ; preds = %21 %32 = zext nneg i32 %25 to i64 %33 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %32 %34 = addrspacecast ptr %33 to ptr addrspace(3) - store i8 0, ptr addrspace(3) %34, align 1, !tbaa !64 + store i8 0, ptr addrspace(3) %34, align 1, !tbaa !63 br label %35 35: ; preds = %31, %27 br i1 %26, label %36, label %37 36: ; preds = %35 - store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !81 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !80 br label %37 37: ; preds = %36, %35, %20 br i1 %7, label %100, label %38 38: ; preds = %37 - %39 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 - %40 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !84 + %39 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !62 + %40 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !83 %41 = and i32 %39, 1 %42 = and i32 %41, %40 %43 = icmp ne i32 %42, 0 - %44 = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !87 + %44 = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !86 %45 = icmp ne i32 %44, 0 %46 = select i1 %43, i1 %45, i1 false br i1 %46, label %47, label %48 47: ; preds = %38 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str845, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(33) @.str747, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 193, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 unreachable 48: ; preds = %38 %49 = icmp eq i32 %44, 0 - tail call void @llvm.assume(i1 noundef %49) #21 - %50 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !88 + tail call void @llvm.assume(i1 noundef %49) #23 + %50 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !87 br i1 %43, label %51, label %54 51: ; preds = %48 @@ -291,14 +289,14 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %52, label %54, label %53 53: ; preds = %51 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str946, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(27) @.str848, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 194, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 unreachable 54: ; preds = %51, %48 %55 = phi i32 [ 0, %51 ], [ %50, %48 ] %56 = icmp eq i32 %55, 0 - tail call void @llvm.assume(i1 noundef %56) #21 - %57 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !89 + tail call void @llvm.assume(i1 noundef %56) #23 + %57 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !88 br i1 %43, label %58, label %61 58: ; preds = %54 @@ -306,14 +304,14 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %59, label %61, label %60 60: ; preds = %58 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1047, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(39) @.str949, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 195, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 unreachable 61: ; preds = %58, %54 %62 = phi i32 [ 0, %58 ], [ %57, %54 ] %63 = icmp eq i32 %62, 0 - tail call void @llvm.assume(i1 noundef %63) #21 - %64 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !90 + tail call void @llvm.assume(i1 noundef %63) #23 + %64 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !89 br i1 %43, label %65, label %68 65: ; preds = %61 @@ -321,14 +319,14 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %66, label %68, label %67 67: ; preds = %65 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1148, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(47) @.str1050, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 196, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 unreachable 68: ; preds = %65, %61 %69 = phi i32 [ 1, %65 ], [ %64, %61 ] %70 = icmp eq i32 %69, 1 - tail call void @llvm.assume(i1 noundef %70) #21 - %71 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !91 + tail call void @llvm.assume(i1 noundef %70) #23 + %71 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !90 br i1 %43, label %72, label %93 72: ; preds = %68 @@ -336,71 +334,71 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %73, label %75, label %74 74: ; preds = %72 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1249, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(33) @.str1151, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 197, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 unreachable 75: ; preds = %72 %76 = icmp eq i32 1, 1 - tail call void @llvm.assume(i1 noundef %76) #21 + tail call void @llvm.assume(i1 noundef %76) #23 br i1 %43, label %77, label %95 77: ; preds = %75 - %78 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !92 + %78 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !91 %79 = icmp eq i32 %78, 1 br i1 %79, label %81, label %80 80: ; preds = %77 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1350, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(43) @.str1252, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 198, ptr noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #22 unreachable 81: ; preds = %77 - %82 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !72 + %82 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 %83 = icmp eq i32 %82, 1 br i1 %83, label %85, label %84 84: ; preds = %81 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(43) @.str13, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 222, ptr noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #22 unreachable 85: ; preds = %81 - %86 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !73 + %86 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 %87 = icmp eq i32 %86, 0 br i1 %87, label %89, label %88 88: ; preds = %85 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1551, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(39) @.str14, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 223, ptr noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #22 unreachable 89: ; preds = %85 - %90 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + %90 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62 %91 = icmp eq i32 %90, 0 br i1 %91, label %92, label %98 92: ; preds = %89 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(32) @.str23, ptr noundef null, ptr noundef nonnull dereferenceable(66) @.str444, i32 noundef 326, ptr noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #22 unreachable 93: ; preds = %68 %94 = icmp eq i32 %71, 1 - tail call void @llvm.assume(i1 noundef %94) #21 + tail call void @llvm.assume(i1 noundef %94) #23 br label %95 95: ; preds = %75, %93 - %96 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + %96 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62 %97 = icmp ne i32 %96, 0 br label %98 98: ; preds = %89, %95 %99 = phi i1 [ %97, %95 ], [ true, %89 ] - tail call void @llvm.assume(i1 noundef %99) #21 - tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19 + tail call void @llvm.assume(i1 noundef %99) #23 + tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #21 br label %130 100: ; preds = %37 - %101 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 + %101 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82 %102 = add nsw i32 %101, -1 %103 = and i32 %102, -32 - %104 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !93 + %104 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !92 %105 = icmp eq i32 %104, %103 br i1 %105, label %130, label %106 @@ -419,11 +417,10 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br label %116 116: ; preds = %110, %128 - call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %3) #22 - store ptr null, ptr %3, align 8, !tbaa !94 - tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 - %117 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %3) #22 - %118 = load ptr, ptr %3, align 8, !tbaa !94 + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %3) #20 + tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) + %117 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %3) #20 + %118 = load ptr, ptr %3, align 8, !tbaa !93 %119 = icmp eq ptr %118, null br i1 %119, label %129, label %120 @@ -437,23 +434,23 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %124, label %125, label %126 125: ; preds = %121 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(67) @.str15, i32 noundef 60, ptr nofree noundef nonnull dereferenceable(36) @__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(23) @.str12, ptr noundef null, ptr noundef nonnull dereferenceable(67) @.str15, i32 noundef 60, ptr noundef nonnull dereferenceable(36) @__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy) #22 unreachable 126: ; preds = %121 %127 = icmp eq i32 %122, 0 - tail call void @llvm.assume(i1 noundef %127) #21 - tail call void %118(i32 noundef 0, i32 noundef %104) #23 + tail call void @llvm.assume(i1 noundef %127) #23 + tail call void %118(i32 noundef 0, i32 noundef %104) #24 tail call void @__kmpc_kernel_end_parallel() #24 br label %128 128: ; preds = %126, %120 - tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 - call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #22 - br label %116, !llvm.loop !95 + tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #20 + br label %116, !llvm.loop !94 129: ; preds = %116 - call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #22 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #20 br label %130 130: ; preds = %106, %129, %100, %98 @@ -477,38 +474,20 @@ define internal void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10Ordering declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5 ; Function Attrs: cold convergent mustprogress noreturn nounwind -define internal fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(8) %0, ptr noundef %1, ptr nofree noundef nonnull dereferenceable(66) %2, i32 noundef range(i32 60, 905) %3, ptr nofree noundef nonnull dereferenceable(20) %4) unnamed_addr #8 { - %6 = alloca %printf_args, align 8 - %7 = alloca %printf_args.7, align 8 - %8 = icmp eq ptr %1, null - br i1 %8, label %12, label %9 +define internal fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(8) %0, ptr noundef %1, ptr noundef nonnull dereferenceable(66) %2, i32 noundef range(i32 60, 905) %3, ptr noundef nonnull dereferenceable(20) %4) unnamed_addr #8 { + %6 = icmp eq ptr %1, null + br i1 %6, label %9, label %7 + +7: ; preds = %5 + %8 = tail call noundef i32 (ptr, ...) @_ZN4ompx6printfEPKcz(ptr noundef nonnull dereferenceable(40) @.str, ptr noundef nonnull dereferenceable(66) %2, i32 noundef %3, ptr noundef nonnull dereferenceable(20) %4, ptr noundef nonnull %1, ptr noundef nonnull dereferenceable(8) %0) #24 + br label %11 9: ; preds = %5 - store ptr %2, ptr %6, align 8 - %10 = getelementptr inbounds nuw i8, ptr %6, i64 8 - store i32 %3, ptr %10, align 8 - %11 = getelementptr inbounds nuw i8, ptr %6, i64 16 - store ptr %4, ptr %11, align 8 - br label %14 - -12: ; preds = %5 - store ptr %2, ptr %7, align 8 - %13 = getelementptr inbounds nuw i8, ptr %7, i64 8 - store i32 %3, ptr %13, align 8 - br label %14 - -14: ; preds = %12, %9 - %15 = phi i64 [ 16, %12 ], [ 24, %9 ] - %16 = phi ptr [ %7, %12 ], [ %6, %9 ] - %17 = phi ptr [ %4, %12 ], [ %1, %9 ] - %18 = phi i64 [ 24, %12 ], [ 32, %9 ] - %19 = phi ptr [ @.str1, %12 ], [ @.str, %9 ] - %20 = getelementptr inbounds nuw i8, ptr %16, i64 %15 - store ptr %17, ptr %20, align 8 - %21 = getelementptr inbounds nuw i8, ptr %16, i64 %18 - store ptr %0, ptr %21, align 8 - %22 = call i32 @vprintf(ptr noundef nonnull %19, ptr noundef nonnull %16) #22 - call void @llvm.trap() #26 + %10 = tail call noundef i32 (ptr, ...) @_ZN4ompx6printfEPKcz(ptr noundef nonnull dereferenceable(35) @.str1, ptr noundef nonnull dereferenceable(66) %2, i32 noundef %3, ptr noundef nonnull dereferenceable(20) %4, ptr noundef nonnull dereferenceable(8) %0) #24 + br label %11 + +11: ; preds = %9, %7 + tail call void @llvm.trap() #26 unreachable } @@ -523,16 +502,16 @@ declare void @llvm.nvvm.barrier.sync(i32) #11 ; Function Attrs: convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) initializes((0, 8)) %0) local_unnamed_addr #12 { - %2 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !94 - store ptr %2, ptr %0, align 8, !tbaa !94 + %2 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !93 + store ptr %2, ptr %0, align 8, !tbaa !93 %3 = icmp eq ptr %2, null br i1 %3, label %15, label %4 4: ; preds = %1 - %5 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !93 - %6 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !63 + %5 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !92 + %6 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !62 %7 = icmp eq i32 %6, 0 - %8 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 + %8 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82 %9 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 %10 = icmp eq i32 %9, 0 %11 = select i1 %10, i32 -32, i32 0 @@ -548,8 +527,8 @@ define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree ; Function Attrs: convergent mustprogress noinline nounwind define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { - %1 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 - %2 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !84 + %1 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !62 + %2 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !83 %3 = and i32 %1, 1 %4 = and i32 %3, %2 %5 = icmp ne i32 %4, 0 @@ -559,13 +538,13 @@ define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { br i1 %8, label %9, label %10 9: ; preds = %0 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1125, i32 noundef 298, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(23) @.str12, ptr noundef null, ptr noundef nonnull dereferenceable(72) @.str1027, i32 noundef 299, ptr noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #22 unreachable 10: ; preds = %0 %11 = icmp eq i32 %6, 0 - tail call void @llvm.assume(i1 noundef %11) #21 - %12 = load i32, ptr @__omp_rtl_assume_no_thread_state, align 4, !tbaa !63 + tail call void @llvm.assume(i1 noundef %11) #23 + %12 = load i32, ptr @__omp_rtl_assume_no_thread_state, align 4, !tbaa !62 %13 = icmp eq i32 %12, 0 %14 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8 %15 = icmp ne i32 %14, 0 @@ -573,21 +552,21 @@ define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { br i1 %16, label %17, label %30 17: ; preds = %10 - %18 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !93 - %19 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 + %18 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !92 + %19 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 %20 = zext nneg i32 %18 to i64 %21 = getelementptr inbounds nuw ptr, ptr %19, i64 %20 - %22 = load ptr, ptr %21, align 8, !tbaa !97 + %22 = load ptr, ptr %21, align 8, !tbaa !96 %23 = icmp eq ptr %22, null - br i1 %23, label %30, label %24, !prof !99 + br i1 %23, label %30, label %24, !prof !98 24: ; preds = %17 %25 = getelementptr inbounds nuw i8, ptr %22, i64 32 - %26 = load ptr, ptr %25, align 8, !tbaa !100 + %26 = load ptr, ptr %25, align 8, !tbaa !99 tail call void @free(ptr noundef nonnull dereferenceable(40) %22) #28 - %27 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 + %27 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 %28 = getelementptr inbounds nuw ptr, ptr %27, i64 %20 - store ptr %26, ptr %28, align 8, !tbaa !97 + store ptr %26, ptr %28, align 8, !tbaa !96 %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 br label %30 @@ -598,12 +577,12 @@ define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { br i1 %33, label %34, label %35 34: ; preds = %30 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1125, i32 noundef 301, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(23) @.str12, ptr noundef null, ptr noundef nonnull dereferenceable(72) @.str1027, i32 noundef 302, ptr noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #22 unreachable 35: ; preds = %30 %36 = icmp eq i32 %31, 0 - tail call void @llvm.assume(i1 noundef %36) #21 + tail call void @llvm.assume(i1 noundef %36) #23 ret void } @@ -613,11 +592,25 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #10 ; Function Attrs: convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) declare extern_weak void @free(ptr allocptr nocapture noundef) local_unnamed_addr #14 -; Function Attrs: convergent -declare i32 @vprintf(ptr, ptr) local_unnamed_addr #2 +; Function Attrs: convergent mustprogress nounwind +define internal noundef i32 @_ZN4ompx6printfEPKcz(ptr noundef %0, ...) local_unnamed_addr #15 { + %2 = alloca ptr, align 8 + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 %2) #29 + call void @llvm.va_start.p0(ptr noundef nonnull align 8 %2) #27 + %3 = load ptr, ptr %2, align 8, !tbaa !101 + %4 = call i32 @vprintf(ptr noundef %0, ptr noundef %3) #24 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %2) #20 + ret i32 %4 +} ; Function Attrs: cold noreturn nounwind memory(inaccessiblemem: write) -declare void @llvm.trap() #15 +declare void @llvm.trap() #16 + +; Function Attrs: nocallback nofree nosync nounwind willreturn +declare void @llvm.va_start.p0(ptr) #17 + +; Function Attrs: convergent nounwind +declare i32 @vprintf(ptr noundef, ptr noundef) local_unnamed_addr #18 ; Function Attrs: convergent nocallback nounwind declare void @llvm.nvvm.barrier0() #11 @@ -625,34 +618,33 @@ declare void @llvm.nvvm.barrier0() #11 ; Function Attrs: convergent mustprogress nounwind define internal void @__kmpc_target_deinit() #4 { %1 = alloca ptr, align 8 - %2 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + %2 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !62 %3 = icmp eq i32 %2, 0 br i1 %3, label %4, label %27 4: ; preds = %0 - %5 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 + %5 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !82 %6 = add nsw i32 %5, -1 %7 = and i32 %6, -32 - %8 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !93 + %8 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !92 %9 = icmp eq i32 %8, %7 br i1 %9, label %10, label %11 10: ; preds = %4 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !94 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !93 br label %27 11: ; preds = %4 - %12 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !77 - %13 = load i8, ptr %12, align 8, !tbaa !102 + %12 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + %13 = load i8, ptr %12, align 8, !tbaa !103 %14 = icmp eq i8 %13, 0 br i1 %14, label %15, label %27 15: ; preds = %11 call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %1) #29 - store ptr null, ptr %1, align 8, !tbaa !94 - %16 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %1) #22 - %17 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 - %18 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !84 + %16 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %1) #20 + %17 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !62 + %18 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !83 %19 = and i32 %17, 1 %20 = and i32 %19, %18 %21 = icmp eq i32 %20, 0 @@ -662,12 +654,12 @@ define internal void @__kmpc_target_deinit() #4 { br i1 %24, label %26, label %25 25: ; preds = %15 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(18) @.str2, ptr noundef null, ptr nofree noundef nonnull dereferenceable(67) @.str15, i32 noundef 152, ptr nofree noundef nonnull dereferenceable(28) @__PRETTY_FUNCTION__.__kmpc_target_deinit) #20 + tail call fastcc void @__assert_fail_internal(ptr noundef nonnull dereferenceable(18) @.str2, ptr noundef null, ptr noundef nonnull dereferenceable(67) @.str15, i32 noundef 152, ptr noundef nonnull dereferenceable(28) @__PRETTY_FUNCTION__.__kmpc_target_deinit) #22 unreachable 26: ; preds = %15 - tail call void @llvm.assume(i1 noundef %23) #21 - call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %1) #22 + tail call void @llvm.assume(i1 noundef %23) #23 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %1) #20 br label %27 27: ; preds = %26, %11, %10, %0 @@ -678,39 +670,39 @@ attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer" attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="128" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } -attributes #4 = { convergent mustprogress nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #4 = { convergent mustprogress nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #6 = { nocallback nofree nounwind willreturn memory(argmem: write) } -attributes #7 = { convergent mustprogress noinline norecurse nounwind "frame-pointer"="all" "llvm.assume"="ompx_aligned_barrier,ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } -attributes #8 = { cold convergent mustprogress noreturn nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #7 = { convergent mustprogress noinline norecurse nounwind "frame-pointer"="all" "llvm.assume"="ompx_aligned_barrier" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #8 = { cold convergent mustprogress noreturn nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } attributes #9 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } attributes #10 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } attributes #11 = { convergent nocallback nounwind } -attributes #12 = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } -attributes #13 = { convergent mustprogress noinline nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } -attributes #14 = { convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } -attributes #15 = { cold noreturn nounwind memory(inaccessiblemem: write) } -attributes #16 = { convergent } -attributes #17 = { nounwind } -attributes #18 = { "llvm.assume"="ompx_no_call_asm" } -attributes #19 = { convergent nounwind "llvm.assume"="ompx_aligned_barrier,ompx_no_call_asm" } -attributes #20 = { noreturn nounwind "llvm.assume"="ompx_no_call_asm" } -attributes #21 = { memory(write) "llvm.assume"="ompx_no_call_asm" } -attributes #22 = { nounwind "llvm.assume"="ompx_no_call_asm" } -attributes #23 = { convergent nounwind } -attributes #24 = { convergent nounwind "llvm.assume"="ompx_no_call_asm" } -attributes #25 = { "llvm.assume"="ompx_aligned_barrier,ompx_no_call_asm" } -attributes #26 = { noreturn "llvm.assume"="ompx_no_call_asm" } -attributes #27 = { nofree willreturn "llvm.assume"="ompx_no_call_asm" } -attributes #28 = { convergent nounwind willreturn "llvm.assume"="ompx_no_call_asm" } -attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } +attributes #12 = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #13 = { convergent mustprogress noinline nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #14 = { convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #15 = { convergent mustprogress nounwind "frame-pointer"="all" "no-builtin-printf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #16 = { cold noreturn nounwind memory(inaccessiblemem: write) } +attributes #17 = { nocallback nofree nosync nounwind willreturn } +attributes #18 = { convergent nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #19 = { convergent } +attributes #20 = { nounwind } +attributes #21 = { convergent nounwind "llvm.assume"="ompx_aligned_barrier" } +attributes #22 = { convergent noreturn nounwind } +attributes #23 = { memory(write) } +attributes #24 = { convergent nounwind } +attributes #25 = { "llvm.assume"="ompx_aligned_barrier" } +attributes #26 = { noreturn } +attributes #27 = { nofree willreturn } +attributes #28 = { convergent nounwind willreturn } +attributes #29 = { nofree nounwind willreturn } !llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10} !llvm.dbg.cu = !{!11} -!nvvm.annotations = !{!13, !14} -!omp_offload.info = !{!15} -!llvm.ident = !{!16, !17, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16} -!nvvmir.version = !{!18} +!nvvm.annotations = !{!13} +!omp_offload.info = !{!14} +!llvm.ident = !{!15, !16, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} +!nvvmir.version = !{!17} !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 12, i32 3]} !1 = !{i32 7, !"Dwarf Version", i32 2} @@ -723,98 +715,99 @@ attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } !8 = !{i32 7, !"frame-pointer", i32 2} !9 = !{i32 1, !"ThinLTO", i32 0} !10 = !{i32 1, !"EnableSplitLTOUnit", i32 1} -!11 = distinct !DICompileUnit(language: DW_LANG_C11, file: !12, producer: "clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!11 = distinct !DICompileUnit(language: DW_LANG_C11, file: !12, producer: "clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) !12 = !DIFile(filename: "test.c", directory: "/tmp") -!13 = !{ptr @__omp_offloading_fd02_100102_h_l12, !"maxntidx", i32 128} -!14 = !{ptr @__omp_offloading_fd02_100102_h_l12, !"kernel", i32 1} -!15 = !{i32 0, i32 64770, i32 1048834, !"h", i32 12, i32 0, i32 0} -!16 = !{!"clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)"} -!17 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} -!18 = !{i32 2, i32 0} -!19 = distinct !DISubprogram(name: "__omp_offloading_fd02_100102_h_l12_debug__", scope: !12, file: !12, line: 13, type: !20, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) -!20 = !DISubroutineType(types: !21) -!21 = !{null, !22} -!22 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !23) -!23 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !24) -!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) -!25 = !{} -!26 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !19, type: !22, flags: DIFlagArtificial) -!27 = !DILocation(line: 0, scope: !19) -!28 = !DILocation(line: 13, column: 3, scope: !19) -!29 = !DILocalVariable(name: "i", scope: !30, file: !12, line: 14, type: !31) -!30 = distinct !DILexicalBlock(scope: !19, file: !12, line: 13, column: 3) -!31 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!32 = !DILocation(line: 14, column: 9, scope: !30) -!33 = !DILocalVariable(name: "a", scope: !30, file: !12, line: 15, type: !34) -!34 = !DICompositeType(tag: DW_TAG_array_type, baseType: !31, size: 64, elements: !35) -!35 = !{!36} -!36 = !DISubrange(count: 2) -!37 = !DILocation(line: 15, column: 9, scope: !30) -!38 = !DILocation(line: 16, column: 5, scope: !30) -!39 = !DILocation(line: 17, column: 5, scope: !30) -!40 = !DILocation(line: 18, column: 3, scope: !30) -!41 = !DILocation(line: 18, column: 3, scope: !19) -!42 = distinct !DISubprogram(name: "__omp_offloading_fd02_100102_h_l12", scope: !12, file: !12, line: 12, type: !20, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) -!43 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !42, type: !22, flags: DIFlagArtificial) -!44 = !DILocation(line: 0, scope: !42) -!45 = !DILocation(line: 12, column: 1, scope: !42) -!46 = distinct !DISubprogram(name: "g", scope: !12, file: !12, line: 3, type: !47, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !11, retainedNodes: !25) -!47 = !DISubroutineType(types: !48) -!48 = !{null} -!49 = !DILocalVariable(name: "i", scope: !46, file: !12, line: 4, type: !31) -!50 = !DILocation(line: 4, column: 7, scope: !46) -!51 = !DILocalVariable(name: "a", scope: !46, file: !12, line: 5, type: !34) -!52 = !DILocation(line: 5, column: 7, scope: !46) -!53 = !DILocation(line: 6, column: 3, scope: !46) -!54 = !DILocation(line: 7, column: 3, scope: !46) -!55 = !DILocation(line: 8, column: 1, scope: !46) -!56 = !{!57, !60, i64 2} -!57 = !{!"_ZTS26ConfigurationEnvironmentTy", !58, i64 0, !58, i64 1, !60, i64 2, !61, i64 4, !61, i64 8, !61, i64 12, !61, i64 16, !61, i64 20, !61, i64 24} -!58 = !{!"omnipotent char", !59, i64 0} -!59 = !{!"Simple C++ TBAA"} -!60 = !{!"_ZTSN4llvm3omp19OMPTgtExecModeFlagsE", !58, i64 0} -!61 = !{!"int", !58, i64 0} -!62 = !{!57, !58, i64 0} -!63 = !{!61, !61, i64 0} -!64 = !{!58, !58, i64 0} -!65 = !{!66, !61, i64 16} -!66 = !{!"_ZTSN4ompx5state11TeamStateTyE", !67, i64 0, !61, i64 28, !61, i64 32, !68, i64 40} -!67 = !{!"_ZTSN4ompx5state10ICVStateTyE", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !61, i64 16, !61, i64 20, !61, i64 24} -!68 = !{!"p1 void", !69, i64 0} -!69 = !{!"any pointer", !58, i64 0} -!70 = !{!66, !61, i64 20} -!71 = !{!66, !61, i64 24} -!72 = !{!66, !61, i64 28} -!73 = !{!66, !61, i64 32} -!74 = !{!66, !68, i64 40} -!75 = !{!76, !76, i64 0} -!76 = !{!"p2 _ZTSN4ompx5state13ThreadStateTyE", !69, i64 0} -!77 = !{!78, !78, i64 0} -!78 = !{!"p1 _ZTS19KernelEnvironmentTy", !69, i64 0} -!79 = !{!80, !80, i64 0} -!80 = !{!"p1 _ZTS25KernelLaunchEnvironmentTy", !69, i64 0} -!81 = !{!82, !82, i64 0} -!82 = !{!"p2 _ZTS22DynamicScheduleTracker", !69, i64 0} -!83 = !{i32 1, i32 1025} -!84 = !{!85, !61, i64 0} -!85 = !{!"_ZTS19DeviceEnvironmentTy", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !86, i64 16, !86, i64 24, !86, i64 32, !86, i64 40} -!86 = !{!"long", !58, i64 0} -!87 = !{!67, !61, i64 0} -!88 = !{!67, !61, i64 4} -!89 = !{!67, !61, i64 8} -!90 = !{!67, !61, i64 16} -!91 = !{!67, !61, i64 20} -!92 = !{!67, !61, i64 24} -!93 = !{i32 0, i32 1024} -!94 = !{!68, !68, i64 0} -!95 = distinct !{!95, !96} -!96 = !{!"llvm.loop.mustprogress"} -!97 = !{!98, !98, i64 0} -!98 = !{!"p1 _ZTSN4ompx5state13ThreadStateTyE", !69, i64 0} -!99 = !{!"branch_weights", !"expected", i32 2000, i32 1} -!100 = !{!101, !98, i64 32} -!101 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !67, i64 0, !98, i64 32} -!102 = !{!103, !58, i64 0} -!103 = !{!"_ZTS19KernelEnvironmentTy", !57, i64 0, !104, i64 32, !105, i64 40} -!104 = !{!"p1 _ZTS7IdentTy", !69, i64 0} -!105 = !{!"p1 _ZTS20DynamicEnvironmentTy", !69, i64 0} +!13 = !{ptr @__omp_offloading_fd02_1116d6_h_l12, !"maxntidx", i32 128} +!14 = !{i32 0, i32 64770, i32 1119958, !"h", i32 12, i32 0, i32 0} +!15 = !{!"clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)"} +!16 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!17 = !{i32 2, i32 0} +!18 = distinct !DISubprogram(name: "__omp_offloading_fd02_1116d6_h_l12_debug__", scope: !12, file: !12, line: 13, type: !19, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !24) +!19 = !DISubroutineType(types: !20) +!20 = !{null, !21} +!21 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !22) +!22 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !23) +!23 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!24 = !{} +!25 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !18, type: !21, flags: DIFlagArtificial) +!26 = !DILocation(line: 0, scope: !18) +!27 = !DILocation(line: 13, column: 3, scope: !18) +!28 = !DILocalVariable(name: "i", scope: !29, file: !12, line: 14, type: !30) +!29 = distinct !DILexicalBlock(scope: !18, file: !12, line: 13, column: 3) +!30 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!31 = !DILocation(line: 14, column: 9, scope: !29) +!32 = !DILocalVariable(name: "a", scope: !29, file: !12, line: 15, type: !33) +!33 = !DICompositeType(tag: DW_TAG_array_type, baseType: !30, size: 64, elements: !34) +!34 = !{!35} +!35 = !DISubrange(count: 2) +!36 = !DILocation(line: 15, column: 9, scope: !29) +!37 = !DILocation(line: 16, column: 5, scope: !29) +!38 = !DILocation(line: 17, column: 5, scope: !29) +!39 = !DILocation(line: 18, column: 3, scope: !29) +!40 = !DILocation(line: 18, column: 3, scope: !18) +!41 = distinct !DISubprogram(name: "__omp_offloading_fd02_1116d6_h_l12", scope: !12, file: !12, line: 12, type: !19, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !24) +!42 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !41, type: !21, flags: DIFlagArtificial) +!43 = !DILocation(line: 0, scope: !41) +!44 = !DILocation(line: 12, column: 1, scope: !41) +!45 = distinct !DISubprogram(name: "g", scope: !12, file: !12, line: 3, type: !46, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !11, retainedNodes: !24) +!46 = !DISubroutineType(types: !47) +!47 = !{null} +!48 = !DILocalVariable(name: "i", scope: !45, file: !12, line: 4, type: !30) +!49 = !DILocation(line: 4, column: 7, scope: !45) +!50 = !DILocalVariable(name: "a", scope: !45, file: !12, line: 5, type: !33) +!51 = !DILocation(line: 5, column: 7, scope: !45) +!52 = !DILocation(line: 6, column: 3, scope: !45) +!53 = !DILocation(line: 7, column: 3, scope: !45) +!54 = !DILocation(line: 8, column: 1, scope: !45) +!55 = !{!56, !59, i64 2} +!56 = !{!"_ZTS26ConfigurationEnvironmentTy", !57, i64 0, !57, i64 1, !59, i64 2, !60, i64 4, !60, i64 8, !60, i64 12, !60, i64 16, !60, i64 20, !60, i64 24} +!57 = !{!"omnipotent char", !58, i64 0} +!58 = !{!"Simple C++ TBAA"} +!59 = !{!"_ZTSN4llvm3omp19OMPTgtExecModeFlagsE", !57, i64 0} +!60 = !{!"int", !57, i64 0} +!61 = !{!56, !57, i64 0} +!62 = !{!60, !60, i64 0} +!63 = !{!57, !57, i64 0} +!64 = !{!65, !60, i64 16} +!65 = !{!"_ZTSN4ompx5state11TeamStateTyE", !66, i64 0, !60, i64 28, !60, i64 32, !67, i64 40} +!66 = !{!"_ZTSN4ompx5state10ICVStateTyE", !60, i64 0, !60, i64 4, !60, i64 8, !60, i64 12, !60, i64 16, !60, i64 20, !60, i64 24} +!67 = !{!"p1 void", !68, i64 0} +!68 = !{!"any pointer", !57, i64 0} +!69 = !{!65, !60, i64 20} +!70 = !{!65, !60, i64 24} +!71 = !{!65, !60, i64 28} +!72 = !{!65, !60, i64 32} +!73 = !{!65, !67, i64 40} +!74 = !{!75, !75, i64 0} +!75 = !{!"p2 _ZTSN4ompx5state13ThreadStateTyE", !68, i64 0} +!76 = !{!77, !77, i64 0} +!77 = !{!"p1 _ZTS19KernelEnvironmentTy", !68, i64 0} +!78 = !{!79, !79, i64 0} +!79 = !{!"p1 _ZTS25KernelLaunchEnvironmentTy", !68, i64 0} +!80 = !{!81, !81, i64 0} +!81 = !{!"p2 _ZTS22DynamicScheduleTracker", !68, i64 0} +!82 = !{i32 1, i32 1025} +!83 = !{!84, !60, i64 0} +!84 = !{!"_ZTS19DeviceEnvironmentTy", !60, i64 0, !60, i64 4, !60, i64 8, !60, i64 12, !85, i64 16, !85, i64 24, !85, i64 32, !85, i64 40} +!85 = !{!"long", !57, i64 0} +!86 = !{!66, !60, i64 0} +!87 = !{!66, !60, i64 4} +!88 = !{!66, !60, i64 8} +!89 = !{!66, !60, i64 16} +!90 = !{!66, !60, i64 20} +!91 = !{!66, !60, i64 24} +!92 = !{i32 0, i32 1024} +!93 = !{!67, !67, i64 0} +!94 = distinct !{!94, !95} +!95 = !{!"llvm.loop.mustprogress"} +!96 = !{!97, !97, i64 0} +!97 = !{!"p1 _ZTSN4ompx5state13ThreadStateTyE", !68, i64 0} +!98 = !{!"branch_weights", !"expected", i32 2000, i32 1} +!99 = !{!100, !97, i64 32} +!100 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !66, i64 0, !97, i64 32} +!101 = !{!102, !102, i64 0} +!102 = !{!"p1 omnipotent char", !68, i64 0} +!103 = !{!104, !57, i64 0} +!104 = !{!"_ZTS19KernelEnvironmentTy", !56, i64 0, !105, i64 32, !106, i64 40} +!105 = !{!"p1 _ZTS7IdentTy", !68, i64 0} +!106 = !{!"p1 _ZTS20DynamicEnvironmentTy", !68, i64 0}