From 530eb982b9770190377bb0bd09c5cb715f34d484 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 15 Dec 2023 20:38:38 -0600 Subject: [PATCH 001/114] Add profiling functions to libomptarget --- .../include/llvm/Frontend/OpenMP/OMPKinds.def | 3 +++ openmp/libomptarget/DeviceRTL/CMakeLists.txt | 2 ++ .../DeviceRTL/include/Profiling.h | 21 +++++++++++++++++++ .../libomptarget/DeviceRTL/src/Profiling.cpp | 19 +++++++++++++++++ 4 files changed, 45 insertions(+) create mode 100644 openmp/libomptarget/DeviceRTL/include/Profiling.h create mode 100644 openmp/libomptarget/DeviceRTL/src/Profiling.cpp diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index d22d2a8e948b0..1d887d5cb5812 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -503,6 +503,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,) __OMP_RTL(__kmpc_syncwarp, false, Void, Int64) +__OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr) +__OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64) + __OMP_RTL(__last, false, Void, ) #undef __OMP_RTL diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt index 1ce3e1e40a80a..55ee15d068c67 100644 --- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt +++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt @@ -89,6 +89,7 @@ set(include_files ${include_directory}/Interface.h ${include_directory}/LibC.h ${include_directory}/Mapping.h + ${include_directory}/Profiling.h ${include_directory}/State.h ${include_directory}/Synchronization.h ${include_directory}/Types.h @@ -104,6 +105,7 @@ set(src_files ${source_directory}/Mapping.cpp ${source_directory}/Misc.cpp ${source_directory}/Parallelism.cpp + ${source_directory}/Profiling.cpp ${source_directory}/Reduction.cpp ${source_directory}/State.cpp ${source_directory}/Synchronization.cpp diff --git a/openmp/libomptarget/DeviceRTL/include/Profiling.h b/openmp/libomptarget/DeviceRTL/include/Profiling.h new file mode 100644 index 0000000000000..68c7744cd6075 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/include/Profiling.h @@ -0,0 +1,21 @@ +//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_DEVICERTL_PROFILING_H +#define OMPTARGET_DEVICERTL_PROFILING_H + +extern "C" { + +void __llvm_profile_register_function(void *ptr); +void __llvm_profile_register_names_function(void *ptr, long int i); +} + +#endif diff --git a/openmp/libomptarget/DeviceRTL/src/Profiling.cpp b/openmp/libomptarget/DeviceRTL/src/Profiling.cpp new file mode 100644 index 0000000000000..799477f5e47d2 --- /dev/null +++ b/openmp/libomptarget/DeviceRTL/src/Profiling.cpp @@ -0,0 +1,19 @@ +//===------- Profiling.cpp ---------------------------------------- C++ ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Profiling.h" + +#pragma omp begin declare target device_type(nohost) + +extern "C" { + +void __llvm_profile_register_function(void *ptr) {} +void __llvm_profile_register_names_function(void *ptr, long int i) {} +} + +#pragma omp end declare target From fb067d4ffe604fd68cf90b705db1942bce49dbb1 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Sat, 16 Dec 2023 01:18:41 -0600 Subject: [PATCH 002/114] Fix PGO instrumentation for GPU targets --- clang/lib/CodeGen/CodeGenPGO.cpp | 10 ++++++++-- .../lib/Transforms/Instrumentation/InstrProfiling.cpp | 11 ++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index 81bf8ea696b16..edae6885b528a 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -959,8 +959,14 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S, unsigned Counter = (*RegionCounterMap)[S]; - llvm::Value *Args[] = {FuncNameVar, - Builder.getInt64(FunctionHash), + // Make sure that pointer to global is passed in with zero addrspace + // This is relevant during GPU profiling + auto *I8Ty = llvm::Type::getInt8Ty(CGM.getLLVMContext()); + auto *I8PtrTy = llvm::PointerType::getUnqual(I8Ty); + auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( + FuncNameVar, I8PtrTy); + + llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash), Builder.getInt32(NumRegionCounters), Builder.getInt32(Counter), StepV}; if (!StepV) diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index fe5a0578bd972..d2cb8155c1796 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1658,10 +1658,13 @@ void InstrLowerer::emitRegistration() { IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", RegisterF)); for (Value *Data : CompilerUsedVars) if (!isa(Data)) - IRB.CreateCall(RuntimeRegisterF, Data); + // Check for addrspace cast when profiling GPU + IRB.CreateCall(RuntimeRegisterF, + IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy)); for (Value *Data : UsedVars) if (Data != NamesVar && !isa(Data)) - IRB.CreateCall(RuntimeRegisterF, Data); + IRB.CreateCall(RuntimeRegisterF, + IRB.CreatePointerBitCastOrAddrSpaceCast(Data, VoidPtrTy)); if (NamesVar) { Type *ParamTypes[] = {VoidPtrTy, Int64Ty}; @@ -1670,7 +1673,9 @@ void InstrLowerer::emitRegistration() { auto *NamesRegisterF = Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage, getInstrProfNamesRegFuncName(), M); - IRB.CreateCall(NamesRegisterF, {NamesVar, IRB.getInt64(NamesSize)}); + IRB.CreateCall(NamesRegisterF, {IRB.CreatePointerBitCastOrAddrSpaceCast( + NamesVar, VoidPtrTy), + IRB.getInt64(NamesSize)}); } IRB.CreateRetVoid(); From 7a0e0efa178cc4de6a22a8f5cc3f53cd1c81ea3a Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Thu, 21 Dec 2023 00:25:46 -0600 Subject: [PATCH 003/114] Change global visibility on GPU targets --- llvm/include/llvm/ProfileData/InstrProf.h | 4 ++++ llvm/lib/ProfileData/InstrProf.cpp | 17 +++++++++++++++-- .../Instrumentation/InstrProfiling.cpp | 15 +++++++++++---- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 288dc71d756ae..bf9899d867e3d 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -171,6 +171,10 @@ inline StringRef getInstrProfCounterBiasVarName() { /// Return the marker used to separate PGO names during serialization. inline StringRef getInstrProfNameSeparator() { return "\01"; } +/// Determines whether module targets a GPU eligable for PGO +/// instrumentation +bool isGPUProfTarget(const Module &M); + /// Return the modified name for function \c F suitable to be /// used the key for profile lookup. Variable \c InLTO indicates if this /// is called in LTO optimization passes. diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 649d814cfd9de..0d6717aeb0142 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -410,13 +410,22 @@ std::string getPGOFuncNameVarName(StringRef FuncName, return VarName; } +bool isGPUProfTarget(const Module &M) { + const auto &triple = M.getTargetTriple(); + return triple.rfind("nvptx", 0) == 0 || triple.rfind("amdgcn", 0) == 0 || + triple.rfind("r600", 0) == 0; +} + GlobalVariable *createPGOFuncNameVar(Module &M, GlobalValue::LinkageTypes Linkage, StringRef PGOFuncName) { + // Ensure profiling variables on GPU are visible to be read from host + if (isGPUProfTarget(M)) + Linkage = GlobalValue::ExternalLinkage; // We generally want to match the function's linkage, but available_externally // and extern_weak both have the wrong semantics, and anything that doesn't // need to link across compilation units doesn't need to be visible at all. - if (Linkage == GlobalValue::ExternalWeakLinkage) + else if (Linkage == GlobalValue::ExternalWeakLinkage) Linkage = GlobalValue::LinkOnceAnyLinkage; else if (Linkage == GlobalValue::AvailableExternallyLinkage) Linkage = GlobalValue::LinkOnceODRLinkage; @@ -430,8 +439,12 @@ GlobalVariable *createPGOFuncNameVar(Module &M, new GlobalVariable(M, Value->getType(), true, Linkage, Value, getPGOFuncNameVarName(PGOFuncName, Linkage)); + // If the target is a GPU, make the symbol protected so it can + // be read from the host device + if (isGPUProfTarget(M)) + FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility); // Hide the symbol so that we correctly get a copy for each executable. - if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) + else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) FuncNameVar->setVisibility(GlobalValue::HiddenVisibility); return FuncNameVar; diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index d2cb8155c1796..3b582b6519080 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1481,6 +1481,10 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); + if (isGPUProfTarget(M)) { + Linkage = GlobalValue::ExternalLinkage; + Visibility = GlobalValue::ProtectedVisibility; + } // If the data variable is not referenced by code (if we don't emit // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the // data variable live under linker GC, the data variable can be private. This @@ -1492,9 +1496,9 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees // that other copies must have the same CFG and cannot have value profiling. // If no hash suffix, other profd copies may be referenced by code. - if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && - (TT.isOSBinFormatELF() || - (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { + else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && + (TT.isOSBinFormatELF() || + (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { Linkage = GlobalValue::PrivateLinkage; Visibility = GlobalValue::DefaultVisibility; } @@ -1696,7 +1700,10 @@ bool InstrLowerer::emitRuntimeHook() { auto *Var = new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage, nullptr, getInstrProfRuntimeHookVarName()); - Var->setVisibility(GlobalValue::HiddenVisibility); + if (isGPUProfTarget(M)) + Var->setVisibility(GlobalValue::ProtectedVisibility); + else + Var->setVisibility(GlobalValue::HiddenVisibility); if (TT.isOSBinFormatELF() && !TT.isPS()) { // Mark the user variable as used so that it isn't stripped out. From fddc07908ed9aa698fe3250ddbfc5621ab4d049d Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 22 Dec 2023 23:43:29 -0600 Subject: [PATCH 004/114] Make names global public on GPU --- llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 3b582b6519080..61fba7be3ee0e 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1621,6 +1621,13 @@ void InstrLowerer::emitNameData() { NamesVar = new GlobalVariable(M, NamesVal->getType(), true, GlobalValue::PrivateLinkage, NamesVal, getInstrProfNamesVarName()); + + // Make names variable public if current target is a GPU + if (isGPUProfTarget(M)) { + NamesVar->setLinkage(GlobalValue::ExternalLinkage); + NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility); + } + NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); NamesVar->setSection( From e9db03c70bf79f4f4ddad4b48a5aa63a37e0d4f6 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 29 Dec 2023 12:54:50 -0600 Subject: [PATCH 005/114] Read and print GPU device PGO globals --- .../common/include/GlobalHandler.h | 27 ++++++ .../common/src/GlobalHandler.cpp | 82 +++++++++++++++++++ .../common/src/PluginInterface.cpp | 14 ++++ 3 files changed, 123 insertions(+) diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h index fa079ac9660ee..a82cd53648765 100644 --- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h +++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h @@ -14,9 +14,11 @@ #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H #include +#include #include "llvm/ADT/DenseMap.h" #include "llvm/Object/ELFObjectFile.h" +#include "llvm/ProfileData/InstrProf.h" #include "Shared/Debug.h" #include "Shared/Utils.h" @@ -58,6 +60,22 @@ class GlobalTy { void setPtr(void *P) { Ptr = P; } }; +typedef void *IntPtrT; +struct __llvm_profile_data { +#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) Type Name; +#include "llvm/ProfileData/InstrProfData.inc" +}; + +/// PGO profiling data extracted from a GPU device +struct GPUProfGlobals { + std::string names; + std::vector> counts; + std::vector<__llvm_profile_data> data; + Triple targetTriple; + + void dump() const; +}; + /// Subclass of GlobalTy that holds the memory for a global of \p Ty. template class StaticGlobalTy : public GlobalTy { Ty Data; @@ -172,6 +190,15 @@ class GenericGlobalHandlerTy { return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, /* D2H */ false); } + + /// Checks whether a given image contains profiling globals. + bool hasProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image); + + /// Reads profiling data from a GPU image to supplied profdata struct. + /// Iterates through the image symbol table and stores global values + /// with profiling prefixes. + Expected readProfilingGlobals(GenericDeviceTy &Device, + DeviceImageTy &Image); }; } // namespace plugin diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp index 3a272e228c7df..5dd5daec468ca 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp @@ -176,3 +176,85 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device, return Plugin::success(); } + +bool GenericGlobalHandlerTy::hasProfilingGlobals(GenericDeviceTy &Device, + DeviceImageTy &Image) { + GlobalTy global(getInstrProfNamesVarName().str(), 0); + if (auto Err = getGlobalMetadataFromImage(Device, Image, global)) { + consumeError(std::move(Err)); + return false; + } + return true; +} + +Expected +GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, + DeviceImageTy &Image) { + GPUProfGlobals profdata; + const auto *elf = getOrCreateELFObjectFile(Device, Image); + profdata.targetTriple = elf->makeTriple(); + // Iterate through + for (auto &sym : elf->symbols()) { + if (auto name = sym.getName()) { + // Check if given current global is a profiling global based + // on name + if (name->equals(getInstrProfNamesVarName())) { + // Read in profiled function names + std::vector chars(sym.getSize() / sizeof(char), ' '); + GlobalTy NamesGlobal(name->str(), sym.getSize(), chars.data()); + if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal)) + return Err; + std::string names(chars.begin(), chars.end()); + profdata.names = std::move(names); + } else if (name->starts_with(getInstrProfCountersVarPrefix())) { + // Read global variable profiling counts + std::vector counts(sym.getSize() / sizeof(int64_t), 0); + GlobalTy CountGlobal(name->str(), sym.getSize(), counts.data()); + if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal)) + return Err; + profdata.counts.push_back(std::move(counts)); + } else if (name->starts_with(getInstrProfDataVarPrefix())) { + // Read profiling data for this global variable + __llvm_profile_data data{}; + GlobalTy DataGlobal(name->str(), sym.getSize(), &data); + if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal)) + return Err; + profdata.data.push_back(std::move(data)); + } + } + } + return profdata; +} + +void GPUProfGlobals::dump() const { + llvm::outs() << "======= GPU Profile =======\nTarget: " << targetTriple.str() + << "\n"; + + llvm::outs() << "======== Counters =========\n"; + for (const auto &count : counts) { + llvm::outs() << "["; + for (size_t i = 0; i < count.size(); i++) { + if (i == 0) + llvm::outs() << " "; + llvm::outs() << count[i] << " "; + } + llvm::outs() << "]\n"; + } + + llvm::outs() << "========== Data ===========\n"; + for (const auto &d : data) { + llvm::outs() << "{ "; +#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \ + llvm::outs() << d.Name << " "; +#include "llvm/ProfileData/InstrProfData.inc" + llvm::outs() << " }\n"; + } + + llvm::outs() << "======== Functions ========\n"; + InstrProfSymtab symtab; + if (Error Err = symtab.create(StringRef(names))) { + consumeError(std::move(Err)); + } + symtab.dumpNames(llvm::outs()); + llvm::outs() << "===========================\n"; +} diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp index 3c7d1ca899878..84ed90f03f84f 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp @@ -811,6 +811,20 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { DeviceMemoryPoolTracking.AllocationMax); } + for (auto *Image : LoadedImages) { + GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); + if (!Handler.hasProfilingGlobals(*this, *Image)) + continue; + + GPUProfGlobals profdata; + auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image); + if (!ProfOrErr) + return ProfOrErr.takeError(); + + // TODO: write data to profiling file + ProfOrErr->dump(); + } + // Delete the memory manager before deinitializing the device. Otherwise, // we may delete device allocations after the device is deinitialized. if (MemoryManager) From e4687605d1a6ca932312025826db09dba84845a3 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Wed, 3 Jan 2024 17:06:15 -0600 Subject: [PATCH 006/114] Fix rebase bug --- .../plugins-nextgen/common/src/GlobalHandler.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp index cb71b61f4a9c4..86742d0f77a2f 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp @@ -178,10 +178,12 @@ Expected GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image) { GPUProfGlobals profdata; - const auto *elf = getOrCreateELFObjectFile(Device, Image); - profdata.targetTriple = elf->makeTriple(); - // Iterate through - for (auto &sym : elf->symbols()) { + auto ELFObj = getELFObjectFile(Image); + if (!ELFObj) + return ELFObj.takeError(); + profdata.targetTriple = ELFObj->makeTriple(); + // Iterate through elf symbols + for (auto &sym : ELFObj->symbols()) { if (auto name = sym.getName()) { // Check if given current global is a profiling global based // on name From ec18ce94c227e1d43927955fa1c67360ecfcfca6 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Wed, 3 Jan 2024 17:10:19 -0600 Subject: [PATCH 007/114] Refactor portions to be more idiomatic --- clang/lib/CodeGen/CodeGenPGO.cpp | 4 +--- llvm/lib/ProfileData/InstrProf.cpp | 5 ++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index edae6885b528a..7bfcec43ee4c9 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -961,10 +961,8 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S, // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling - auto *I8Ty = llvm::Type::getInt8Ty(CGM.getLLVMContext()); - auto *I8PtrTy = llvm::PointerType::getUnqual(I8Ty); auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - FuncNameVar, I8PtrTy); + FuncNameVar, llvm::PointerType::getUnqual(CGM.getLLVMContext())); llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash), Builder.getInt32(NumRegionCounters), diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index cdcd6840bb510..1d88da16a5ff9 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -429,9 +429,8 @@ std::string getPGOFuncNameVarName(StringRef FuncName, } bool isGPUProfTarget(const Module &M) { - const auto &triple = M.getTargetTriple(); - return triple.rfind("nvptx", 0) == 0 || triple.rfind("amdgcn", 0) == 0 || - triple.rfind("r600", 0) == 0; + const auto &Triple = llvm::Triple(M.getTargetTriple()); + return Triple.isAMDGPU() || Triple.isNVPTX(); } GlobalVariable *createPGOFuncNameVar(Module &M, From 0872556f597056361b0a2c23cdd0be3d9745aef3 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Wed, 3 Jan 2024 17:18:47 -0600 Subject: [PATCH 008/114] Reformat DeviceRTL prof functions --- openmp/libomptarget/DeviceRTL/include/Profiling.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/openmp/libomptarget/DeviceRTL/include/Profiling.h b/openmp/libomptarget/DeviceRTL/include/Profiling.h index 68c7744cd6075..9efc1554c176b 100644 --- a/openmp/libomptarget/DeviceRTL/include/Profiling.h +++ b/openmp/libomptarget/DeviceRTL/include/Profiling.h @@ -13,9 +13,8 @@ #define OMPTARGET_DEVICERTL_PROFILING_H extern "C" { - -void __llvm_profile_register_function(void *ptr); -void __llvm_profile_register_names_function(void *ptr, long int i); +void __llvm_profile_register_function(void *Ptr); +void __llvm_profile_register_names_function(void *Ptr, long int I); } #endif From 62f31d1c71b5d100f38d6dc584cc138b3904581b Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Tue, 9 Jan 2024 11:52:29 -0600 Subject: [PATCH 009/114] Style changes + catch name error --- .../common/include/GlobalHandler.h | 16 ++-- .../common/src/GlobalHandler.cpp | 87 ++++++++++--------- 2 files changed, 56 insertions(+), 47 deletions(-) diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h index a803b3f76d8b2..755bb23a414e3 100644 --- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h +++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h @@ -13,8 +13,7 @@ #ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H -#include -#include +#include #include "llvm/ADT/DenseMap.h" #include "llvm/Object/ELFObjectFile.h" @@ -60,18 +59,19 @@ class GlobalTy { void setPtr(void *P) { Ptr = P; } }; -typedef void *IntPtrT; +using IntPtrT = void *; struct __llvm_profile_data { -#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) Type Name; +#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \ + std::remove_const::type Name; #include "llvm/ProfileData/InstrProfData.inc" }; /// PGO profiling data extracted from a GPU device struct GPUProfGlobals { - std::string names; - std::vector> counts; - std::vector<__llvm_profile_data> data; - Triple targetTriple; + SmallVector NamesData; + SmallVector> Counts; + SmallVector<__llvm_profile_data> Data; + Triple TargetTriple; void dump() const; }; diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp index 86742d0f77a2f..7cb672e7b2683 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp @@ -19,6 +19,7 @@ #include "llvm/Support/Error.h" #include +#include using namespace llvm; using namespace omp; @@ -177,73 +178,81 @@ bool GenericGlobalHandlerTy::hasProfilingGlobals(GenericDeviceTy &Device, Expected GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image) { - GPUProfGlobals profdata; + GPUProfGlobals DeviceProfileData; auto ELFObj = getELFObjectFile(Image); if (!ELFObj) return ELFObj.takeError(); - profdata.targetTriple = ELFObj->makeTriple(); + DeviceProfileData.TargetTriple = ELFObj->makeTriple(); + // Iterate through elf symbols - for (auto &sym : ELFObj->symbols()) { - if (auto name = sym.getName()) { - // Check if given current global is a profiling global based - // on name - if (name->equals(getInstrProfNamesVarName())) { - // Read in profiled function names - std::vector chars(sym.getSize() / sizeof(char), ' '); - GlobalTy NamesGlobal(name->str(), sym.getSize(), chars.data()); - if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal)) - return Err; - std::string names(chars.begin(), chars.end()); - profdata.names = std::move(names); - } else if (name->starts_with(getInstrProfCountersVarPrefix())) { - // Read global variable profiling counts - std::vector counts(sym.getSize() / sizeof(int64_t), 0); - GlobalTy CountGlobal(name->str(), sym.getSize(), counts.data()); - if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal)) - return Err; - profdata.counts.push_back(std::move(counts)); - } else if (name->starts_with(getInstrProfDataVarPrefix())) { - // Read profiling data for this global variable - __llvm_profile_data data{}; - GlobalTy DataGlobal(name->str(), sym.getSize(), &data); - if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal)) - return Err; - profdata.data.push_back(std::move(data)); - } + for (auto &Sym : ELFObj->symbols()) { + auto NameOrErr = Sym.getName(); + if (!NameOrErr) + return ELFObj.takeError(); + + // Check if given current global is a profiling global based + // on name + if (NameOrErr->equals(getInstrProfNamesVarName())) { + // Read in profiled function names + DeviceProfileData.NamesData = SmallVector(Sym.getSize(), 0); + GlobalTy NamesGlobal(NameOrErr->str(), Sym.getSize(), + DeviceProfileData.NamesData.data()); + if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal)) + return Err; + } else if (NameOrErr->starts_with(getInstrProfCountersVarPrefix())) { + // Read global variable profiling counts + SmallVector Counts(Sym.getSize() / sizeof(int64_t), 0); + GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data()); + if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal)) + return Err; + DeviceProfileData.Counts.push_back(std::move(Counts)); + } else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) { + // Read profiling data for this global variable + __llvm_profile_data Data{}; + GlobalTy DataGlobal(NameOrErr->str(), Sym.getSize(), &Data); + if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal)) + return Err; + DeviceProfileData.Data.push_back(std::move(Data)); } } - return profdata; + return DeviceProfileData; } void GPUProfGlobals::dump() const { - llvm::outs() << "======= GPU Profile =======\nTarget: " << targetTriple.str() + llvm::outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str() << "\n"; llvm::outs() << "======== Counters =========\n"; - for (const auto &count : counts) { + for (const auto &Count : Counts) { llvm::outs() << "["; - for (size_t i = 0; i < count.size(); i++) { + for (size_t i = 0; i < Count.size(); i++) { if (i == 0) llvm::outs() << " "; - llvm::outs() << count[i] << " "; + llvm::outs() << Count[i] << " "; } llvm::outs() << "]\n"; } llvm::outs() << "========== Data ===========\n"; - for (const auto &d : data) { + for (const auto &ProfData : Data) { llvm::outs() << "{ "; #define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \ - llvm::outs() << d.Name << " "; + llvm::outs() << ProfData.Name << " "; #include "llvm/ProfileData/InstrProfData.inc" llvm::outs() << " }\n"; } llvm::outs() << "======== Functions ========\n"; - InstrProfSymtab symtab; - if (Error Err = symtab.create(StringRef(names))) { + std::string s; + s.reserve(NamesData.size()); + for (uint8_t Name : NamesData) { + s.push_back((char)Name); + } + + InstrProfSymtab Symtab; + if (Error Err = Symtab.create(StringRef(s))) { consumeError(std::move(Err)); } - symtab.dumpNames(llvm::outs()); + Symtab.dumpNames(llvm::outs()); llvm::outs() << "===========================\n"; } From 0c4bbeb54d189c1461affd37853aa86c3e3ca7d8 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Wed, 17 Jan 2024 19:59:06 -0600 Subject: [PATCH 010/114] Add GPU PGO test --- .../common/src/GlobalHandler.cpp | 2 +- openmp/libomptarget/test/CMakeLists.txt | 6 +++ openmp/libomptarget/test/lit.cfg | 3 ++ openmp/libomptarget/test/lit.site.cfg.in | 2 +- openmp/libomptarget/test/offloading/pgo1.c | 39 +++++++++++++++++++ 5 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 openmp/libomptarget/test/offloading/pgo1.c diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp index 7cb672e7b2683..e5eb653d02228 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp @@ -239,7 +239,7 @@ void GPUProfGlobals::dump() const { #define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \ llvm::outs() << ProfData.Name << " "; #include "llvm/ProfileData/InstrProfData.inc" - llvm::outs() << " }\n"; + llvm::outs() << "}\n"; } llvm::outs() << "======== Functions ========\n"; diff --git a/openmp/libomptarget/test/CMakeLists.txt b/openmp/libomptarget/test/CMakeLists.txt index a0ba233eaa572..21233f3e252eb 100644 --- a/openmp/libomptarget/test/CMakeLists.txt +++ b/openmp/libomptarget/test/CMakeLists.txt @@ -12,6 +12,12 @@ else() set(LIBOMPTARGET_DEBUG False) endif() +if (OPENMP_STANDALONE_BUILD) + set(LIBOMPTARGET_TEST_GPU_PGO False) +else() + set(LIBOMPTARGET_TEST_GPU_PGO True) +endif() + # Replace the space from user's input with ";" in case that CMake add escape # char into the lit command. string(REPLACE " " ";" LIBOMPTARGET_LIT_ARG_LIST "${LIBOMPTARGET_LIT_ARGS}") diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg index 19c5e5c457222..49743f9fed7f2 100644 --- a/openmp/libomptarget/test/lit.cfg +++ b/openmp/libomptarget/test/lit.cfg @@ -104,6 +104,9 @@ config.available_features.add(config.libomptarget_current_target) if config.libomptarget_has_libc: config.available_features.add('libc') +if config.libomptarget_test_pgo: + config.available_features.add('pgo') + # Determine whether the test system supports unified memory. # For CUDA, this is the case with compute capability 70 (Volta) or higher. # For all other targets, we currently assume it is. diff --git a/openmp/libomptarget/test/lit.site.cfg.in b/openmp/libomptarget/test/lit.site.cfg.in index 2d63811883872..494d1636af304 100644 --- a/openmp/libomptarget/test/lit.site.cfg.in +++ b/openmp/libomptarget/test/lit.site.cfg.in @@ -25,6 +25,6 @@ config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@" config.libomptarget_debug = @LIBOMPTARGET_DEBUG@ config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@ config.libomptarget_has_libc = @LIBOMPTARGET_GPU_LIBC_SUPPORT@ - +config.libomptarget_test_pgo = @LIBOMPTARGET_TEST_GPU_PGO@ # Let the main config do the real work. lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg") diff --git a/openmp/libomptarget/test/offloading/pgo1.c b/openmp/libomptarget/test/offloading/pgo1.c new file mode 100644 index 0000000000000..ca8a6f502a06a --- /dev/null +++ b/openmp/libomptarget/test/offloading/pgo1.c @@ -0,0 +1,39 @@ +// RUN: %libomptarget-compile-generic -fprofile-instr-generate -Xclang "-fprofile-instrument=clang" +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic + +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// REQUIRES: pgo + +#ifdef _OPENMP +#include +#endif + +int test1(int a) { return a / 2; } +int test2(int a) { return a * 2; } + +int main() { + int m = 2; +#pragma omp target + for (int i = 0; i < 10; i++) { + m = test1(m); + for (int j = 0; j < 2; j++) { + m = test2(m); + } + } +} + +// CHECK: ======== Counters ========= +// CHECK-NEXT: [ 0 11 20 ] +// CHECK-NEXT: [ 10 ] +// CHECK-NEXT: [ 20 ] +// CHECK-NEXT: ========== Data =========== +// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CHECK-NEXT: ======== Functions ======== +// CHECK-NEXT: pgo1.c:__omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} +// CHECK-NEXT: test1 +// CHECK-NEXT: test2 From c7ae2a74daa93b05058fcc9bba64e0734359362c Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Wed, 17 Jan 2024 23:12:27 -0600 Subject: [PATCH 011/114] Fix PGO test formatting --- openmp/libomptarget/test/offloading/pgo1.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/openmp/libomptarget/test/offloading/pgo1.c b/openmp/libomptarget/test/offloading/pgo1.c index ca8a6f502a06a..389be19b670d7 100644 --- a/openmp/libomptarget/test/offloading/pgo1.c +++ b/openmp/libomptarget/test/offloading/pgo1.c @@ -1,4 +1,5 @@ -// RUN: %libomptarget-compile-generic -fprofile-instr-generate -Xclang "-fprofile-instrument=clang" +// RUN: %libomptarget-compile-generic -fprofile-instr-generate \ +// RUN: -Xclang "-fprofile-instrument=clang" // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic // UNSUPPORTED: x86_64-pc-linux-gnu @@ -30,9 +31,18 @@ int main() { // CHECK-NEXT: [ 10 ] // CHECK-NEXT: [ 20 ] // CHECK-NEXT: ========== Data =========== -// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} +// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} +// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} +// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } // CHECK-NEXT: ======== Functions ======== // CHECK-NEXT: pgo1.c:__omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} // CHECK-NEXT: test1 From 8bb22072914bbb830e2788d117aedd0e0bab66ff Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Thu, 18 Jan 2024 23:15:55 -0600 Subject: [PATCH 012/114] Refactor visibility logic --- llvm/lib/ProfileData/InstrProf.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 511571a3eed9b..708ea63fd95e0 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -422,6 +422,16 @@ bool isGPUProfTarget(const Module &M) { return Triple.isAMDGPU() || Triple.isNVPTX(); } +void setPGOFuncVisibility(Module &M, GlobalVariable *FuncNameVar) { + // If the target is a GPU, make the symbol protected so it can + // be read from the host device + if (isGPUProfTarget(M)) + FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility); + // Hide the symbol so that we correctly get a copy for each executable. + else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) + FuncNameVar->setVisibility(GlobalValue::HiddenVisibility); +} + GlobalVariable *createPGOFuncNameVar(Module &M, GlobalValue::LinkageTypes Linkage, StringRef PGOFuncName) { @@ -445,14 +455,7 @@ GlobalVariable *createPGOFuncNameVar(Module &M, new GlobalVariable(M, Value->getType(), true, Linkage, Value, getPGOFuncNameVarName(PGOFuncName, Linkage)); - // If the target is a GPU, make the symbol protected so it can - // be read from the host device - if (isGPUProfTarget(M)) - FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility); - // Hide the symbol so that we correctly get a copy for each executable. - else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) - FuncNameVar->setVisibility(GlobalValue::HiddenVisibility); - + setPGOFuncVisibility(M, FuncNameVar); return FuncNameVar; } From 9f13943f64cb16162e44902d54de53a9b1229179 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Tue, 23 Jan 2024 18:33:58 -0600 Subject: [PATCH 013/114] Add LLVM instrumentation support This PR formerly only supported -fprofile-instrument=clang. This commit adds support for -fprofile-instrument=llvm --- .../Instrumentation/PGOInstrumentation.cpp | 12 +++- openmp/libomptarget/test/offloading/pgo1.c | 72 +++++++++++++------ 2 files changed, 59 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index c20fc942eaf0d..bbc8da78fd7ba 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -862,6 +862,10 @@ static void instrumentOneFunc( auto Name = FuncInfo.FuncNameVar; auto CFGHash = ConstantInt::get(Type::getInt64Ty(M->getContext()), FuncInfo.FunctionHash); + // Make sure that pointer to global is passed in with zero addrspace + // This is relevant during GPU profiling + auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( + Name, llvm::PointerType::getUnqual(M->getContext())); if (PGOFunctionEntryCoverage) { auto &EntryBB = F.getEntryBlock(); IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt()); @@ -869,7 +873,7 @@ static void instrumentOneFunc( // i32 ) Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_cover), - {Name, CFGHash, Builder.getInt32(1), Builder.getInt32(0)}); + {NormalizedPtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)}); return; } @@ -887,7 +891,8 @@ static void instrumentOneFunc( // i32 ) Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_timestamp), - {Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I)}); + {NormalizedPtr, CFGHash, Builder.getInt32(NumCounters), + Builder.getInt32(I)}); I += PGOBlockCoverage ? 8 : 1; } @@ -901,7 +906,8 @@ static void instrumentOneFunc( Intrinsic::getDeclaration(M, PGOBlockCoverage ? Intrinsic::instrprof_cover : Intrinsic::instrprof_increment), - {Name, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I++)}); + {NormalizedPtr, CFGHash, Builder.getInt32(NumCounters), + Builder.getInt32(I++)}); } // Now instrument select instructions: diff --git a/openmp/libomptarget/test/offloading/pgo1.c b/openmp/libomptarget/test/offloading/pgo1.c index 389be19b670d7..d95793b508dcf 100644 --- a/openmp/libomptarget/test/offloading/pgo1.c +++ b/openmp/libomptarget/test/offloading/pgo1.c @@ -1,6 +1,11 @@ // RUN: %libomptarget-compile-generic -fprofile-instr-generate \ // RUN: -Xclang "-fprofile-instrument=clang" -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ +// RUN: --check-prefix="CLANG-PGO" +// RUN: %libomptarget-compile-generic -fprofile-generate \ +// RUN: -Xclang "-fprofile-instrument=llvm" +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ +// RUN: --check-prefix="LLVM-PGO" // UNSUPPORTED: x86_64-pc-linux-gnu // UNSUPPORTED: x86_64-pc-linux-gnu-LTO @@ -26,24 +31,47 @@ int main() { } } -// CHECK: ======== Counters ========= -// CHECK-NEXT: [ 0 11 20 ] -// CHECK-NEXT: [ 10 ] -// CHECK-NEXT: [ 20 ] -// CHECK-NEXT: ========== Data =========== -// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CHECK-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CHECK-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CHECK-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CHECK-NEXT: ======== Functions ======== -// CHECK-NEXT: pgo1.c:__omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} -// CHECK-NEXT: test1 -// CHECK-NEXT: test2 +// CLANG-PGO: ======== Counters ========= +// CLANG-PGO-NEXT: [ 0 11 20 ] +// CLANG-PGO-NEXT: [ 10 ] +// CLANG-PGO-NEXT: [ 20 ] +// CLANG-PGO-NEXT: ========== Data =========== +// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// CLANG-PGO-NEXT: ======== Functions ======== +// CLANG-PGO-NEXT: pgo1.c: +// CLANG-PGO-SAME: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} +// CLANG-PGO-NEXT: test1 +// CLANG-PGO-NEXT: test2 + +// LLVM-PGO: ======== Counters ========= +// LLVM-PGO-NEXT: [ 20 ] +// LLVM-PGO-NEXT: [ 10 ] +// LLVM-PGO-NEXT: [ 20 10 1 1 ] +// LLVM-PGO-NEXT: ========== Data =========== +// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} +// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } +// LLVM-PGO-NEXT: ======== Functions ======== +// LLVM-PGO-NEXT: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} +// LLVM-PGO-NEXT: test1 +// LLVM-PGO-NEXT: test2 From 0606f0dd1b32ef9ebe138bbc964b3921e22d95d1 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Wed, 14 Feb 2024 01:46:55 -0600 Subject: [PATCH 014/114] Use explicit addrspace instead of unqual --- clang/lib/CodeGen/CodeGenPGO.cpp | 2 +- llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index e084dda879cbc..4c75a01222d30 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -1103,7 +1103,7 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S, // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - FuncNameVar, llvm::PointerType::getUnqual(CGM.getLLVMContext())); + FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0)); llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash), Builder.getInt32(NumRegionCounters), diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index bbc8da78fd7ba..c63b3e4ecf786 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -865,7 +865,7 @@ static void instrumentOneFunc( // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - Name, llvm::PointerType::getUnqual(M->getContext())); + Name, llvm::PointerType::get(M->getContext(), 0)); if (PGOFunctionEntryCoverage) { auto &EntryBB = F.getEntryBlock(); IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt()); From c1f9be321678766525141214aaab74636cafbc2c Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Thu, 15 Feb 2024 19:10:09 -0600 Subject: [PATCH 015/114] Remove redundant namespaces --- .../Instrumentation/PGOInstrumentation.cpp | 4 +-- .../common/src/GlobalHandler.cpp | 26 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index c63b3e4ecf786..3058e577738fd 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -864,8 +864,8 @@ static void instrumentOneFunc( FuncInfo.FunctionHash); // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling - auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - Name, llvm::PointerType::get(M->getContext(), 0)); + auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + Name, PointerType::get(M->getContext(), 0)); if (PGOFunctionEntryCoverage) { auto &EntryBB = F.getEntryBlock(); IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt()); diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp index e5eb653d02228..ae270c60804d2 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp @@ -219,30 +219,30 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, } void GPUProfGlobals::dump() const { - llvm::outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str() + outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str() << "\n"; - llvm::outs() << "======== Counters =========\n"; + outs() << "======== Counters =========\n"; for (const auto &Count : Counts) { - llvm::outs() << "["; + outs() << "["; for (size_t i = 0; i < Count.size(); i++) { if (i == 0) - llvm::outs() << " "; - llvm::outs() << Count[i] << " "; + outs() << " "; + outs() << Count[i] << " "; } - llvm::outs() << "]\n"; + outs() << "]\n"; } - llvm::outs() << "========== Data ===========\n"; + outs() << "========== Data ===========\n"; for (const auto &ProfData : Data) { - llvm::outs() << "{ "; + outs() << "{ "; #define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \ - llvm::outs() << ProfData.Name << " "; + outs() << ProfData.Name << " "; #include "llvm/ProfileData/InstrProfData.inc" - llvm::outs() << "}\n"; + outs() << "}\n"; } - llvm::outs() << "======== Functions ========\n"; + outs() << "======== Functions ========\n"; std::string s; s.reserve(NamesData.size()); for (uint8_t Name : NamesData) { @@ -253,6 +253,6 @@ void GPUProfGlobals::dump() const { if (Error Err = Symtab.create(StringRef(s))) { consumeError(std::move(Err)); } - Symtab.dumpNames(llvm::outs()); - llvm::outs() << "===========================\n"; + Symtab.dumpNames(outs()); + outs() << "===========================\n"; } From 6a3ae407e69e7524f0f808329c534f8352ee1779 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Thu, 15 Feb 2024 19:15:15 -0600 Subject: [PATCH 016/114] Clang format --- .../libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp index ae270c60804d2..1fce244892262 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp @@ -220,7 +220,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, void GPUProfGlobals::dump() const { outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str() - << "\n"; + << "\n"; outs() << "======== Counters =========\n"; for (const auto &Count : Counts) { From 6866862d459e3c3fa65fae8ae639ddc3ff735252 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 16 Feb 2024 13:13:39 -0600 Subject: [PATCH 017/114] Use getAddrSpaceCast Replace getPointerBitCastOrAddrSpaceCast with getAddrSpaceCast and allow no-op getAddrSpaceCast calls when types are identical --- clang/lib/CodeGen/CodeGenPGO.cpp | 2 +- llvm/lib/IR/Constants.cpp | 4 ++++ llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index 8f52018445d2b..baceeba8380dd 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -1099,7 +1099,7 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S, // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling - auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( + auto *NormalizedPtr = llvm::ConstantExpr::getAddrSpaceCast( FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0)); llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash), diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index a38b912164b13..2d89c5bbd4a4c 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -2067,6 +2067,10 @@ Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy, Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy, bool OnlyIfReduced) { + // Skip cast if types are identical + if (C->getType() == DstTy) + return C; + assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) && "Invalid constantexpr addrspacecast!"); return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy, OnlyIfReduced); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 3058e577738fd..c0be71aa4cc00 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -864,7 +864,7 @@ static void instrumentOneFunc( FuncInfo.FunctionHash); // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling - auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + auto *NormalizedPtr = ConstantExpr::getAddrSpaceCast( Name, PointerType::get(M->getContext(), 0)); if (PGOFunctionEntryCoverage) { auto &EntryBB = F.getEntryBlock(); From 62a5ee1c75545571f81d9edd22e19e9ef7cff69f Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Tue, 27 Feb 2024 14:53:51 -0600 Subject: [PATCH 018/114] Revert "Use getAddrSpaceCast" This reverts commit 6866862d459e3c3fa65fae8ae639ddc3ff735252. --- clang/lib/CodeGen/CodeGenPGO.cpp | 2 +- llvm/lib/IR/Constants.cpp | 4 ---- llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp | 2 +- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index baceeba8380dd..8f52018445d2b 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -1099,7 +1099,7 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S, // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling - auto *NormalizedPtr = llvm::ConstantExpr::getAddrSpaceCast( + auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0)); llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash), diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index 2d89c5bbd4a4c..a38b912164b13 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -2067,10 +2067,6 @@ Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy, Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy, bool OnlyIfReduced) { - // Skip cast if types are identical - if (C->getType() == DstTy) - return C; - assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) && "Invalid constantexpr addrspacecast!"); return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy, OnlyIfReduced); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index c0be71aa4cc00..3058e577738fd 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -864,7 +864,7 @@ static void instrumentOneFunc( FuncInfo.FunctionHash); // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling - auto *NormalizedPtr = ConstantExpr::getAddrSpaceCast( + auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( Name, PointerType::get(M->getContext(), 0)); if (PGOFunctionEntryCoverage) { auto &EntryBB = F.getEntryBlock(); From 052394fa28c923d130bf73a07b965a9751467302 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Tue, 27 Feb 2024 15:34:34 -0600 Subject: [PATCH 019/114] Revert "Use getAddrSpaceCast" This reverts commit 6866862d459e3c3fa65fae8ae639ddc3ff735252. --- clang/lib/CodeGen/CodeGenPGO.cpp | 2 +- llvm/lib/IR/Constants.cpp | 4 ---- llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp | 2 +- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index baceeba8380dd..8f52018445d2b 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -1099,7 +1099,7 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S, // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling - auto *NormalizedPtr = llvm::ConstantExpr::getAddrSpaceCast( + auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0)); llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash), diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index 2d89c5bbd4a4c..a38b912164b13 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -2067,10 +2067,6 @@ Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy, Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy, bool OnlyIfReduced) { - // Skip cast if types are identical - if (C->getType() == DstTy) - return C; - assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) && "Invalid constantexpr addrspacecast!"); return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy, OnlyIfReduced); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index c0be71aa4cc00..3058e577738fd 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -864,7 +864,7 @@ static void instrumentOneFunc( FuncInfo.FunctionHash); // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling - auto *NormalizedPtr = ConstantExpr::getAddrSpaceCast( + auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( Name, PointerType::get(M->getContext(), 0)); if (PGOFunctionEntryCoverage) { auto &EntryBB = F.getEntryBlock(); From 612d5a5f6966a77e82e5591f5aea475fbf886e55 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 1 Mar 2024 02:04:00 -0600 Subject: [PATCH 020/114] Write PGO TODO: Fix tests --- compiler-rt/lib/profile/InstrProfiling.h | 11 ++ compiler-rt/lib/profile/InstrProfilingFile.c | 148 +++++++++++++++--- .../common/include/GlobalHandler.h | 14 +- .../common/src/GlobalHandler.cpp | 57 +++++-- .../common/src/PluginInterface.cpp | 6 +- 5 files changed, 200 insertions(+), 36 deletions(-) diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index 0123908336918..937acbd417de4 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -275,6 +275,17 @@ void __llvm_profile_get_padding_sizes_for_counters( */ void __llvm_profile_set_dumped(); +/*! + * \brief Write custom target-specific profiling data to a seperate file. + * Used by libomptarget for GPU PGO. + */ +int __llvm_write_custom_profile(const char *Target, + const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, + const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, + const char *NamesEnd); + /*! * This variable is defined in InstrProfilingRuntime.cpp as a hidden * symbol. Its main purpose is to enable profile runtime user to diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index f3b457d786e6b..4fc401bb9bebf 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -502,27 +502,15 @@ static FILE *getFileObject(const char *OutputName) { return fopen(OutputName, "ab"); } -/* Write profile data to file \c OutputName. */ -static int writeFile(const char *OutputName) { - int RetVal; - FILE *OutputFile; - - int MergeDone = 0; +/* Get file object and merge if applicable */ +static FILE *getMergeFileObject(const char *OutputName, int *MergeDone) { VPMergeHook = &lprofMergeValueProfData; if (doMerging()) - OutputFile = openFileForMerging(OutputName, &MergeDone); - else - OutputFile = getFileObject(OutputName); - - if (!OutputFile) - return -1; - - FreeHook = &free; - setupIOBuffer(); - ProfDataWriter fileWriter; - initFileWriter(&fileWriter, OutputFile); - RetVal = lprofWriteData(&fileWriter, lprofGetVPDataReader(), MergeDone); + return openFileForMerging(OutputName, MergeDone); + return getFileObject(OutputName); +} +static void closeFileObject(FILE *OutputFile) { if (OutputFile == getProfileFile()) { fflush(OutputFile); if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) { @@ -531,7 +519,23 @@ static int writeFile(const char *OutputName) { } else { fclose(OutputFile); } +} + +/* Write profile data to file \c OutputName. */ +static int writeFile(const char *OutputName) { + int RetVal, MergeDone = 0; + FILE *OutputFile = getMergeFileObject(OutputName, &MergeDone); + + if (!OutputFile) + return -1; + + FreeHook = &free; + setupIOBuffer(); + ProfDataWriter fileWriter; + initFileWriter(&fileWriter, OutputFile); + RetVal = lprofWriteData(&fileWriter, lprofGetVPDataReader(), MergeDone); + closeFileObject(OutputFile); return RetVal; } @@ -558,10 +562,16 @@ static int writeOrderFile(const char *OutputName) { #define LPROF_INIT_ONCE_ENV "__LLVM_PROFILE_RT_INIT_ONCE" +static void forceTruncateFile(const char *Filename) { + FILE *File = fopen(Filename, "w"); + if (!File) + return; + fclose(File); +} + static void truncateCurrentFile(void) { const char *Filename; char *FilenameBuf; - FILE *File; int Length; Length = getCurFilenameLength(); @@ -591,10 +601,7 @@ static void truncateCurrentFile(void) { return; /* Truncate the file. Later we'll reopen and append. */ - File = fopen(Filename, "w"); - if (!File) - return; - fclose(File); + forceTruncateFile(Filename); } /* Write a partial profile to \p Filename, which is required to be backed by @@ -1271,4 +1278,99 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File, return 0; } +int __llvm_write_custom_profile(const char *Target, + const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, + const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, + const char *NamesEnd) { + int ReturnValue = 0, FilenameLength, TargetLength, MergeDone; + char *FilenameBuf, *TargetFilename; + const char *Filename; + + /* Save old profile data */ + FILE *oldFile = getProfileFile(); + + // Temporarily suspend getting SIGKILL when the parent exits. + int PDeathSig = lprofSuspendSigKill(); + + if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) { + PROF_NOTE("Profile data not written to file: %s.\n", "already written"); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return 0; + } + + /* Get current filename */ + FilenameLength = getCurFilenameLength(); + FilenameBuf = (char *)COMPILER_RT_ALLOCA(FilenameLength + 1); + Filename = getCurFilename(FilenameBuf, 0); + + /* Check the filename. */ + if (!Filename) { + PROF_ERR("Failed to write file : %s\n", "Filename not set"); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return -1; + } + + /* Allocate new space for our target-specific PGO filename */ + TargetLength = strlen(Target); + TargetFilename = + (char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength + 2); + + /* Prepend "TARGET." to current filename */ + memcpy(TargetFilename, Target, TargetLength); + TargetFilename[TargetLength] = '.'; + memcpy(TargetFilename, Target, TargetLength); + memcpy(TargetFilename + 1 + TargetLength, Filename, FilenameLength); + TargetFilename[FilenameLength + 1 + TargetLength] = 0; + + /* Check if there is llvm/runtime version mismatch. */ + if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) { + PROF_ERR("Runtime and instrumentation version mismatch : " + "expected %d, but get %d\n", + INSTR_PROF_RAW_VERSION, + (int)GET_VERSION(__llvm_profile_get_version())); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return -1; + } + + /* Clean old target file */ + forceTruncateFile(TargetFilename); + + /* Open target-specific PGO file */ + MergeDone = 0; + FILE *OutputFile = getMergeFileObject(TargetFilename, &MergeDone); + + if (!OutputFile) { + PROF_ERR("Failed to open file : %s\n", TargetFilename); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return -1; + } + + FreeHook = &free; + setupIOBuffer(); + ProfDataWriter fileWriter; + initFileWriter(&fileWriter, OutputFile); + + /* Write custom data to the file */ + ReturnValue = lprofWriteDataImpl( + &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL, + lprofGetVPDataReader(), NamesBegin, NamesEnd, MergeDone); + + closeFileObject(OutputFile); + + // Restore SIGKILL. + if (PDeathSig == 1) + lprofRestoreSigKill(); + + /* Restore old profiling file */ + setProfileFile(oldFile); + + return ReturnValue; +} + #endif diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h index f5a15ca11bfcd..af0cd4dcdf5dc 100644 --- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h +++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h @@ -63,14 +63,24 @@ struct __llvm_profile_data { #include "llvm/ProfileData/InstrProfData.inc" }; +extern "C" { +extern int __attribute__((weak)) +__llvm_write_custom_profile(const char *Target, + const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, + const char *CountersBegin, const char *CountersEnd, + const char *NamesBegin, const char *NamesEnd); +} + /// PGO profiling data extracted from a GPU device struct GPUProfGlobals { - SmallVector NamesData; - SmallVector> Counts; + SmallVector Counts; SmallVector<__llvm_profile_data> Data; + SmallVector NamesData; Triple TargetTriple; void dump() const; + Error write() const; }; /// Subclass of GlobalTy that holds the memory for a global of \p Ty. diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp index 1fce244892262..2f16b6e3c139e 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp @@ -205,7 +205,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data()); if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal)) return Err; - DeviceProfileData.Counts.push_back(std::move(Counts)); + DeviceProfileData.Counts.append(std::move(Counts)); } else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) { // Read profiling data for this global variable __llvm_profile_data Data{}; @@ -223,15 +223,14 @@ void GPUProfGlobals::dump() const { << "\n"; outs() << "======== Counters =========\n"; - for (const auto &Count : Counts) { - outs() << "["; - for (size_t i = 0; i < Count.size(); i++) { - if (i == 0) - outs() << " "; - outs() << Count[i] << " "; - } - outs() << "]\n"; + for (size_t i = 0; i < Counts.size(); i++) { + if (i > 0 && i % 10 == 0) + outs() << "\n"; + else if (i != 0) + outs() << " "; + outs() << Counts[i]; } + outs() << "\n"; outs() << "========== Data ===========\n"; for (const auto &ProfData : Data) { @@ -256,3 +255,43 @@ void GPUProfGlobals::dump() const { Symtab.dumpNames(outs()); outs() << "===========================\n"; } + +Error GPUProfGlobals::write() const { + if (!__llvm_write_custom_profile) + return Plugin::error("Could not find symbol __llvm_write_custom_profile. " + "The compiler-rt profiling library must be linked for " + "GPU PGO to work."); + + size_t DataSize = Data.size() * sizeof(__llvm_profile_data), + CountsSize = Counts.size() * sizeof(int64_t); + __llvm_profile_data *DataBegin, *DataEnd; + char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd; + + // Initialize array of contiguous data. We need to make sure each section is + // contiguous so that the PGO library can compute deltas properly + SmallVector ContiguousData(NamesData.size() + DataSize + CountsSize); + + // Compute region pointers + DataBegin = (__llvm_profile_data *)(ContiguousData.data() + CountsSize); + DataEnd = + (__llvm_profile_data *)(ContiguousData.data() + CountsSize + DataSize); + CountersBegin = (char *)ContiguousData.data(); + CountersEnd = (char *)(ContiguousData.data() + CountsSize); + NamesBegin = (char *)(ContiguousData.data() + CountsSize + DataSize); + NamesEnd = (char *)(ContiguousData.data() + CountsSize + DataSize + + NamesData.size()); + + // Copy data to contiguous buffer + memcpy(DataBegin, Data.data(), DataSize); + memcpy(CountersBegin, Counts.data(), CountsSize); + memcpy(NamesBegin, NamesData.data(), NamesData.size()); + + // Invoke compiler-rt entrypoint + int result = __llvm_write_custom_profile(TargetTriple.str().c_str(), + DataBegin, DataEnd, CountersBegin, + CountersEnd, NamesBegin, NamesEnd); + if (result != 0) + return Plugin::error("Error writing GPU PGO data to file"); + + return Plugin::success(); +} diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp index 1ea93795ce8ce..d5e6b6128152d 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp @@ -837,8 +837,10 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { if (!ProfOrErr) return ProfOrErr.takeError(); - // TODO: write data to profiling file - ProfOrErr->dump(); + // Write data to profiling file + if (auto Err = ProfOrErr->write()) { + consumeError(std::move(Err)); + } } // Delete the memory manager before deinitializing the device. Otherwise, From b8c916305acf08c0bd2d51b81875be5e8fc59ff3 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Wed, 13 Mar 2024 20:05:32 -0500 Subject: [PATCH 021/114] Fix tests --- .../plugins-nextgen/common/src/PluginInterface.cpp | 3 +++ openmp/libomptarget/test/offloading/pgo1.c | 8 ++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp index d5e6b6128152d..2359ad28a25b0 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp @@ -837,6 +837,9 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { if (!ProfOrErr) return ProfOrErr.takeError(); + // Dump out profdata + ProfOrErr->dump(); + // Write data to profiling file if (auto Err = ProfOrErr->write()) { consumeError(std::move(Err)); diff --git a/openmp/libomptarget/test/offloading/pgo1.c b/openmp/libomptarget/test/offloading/pgo1.c index d95793b508dcf..79e93d0f10827 100644 --- a/openmp/libomptarget/test/offloading/pgo1.c +++ b/openmp/libomptarget/test/offloading/pgo1.c @@ -32,9 +32,7 @@ int main() { } // CLANG-PGO: ======== Counters ========= -// CLANG-PGO-NEXT: [ 0 11 20 ] -// CLANG-PGO-NEXT: [ 10 ] -// CLANG-PGO-NEXT: [ 20 ] +// CLANG-PGO-NEXT: 0 11 20 10 20 // CLANG-PGO-NEXT: ========== Data =========== // CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} // CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} @@ -55,9 +53,7 @@ int main() { // CLANG-PGO-NEXT: test2 // LLVM-PGO: ======== Counters ========= -// LLVM-PGO-NEXT: [ 20 ] -// LLVM-PGO-NEXT: [ 10 ] -// LLVM-PGO-NEXT: [ 20 10 1 1 ] +// LLVM-PGO-NEXT: 20 10 20 10 1 1 // LLVM-PGO-NEXT: ========== Data =========== // LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} // LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} From 4568c4244d11010aacf9f1fe20bb1197008b057f Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Wed, 13 Mar 2024 21:40:20 -0500 Subject: [PATCH 022/114] Fix arguments --- compiler-rt/lib/profile/InstrProfilingFile.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 466bfe480543b..6570bc0d74caa 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1360,10 +1360,10 @@ int __llvm_write_custom_profile(const char *Target, initFileWriter(&fileWriter, OutputFile); /* Write custom data to the file */ - ReturnValue = lprofWriteDataImpl( - &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL, - lprofGetVPDataReader(), NamesBegin, NamesEnd, MergeDone); - + ReturnValue = lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, + CountersBegin, CountersEnd, NULL, NULL, + lprofGetVPDataReader(), NamesBegin, NamesEnd, + NULL, NULL, NULL, NULL, MergeDone); closeFileObject(OutputFile); // Restore SIGKILL. From 1fc4cb9c01f251432f4a6748e69b1d8cf74cc4fb Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 18 Mar 2024 20:05:52 -0500 Subject: [PATCH 023/114] Add GPU prof flags --- clang/include/clang/Driver/Options.td | 6 +++ clang/lib/Driver/ToolChain.cpp | 69 +++++++++++++-------------- clang/lib/Driver/ToolChains/Clang.cpp | 39 +++++++++++++-- 3 files changed, 74 insertions(+), 40 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4b1fcf1db1ad0..aab445906fa34 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1716,6 +1716,9 @@ defm debug_info_for_profiling : BoolFOption<"debug-info-for-profiling", def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">, Group, Visibility<[ClangOption, CLOption]>, HelpText<"Generate instrumented code to collect execution counts into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">; +def fprofile_instr_generate_gpu : Flag<["-"], "fprofile-instr-generate-gpu">, + Group, Visibility<[ClangOption, CLOption]>, + HelpText<"Generate instrumented GPU device code to collect execution counts into GPU_TARGET.default.profraw (overridden by LLVM_PROFILE_FILE env var)">; def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">, Group, Visibility<[ClangOption, CLOption]>, MetaVarName<"">, HelpText<"Generate instrumented code to collect execution counts into (overridden by LLVM_PROFILE_FILE env var)">; @@ -1744,6 +1747,9 @@ defm mcdc_coverage : BoolFOption<"coverage-mcdc", def fprofile_generate : Flag<["-"], "fprofile-generate">, Group, Visibility<[ClangOption, CLOption]>, HelpText<"Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">; +def fprofile_generate_gpu : Flag<["-"], "fprofile-generate-gpu">, + Group, Visibility<[ClangOption, CLOption]>, + HelpText<"Generate instrumented GPU device code to collect execution counts into GPU_TARGET.default.profraw (overridden by LLVM_PROFILE_FILE env var)">; def fprofile_generate_EQ : Joined<["-"], "fprofile-generate=">, Group, Visibility<[ClangOption, CLOption]>, MetaVarName<"">, diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 03450fc0f57b9..f4c8aafbbd0e6 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -184,10 +184,9 @@ bool ToolChain::defaultToIEEELongDouble() const { return PPC_LINUX_DEFAULT_IEEELONGDOUBLE && getTriple().isOSLinux(); } -static void getAArch64MultilibFlags(const Driver &D, - const llvm::Triple &Triple, - const llvm::opt::ArgList &Args, - Multilib::flags_list &Result) { +static void getAArch64MultilibFlags(const Driver &D, const llvm::Triple &Triple, + const llvm::opt::ArgList &Args, + Multilib::flags_list &Result) { std::vector Features; tools::aarch64::getAArch64TargetFeatures(D, Triple, Args, Features, false); const auto UnifiedFeatures = tools::unifyTargetFeatures(Features); @@ -209,10 +208,9 @@ static void getAArch64MultilibFlags(const Driver &D, Result.push_back(llvm::join(MArch, "+")); } -static void getARMMultilibFlags(const Driver &D, - const llvm::Triple &Triple, - const llvm::opt::ArgList &Args, - Multilib::flags_list &Result) { +static void getARMMultilibFlags(const Driver &D, const llvm::Triple &Triple, + const llvm::opt::ArgList &Args, + Multilib::flags_list &Result) { std::vector Features; llvm::ARM::FPUKind FPUKind = tools::arm::getARMTargetFeatures( D, Triple, Args, Features, false /*ForAs*/, true /*ForMultilib*/); @@ -303,7 +301,7 @@ ToolChain::getSanitizerArgs(const llvm::opt::ArgList &JobArgs) const { return SanArgs; } -const XRayArgs& ToolChain::getXRayArgs() const { +const XRayArgs &ToolChain::getXRayArgs() const { if (!XRayArguments) XRayArguments.reset(new XRayArgs(*this, Args)); return *XRayArguments; @@ -394,8 +392,7 @@ static const DriverSuffix *parseDriverSuffix(StringRef ProgName, size_t &Pos) { return DS; } -ParsedClangName -ToolChain::getTargetAndModeFromProgramName(StringRef PN) { +ParsedClangName ToolChain::getTargetAndModeFromProgramName(StringRef PN) { std::string ProgName = normalizeProgramName(PN); size_t SuffixPos; const DriverSuffix *DS = parseDriverSuffix(ProgName, SuffixPos); @@ -406,8 +403,8 @@ ToolChain::getTargetAndModeFromProgramName(StringRef PN) { size_t LastComponent = ProgName.rfind('-', SuffixPos); if (LastComponent == std::string::npos) return ParsedClangName(ProgName.substr(0, SuffixEnd), DS->ModeFlag); - std::string ModeSuffix = ProgName.substr(LastComponent + 1, - SuffixEnd - LastComponent - 1); + std::string ModeSuffix = + ProgName.substr(LastComponent + 1, SuffixEnd - LastComponent - 1); // Infer target from the prefix. StringRef Prefix(ProgName); @@ -465,9 +462,7 @@ Tool *ToolChain::getFlang() const { return Flang.get(); } -Tool *ToolChain::buildAssembler() const { - return new tools::ClangAs(*this); -} +Tool *ToolChain::buildAssembler() const { return new tools::ClangAs(*this); } Tool *ToolChain::buildLinker() const { llvm_unreachable("Linking is not supported by this toolchain"); @@ -826,10 +821,12 @@ bool ToolChain::needsProfileRT(const ArgList &Args) { return false; return Args.hasArg(options::OPT_fprofile_generate) || + Args.hasArg(options::OPT_fprofile_generate_gpu) || Args.hasArg(options::OPT_fprofile_generate_EQ) || Args.hasArg(options::OPT_fcs_profile_generate) || Args.hasArg(options::OPT_fcs_profile_generate_EQ) || Args.hasArg(options::OPT_fprofile_instr_generate) || + Args.hasArg(options::OPT_fprofile_instr_generate_gpu) || Args.hasArg(options::OPT_fprofile_instr_generate_EQ) || Args.hasArg(options::OPT_fcreate_profile) || Args.hasArg(options::OPT_forder_file_instrumentation); @@ -842,8 +839,10 @@ bool ToolChain::needsGCovInstrumentation(const llvm::opt::ArgList &Args) { } Tool *ToolChain::SelectTool(const JobAction &JA) const { - if (D.IsFlangMode() && getDriver().ShouldUseFlangCompiler(JA)) return getFlang(); - if (getDriver().ShouldUseClangCompiler(JA)) return getClang(); + if (D.IsFlangMode() && getDriver().ShouldUseFlangCompiler(JA)) + return getFlang(); + if (getDriver().ShouldUseClangCompiler(JA)) + return getClang(); Action::ActionClass AC = JA.getKind(); if (AC == Action::AssembleJobClass && useIntegratedAs() && !getTriple().isOSAIX()) @@ -865,7 +864,7 @@ std::string ToolChain::GetLinkerPath(bool *LinkerIsLLD) const { // Get -fuse-ld= first to prevent -Wunused-command-line-argument. -fuse-ld= is // considered as the linker flavor, e.g. "bfd", "gold", or "lld". - const Arg* A = Args.getLastArg(options::OPT_fuse_ld_EQ); + const Arg *A = Args.getLastArg(options::OPT_fuse_ld_EQ); StringRef UseLinker = A ? A->getValue() : CLANG_DEFAULT_LINKER; // --ld-path= takes precedence over -fuse-ld= and specifies the executable @@ -950,9 +949,7 @@ types::ID ToolChain::LookupTypeForExtension(StringRef Ext) const { return id; } -bool ToolChain::HasNativeLLVMSupport() const { - return false; -} +bool ToolChain::HasNativeLLVMSupport() const { return false; } bool ToolChain::isCrossCompiling() const { llvm::Triple HostTriple(LLVM_HOST_TRIPLE); @@ -964,7 +961,8 @@ bool ToolChain::isCrossCompiling() const { case llvm::Triple::thumb: case llvm::Triple::thumbeb: return getArch() != llvm::Triple::arm && getArch() != llvm::Triple::thumb && - getArch() != llvm::Triple::armeb && getArch() != llvm::Triple::thumbeb; + getArch() != llvm::Triple::armeb && + getArch() != llvm::Triple::thumbeb; default: return HostTriple.getArch() != getArch(); } @@ -1046,9 +1044,7 @@ std::string ToolChain::ComputeEffectiveClangTriple(const ArgList &Args, return ComputeLLVMTriple(Args, InputType); } -std::string ToolChain::computeSysRoot() const { - return D.SysRoot; -} +std::string ToolChain::computeSysRoot() const { return D.SysRoot; } void ToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { @@ -1072,12 +1068,12 @@ void ToolChain::addProfileRTLibs(const llvm::opt::ArgList &Args, CmdArgs.push_back(getCompilerRTArgString(Args, "profile")); } -ToolChain::RuntimeLibType ToolChain::GetRuntimeLibType( - const ArgList &Args) const { +ToolChain::RuntimeLibType +ToolChain::GetRuntimeLibType(const ArgList &Args) const { if (runtimeLibType) return *runtimeLibType; - const Arg* A = Args.getLastArg(options::OPT_rtlib_EQ); + const Arg *A = Args.getLastArg(options::OPT_rtlib_EQ); StringRef LibName = A ? A->getValue() : CLANG_DEFAULT_RTLIB; // Only use "platform" in tests to override CLANG_DEFAULT_RTLIB! @@ -1098,8 +1094,8 @@ ToolChain::RuntimeLibType ToolChain::GetRuntimeLibType( return *runtimeLibType; } -ToolChain::UnwindLibType ToolChain::GetUnwindLibType( - const ArgList &Args) const { +ToolChain::UnwindLibType +ToolChain::GetUnwindLibType(const ArgList &Args) const { if (unwindLibType) return *unwindLibType; @@ -1134,7 +1130,8 @@ ToolChain::UnwindLibType ToolChain::GetUnwindLibType( return *unwindLibType; } -ToolChain::CXXStdlibType ToolChain::GetCXXStdlibType(const ArgList &Args) const{ +ToolChain::CXXStdlibType +ToolChain::GetCXXStdlibType(const ArgList &Args) const { if (cxxStdlibType) return *cxxStdlibType; @@ -1290,7 +1287,7 @@ void ToolChain::AddCXXStdlibLibArgs(const ArgList &Args, void ToolChain::AddFilePathLibArgs(const ArgList &Args, ArgStringList &CmdArgs) const { for (const auto &LibPath : getFilePaths()) - if(LibPath.length() > 0) + if (LibPath.length() > 0) CmdArgs.push_back(Args.MakeArgString(StringRef("-L") + LibPath)); } @@ -1306,9 +1303,9 @@ bool ToolChain::isFastMathRuntimeAvailable(const ArgList &Args, if (!isOptimizationLevelFast(Args)) { // Check if -ffast-math or -funsafe-math. Arg *A = - Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math, - options::OPT_funsafe_math_optimizations, - options::OPT_fno_unsafe_math_optimizations); + Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math, + options::OPT_funsafe_math_optimizations, + options::OPT_fno_unsafe_math_optimizations); if (!A || A->getOption().getID() == options::OPT_fno_fast_math || A->getOption().getID() == options::OPT_fno_unsafe_math_optimizations) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 055884d275ce1..106a612135f93 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -578,6 +578,35 @@ static void addDashXForInput(const ArgList &Args, const InputInfo &Input, } } +static void addPGOFlagsGPU(const ToolChain &TC, const ArgList &Args, + ArgStringList &CmdArgs) { + const Driver &D = TC.getDriver(); + auto *ProfileClangArg = Args.getLastArg(options::OPT_fprofile_generate_gpu, + options::OPT_fno_profile_generate); + auto *ProfileLLVMArg = + Args.getLastArg(options::OPT_fprofile_instr_generate_gpu, + options::OPT_fno_profile_generate); + if (ProfileClangArg && + ProfileClangArg->getOption().matches(options::OPT_fno_profile_generate)) + ProfileClangArg = nullptr; + + if (ProfileLLVMArg && + ProfileLLVMArg->getOption().matches(options::OPT_fno_profile_generate)) + ProfileLLVMArg = nullptr; + + if (ProfileClangArg && ProfileLLVMArg) { + D.Diag(diag::err_drv_argument_not_allowed_with) + << ProfileClangArg->getSpelling() << ProfileLLVMArg->getSpelling(); + return; + } + + if (ProfileClangArg) + CmdArgs.push_back("-fprofile-instrument=clang"); + + if (ProfileLLVMArg) + CmdArgs.push_back("-fprofile-instrument=llvm"); +} + static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C, const JobAction &JA, const InputInfo &Output, const ArgList &Args, SanitizerArgs &SanArgs, @@ -6049,10 +6078,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_finstrument_functions_after_inlining, options::OPT_finstrument_function_entry_bare); - // NVPTX/AMDGCN doesn't support PGO or coverage. There's no runtime support - // for sampling, overhead of call arc collection is way too high and there's - // no way to collect the output. - if (!Triple.isNVPTX() && !Triple.isAMDGCN()) + // NVPTX/AMDGCN PGO is handled separately + // GPU targets don't have their own profiling libraries and are + // collected/handled by the host's profiling library + if (Triple.isNVPTX() || Triple.isAMDGCN()) + addPGOFlagsGPU(TC, Args, CmdArgs); + else addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs); Args.AddLastArg(CmdArgs, options::OPT_fclang_abi_compat_EQ); From 849b244ea29ac15cae7ddaa973356cecfb0e4792 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 18 Mar 2024 21:31:32 -0500 Subject: [PATCH 024/114] Fix elf obj file --- .../plugins-nextgen/common/src/GlobalHandler.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp index 88e4bee506ba8..bca66cff6558a 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp @@ -177,16 +177,19 @@ Expected GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image) { GPUProfGlobals DeviceProfileData; - auto ELFObj = getELFObjectFile(Image); - if (!ELFObj) - return ELFObj.takeError(); + auto ObjFile = getELFObjectFile(Image); + if (!ObjFile) + return ObjFile.takeError(); + + std::unique_ptr ELFObj( + static_cast(ObjFile->release())); DeviceProfileData.TargetTriple = ELFObj->makeTriple(); // Iterate through elf symbols for (auto &Sym : ELFObj->symbols()) { auto NameOrErr = Sym.getName(); if (!NameOrErr) - return ELFObj.takeError(); + return NameOrErr.takeError(); // Check if given current global is a profiling global based // on name From 55bd8d21a6224e0872002b0d1d77361eb75a3419 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Tue, 19 Mar 2024 14:50:54 -0500 Subject: [PATCH 025/114] Add GPU use profile option --- clang/include/clang/Driver/Options.td | 5 +++++ clang/lib/Driver/ToolChains/Clang.cpp | 22 ++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index aab445906fa34..b317d4e85b957 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1768,6 +1768,11 @@ def fprofile_use_EQ : Joined<["-"], "fprofile-use=">, Visibility<[ClangOption, CLOption]>, MetaVarName<"">, HelpText<"Use instrumentation data for profile-guided optimization. If pathname is a directory, it reads from /default.profdata. Otherwise, it reads from file .">; +def fprofile_use_gpu_EQ : Joined<["-"], "fprofile-use-gpu=">, + Group, + Visibility<[ClangOption, CLOption]>, + MetaVarName<"">, + HelpText<"Use instrumentation data for profile-guided optimization targeting GPU">; def fno_profile_instr_generate : Flag<["-"], "fno-profile-instr-generate">, Group, Visibility<[ClangOption, CLOption]>, HelpText<"Disable generation of profile instrumentation.">; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 106a612135f93..1ea5501146950 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -586,6 +586,8 @@ static void addPGOFlagsGPU(const ToolChain &TC, const ArgList &Args, auto *ProfileLLVMArg = Args.getLastArg(options::OPT_fprofile_instr_generate_gpu, options::OPT_fno_profile_generate); + auto *ProfileUseArg = Args.getLastArg(options::OPT_fprofile_use_gpu_EQ, + options::OPT_fno_profile_instr_use); if (ProfileClangArg && ProfileClangArg->getOption().matches(options::OPT_fno_profile_generate)) ProfileClangArg = nullptr; @@ -594,17 +596,37 @@ static void addPGOFlagsGPU(const ToolChain &TC, const ArgList &Args, ProfileLLVMArg->getOption().matches(options::OPT_fno_profile_generate)) ProfileLLVMArg = nullptr; + if (ProfileUseArg && + ProfileUseArg->getOption().matches(options::OPT_fno_profile_generate)) + ProfileUseArg = nullptr; + if (ProfileClangArg && ProfileLLVMArg) { D.Diag(diag::err_drv_argument_not_allowed_with) << ProfileClangArg->getSpelling() << ProfileLLVMArg->getSpelling(); return; } + if (ProfileUseArg && ProfileClangArg) { + D.Diag(diag::err_drv_argument_not_allowed_with) + << ProfileClangArg->getSpelling() << ProfileUseArg->getSpelling(); + return; + } + + if (ProfileUseArg && ProfileLLVMArg) { + D.Diag(diag::err_drv_argument_not_allowed_with) + << ProfileLLVMArg->getSpelling() << ProfileUseArg->getSpelling(); + return; + } + if (ProfileClangArg) CmdArgs.push_back("-fprofile-instrument=clang"); if (ProfileLLVMArg) CmdArgs.push_back("-fprofile-instrument=llvm"); + + if (ProfileUseArg) + CmdArgs.push_back(Args.MakeArgString( + Twine("-fprofile-instrument-use-path=") + ProfileUseArg->getValue())); } static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C, From 4ebbb45baa24b52eb0f94ebaf16b6b9eb671420a Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 6 May 2024 23:13:58 -0500 Subject: [PATCH 026/114] Add more addrspace casts for GPU targets --- .../Transforms/Instrumentation/InstrProfiling.cpp | 11 ++++++++--- .../Instrumentation/PGOInstrumentation.cpp | 13 +++++++++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index a6b1e0d488120..dd8c027c4bbf6 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -869,6 +869,8 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { llvm::InstrProfValueKind::IPVK_MemOPSize); CallInst *Call = nullptr; auto *TLI = &GetTLI(*Ind->getFunction()); + auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + DataVar, PointerType::getUnqual(M.getContext())); // To support value profiling calls within Windows exception handlers, funclet // information contained within operand bundles needs to be copied over to @@ -877,11 +879,13 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { SmallVector OpBundles; Ind->getOperandBundlesAsDefs(OpBundles); if (!IsMemOpSize) { - Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)}; + Value *Args[3] = {Ind->getTargetValue(), NormalizedPtr, + Builder.getInt32(Index)}; Call = Builder.CreateCall(getOrInsertValueProfilingCall(M, *TLI), Args, OpBundles); } else { - Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)}; + Value *Args[3] = {Ind->getTargetValue(), NormalizedPtr, + Builder.getInt32(Index)}; Call = Builder.CreateCall( getOrInsertValueProfilingCall(M, *TLI, ValueProfilingCallType::MemOp), Args, OpBundles); @@ -1575,7 +1579,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { getInstrProfSectionName(IPSK_vals, TT.getObjectFormat())); ValuesVar->setAlignment(Align(8)); maybeSetComdat(ValuesVar, Fn, CntsVarName); - ValuesPtrExpr = ValuesVar; + ValuesPtrExpr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + ValuesVar, PointerType::getUnqual(Fn->getContext())); } uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index f662c89a378be..fbe969f4a9c16 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -955,12 +955,15 @@ static void instrumentOneFunc( ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty()); assert(ToProfile && "value profiling Value is of unexpected type"); + auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + Name, PointerType::get(M->getContext(), 0)); + SmallVector OpBundles; populateEHOperandBundle(Cand, BlockColors, OpBundles); Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), - {FuncInfo.FuncNameVar, Builder.getInt64(FuncInfo.FunctionHash), - ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}, + {NormalizedPtr, Builder.getInt64(FuncInfo.FunctionHash), ToProfile, + Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}, OpBundles); } } // IPVK_First <= Kind <= IPVK_Last @@ -1632,10 +1635,12 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { IRBuilder<> Builder(&SI); Type *Int64Ty = Builder.getInt64Ty(); auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty); + auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + FuncNameVar, PointerType::get(M->getContext(), 0)); Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step), - {FuncNameVar, Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs), - Builder.getInt32(*CurCtrIdx), Step}); + {NormalizedPtr, Builder.getInt64(FuncHash), + Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step}); ++(*CurCtrIdx); } From 7770b37a5a4c40bd45887f762bd7f1e652bc0ed2 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Tue, 7 May 2024 16:31:48 -0500 Subject: [PATCH 027/114] Fix params --- compiler-rt/lib/profile/InstrProfilingFile.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 466bfe480543b..bc1d40a37a5ad 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1360,9 +1360,10 @@ int __llvm_write_custom_profile(const char *Target, initFileWriter(&fileWriter, OutputFile); /* Write custom data to the file */ - ReturnValue = lprofWriteDataImpl( - &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL, - lprofGetVPDataReader(), NamesBegin, NamesEnd, MergeDone); + ReturnValue = + lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, CountersBegin, + CountersEnd, NULL, NULL, lprofGetVPDataReader(), NULL, + NULL, NULL, NULL, NamesBegin, NamesEnd, MergeDone); closeFileObject(OutputFile); From 619fb6918560f0b5d0b8137d392dfb27255a7d32 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Tue, 7 May 2024 17:20:39 -0500 Subject: [PATCH 028/114] Resolve merge conflict --- clang/lib/Driver/ToolChain.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index fce4168761a6d..e29f1ccf44b69 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -1313,17 +1313,10 @@ bool ToolChain::isFastMathRuntimeAvailable(const ArgList &Args, // (to keep the linker options consistent with gcc and clang itself). if (Default && !isOptimizationLevelFast(Args)) { // Check if -ffast-math or -funsafe-math. -<<<<<<< HEAD - Arg *A = - Args.getLastArg(options::OPT_ffast_math, options::OPT_fno_fast_math, - options::OPT_funsafe_math_optimizations, - options::OPT_fno_unsafe_math_optimizations); -======= Arg *A = Args.getLastArg( options::OPT_ffast_math, options::OPT_fno_fast_math, options::OPT_funsafe_math_optimizations, options::OPT_fno_unsafe_math_optimizations, options::OPT_ffp_model_EQ); ->>>>>>> main if (!A || A->getOption().getID() == options::OPT_fno_fast_math || A->getOption().getID() == options::OPT_fno_unsafe_math_optimizations) From 3f08ae9d560dbaeba4c547186c85a8c34f3dee97 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Sat, 11 May 2024 02:31:28 -0500 Subject: [PATCH 029/114] Have test read from profraw instead of dump --- offload/test/lit.cfg | 2 +- offload/test/offloading/pgo1.c | 90 +++++++++++++++++----------------- 2 files changed, 45 insertions(+), 47 deletions(-) diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 069110dc69a6e..94a0bc8a2b43f 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -390,7 +390,7 @@ config.substitutions.append(("%clang", config.test_c_compiler)) if config.test_fortran_compiler: config.available_features.add('flang') config.substitutions.append(("%flang", config.test_fortran_compiler)) - +config.substitutions.append(("%target_triple", config.libomptarget_current_target)) config.substitutions.append(("%openmp_flags", config.test_openmp_flags)) if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path: config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path)) diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c index 79e93d0f10827..9fe231e7c6716 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/pgo1.c @@ -1,12 +1,15 @@ -// RUN: %libomptarget-compile-generic -fprofile-instr-generate \ -// RUN: -Xclang "-fprofile-instrument=clang" -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ -// RUN: --check-prefix="CLANG-PGO" -// RUN: %libomptarget-compile-generic -fprofile-generate \ -// RUN: -Xclang "-fprofile-instrument=llvm" -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ +// RUN: %libomptarget-compile-generic -fprofile-instr-generate-gpu +// RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1 +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %target_triple.llvm.profraw | %fcheck-generic \ // RUN: --check-prefix="LLVM-PGO" +// RUN: %libomptarget-compile-generic -fprofile-generate-gpu +// RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1 +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %target_triple.clang.profraw | %fcheck-generic \ +// RUN: --check-prefix="CLANG-PGO" + // UNSUPPORTED: x86_64-pc-linux-gnu // UNSUPPORTED: x86_64-pc-linux-gnu-LTO // UNSUPPORTED: aarch64-unknown-linux-gnu @@ -31,43 +34,38 @@ int main() { } } -// CLANG-PGO: ======== Counters ========= -// CLANG-PGO-NEXT: 0 11 20 10 20 -// CLANG-PGO-NEXT: ========== Data =========== -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: ======== Functions ======== -// CLANG-PGO-NEXT: pgo1.c: -// CLANG-PGO-SAME: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} -// CLANG-PGO-NEXT: test1 -// CLANG-PGO-NEXT: test2 +// LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 4 +// LLVM-PGO: Function count: 20 +// LLVM-PGO: Block counts: [10, 20, 10] + +// LLVM-PGO-LABEL: test1: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 1 +// LLVM-PGO: Function count: 1 +// LLVM-PGO: Block counts: [] + +// LLVM-PGO-LABEL: test2: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 1 +// LLVM-PGO: Function count: 1 +// LLVM-PGO: Block counts: [] + +// CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 3 +// CLANG-PGO: Function count: 0 +// CLANG-PGO: Block counts: [11, 20] + +// CLANG-PGO-LABEL: test1: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: 10 +// CLANG-PGO: Block counts: [] -// LLVM-PGO: ======== Counters ========= -// LLVM-PGO-NEXT: 20 10 20 10 1 1 -// LLVM-PGO-NEXT: ========== Data =========== -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: ======== Functions ======== -// LLVM-PGO-NEXT: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} -// LLVM-PGO-NEXT: test1 -// LLVM-PGO-NEXT: test2 +// CLANG-PGO-LABEL: test2: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: 20 +// CLANG-PGO: Block counts: [] From 09f2b39beaa9e325655d5569e5107827c1e7e955 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Sat, 11 May 2024 02:39:16 -0500 Subject: [PATCH 030/114] Remove debug dump --- offload/plugins-nextgen/common/src/PluginInterface.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index a410deed1654f..e7559f9e6cec8 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -840,9 +840,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { if (!ProfOrErr) return ProfOrErr.takeError(); - // Dump out profdata - ProfOrErr->dump(); - // Write data to profiling file if (auto Err = ProfOrErr->write()) { consumeError(std::move(Err)); From aa895a1788969a0d27692057a1457074e9772c78 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 18 Mar 2024 21:31:32 -0500 Subject: [PATCH 031/114] Fix elf obj file --- offload/plugins-nextgen/common/src/GlobalHandler.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index 80cdcaff75528..7717e19a5b677 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -177,16 +177,19 @@ Expected GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image) { GPUProfGlobals DeviceProfileData; - auto ELFObj = getELFObjectFile(Image); - if (!ELFObj) - return ELFObj.takeError(); + auto ObjFile = getELFObjectFile(Image); + if (!ObjFile) + return ObjFile.takeError(); + + std::unique_ptr ELFObj( + static_cast(ObjFile->release())); DeviceProfileData.TargetTriple = ELFObj->makeTriple(); // Iterate through elf symbols for (auto &Sym : ELFObj->symbols()) { auto NameOrErr = Sym.getName(); if (!NameOrErr) - return ELFObj.takeError(); + return NameOrErr.takeError(); // Check if given current global is a profiling global based // on name From 2031e49c2b26864f2dab72e629eb6cbe34928a7a Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 6 May 2024 23:13:58 -0500 Subject: [PATCH 032/114] Add more addrspace casts for GPU targets --- .../Transforms/Instrumentation/InstrProfiling.cpp | 11 ++++++++--- .../Instrumentation/PGOInstrumentation.cpp | 13 +++++++++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index a6b1e0d488120..dd8c027c4bbf6 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -869,6 +869,8 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { llvm::InstrProfValueKind::IPVK_MemOPSize); CallInst *Call = nullptr; auto *TLI = &GetTLI(*Ind->getFunction()); + auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + DataVar, PointerType::getUnqual(M.getContext())); // To support value profiling calls within Windows exception handlers, funclet // information contained within operand bundles needs to be copied over to @@ -877,11 +879,13 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { SmallVector OpBundles; Ind->getOperandBundlesAsDefs(OpBundles); if (!IsMemOpSize) { - Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)}; + Value *Args[3] = {Ind->getTargetValue(), NormalizedPtr, + Builder.getInt32(Index)}; Call = Builder.CreateCall(getOrInsertValueProfilingCall(M, *TLI), Args, OpBundles); } else { - Value *Args[3] = {Ind->getTargetValue(), DataVar, Builder.getInt32(Index)}; + Value *Args[3] = {Ind->getTargetValue(), NormalizedPtr, + Builder.getInt32(Index)}; Call = Builder.CreateCall( getOrInsertValueProfilingCall(M, *TLI, ValueProfilingCallType::MemOp), Args, OpBundles); @@ -1575,7 +1579,8 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { getInstrProfSectionName(IPSK_vals, TT.getObjectFormat())); ValuesVar->setAlignment(Align(8)); maybeSetComdat(ValuesVar, Fn, CntsVarName); - ValuesPtrExpr = ValuesVar; + ValuesPtrExpr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + ValuesVar, PointerType::getUnqual(Fn->getContext())); } uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 4b51396a8baa3..ee1657ba8400e 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1007,12 +1007,15 @@ static void instrumentOneFunc( ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty()); assert(ToProfile && "value profiling Value is of unexpected type"); + auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + Name, PointerType::get(M->getContext(), 0)); + SmallVector OpBundles; populateEHOperandBundle(Cand, BlockColors, OpBundles); Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), - {FuncInfo.FuncNameVar, Builder.getInt64(FuncInfo.FunctionHash), - ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}, + {NormalizedPtr, Builder.getInt64(FuncInfo.FunctionHash), ToProfile, + Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}, OpBundles); } } // IPVK_First <= Kind <= IPVK_Last @@ -1685,10 +1688,12 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { IRBuilder<> Builder(&SI); Type *Int64Ty = Builder.getInt64Ty(); auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty); + auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + FuncNameVar, PointerType::get(M->getContext(), 0)); Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step), - {FuncNameVar, Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs), - Builder.getInt32(*CurCtrIdx), Step}); + {NormalizedPtr, Builder.getInt64(FuncHash), + Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step}); ++(*CurCtrIdx); } From be6524bb4f77de0add1e698f68115fd336f32238 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 13 May 2024 17:41:00 -0500 Subject: [PATCH 033/114] Have test read from profraw instead of dump --- offload/test/lit.cfg | 2 + offload/test/offloading/pgo1.c | 94 ++++++++++++++++------------------ 2 files changed, 46 insertions(+), 50 deletions(-) diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 069110dc69a6e..38e6a33b01faf 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -391,6 +391,8 @@ if config.test_fortran_compiler: config.available_features.add('flang') config.substitutions.append(("%flang", config.test_fortran_compiler)) +config.substitutions.append(("%target_triple", config.libomptarget_current_target)) + config.substitutions.append(("%openmp_flags", config.test_openmp_flags)) if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path: config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path)) diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c index 79e93d0f10827..d22d5340f5b3e 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/pgo1.c @@ -1,22 +1,21 @@ -// RUN: %libomptarget-compile-generic -fprofile-instr-generate \ -// RUN: -Xclang "-fprofile-instrument=clang" -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ -// RUN: --check-prefix="CLANG-PGO" -// RUN: %libomptarget-compile-generic -fprofile-generate \ -// RUN: -Xclang "-fprofile-instrument=llvm" -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ +// RUN: %libomptarget-compile-generic -Xclang "-fprofile-instrument=llvm" +// RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1 +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %target_triple.llvm.profraw | %fcheck-generic \ // RUN: --check-prefix="LLVM-PGO" +// RUN: %libomptarget-compile-generic -Xclang "-fprofile-instrument=clang" +// RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1 +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %target_triple.clang.profraw | %fcheck-generic \ +// RUN: --check-prefix="CLANG-PGO" + // UNSUPPORTED: x86_64-pc-linux-gnu // UNSUPPORTED: x86_64-pc-linux-gnu-LTO // UNSUPPORTED: aarch64-unknown-linux-gnu // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO // REQUIRES: pgo -#ifdef _OPENMP -#include -#endif - int test1(int a) { return a / 2; } int test2(int a) { return a * 2; } @@ -31,43 +30,38 @@ int main() { } } -// CLANG-PGO: ======== Counters ========= -// CLANG-PGO-NEXT: 0 11 20 10 20 -// CLANG-PGO-NEXT: ========== Data =========== -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: ======== Functions ======== -// CLANG-PGO-NEXT: pgo1.c: -// CLANG-PGO-SAME: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} -// CLANG-PGO-NEXT: test1 -// CLANG-PGO-NEXT: test2 +// LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 4 +// LLVM-PGO: Function count: 20 +// LLVM-PGO: Block counts: [10, 20, 10] + +// LLVM-PGO-LABEL: test1: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 1 +// LLVM-PGO: Function count: 1 +// LLVM-PGO: Block counts: [] + +// LLVM-PGO-LABEL: test2: +// LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-PGO: Counters: 1 +// LLVM-PGO: Function count: 1 +// LLVM-PGO: Block counts: [] + +// CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 3 +// CLANG-PGO: Function count: 0 +// CLANG-PGO: Block counts: [11, 20] + +// CLANG-PGO-LABEL: test1: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: 10 +// CLANG-PGO: Block counts: [] -// LLVM-PGO: ======== Counters ========= -// LLVM-PGO-NEXT: 20 10 20 10 1 1 -// LLVM-PGO-NEXT: ========== Data =========== -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: ======== Functions ======== -// LLVM-PGO-NEXT: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} -// LLVM-PGO-NEXT: test1 -// LLVM-PGO-NEXT: test2 +// CLANG-PGO-LABEL: test2: +// CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-PGO: Counters: 1 +// CLANG-PGO: Function count: 20 +// CLANG-PGO: Block counts: [] From e266cc7190b4639c8273d49d39e78aa644bf032b Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Thu, 16 May 2024 23:21:20 -0500 Subject: [PATCH 034/114] Fix GPU PGO names --- clang/lib/Driver/ToolChains/Clang.cpp | 25 ++++++++++++++++++++++--- offload/test/offloading/pgo1.c | 4 ++-- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index f0c2481145d0b..5a4dc1295360f 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -585,13 +585,20 @@ static void addDashXForInput(const ArgList &Args, const InputInfo &Input, static void addPGOFlagsGPU(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs) { const Driver &D = TC.getDriver(); - auto *ProfileClangArg = Args.getLastArg(options::OPT_fprofile_generate_gpu, - options::OPT_fno_profile_generate); - auto *ProfileLLVMArg = + auto *ProfileClangArg = Args.getLastArg(options::OPT_fprofile_instr_generate_gpu, options::OPT_fno_profile_generate); + auto *ProfileLLVMArg = Args.getLastArg(options::OPT_fprofile_generate_gpu, + options::OPT_fno_profile_generate); auto *ProfileUseArg = Args.getLastArg(options::OPT_fprofile_use_gpu_EQ, options::OPT_fno_profile_instr_use); + + auto *HostLLVMArg = Args.getLastArgNoClaim(options::OPT_fprofile_generate, + options::OPT_fprofile_generate_EQ); + auto *HostClangArg = + Args.getLastArgNoClaim(options::OPT_fprofile_instr_generate, + options::OPT_fprofile_instr_generate_EQ); + if (ProfileClangArg && ProfileClangArg->getOption().matches(options::OPT_fno_profile_generate)) ProfileClangArg = nullptr; @@ -622,6 +629,18 @@ static void addPGOFlagsGPU(const ToolChain &TC, const ArgList &Args, return; } + if (HostLLVMArg && ProfileClangArg) { + D.Diag(diag::err_drv_argument_not_allowed_with) + << HostLLVMArg->getSpelling() << ProfileClangArg->getSpelling(); + return; + } + + if (HostClangArg && ProfileLLVMArg) { + D.Diag(diag::err_drv_argument_not_allowed_with) + << HostClangArg->getSpelling() << ProfileLLVMArg->getSpelling(); + return; + } + if (ProfileClangArg) CmdArgs.push_back("-fprofile-instrument=clang"); diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c index ec93cce2c8620..b9fc95c89791a 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/pgo1.c @@ -1,10 +1,10 @@ -// RUN: %libomptarget-compile-generic -fprofile-instr-generate-gpu +// RUN: %libomptarget-compile-generic -fprofile-generate-gpu // RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1 // RUN: llvm-profdata show --all-functions --counts \ // RUN: %target_triple.llvm.profraw | %fcheck-generic \ // RUN: --check-prefix="LLVM-PGO" -// RUN: %libomptarget-compile-generic -fprofile-generate-gpu +// RUN: %libomptarget-compile-generic -fprofile-instr-generate-gpu // RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1 // RUN: llvm-profdata show --all-functions --counts \ // RUN: %target_triple.clang.profraw | %fcheck-generic \ From 2b8eb2935ec21bf0acc5c56f45837b5976560963 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 24 May 2024 19:59:33 -0500 Subject: [PATCH 035/114] Fix PGO test format --- offload/test/offloading/pgo1.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c index d22d5340f5b3e..0e75c684ed926 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/pgo1.c @@ -33,20 +33,17 @@ int main() { // LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-PGO: Counters: 4 -// LLVM-PGO: Function count: 20 -// LLVM-PGO: Block counts: [10, 20, 10] +// LLVM-PGO: Block counts: [20, 10, 20, 10] // LLVM-PGO-LABEL: test1: // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-PGO: Counters: 1 -// LLVM-PGO: Function count: 1 -// LLVM-PGO: Block counts: [] +// LLVM-PGO: Block counts: [1] // LLVM-PGO-LABEL: test2: // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-PGO: Counters: 1 -// LLVM-PGO: Function count: 1 -// LLVM-PGO: Block counts: [] +// LLVM-PGO: Block counts: [1] // CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} From 67f3009173d815295f36e2b37e85add1347e3bf9 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 24 May 2024 20:45:04 -0500 Subject: [PATCH 036/114] Refactor profile writer --- compiler-rt/lib/profile/InstrProfilingFile.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index bc1d40a37a5ad..76238214c13aa 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1344,8 +1344,7 @@ int __llvm_write_custom_profile(const char *Target, forceTruncateFile(TargetFilename); /* Open target-specific PGO file */ - MergeDone = 0; - FILE *OutputFile = getMergeFileObject(TargetFilename, &MergeDone); + FILE *OutputFile = getFileObject(TargetFilename); if (!OutputFile) { PROF_ERR("Failed to open file : %s\n", TargetFilename); @@ -1356,15 +1355,11 @@ int __llvm_write_custom_profile(const char *Target, FreeHook = &free; setupIOBuffer(); - ProfDataWriter fileWriter; - initFileWriter(&fileWriter, OutputFile); - - /* Write custom data to the file */ - ReturnValue = - lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, CountersBegin, - CountersEnd, NULL, NULL, lprofGetVPDataReader(), NULL, - NULL, NULL, NULL, NamesBegin, NamesEnd, MergeDone); + /* Write custom data */ + ReturnValue = __llvm_profile_write_buffer_internal( + OutputFile, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL, + NamesBegin, NamesEnd); closeFileObject(OutputFile); // Restore SIGKILL. From e8ad1322c557f7b48e2b28fe3a34a696a1103bba Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 27 May 2024 18:29:18 -0500 Subject: [PATCH 037/114] Fix refactor bug --- compiler-rt/lib/profile/InstrProfilingFile.c | 52 ++++++++++---------- offload/test/offloading/pgo1.c | 6 ++- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 76238214c13aa..784cb9af6169d 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -505,14 +505,6 @@ static FILE *getFileObject(const char *OutputName) { return fopen(OutputName, "ab"); } -/* Get file object and merge if applicable */ -static FILE *getMergeFileObject(const char *OutputName, int *MergeDone) { - VPMergeHook = &lprofMergeValueProfData; - if (doMerging()) - return openFileForMerging(OutputName, MergeDone); - return getFileObject(OutputName); -} - static void closeFileObject(FILE *OutputFile) { if (OutputFile == getProfileFile()) { fflush(OutputFile); @@ -526,8 +518,15 @@ static void closeFileObject(FILE *OutputFile) { /* Write profile data to file \c OutputName. */ static int writeFile(const char *OutputName) { - int RetVal, MergeDone = 0; - FILE *OutputFile = getMergeFileObject(OutputName, &MergeDone); + int RetVal; + FILE *OutputFile; + + int MergeDone = 0; + VPMergeHook = &lprofMergeValueProfData; + if (doMerging()) + OutputFile = openFileForMerging(OutputName, &MergeDone); + else + OutputFile = getFileObject(OutputName); if (!OutputFile) return -1; @@ -565,16 +564,10 @@ static int writeOrderFile(const char *OutputName) { #define LPROF_INIT_ONCE_ENV "__LLVM_PROFILE_RT_INIT_ONCE" -static void forceTruncateFile(const char *Filename) { - FILE *File = fopen(Filename, "w"); - if (!File) - return; - fclose(File); -} - static void truncateCurrentFile(void) { const char *Filename; char *FilenameBuf; + FILE *File; int Length; Length = getCurFilenameLength(); @@ -604,7 +597,10 @@ static void truncateCurrentFile(void) { return; /* Truncate the file. Later we'll reopen and append. */ - forceTruncateFile(Filename); + File = fopen(Filename, "w"); + if (!File) + return; + fclose(File); } /* Write a partial profile to \p Filename, which is required to be backed by @@ -1287,7 +1283,7 @@ int __llvm_write_custom_profile(const char *Target, const char *CountersBegin, const char *CountersEnd, const char *NamesBegin, const char *NamesEnd) { - int ReturnValue = 0, FilenameLength, TargetLength, MergeDone; + int ReturnValue = 0, FilenameLength, TargetLength; char *FilenameBuf, *TargetFilename; const char *Filename; @@ -1340,11 +1336,9 @@ int __llvm_write_custom_profile(const char *Target, return -1; } - /* Clean old target file */ - forceTruncateFile(TargetFilename); - - /* Open target-specific PGO file */ - FILE *OutputFile = getFileObject(TargetFilename); + /* Open and truncate target-specific PGO file */ + FILE *OutputFile = fopen(TargetFilename, "w"); + setProfileFile(OutputFile); if (!OutputFile) { PROF_ERR("Failed to open file : %s\n", TargetFilename); @@ -1357,9 +1351,13 @@ int __llvm_write_custom_profile(const char *Target, setupIOBuffer(); /* Write custom data */ - ReturnValue = __llvm_profile_write_buffer_internal( - OutputFile, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL, - NamesBegin, NamesEnd); + ProfDataWriter fileWriter; + initFileWriter(&fileWriter, OutputFile); + + /* Write custom data to the file */ + ReturnValue = lprofWriteDataImpl( + &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL, + lprofGetVPDataReader(), NULL, NULL, NULL, NULL, NamesBegin, NamesEnd, 0); closeFileObject(OutputFile); // Restore SIGKILL. diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c index 0e75c684ed926..d674711326580 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/pgo1.c @@ -1,10 +1,12 @@ -// RUN: %libomptarget-compile-generic -Xclang "-fprofile-instrument=llvm" +// RUN: %libomptarget-compile-generic -fprofile-generate \ +// RUN: -Xclang "-fprofile-instrument=llvm" // RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1 // RUN: llvm-profdata show --all-functions --counts \ // RUN: %target_triple.llvm.profraw | %fcheck-generic \ // RUN: --check-prefix="LLVM-PGO" -// RUN: %libomptarget-compile-generic -Xclang "-fprofile-instrument=clang" +// RUN: %libomptarget-compile-generic -fprofile-instr-generate \ +// RUN: -Xclang "-fprofile-instrument=clang" // RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1 // RUN: llvm-profdata show --all-functions --counts \ // RUN: %target_triple.clang.profraw | %fcheck-generic \ From 79bf08e0bea8ab32781f201cdfc096a59156f270 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Tue, 28 May 2024 00:45:47 -0500 Subject: [PATCH 038/114] Check for level in test case TODO: Actually ensure the right level is used in the profraw file when only GPU flags are supplied --- offload/test/offloading/pgo1.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c index 5d1a41ccab0f2..a3a242c703d96 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/pgo1.c @@ -45,6 +45,16 @@ int main() { // LLVM-PGO: Counters: 1 // LLVM-PGO: Block counts: [1] +// LLVM-PGO-LABEL: Instrumentation level: +// LLVM-PGO-SAME: IR +// LLVM-PGO-SAME: entry_first = 0 +// LLVM-PGO-LABEL: Functions shown: +// LLVM-PGO-SAME: 3 +// LLVM-PGO-LABEL: Maximum function count: +// LLVM-PGO-SAME: 20 +// LLVM-PGO-LABEL: Maximum internal block count: +// LLVM-PGO-SAME: 20 + // CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} // CLANG-PGO: Counters: 3 @@ -62,3 +72,12 @@ int main() { // CLANG-PGO: Counters: 1 // CLANG-PGO: Function count: 20 // CLANG-PGO: Block counts: [] + +// CLANG-PGO-LABEL: Instrumentation level: +// CLANG-PGO-SAME: Front-end +// CLANG-PGO-LABEL: Functions shown: +// CLANG-PGO-SAME: 3 +// CLANG-PGO-LABEL: Maximum function count: +// CLANG-PGO-SAME: 20 +// CLANG-PGO-LABEL: Maximum internal block count: +// CLANG-PGO-SAME: 20 From 4c9f814ce14aeb6766a93f5c1d15b847b98dc29f Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Tue, 28 May 2024 12:58:43 -0500 Subject: [PATCH 039/114] Make requested clang-format change --- offload/plugins-nextgen/common/include/GlobalHandler.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h index 017d7e994f07a..1d7b9f80f9dfd 100644 --- a/offload/plugins-nextgen/common/include/GlobalHandler.h +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -64,12 +64,10 @@ struct __llvm_profile_data { }; extern "C" { -extern int __attribute__((weak)) -__llvm_write_custom_profile(const char *Target, - const __llvm_profile_data *DataBegin, - const __llvm_profile_data *DataEnd, - const char *CountersBegin, const char *CountersEnd, - const char *NamesBegin, const char *NamesEnd); +extern int __attribute__((weak)) __llvm_write_custom_profile( + const char *Target, const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, const char *NamesEnd); } /// PGO profiling data extracted from a GPU device From cfe166091ca91623d356d2dde41b64cefe98e472 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Thu, 30 May 2024 18:15:55 -0500 Subject: [PATCH 040/114] Check for version global on GPU --- compiler-rt/include/profile/InstrProfData.inc | 2 +- compiler-rt/lib/profile/InstrProfiling.h | 3 ++- .../lib/profile/InstrProfilingBuffer.c | 3 ++- compiler-rt/lib/profile/InstrProfilingFile.c | 14 +++++++++---- .../lib/profile/InstrProfilingInternal.h | 3 ++- .../lib/profile/InstrProfilingWriter.c | 20 +++++++++---------- .../llvm/ProfileData/InstrProfData.inc | 2 +- .../Instrumentation/PGOInstrumentation.cpp | 5 ++++- .../llvm-profdata/binary-ids-padding.test | 2 +- ...alformed-not-space-for-another-header.test | 2 +- .../malformed-num-counters-zero.test | 2 +- .../malformed-ptr-to-counter-array.test | 2 +- .../common/include/GlobalHandler.h | 13 ++++++++---- .../common/src/GlobalHandler.cpp | 16 ++++++++++++--- 14 files changed, 58 insertions(+), 31 deletions(-) diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc index e9866d94b762c..f0a260483429c 100644 --- a/compiler-rt/include/profile/InstrProfData.inc +++ b/compiler-rt/include/profile/InstrProfData.inc @@ -152,7 +152,7 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \ #define INSTR_PROF_DATA_DEFINED #endif INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic()) -INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version()) +INSTR_PROF_RAW_HEADER(uint64_t, Version, Version) INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL)) INSTR_PROF_RAW_HEADER(uint64_t, NumData, NumData) INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesBeforeCounters, PaddingBytesBeforeCounters) diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index ef1292a45bf01..34b7d85ad6684 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -305,7 +305,8 @@ int __llvm_write_custom_profile(const char *Target, const __llvm_profile_data *DataEnd, const char *CountersBegin, const char *CountersEnd, const char *NamesBegin, - const char *NamesEnd); + const char *NamesEnd, + const uint64_t *VersionOverride); /*! * This variable is defined in InstrProfilingRuntime.cpp as a hidden diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c index 1c451d7ec7563..b406e8db74f3f 100644 --- a/compiler-rt/lib/profile/InstrProfilingBuffer.c +++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c @@ -252,5 +252,6 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal( &BufferWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd, /*VPDataReader=*/0, NamesBegin, NamesEnd, /*VTableBegin=*/NULL, /*VTableEnd=*/NULL, /*VNamesBegin=*/NULL, - /*VNamesEnd=*/NULL, /*SkipNameDataWrite=*/0); + /*VNamesEnd=*/NULL, /*SkipNameDataWrite=*/0, + __llvm_profile_get_version()); } diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 784cb9af6169d..947a3ff89bc1f 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1282,7 +1282,8 @@ int __llvm_write_custom_profile(const char *Target, const __llvm_profile_data *DataEnd, const char *CountersBegin, const char *CountersEnd, const char *NamesBegin, - const char *NamesEnd) { + const char *NamesEnd, + const uint64_t *VersionOverride) { int ReturnValue = 0, FilenameLength, TargetLength; char *FilenameBuf, *TargetFilename; const char *Filename; @@ -1354,10 +1355,15 @@ int __llvm_write_custom_profile(const char *Target, ProfDataWriter fileWriter; initFileWriter(&fileWriter, OutputFile); + uint64_t Version = __llvm_profile_get_version(); + if (VersionOverride) + Version = *VersionOverride; + /* Write custom data to the file */ - ReturnValue = lprofWriteDataImpl( - &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL, - lprofGetVPDataReader(), NULL, NULL, NULL, NULL, NamesBegin, NamesEnd, 0); + ReturnValue = + lprofWriteDataImpl(&fileWriter, DataBegin, DataEnd, CountersBegin, + CountersEnd, NULL, NULL, lprofGetVPDataReader(), NULL, + NULL, NULL, NULL, NamesBegin, NamesEnd, 0, Version); closeFileObject(OutputFile); // Restore SIGKILL. diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h index d5bd0e41fb129..2b9f687e7f886 100644 --- a/compiler-rt/lib/profile/InstrProfilingInternal.h +++ b/compiler-rt/lib/profile/InstrProfilingInternal.h @@ -160,7 +160,8 @@ int lprofWriteDataImpl(ProfDataWriter *Writer, VPDataReaderType *VPDataReader, const char *NamesBegin, const char *NamesEnd, const VTableProfData *VTableBegin, const VTableProfData *VTableEnd, const char *VNamesBegin, - const char *VNamesEnd, int SkipNameDataWrite); + const char *VNamesEnd, int SkipNameDataWrite, + uint64_t Version); /* Merge value profile data pointed to by SrcValueProfData into * in-memory profile counters pointed by to DstData. */ diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c index 8816a71155511..bcd88b30d050d 100644 --- a/compiler-rt/lib/profile/InstrProfilingWriter.c +++ b/compiler-rt/lib/profile/InstrProfilingWriter.c @@ -254,21 +254,21 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer, const VTableProfData *VTableEnd = __llvm_profile_end_vtables(); const char *VNamesBegin = __llvm_profile_begin_vtabnames(); const char *VNamesEnd = __llvm_profile_end_vtabnames(); + uint64_t Version = __llvm_profile_get_version(); return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd, VPDataReader, NamesBegin, NamesEnd, VTableBegin, VTableEnd, - VNamesBegin, VNamesEnd, SkipNameDataWrite); + VNamesBegin, VNamesEnd, SkipNameDataWrite, Version); } -COMPILER_RT_VISIBILITY int -lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin, - const __llvm_profile_data *DataEnd, - const char *CountersBegin, const char *CountersEnd, - const char *BitmapBegin, const char *BitmapEnd, - VPDataReaderType *VPDataReader, const char *NamesBegin, - const char *NamesEnd, const VTableProfData *VTableBegin, - const VTableProfData *VTableEnd, const char *VNamesBegin, - const char *VNamesEnd, int SkipNameDataWrite) { +COMPILER_RT_VISIBILITY int lprofWriteDataImpl( + ProfDataWriter *Writer, const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, const char *CountersBegin, + const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd, + VPDataReaderType *VPDataReader, const char *NamesBegin, + const char *NamesEnd, const VTableProfData *VTableBegin, + const VTableProfData *VTableEnd, const char *VNamesBegin, + const char *VNamesEnd, int SkipNameDataWrite, uint64_t Version) { /* Calculate size of sections. */ const uint64_t DataSectionSize = __llvm_profile_get_data_size(DataBegin, DataEnd); diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc index e9866d94b762c..f0a260483429c 100644 --- a/llvm/include/llvm/ProfileData/InstrProfData.inc +++ b/llvm/include/llvm/ProfileData/InstrProfData.inc @@ -152,7 +152,7 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \ #define INSTR_PROF_DATA_DEFINED #endif INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic()) -INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version()) +INSTR_PROF_RAW_HEADER(uint64_t, Version, Version) INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL)) INSTR_PROF_RAW_HEADER(uint64_t, NumData, NumData) INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesBeforeCounters, PaddingBytesBeforeCounters) diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index ee1657ba8400e..f3c68fb17ce7c 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -408,7 +408,10 @@ static GlobalVariable *createIRLevelProfileFlagVar(Module &M, bool IsCS) { auto IRLevelVersionVariable = new GlobalVariable( M, IntTy64, true, GlobalValue::WeakAnyLinkage, Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)), VarName); - IRLevelVersionVariable->setVisibility(GlobalValue::HiddenVisibility); + if (isGPUProfTarget(M)) + IRLevelVersionVariable->setVisibility(GlobalValue::ProtectedVisibility); + else + IRLevelVersionVariable->setVisibility(GlobalValue::HiddenVisibility); Triple TT(M.getTargetTriple()); if (TT.supportsCOMDAT()) { IRLevelVersionVariable->setLinkage(GlobalValue::ExternalLinkage); diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test index 292c582b45c52..f31aa15bfe6c9 100644 --- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test +++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test @@ -1,7 +1,7 @@ // Header // // INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic()) -// INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version()) +// INSTR_PROF_RAW_HEADER(uint64_t, Version, Version) // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL)) // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize) // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize) diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test index 705e5efaf5875..44be2980bb2f2 100644 --- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test +++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test @@ -1,7 +1,7 @@ // Header // // INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic()) -// INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version()) +// INSTR_PROF_RAW_HEADER(uint64_t, Version, Version) // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL)) // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize) // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize) diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test index 157c13b926a7e..9af9d65a6bdba 100644 --- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test +++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test @@ -1,7 +1,7 @@ // Header // // INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic()) -// INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version()) +// INSTR_PROF_RAW_HEADER(uint64_t, Version, Version) // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL)) // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize) // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize) diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test index 83cf76f68fb63..49c5ae9b0931d 100644 --- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test +++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test @@ -1,7 +1,7 @@ // Header // // INSTR_PROF_RAW_HEADER(uint64_t, Magic, __llvm_profile_get_magic()) -// INSTR_PROF_RAW_HEADER(uint64_t, Version, __llvm_profile_get_version()) +// INSTR_PROF_RAW_HEADER(uint64_t, Version, Version) // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL)) // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize) // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize) diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h index 1d7b9f80f9dfd..6daa8b1b85413 100644 --- a/offload/plugins-nextgen/common/include/GlobalHandler.h +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -13,6 +13,7 @@ #ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H #define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H +#include #include #include "llvm/ADT/DenseMap.h" @@ -64,10 +65,13 @@ struct __llvm_profile_data { }; extern "C" { -extern int __attribute__((weak)) __llvm_write_custom_profile( - const char *Target, const __llvm_profile_data *DataBegin, - const __llvm_profile_data *DataEnd, const char *CountersBegin, - const char *CountersEnd, const char *NamesBegin, const char *NamesEnd); +extern int __attribute__((weak)) +__llvm_write_custom_profile(const char *Target, + const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, + const char *CountersBegin, const char *CountersEnd, + const char *NamesBegin, const char *NamesEnd, + const uint64_t *VersionOverride); } /// PGO profiling data extracted from a GPU device @@ -76,6 +80,7 @@ struct GPUProfGlobals { SmallVector<__llvm_profile_data> Data; SmallVector NamesData; Triple TargetTriple; + std::optional Version; void dump() const; Error write() const; diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index bca66cff6558a..93abd0a5cea36 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -16,6 +16,7 @@ #include "Shared/Utils.h" +#include "llvm/ProfileData/InstrProfData.inc" #include "llvm/Support/Error.h" #include @@ -214,6 +215,13 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal)) return Err; DeviceProfileData.Data.push_back(std::move(Data)); + } else if (*NameOrErr == INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR)) { + uint64_t RawVersionData; + GlobalTy RawVersionGlobal(NameOrErr->str(), Sym.getSize(), + &RawVersionData); + if (auto Err = readGlobalFromDevice(Device, Image, RawVersionGlobal)) + return Err; + DeviceProfileData.Version = RawVersionData; } } return DeviceProfileData; @@ -267,6 +275,8 @@ Error GPUProfGlobals::write() const { CountsSize = Counts.size() * sizeof(int64_t); __llvm_profile_data *DataBegin, *DataEnd; char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd; + const uint64_t *VersionOverride = + Version.has_value() ? &Version.value() : nullptr; // Initialize array of contiguous data. We need to make sure each section is // contiguous so that the PGO library can compute deltas properly @@ -288,9 +298,9 @@ Error GPUProfGlobals::write() const { memcpy(NamesBegin, NamesData.data(), NamesData.size()); // Invoke compiler-rt entrypoint - int result = __llvm_write_custom_profile(TargetTriple.str().c_str(), - DataBegin, DataEnd, CountersBegin, - CountersEnd, NamesBegin, NamesEnd); + int result = __llvm_write_custom_profile( + TargetTriple.str().c_str(), DataBegin, DataEnd, CountersBegin, + CountersEnd, NamesBegin, NamesEnd, VersionOverride); if (result != 0) return Plugin::error("Error writing GPU PGO data to file"); From 5bf437618c91c882543c97d34b468d74070218fa Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 31 May 2024 12:57:07 -0500 Subject: [PATCH 041/114] Add host/device combination test --- offload/test/offloading/{ => gpupgo}/pgo1.c | 14 ++-- offload/test/offloading/gpupgo/pgo2.c | 73 +++++++++++++++++++++ 2 files changed, 81 insertions(+), 6 deletions(-) rename offload/test/offloading/{ => gpupgo}/pgo1.c (83%) create mode 100644 offload/test/offloading/gpupgo/pgo2.c diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/gpupgo/pgo1.c similarity index 83% rename from offload/test/offloading/pgo1.c rename to offload/test/offloading/gpupgo/pgo1.c index a3a242c703d96..7c6e55f654642 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/gpupgo/pgo1.c @@ -1,14 +1,16 @@ // RUN: %libomptarget-compile-generic -fprofile-generate-gpu -// RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1 +// RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \ +// RUN: %libomptarget-run-generic 2>&1 // RUN: llvm-profdata show --all-functions --counts \ -// RUN: %target_triple.llvm.profraw | %fcheck-generic \ -// RUN: --check-prefix="LLVM-PGO" +// RUN: %target_triple.%basename_t.llvm.profraw | \ +// RUN: %fcheck-generic --check-prefix="LLVM-PGO" // RUN: %libomptarget-compile-generic -fprofile-instr-generate-gpu -// RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1 +// RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \ +// RUN: %libomptarget-run-generic 2>&1 // RUN: llvm-profdata show --all-functions --counts \ -// RUN: %target_triple.clang.profraw | %fcheck-generic \ -// RUN: --check-prefix="CLANG-PGO" +// RUN: %target_triple.%basename_t.clang.profraw | \ +// RUN: %fcheck-generic --check-prefix="CLANG-PGO" // UNSUPPORTED: x86_64-pc-linux-gnu // UNSUPPORTED: x86_64-pc-linux-gnu-LTO diff --git a/offload/test/offloading/gpupgo/pgo2.c b/offload/test/offloading/gpupgo/pgo2.c new file mode 100644 index 0000000000000..1819573d55024 --- /dev/null +++ b/offload/test/offloading/gpupgo/pgo2.c @@ -0,0 +1,73 @@ +// RUN: %libomptarget-compile-generic -fprofile-generate \ +// RUN: -fprofile-generate-gpu +// RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \ +// RUN: %libomptarget-run-generic 2>&1 +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %basename_t.llvm.profraw | %fcheck-generic \ +// RUN: --check-prefix="LLVM-HOST" +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %target_triple.%basename_t.llvm.profraw \ +// RUN: | %fcheck-generic --check-prefix="LLVM-DEVICE" + +// RUN: %libomptarget-compile-generic -fprofile-instr-generate \ +// RUN: -fprofile-instr-generate-gpu +// RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \ +// RUN: %libomptarget-run-generic 2>&1 +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %basename_t.clang.profraw | %fcheck-generic \ +// RUN: --check-prefix="CLANG-HOST" +// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %target_triple.%basename_t.clang.profraw | \ +// RUN: %fcheck-generic --check-prefix="CLANG-DEV" + +// UNSUPPORTED: x86_64-pc-linux-gnu +// UNSUPPORTED: x86_64-pc-linux-gnu-LTO +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// REQUIRES: pgo + +int main() { + int host_var = 0; + for (int i = 0; i < 20; i++) { + host_var += i; + } + + int device_var = 1; +#pragma omp target + for (int i = 0; i < 10; i++) { + device_var *= i; + } +} + +// LLVM-HOST-LABEL: main: +// LLVM-HOST: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-HOST: Counters: 3 +// LLVM-HOST: Block counts: [20, 1, 0] + +// LLVM-HOST-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// LLVM-HOST: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-HOST: Counters: 2 +// LLVM-HOST: Block counts: [0, 0] + +// LLVM-DEVICE-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// LLVM-DEVICE: Hash: {{0[xX][0-9a-fA-F]+}} +// LLVM-DEVICE: Counters: 3 +// LLVM-DEVICE: Block counts: [10, 1, 1] + +// CLANG-HOST-LABEL: main: +// CLANG-HOST: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-HOST: Counters: 2 +// CLANG-HOST: Function count: 1 +// CLANG-HOST: Block counts: [20] + +// CLANG-HOST-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// CLANG-HOST: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-HOST: Counters: 2 +// CLANG-HOST: Function count: 0 +// CLANG-HOST: Block counts: [0] + +// CLANG-DEV-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: +// CLANG-DEV: Hash: {{0[xX][0-9a-fA-F]+}} +// CLANG-DEV: Counters: 2 +// CLANG-DEV: Function count: 0 +// CLANG-DEV: Block counts: [11] From 253013792cb7137b11893e701497e8f62143123a Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 31 May 2024 16:59:22 -0500 Subject: [PATCH 042/114] Add PGO dump debug option --- offload/include/Shared/Environment.h | 1 + offload/plugins-nextgen/common/src/PluginInterface.cpp | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h index d141146b6bd5a..86f6d1c6ea2d3 100644 --- a/offload/include/Shared/Environment.h +++ b/offload/include/Shared/Environment.h @@ -30,6 +30,7 @@ enum class DeviceDebugKind : uint32_t { FunctionTracing = 1U << 1, CommonIssues = 1U << 2, AllocationTracker = 1U << 3, + PGODump = 1U << 4, }; struct DeviceEnvironmentTy { diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index a479235e2c36c..a68875ea7748e 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -839,6 +839,10 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { if (!ProfOrErr) return ProfOrErr.takeError(); + if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) == + uint32_t(DeviceDebugKind::PGODump)) + ProfOrErr->dump(); + // Write data to profiling file if (auto Err = ProfOrErr->write()) { consumeError(std::move(Err)); From 344e357de657f54c068be969dcfc3ea33f2f026e Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 31 May 2024 20:29:20 -0500 Subject: [PATCH 043/114] Tighten PGO test requirements Require compiler-rt to be an enabled runtime --- offload/test/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/offload/test/CMakeLists.txt b/offload/test/CMakeLists.txt index 32df1e47afaeb..41ab339147791 100644 --- a/offload/test/CMakeLists.txt +++ b/offload/test/CMakeLists.txt @@ -12,10 +12,10 @@ else() set(LIBOMPTARGET_DEBUG False) endif() -if (OPENMP_STANDALONE_BUILD) - set(LIBOMPTARGET_TEST_GPU_PGO False) -else() +if (NOT OPENMP_STANDALONE_BUILD AND "compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES) set(LIBOMPTARGET_TEST_GPU_PGO True) +else() + set(LIBOMPTARGET_TEST_GPU_PGO False) endif() # Replace the space from user's input with ";" in case that CMake add escape From 2f751420b9ad2ffc7c9fac4a645724b45cdae59a Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 31 May 2024 20:29:20 -0500 Subject: [PATCH 044/114] Tighten PGO test requirements Require compiler-rt to be an enabled runtime --- offload/test/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/offload/test/CMakeLists.txt b/offload/test/CMakeLists.txt index 32df1e47afaeb..41ab339147791 100644 --- a/offload/test/CMakeLists.txt +++ b/offload/test/CMakeLists.txt @@ -12,10 +12,10 @@ else() set(LIBOMPTARGET_DEBUG False) endif() -if (OPENMP_STANDALONE_BUILD) - set(LIBOMPTARGET_TEST_GPU_PGO False) -else() +if (NOT OPENMP_STANDALONE_BUILD AND "compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES) set(LIBOMPTARGET_TEST_GPU_PGO True) +else() + set(LIBOMPTARGET_TEST_GPU_PGO False) endif() # Replace the space from user's input with ";" in case that CMake add escape From 79ceacb6559a3f6ecf3fd7ec1abf768ddeb97d13 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 31 May 2024 20:29:20 -0500 Subject: [PATCH 045/114] Tighten PGO test requirements Require compiler-rt to be an enabled runtime --- offload/test/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/offload/test/CMakeLists.txt b/offload/test/CMakeLists.txt index 32df1e47afaeb..41ab339147791 100644 --- a/offload/test/CMakeLists.txt +++ b/offload/test/CMakeLists.txt @@ -12,10 +12,10 @@ else() set(LIBOMPTARGET_DEBUG False) endif() -if (OPENMP_STANDALONE_BUILD) - set(LIBOMPTARGET_TEST_GPU_PGO False) -else() +if (NOT OPENMP_STANDALONE_BUILD AND "compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES) set(LIBOMPTARGET_TEST_GPU_PGO True) +else() + set(LIBOMPTARGET_TEST_GPU_PGO False) endif() # Replace the space from user's input with ";" in case that CMake add escape From ff0dd62cf1b236f1373fd3b70ec2875c3719ca04 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 31 May 2024 22:07:20 -0500 Subject: [PATCH 046/114] Add note about PGO debug flag --- openmp/docs/design/Runtimes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst index f8a8cb87e83e6..7fc697a838e22 100644 --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1493,3 +1493,4 @@ debugging features are supported. * Enable debugging assertions in the device. ``0x01`` * Enable diagnosing common problems during offloading . ``0x4`` * Enable device malloc statistics (amdgpu only). ``0x8`` + * Dump device PGO counters (only if PGO on GPU is enabled). ``0x10`` From 0b9cc35d686f03fb8f835b2be2c4e16b630bd426 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 3 Jun 2024 22:15:46 -0500 Subject: [PATCH 047/114] Fix clang format --- .../plugins-nextgen/common/include/GlobalHandler.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h index 6daa8b1b85413..1b38ce65216dd 100644 --- a/offload/plugins-nextgen/common/include/GlobalHandler.h +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -65,13 +65,11 @@ struct __llvm_profile_data { }; extern "C" { -extern int __attribute__((weak)) -__llvm_write_custom_profile(const char *Target, - const __llvm_profile_data *DataBegin, - const __llvm_profile_data *DataEnd, - const char *CountersBegin, const char *CountersEnd, - const char *NamesBegin, const char *NamesEnd, - const uint64_t *VersionOverride); +extern int __attribute__((weak)) __llvm_write_custom_profile( + const char *Target, const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, const char *NamesEnd, + const uint64_t *VersionOverride); } /// PGO profiling data extracted from a GPU device From 488cb4a349fdfbd73d0a78ddb2c17522c46145ba Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Wed, 26 Jun 2024 18:18:31 -0500 Subject: [PATCH 048/114] Apply requested formatting changes --- clang/lib/CodeGen/CodeGenPGO.cpp | 11 +++++----- llvm/lib/ProfileData/InstrProf.cpp | 4 ++-- .../Instrumentation/InstrProfiling.cpp | 10 ++++----- .../Instrumentation/PGOInstrumentation.cpp | 21 ++++++++++--------- offload/DeviceRTL/src/Profiling.cpp | 6 ++++-- 5 files changed, 28 insertions(+), 24 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index a7ce0b8f6a35f..3edfbdd679c61 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -1199,12 +1199,13 @@ void CodeGenPGO::emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S, // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling - auto *NormalizedPtr = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0)); + auto *NormalizedFuncNameVarPtr = + llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( + FuncNameVar, llvm::PointerType::get(CGM.getLLVMContext(), 0)); - llvm::Value *Args[] = {NormalizedPtr, Builder.getInt64(FunctionHash), - Builder.getInt32(NumRegionCounters), - Builder.getInt32(Counter), StepV}; + llvm::Value *Args[] = { + NormalizedFuncNameVarPtr, Builder.getInt64(FunctionHash), + Builder.getInt32(NumRegionCounters), Builder.getInt32(Counter), StepV}; if (llvm::EnableSingleByteCoverage) Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_cover), diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 1284efd4b5f4d..6742435c9d065 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -433,8 +433,8 @@ std::string getPGOFuncNameVarName(StringRef FuncName, } bool isGPUProfTarget(const Module &M) { - const auto &Triple = llvm::Triple(M.getTargetTriple()); - return Triple.isAMDGPU() || Triple.isNVPTX(); + const auto &T = Triple(M.getTargetTriple()); + return T.isAMDGPU() || T.isNVPTX(); } void setPGOFuncVisibility(Module &M, GlobalVariable *FuncNameVar) { diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index dd8c027c4bbf6..05cef1236f087 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -869,8 +869,8 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { llvm::InstrProfValueKind::IPVK_MemOPSize); CallInst *Call = nullptr; auto *TLI = &GetTLI(*Ind->getFunction()); - auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( - DataVar, PointerType::getUnqual(M.getContext())); + auto *NormalizedDataVarPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + DataVar, PointerType::get(M.getContext(), 0)); // To support value profiling calls within Windows exception handlers, funclet // information contained within operand bundles needs to be copied over to @@ -879,12 +879,12 @@ void InstrLowerer::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { SmallVector OpBundles; Ind->getOperandBundlesAsDefs(OpBundles); if (!IsMemOpSize) { - Value *Args[3] = {Ind->getTargetValue(), NormalizedPtr, + Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr, Builder.getInt32(Index)}; Call = Builder.CreateCall(getOrInsertValueProfilingCall(M, *TLI), Args, OpBundles); } else { - Value *Args[3] = {Ind->getTargetValue(), NormalizedPtr, + Value *Args[3] = {Ind->getTargetValue(), NormalizedDataVarPtr, Builder.getInt32(Index)}; Call = Builder.CreateCall( getOrInsertValueProfilingCall(M, *TLI, ValueProfilingCallType::MemOp), @@ -1580,7 +1580,7 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { ValuesVar->setAlignment(Align(8)); maybeSetComdat(ValuesVar, Fn, CntsVarName); ValuesPtrExpr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( - ValuesVar, PointerType::getUnqual(Fn->getContext())); + ValuesVar, PointerType::get(Fn->getContext(), 0)); } uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index ee1657ba8400e..f8f34ea25597f 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -884,7 +884,7 @@ static void instrumentOneFunc( FuncInfo.FunctionHash); // Make sure that pointer to global is passed in with zero addrspace // This is relevant during GPU profiling - auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( Name, PointerType::get(M->getContext(), 0)); if (PGOFunctionEntryCoverage) { auto &EntryBB = F.getEntryBlock(); @@ -893,7 +893,7 @@ static void instrumentOneFunc( // i32 ) Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_cover), - {NormalizedPtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)}); + {NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)}); return; } @@ -948,7 +948,7 @@ static void instrumentOneFunc( // i32 ) Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_timestamp), - {NormalizedPtr, CFGHash, Builder.getInt32(NumCounters), + {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I)}); I += PGOBlockCoverage ? 8 : 1; } @@ -963,7 +963,7 @@ static void instrumentOneFunc( Intrinsic::getDeclaration(M, PGOBlockCoverage ? Intrinsic::instrprof_cover : Intrinsic::instrprof_increment), - {NormalizedPtr, CFGHash, Builder.getInt32(NumCounters), + {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I++)}); } @@ -1007,15 +1007,15 @@ static void instrumentOneFunc( ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty()); assert(ToProfile && "value profiling Value is of unexpected type"); - auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( + auto *NormalizedNamePtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( Name, PointerType::get(M->getContext(), 0)); SmallVector OpBundles; populateEHOperandBundle(Cand, BlockColors, OpBundles); Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), - {NormalizedPtr, Builder.getInt64(FuncInfo.FunctionHash), ToProfile, - Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}, + {NormalizedNamePtr, Builder.getInt64(FuncInfo.FunctionHash), + ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}, OpBundles); } } // IPVK_First <= Kind <= IPVK_Last @@ -1688,11 +1688,12 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { IRBuilder<> Builder(&SI); Type *Int64Ty = Builder.getInt64Ty(); auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty); - auto *NormalizedPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( - FuncNameVar, PointerType::get(M->getContext(), 0)); + auto *NormalizedFuncNameVarPtr = + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + FuncNameVar, PointerType::get(M->getContext(), 0)); Builder.CreateCall( Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step), - {NormalizedPtr, Builder.getInt64(FuncHash), + {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step}); ++(*CurCtrIdx); } diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp index 799477f5e47d2..639c62ceff7a6 100644 --- a/offload/DeviceRTL/src/Profiling.cpp +++ b/offload/DeviceRTL/src/Profiling.cpp @@ -12,8 +12,10 @@ extern "C" { -void __llvm_profile_register_function(void *ptr) {} -void __llvm_profile_register_names_function(void *ptr, long int i) {} +// Provides empty implementations for certain functions in compiler-rt +// that are emitted by the PGO instrumentation. +void __llvm_profile_register_function(void *Ptr) {} +void __llvm_profile_register_names_function(void *Ptr, long int I) {} } #pragma omp end declare target From b90c01583f1893802aba0180b07a448584585365 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Wed, 26 Jun 2024 18:29:59 -0500 Subject: [PATCH 049/114] Add memop function shim to DeviceRTL This comes up sometimes when using LLVM IR level instrumentation. --- offload/DeviceRTL/include/Profiling.h | 1 + offload/DeviceRTL/src/Profiling.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/offload/DeviceRTL/include/Profiling.h b/offload/DeviceRTL/include/Profiling.h index 9efc1554c176b..d994752254121 100644 --- a/offload/DeviceRTL/include/Profiling.h +++ b/offload/DeviceRTL/include/Profiling.h @@ -15,6 +15,7 @@ extern "C" { void __llvm_profile_register_function(void *Ptr); void __llvm_profile_register_names_function(void *Ptr, long int I); +void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2); } #endif diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp index 639c62ceff7a6..bb3caaadcc03d 100644 --- a/offload/DeviceRTL/src/Profiling.cpp +++ b/offload/DeviceRTL/src/Profiling.cpp @@ -16,6 +16,7 @@ extern "C" { // that are emitted by the PGO instrumentation. void __llvm_profile_register_function(void *Ptr) {} void __llvm_profile_register_names_function(void *Ptr, long int I) {} +void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {} } #pragma omp end declare target From c68c6e2fa98a1fe608b88ed38f7db68eae804c5b Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Thu, 27 Jun 2024 02:04:27 -0500 Subject: [PATCH 050/114] Make requested changes --- compiler-rt/lib/profile/InstrProfiling.h | 2 +- compiler-rt/lib/profile/InstrProfilingFile.c | 1 - offload/plugins-nextgen/common/src/PluginInterface.cpp | 5 ++--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index ef1292a45bf01..eda3e9a673c1a 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -298,7 +298,7 @@ void __llvm_profile_set_dumped(); /*! * \brief Write custom target-specific profiling data to a seperate file. - * Used by libomptarget for GPU PGO. + * Used by offload PGO. */ int __llvm_write_custom_profile(const char *Target, const __llvm_profile_data *DataBegin, diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 784cb9af6169d..93436ecbabb40 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1321,7 +1321,6 @@ int __llvm_write_custom_profile(const char *Target, /* Prepend "TARGET." to current filename */ memcpy(TargetFilename, Target, TargetLength); TargetFilename[TargetLength] = '.'; - memcpy(TargetFilename, Target, TargetLength); memcpy(TargetFilename + 1 + TargetLength, Filename, FilenameLength); TargetFilename[FilenameLength + 1 + TargetLength] = 0; diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index c4e1e63777de8..445f4ad942bd4 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -843,9 +843,8 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { ProfOrErr->dump(); // Write data to profiling file - if (auto Err = ProfOrErr->write()) { - consumeError(std::move(Err)); - } + if (auto Err = ProfOrErr->write()) + return Err; } // Delete the memory manager before deinitializing the device. Otherwise, From ca52c58c7fde412897cf6b10b9bbb321812f193d Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Thu, 27 Jun 2024 02:26:20 -0500 Subject: [PATCH 051/114] Only dump counters if PGODump flag is set --- offload/include/Shared/Environment.h | 1 + offload/plugins-nextgen/common/src/PluginInterface.cpp | 4 +++- openmp/docs/design/Runtimes.rst | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h index d141146b6bd5a..86f6d1c6ea2d3 100644 --- a/offload/include/Shared/Environment.h +++ b/offload/include/Shared/Environment.h @@ -30,6 +30,7 @@ enum class DeviceDebugKind : uint32_t { FunctionTracing = 1U << 1, CommonIssues = 1U << 2, AllocationTracker = 1U << 3, + PGODump = 1U << 4, }; struct DeviceEnvironmentTy { diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 445f4ad942bd4..35fb04863d874 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -840,7 +840,9 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { return ProfOrErr.takeError(); // Dump out profdata - ProfOrErr->dump(); + if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) == + uint32_t(DeviceDebugKind::PGODump)) + ProfOrErr->dump(); // Write data to profiling file if (auto Err = ProfOrErr->write()) diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst index f8a8cb87e83e6..7fc697a838e22 100644 --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1493,3 +1493,4 @@ debugging features are supported. * Enable debugging assertions in the device. ``0x01`` * Enable diagnosing common problems during offloading . ``0x4`` * Enable device malloc statistics (amdgpu only). ``0x8`` + * Dump device PGO counters (only if PGO on GPU is enabled). ``0x10`` From ee4431a1b57469c7679f54f124ca5f3dd7f0433b Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 9 Aug 2024 20:21:38 -0500 Subject: [PATCH 052/114] Update requirements --- offload/test/offloading/pgo1.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c index d674711326580..fbf6337374a99 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/pgo1.c @@ -12,10 +12,7 @@ // RUN: %target_triple.clang.profraw | %fcheck-generic \ // RUN: --check-prefix="CLANG-PGO" -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// REQUIRES: gpu // REQUIRES: pgo int test1(int a) { return a / 2; } From f9a24e35dfce2b18d0c4acefdaa0e71561bb875d Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 9 Aug 2024 20:30:58 -0500 Subject: [PATCH 053/114] Update test requirements --- offload/test/offloading/gpupgo/pgo1.c | 5 +---- offload/test/offloading/gpupgo/pgo2.c | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/offload/test/offloading/gpupgo/pgo1.c b/offload/test/offloading/gpupgo/pgo1.c index 7c6e55f654642..f5d8aee7908be 100644 --- a/offload/test/offloading/gpupgo/pgo1.c +++ b/offload/test/offloading/gpupgo/pgo1.c @@ -12,10 +12,7 @@ // RUN: %target_triple.%basename_t.clang.profraw | \ // RUN: %fcheck-generic --check-prefix="CLANG-PGO" -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// REQUIRES: gpu // REQUIRES: pgo int test1(int a) { return a / 2; } diff --git a/offload/test/offloading/gpupgo/pgo2.c b/offload/test/offloading/gpupgo/pgo2.c index 1819573d55024..b5d0f2120754a 100644 --- a/offload/test/offloading/gpupgo/pgo2.c +++ b/offload/test/offloading/gpupgo/pgo2.c @@ -20,10 +20,7 @@ // RUN: %target_triple.%basename_t.clang.profraw | \ // RUN: %fcheck-generic --check-prefix="CLANG-DEV" -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// REQUIRES: gpu // REQUIRES: pgo int main() { From fb699b6bca72d42359a304bcbba88f3564ae9ac9 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Sat, 10 Aug 2024 00:54:36 -0500 Subject: [PATCH 054/114] Merge changes --- offload/plugins-nextgen/common/src/GlobalHandler.cpp | 2 +- offload/test/offloading/pgo1.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index bca66cff6558a..d7bfbba01c8ef 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -193,7 +193,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, // Check if given current global is a profiling global based // on name - if (NameOrErr->equals(getInstrProfNamesVarName())) { + if (*NameOrErr == getInstrProfNamesVarName()) { // Read in profiled function names DeviceProfileData.NamesData = SmallVector(Sym.getSize(), 0); GlobalTy NamesGlobal(NameOrErr->str(), Sym.getSize(), diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c index fbf6337374a99..3270ce8f15e7d 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/pgo1.c @@ -32,17 +32,17 @@ int main() { // LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-PGO: Counters: 4 -// LLVM-PGO: Block counts: [20, 10, 20, 10] +// LLVM-PGO: Block counts: [20, 10, 2, 1] // LLVM-PGO-LABEL: test1: // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-PGO: Counters: 1 -// LLVM-PGO: Block counts: [1] +// LLVM-PGO: Block counts: [10] // LLVM-PGO-LABEL: test2: // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-PGO: Counters: 1 -// LLVM-PGO: Block counts: [1] +// LLVM-PGO: Block counts: [20] // CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} From 5a671f685921b5cc02ced87a410645e8ad1b5c98 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 13:55:13 -0400 Subject: [PATCH 055/114] [KernelInfo] Implement new LLVM IR pass for GPU code analysis This patch implements an LLVM IR pass, named kernel-info, that reports various statistics for codes compiled for GPUs. The ultimate goal of these statistics to help identify bad code patterns and ways to mitigate them. The pass operates at the LLVM IR level so that it can, in theory, support any LLVM-based compiler for programming languages supporting GPUs. It has been tested so far with LLVM IR generated by Clang for OpenMP offload codes targeting NVIDIA GPUs and AMD GPUs. By default, the pass is disabled. For convenience, `-kernel-info-end-lto` inserts it at the end of LTO, and options like `-Rpass=kernel-info` enable its remarks. Example opt and clang command lines appear in comments in `llvm/include/llvm/Analysis/KernelInfo.h`. Remarks include summary statistics (e.g., total size of static allocas) and individual occurrences (e.g., source location of each alloca). Examples of its output appear in tests in `llvm/test/Analysis/KernelInfo`. --- llvm/include/llvm/Analysis/KernelInfo.h | 148 ++++ llvm/include/llvm/Target/TargetMachine.h | 3 + llvm/lib/Analysis/CMakeLists.txt | 1 + llvm/lib/Analysis/KernelInfo.cpp | 350 ++++++++ llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassRegistry.def | 2 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 + llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 10 + llvm/lib/Target/TargetMachine.cpp | 5 + llvm/test/Analysis/KernelInfo/addrspace0.ll | 152 ++++ llvm/test/Analysis/KernelInfo/allocas.ll | 78 ++ llvm/test/Analysis/KernelInfo/calls.ll | 112 +++ .../kernel-info-after-lto/amdgpu.ll | 47 + .../KernelInfo/kernel-info-after-lto/nvptx.ll | 47 + .../KernelInfo/launch-bounds/amdgpu.ll | 40 + .../KernelInfo/launch-bounds/nvptx.ll | 36 + llvm/test/Analysis/KernelInfo/linkage.ll | 51 ++ .../test/Analysis/KernelInfo/openmp/README.md | 40 + .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 217 +++++ llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 811 ++++++++++++++++++ 20 files changed, 2161 insertions(+) create mode 100644 llvm/include/llvm/Analysis/KernelInfo.h create mode 100644 llvm/lib/Analysis/KernelInfo.cpp create mode 100644 llvm/test/Analysis/KernelInfo/addrspace0.ll create mode 100644 llvm/test/Analysis/KernelInfo/allocas.ll create mode 100644 llvm/test/Analysis/KernelInfo/calls.ll create mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll create mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll create mode 100644 llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll create mode 100644 llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll create mode 100644 llvm/test/Analysis/KernelInfo/linkage.ll create mode 100644 llvm/test/Analysis/KernelInfo/openmp/README.md create mode 100644 llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll create mode 100644 llvm/test/Analysis/KernelInfo/openmp/nvptx.ll diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h new file mode 100644 index 0000000000000..5495bb2fd4d92 --- /dev/null +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -0,0 +1,148 @@ +//=- KernelInfo.h - Kernel Analysis -------------------------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter +// classes used to extract function properties from a GPU kernel. +// +// To analyze a C program as it appears to an LLVM GPU backend at the end of +// LTO: +// +// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ +// -Rpass=kernel-info -mllvm -kernel-info-end-lto +// +// To analyze specified LLVM IR, perhaps previously generated by something like +// 'clang -save-temps -g -fopenmp --offload-arch=native test.c': +// +// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ +// -pass-remarks=kernel-info -passes=kernel-info +// +// kernel-info can also be inserted into a specified LLVM pass pipeline using +// -kernel-info-end-lto, or it can be positioned explicitly in that pipeline: +// +// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ +// -Rpass=kernel-info -mllvm -kernel-info-end-lto \ +// -Xoffload-linker --lto-newpm-passes='lto' +// +// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ +// -Rpass=kernel-info \ +// -Xoffload-linker --lto-newpm-passes='lto,module(kernel-info)' +// +// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ +// -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto' +// +// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ +// -pass-remarks=kernel-info -passes='lto,module(kernel-info)' +// ===---------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_KERNELINFO_H +#define LLVM_ANALYSIS_KERNELINFO_H + +#include "llvm/Analysis/OptimizationRemarkEmitter.h" + +namespace llvm { +class DominatorTree; +class Function; + +/// Data structure holding function info for kernels. +class KernelInfo { + void updateForBB(const BasicBlock &BB, int64_t Direction, + OptimizationRemarkEmitter &ORE); + +public: + static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); + + bool operator==(const KernelInfo &FPI) const { + return std::memcmp(this, &FPI, sizeof(KernelInfo)) == 0; + } + + bool operator!=(const KernelInfo &FPI) const { return !(*this == FPI); } + + /// If false, nothing was recorded here because the supplied function didn't + /// appear in a module compiled for a GPU. + bool IsValid = false; + + /// Whether the function has external linkage and is not a kernel function. + bool ExternalNotKernel = false; + + /// OpenMP Launch bounds. + ///@{ + std::optional OmpTargetNumTeams; + std::optional OmpTargetThreadLimit; + ///@} + + /// AMDGPU launch bounds. + ///@{ + std::optional AmdgpuMaxNumWorkgroupsX; + std::optional AmdgpuMaxNumWorkgroupsY; + std::optional AmdgpuMaxNumWorkgroupsZ; + std::optional AmdgpuFlatWorkGroupSizeMin; + std::optional AmdgpuFlatWorkGroupSizeMax; + std::optional AmdgpuWavesPerEuMin; + std::optional AmdgpuWavesPerEuMax; + ///@} + + /// NVPTX launch bounds. + ///@{ + std::optional Maxclusterrank; + std::optional Maxntidx; + ///@} + + /// The number of alloca instructions inside the function, the number of those + /// with allocation sizes that cannot be determined at compile time, and the + /// sum of the sizes that can be. + /// + /// With the current implementation for at least some GPU archs, + /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in + /// case the implementation changes. + int64_t Allocas = 0; + int64_t AllocasDyn = 0; + int64_t AllocasStaticSizeSum = 0; + + /// Number of direct/indirect calls (anything derived from CallBase). + int64_t DirectCalls = 0; + int64_t IndirectCalls = 0; + + /// Number of direct calls made from this function to other functions + /// defined in this module. + int64_t DirectCallsToDefinedFunctions = 0; + + /// Number of calls of type InvokeInst. + int64_t Invokes = 0; + + /// Number of addrspace(0) memory accesses (via load, store, etc.). + int64_t AddrspaceZeroAccesses = 0; +}; + +/// Analysis class for KernelInfo. +class KernelInfoAnalysis : public AnalysisInfoMixin { +public: + static AnalysisKey Key; + + using Result = const KernelInfo; + + KernelInfo run(Function &F, FunctionAnalysisManager &FAM) { + return KernelInfo::getKernelInfo(F, FAM); + } +}; + +/// Printer pass for KernelInfoAnalysis. +/// +/// It just calls KernelInfoAnalysis, which prints remarks if they are enabled. +class KernelInfoPrinter : public PassInfoMixin { +public: + explicit KernelInfoPrinter() {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) { + AM.getResult(F); + return PreservedAnalyses::all(); + } + + static bool isRequired() { return true; } +}; +} // namespace llvm +#endif // LLVM_ANALYSIS_KERNELINFO_H diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h index c3e9d41315f61..5c338a8fcd0cf 100644 --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -18,6 +18,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/PGOOptions.h" #include "llvm/Target/CGPassBuilderOption.h" @@ -27,6 +28,8 @@ #include #include +extern llvm::cl::opt KernelInfoEndLTO; + namespace llvm { class AAManager; diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index 2cb3547ec4047..02e76af8d903d 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -78,6 +78,7 @@ add_llvm_component_library(LLVMAnalysis InstructionPrecedenceTracking.cpp InstructionSimplify.cpp InteractiveModelRunner.cpp + KernelInfo.cpp LazyBranchProbabilityInfo.cpp LazyBlockFrequencyInfo.cpp LazyCallGraph.cpp diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp new file mode 100644 index 0000000000000..9df3b5b32afcb --- /dev/null +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -0,0 +1,350 @@ +//===- KernelInfo.cpp - Kernel Analysis -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter +// classes used to extract function properties from a kernel. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/KernelInfo.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassBuilder.h" + +using namespace llvm; + +#define DEBUG_TYPE "kernel-info" + +static bool isKernelFunction(Function &F) { + // TODO: Is this general enough? Consider languages beyond OpenMP. + return F.hasFnAttribute("kernel"); +} + +static void identifyFunction(OptimizationRemark &R, const Function &F) { + if (auto *SubProgram = F.getSubprogram()) { + if (SubProgram->isArtificial()) + R << "artificial "; + } + R << "function '" << F.getName() << "'"; +} + +static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller, + const AllocaInst &Alloca, + TypeSize::ScalarTy StaticSize) { + ORE.emit([&] { + StringRef Name; + DebugLoc Loc; + bool Artificial = false; + auto DVRs = findDVRDeclares(&const_cast(Alloca)); + if (!DVRs.empty()) { + const DbgVariableRecord &DVR = **DVRs.begin(); + Name = DVR.getVariable()->getName(); + Loc = DVR.getDebugLoc(); + Artificial = DVR.Variable->isArtificial(); + } + OptimizationRemark R(DEBUG_TYPE, "Alloca", DiagnosticLocation(Loc), + Alloca.getParent()); + R << "in "; + identifyFunction(R, Caller); + R << ", "; + if (Artificial) + R << "artificial "; + if (Name.empty()) { + R << "unnamed alloca "; + if (DVRs.empty()) + R << "(missing debug metadata) "; + } else { + R << "alloca '" << Name << "' "; + } + R << "with "; + if (StaticSize) + R << "static size of " << itostr(StaticSize) << " bytes"; + else + R << "dynamic size"; + return R; + }); +} + +static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, + const CallBase &Call, StringRef CallKind, + StringRef RemarkKind) { + ORE.emit([&] { + OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call); + R << "in "; + identifyFunction(R, Caller); + R << ", " << CallKind; + if (const Function *Callee = + dyn_cast_or_null(Call.getCalledOperand())) { + R << ", callee is"; + StringRef Name = Callee->getName(); + if (auto *SubProgram = Callee->getSubprogram()) { + if (SubProgram->isArtificial()) + R << " artificial"; + } + if (!Name.empty()) + R << " '" << Name << "'"; + else + R << " with unknown name"; + } + return R; + }); +} + +static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE, + const Function &Caller, + const Instruction &Inst) { + ORE.emit([&] { + OptimizationRemark R(DEBUG_TYPE, "AddrspaceZeroAccess", &Inst); + R << "in "; + identifyFunction(R, Caller); + if (const IntrinsicInst *II = dyn_cast(&Inst)) { + R << ", '" << II->getCalledFunction()->getName() << "' call"; + } else { + R << ", '" << Inst.getOpcodeName() << "' instruction"; + } + if (Inst.hasName()) + R << " ('%" << Inst.getName() << "')"; + R << " accesses memory in addrspace(0)"; + return R; + }); +} + +void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, + OptimizationRemarkEmitter &ORE) { + assert(Direction == 1 || Direction == -1); + const Function &F = *BB.getParent(); + const Module &M = *F.getParent(); + const DataLayout &DL = M.getDataLayout(); + for (const Instruction &I : BB.instructionsWithoutDebug()) { + if (const AllocaInst *Alloca = dyn_cast(&I)) { + Allocas += Direction; + TypeSize::ScalarTy StaticSize = 0; + if (std::optional Size = Alloca->getAllocationSize(DL)) { + StaticSize = Size->getFixedValue(); + assert(StaticSize <= std::numeric_limits::max()); + AllocasStaticSizeSum += Direction * StaticSize; + } else { + AllocasDyn += Direction; + } + remarkAlloca(ORE, F, *Alloca, StaticSize); + } else if (const CallBase *Call = dyn_cast(&I)) { + std::string CallKind; + std::string RemarkKind; + if (Call->isIndirectCall()) { + IndirectCalls += Direction; + CallKind += "indirect"; + RemarkKind += "Indirect"; + } else { + DirectCalls += Direction; + CallKind += "direct"; + RemarkKind += "Direct"; + } + if (isa(Call)) { + Invokes += Direction; + CallKind += " invoke"; + RemarkKind += "Invoke"; + } else { + CallKind += " call"; + RemarkKind += "Call"; + } + if (!Call->isIndirectCall()) { + if (const Function *Callee = Call->getCalledFunction()) { + if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration()) { + DirectCallsToDefinedFunctions += Direction; + CallKind += " to defined function"; + RemarkKind += "ToDefinedFunction"; + } + } + } + remarkCall(ORE, F, *Call, CallKind, RemarkKind); + if (const AnyMemIntrinsic *MI = dyn_cast(Call)) { + if (MI->getDestAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } else if (const AnyMemTransferInst *MT = + dyn_cast(MI)) { + if (MT->getSourceAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } + } + } + } else if (const LoadInst *Load = dyn_cast(&I)) { + if (Load->getPointerAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } + } else if (const StoreInst *Store = dyn_cast(&I)) { + if (Store->getPointerAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } + } else if (const AtomicRMWInst *At = dyn_cast(&I)) { + if (At->getPointerAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } + } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { + if (At->getPointerAddressSpace() == 0) { + AddrspaceZeroAccesses += Direction; + remarkAddrspaceZeroAccess(ORE, F, I); + } + } + } +} + +static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, + StringRef Name, int64_t Value) { + ORE.emit([&] { + OptimizationRemark R(DEBUG_TYPE, Name, &F); + R << "in "; + identifyFunction(R, F); + R << ", " << Name << " = " << itostr(Value); + return R; + }); +} + +static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, + StringRef Name, std::optional Value) { + if (!Value) + return; + remarkProperty(ORE, F, Name, Value.value()); +} + +static std::vector> +parseFnAttrAsIntegerFields(Function &F, StringRef Name, unsigned NumFields) { + std::vector> Result(NumFields); + Attribute A = F.getFnAttribute(Name); + if (!A.isStringAttribute()) + return Result; + StringRef Rest = A.getValueAsString(); + for (unsigned I = 0; I < NumFields; ++I) { + StringRef Field; + std::tie(Field, Rest) = Rest.split(','); + if (Field.empty()) + break; + int64_t Val; + if (Field.getAsInteger(0, Val)) { + F.getContext().emitError("cannot parse integer in attribute '" + Name + + "': " + Field); + break; + } + Result[I] = Val; + } + if (!Rest.empty()) + F.getContext().emitError("too many fields in attribute " + Name); + return Result; +} + +static std::optional parseFnAttrAsInteger(Function &F, + StringRef Name) { + return parseFnAttrAsIntegerFields(F, Name, 1)[0]; +} + +// TODO: This nearly duplicates the same function in OMPIRBuilder.cpp. Can we +// share? +static MDNode *getNVPTXMDNode(Function &F, StringRef Name) { + Module &M = *F.getParent(); + NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"); + if (!MD) + return nullptr; + for (auto *Op : MD->operands()) { + if (Op->getNumOperands() != 3) + continue; + auto *KernelOp = dyn_cast(Op->getOperand(0)); + if (!KernelOp || KernelOp->getValue() != &F) + continue; + auto *Prop = dyn_cast(Op->getOperand(1)); + if (!Prop || Prop->getString() != Name) + continue; + return Op; + } + return nullptr; +} + +static std::optional parseNVPTXMDNodeAsInteger(Function &F, + StringRef Name) { + std::optional Result; + if (MDNode *ExistingOp = getNVPTXMDNode(F, Name)) { + auto *Op = cast(ExistingOp->getOperand(2)); + Result = cast(Op->getValue())->getZExtValue(); + } + return Result; +} + +KernelInfo KernelInfo::getKernelInfo(Function &F, + FunctionAnalysisManager &FAM) { + KernelInfo KI; + // Only analyze modules for GPUs. + // TODO: This would be more maintainable if there were an isGPU. + const std::string &TT = F.getParent()->getTargetTriple(); + llvm::Triple T(TT); + if (!T.isAMDGPU() && !T.isNVPTX()) + return KI; + KI.IsValid = true; + + // Record function properties. + KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); + KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams"); + KI.OmpTargetThreadLimit = parseFnAttrAsInteger(F, "omp_target_thread_limit"); + auto AmdgpuMaxNumWorkgroups = + parseFnAttrAsIntegerFields(F, "amdgpu-max-num-workgroups", 3); + KI.AmdgpuMaxNumWorkgroupsX = AmdgpuMaxNumWorkgroups[0]; + KI.AmdgpuMaxNumWorkgroupsY = AmdgpuMaxNumWorkgroups[1]; + KI.AmdgpuMaxNumWorkgroupsZ = AmdgpuMaxNumWorkgroups[2]; + auto AmdgpuFlatWorkGroupSize = + parseFnAttrAsIntegerFields(F, "amdgpu-flat-work-group-size", 2); + KI.AmdgpuFlatWorkGroupSizeMin = AmdgpuFlatWorkGroupSize[0]; + KI.AmdgpuFlatWorkGroupSizeMax = AmdgpuFlatWorkGroupSize[1]; + auto AmdgpuWavesPerEu = + parseFnAttrAsIntegerFields(F, "amdgpu-waves-per-eu", 2); + KI.AmdgpuWavesPerEuMin = AmdgpuWavesPerEu[0]; + KI.AmdgpuWavesPerEuMax = AmdgpuWavesPerEu[1]; + KI.Maxclusterrank = parseNVPTXMDNodeAsInteger(F, "maxclusterrank"); + KI.Maxntidx = parseNVPTXMDNodeAsInteger(F, "maxntidx"); + + const DominatorTree &DT = FAM.getResult(F); + auto &ORE = FAM.getResult(F); + for (const auto &BB : F) + if (DT.isReachableFromEntry(&BB)) + KI.updateForBB(BB, +1, ORE); + +#define REMARK_PROPERTY(PROP_NAME) \ + remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) + REMARK_PROPERTY(ExternalNotKernel); + REMARK_PROPERTY(OmpTargetNumTeams); + REMARK_PROPERTY(OmpTargetThreadLimit); + REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsX); + REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsY); + REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsZ); + REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMin); + REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMax); + REMARK_PROPERTY(AmdgpuWavesPerEuMin); + REMARK_PROPERTY(AmdgpuWavesPerEuMax); + REMARK_PROPERTY(Maxclusterrank); + REMARK_PROPERTY(Maxntidx); + REMARK_PROPERTY(Allocas); + REMARK_PROPERTY(AllocasStaticSizeSum); + REMARK_PROPERTY(AllocasDyn); + REMARK_PROPERTY(DirectCalls); + REMARK_PROPERTY(IndirectCalls); + REMARK_PROPERTY(DirectCallsToDefinedFunctions); + REMARK_PROPERTY(Invokes); + REMARK_PROPERTY(AddrspaceZeroAccesses); +#undef REMARK_PROPERTY + + return KI; +} + +AnalysisKey KernelInfoAnalysis::Key; diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 46f43f3de4705..61677f02783cc 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -44,6 +44,7 @@ #include "llvm/Analysis/InlineAdvisor.h" #include "llvm/Analysis/InlineSizeEstimatorAnalysis.h" #include "llvm/Analysis/InstCount.h" +#include "llvm/Analysis/KernelInfo.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Lint.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 0cec9fbd7cd05..dcfa732f410b3 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -278,6 +278,7 @@ FUNCTION_ANALYSIS( MachineFunctionAnalysis(static_cast(TM))) FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis()) FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis()) +FUNCTION_ANALYSIS("kernel-info", KernelInfoAnalysis()) FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis()) FUNCTION_ANALYSIS("loops", LoopAnalysis()) FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis()) @@ -374,6 +375,7 @@ FUNCTION_PASS("irce", IRCEPass()) FUNCTION_PASS("jump-threading", JumpThreadingPass()) FUNCTION_PASS("jump-table-to-switch", JumpTableToSwitchPass()); FUNCTION_PASS("kcfi", KCFIPass()) +FUNCTION_PASS("kernel-info", KernelInfoPrinter()) FUNCTION_PASS("lcssa", LCSSAPass()) FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass()) FUNCTION_PASS("lint", LintPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0523fee5bcf9f..3b2ed9fe4236c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -40,6 +40,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/KernelInfo.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" @@ -772,6 +773,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { return onlyAllocateVGPRs; return nullptr; }); + + PB.registerFullLinkTimeOptimizationLastEPCallback( + [](ModulePassManager &PM, OptimizationLevel Level) { + if (KernelInfoEndLTO) { + FunctionPassManager FPM; + FPM.addPass(KernelInfoPrinter()); + PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + }); } int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 097e29527eed9..8d77c8e53f7a6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -22,6 +22,7 @@ #include "NVPTXTargetTransformInfo.h" #include "TargetInfo/NVPTXTargetInfo.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/KernelInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -238,6 +239,15 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { FPM.addPass(NVVMIntrRangePass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); + + PB.registerFullLinkTimeOptimizationLastEPCallback( + [](ModulePassManager &PM, OptimizationLevel Level) { + if (KernelInfoEndLTO) { + FunctionPassManager FPM; + FPM.addPass(KernelInfoPrinter()); + PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + }); } TargetTransformInfo diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index c0985f3be91a5..b235fd8f6f49a 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -26,6 +26,11 @@ #include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; +cl::opt KernelInfoEndLTO( + "kernel-info-end-lto", + cl::desc("add the kernel-info pass at the end of the full LTO pipeline"), + cl::init(false), cl::Hidden); + //--------------------------------------------------------------------------- // TargetMachine Class // diff --git a/llvm/test/Analysis/KernelInfo/addrspace0.ll b/llvm/test/Analysis/KernelInfo/addrspace0.ll new file mode 100644 index 0000000000000..4c472396443f5 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/addrspace0.ll @@ -0,0 +1,152 @@ +; Check info on addrspace(0) memory accesses. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines --implicit-check-not='addrspace(0)' %s + +target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define void @f() !dbg !3 { +entry: + ; load + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in addrspace(0) + %0 = load i32, ptr null, align 4, !dbg !6 + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in addrspace(0) + %load = load i32, ptr null, align 4, !dbg !6 + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in addrspace(0) + %load0 = load i32, ptr addrspace(0) null, align 4, !dbg !6 + %load1 = load i32, ptr addrspace(1) null, align 4, !dbg !6 + %load2 = load i32, ptr addrspace(2) null, align 4, !dbg !6 + + ; store + ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0) + store i32 0, ptr null, align 4, !dbg !7 + ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0) + store i32 0, ptr addrspace(0) null, align 4, !dbg !7 + store i32 0, ptr addrspace(1) null, align 4, !dbg !7 + store i32 0, ptr addrspace(8) null, align 4, !dbg !7 + + ; atomicrmw + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0) + atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8 + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0) + atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8 + atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8 + atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8 + + ; cmpxchg + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0) + cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0) + cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 + + ; llvm.memcpy + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + + ; llvm.memcpy.inline + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.inline.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.inline.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.inline.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 + call void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 + + ; llvm.memcpy.element.unordered.atomic + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10 + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memcpy.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10 + call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10 + + ; llvm.memmove + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 + call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11 + + ; llvm.memmove.element.unordered.atomic + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11 + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memmove.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11 + call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11 + + ; llvm.memset + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12 + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + + ; llvm.memset.inline + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.inline.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12 + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.inline.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.inline.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + call void @llvm.memset.inline.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12 + + ; llvm.memset.element.unordered.atomic + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 null, i8 0, i64 10, i32 4), !dbg !12 + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0) + call void @llvm.memset.element.unordered.atomic.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i32 4), !dbg !12 + call void @llvm.memset.element.unordered.atomic.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i32 4), !dbg !12 + call void @llvm.memset.element.unordered.atomic.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i32 4), !dbg !12 + + ret void +} +; CHECK: remark: test.c:2:0: in function 'f', AddrspaceZeroAccesses = 36 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 2, type: !4, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !5) +!4 = !DISubroutineType(types: !5) +!5 = !{} +!6 = !DILocation(line: 3, column: 11, scope: !3) +!7 = !DILocation(line: 4, column: 6, scope: !3) +!8 = !DILocation(line: 5, column: 1, scope: !3) +!9 = !DILocation(line: 6, column: 2, scope: !3) +!10 = !DILocation(line: 7, column: 3, scope: !3) +!11 = !DILocation(line: 8, column: 4, scope: !3) +!12 = !DILocation(line: 9, column: 5, scope: !3) diff --git a/llvm/test/Analysis/KernelInfo/allocas.ll b/llvm/test/Analysis/KernelInfo/allocas.ll new file mode 100644 index 0000000000000..048d53799c33e --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/allocas.ll @@ -0,0 +1,78 @@ +; Check info on allocas. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define void @h() !dbg !3 { +entry: + ; CHECK: remark: test.c:0:0: in artificial function 'h', artificial alloca 'dyn_ptr' with static size of 8 bytes + %dyn_ptr.addr = alloca ptr, align 8 + ; CHECK: remark: test.c:14:9: in artificial function 'h', alloca 'i' with static size of 4 bytes + %i = alloca i32, align 4 + ; CHECK: remark: test.c:15:9: in artificial function 'h', alloca 'a' with static size of 8 bytes + %a = alloca [2 x i32], align 4 + tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !7, metadata !DIExpression()), !dbg !11 + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !12, metadata !DIExpression()), !dbg !15 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !16, metadata !DIExpression()), !dbg !20 + ret void +} +; CHECK: remark: test.c:13:0: in artificial function 'h', Allocas = 3 +; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasStaticSizeSum = 20 +; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasDyn = 0 + +define void @g() !dbg !21 { +entry: + ; CHECK: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes + %i = alloca i32, align 4 + ; CHECK: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes + %a = alloca [2 x i32], align 4 + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !23, metadata !DIExpression()), !dbg !24 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !25, metadata !DIExpression()), !dbg !26 + ret void +} +; CHECK: remark: test.c:3:0: in function 'g', Allocas = 2 +; CHECK: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 +; CHECK: remark: test.c:3:0: in function 'g', AllocasDyn = 0 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.declare(metadata, metadata, metadata) #0 + +; uselistorder directives +uselistorder ptr @llvm.dbg.declare, { 4, 3, 2, 1, 0 } + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !4, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !6) +!4 = distinct !DISubroutineType(types: !5) +!5 = !{null} +!6 = !{} +!7 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !3, type: !8, flags: DIFlagArtificial) +!8 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !9) +!9 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !10) +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!11 = !DILocation(line: 0, scope: !3) +!12 = !DILocalVariable(name: "i", scope: !13, file: !2, line: 14, type: !14) +!13 = distinct !DILexicalBlock(scope: !3, file: !2, line: 13, column: 3) +!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!15 = !DILocation(line: 14, column: 9, scope: !13) +!16 = !DILocalVariable(name: "a", scope: !13, file: !2, line: 15, type: !17) +!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !14, size: 64, elements: !18) +!18 = !{!19} +!19 = !DISubrange(count: 2) +!20 = !DILocation(line: 15, column: 9, scope: !13) +!21 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !22, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !6) +!22 = !DISubroutineType(types: !5) +!23 = !DILocalVariable(name: "i", scope: !21, file: !2, line: 4, type: !14) +!24 = !DILocation(line: 4, column: 7, scope: !21) +!25 = !DILocalVariable(name: "a", scope: !21, file: !2, line: 5, type: !17) +!26 = !DILocation(line: 5, column: 7, scope: !21) diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll new file mode 100644 index 0000000000000..6101a71254898 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/calls.ll @@ -0,0 +1,112 @@ +; Check info on calls. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +declare void @personality() + +define void @h() personality ptr @personality !dbg !100 { +entry: + ; CHECK: remark: test.c:16:5: in artificial function 'h', direct call, callee is 'f' + call void @f(), !dbg !102 + ; CHECK: remark: test.c:17:5: in artificial function 'h', direct call to defined function, callee is 'g' + call void @g(), !dbg !104 + ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h' + call void @h(), !dbg !105 + %0 = load ptr, ptr null, align 8 + ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call + call void %0(), !dbg !106 + ; CHECK: remark: test.c:20:5: in artificial function 'h', direct invoke, callee is 'f' + invoke void @f() to label %fcont unwind label %cleanup, !dbg !107 +fcont: + ; CHECK: remark: test.c:21:5: in artificial function 'h', direct invoke to defined function, callee is 'g' + invoke void @g() to label %gcont unwind label %cleanup, !dbg !108 +gcont: + ; CHECK: remark: test.c:22:5: in artificial function 'h', direct invoke to defined function, callee is artificial 'h' + invoke void @h() to label %hcont unwind label %cleanup, !dbg !109 +hcont: + ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke + invoke void %0() to label %end unwind label %cleanup, !dbg !110 +cleanup: + %ll = landingpad { ptr, i32 } + cleanup + br label %end +end: + ret void +} +; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCalls = 6 +; CHECK: remark: test.c:13:0: in artificial function 'h', IndirectCalls = 2 +; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCallsToDefinedFunctions = 4 +; CHECK: remark: test.c:13:0: in artificial function 'h', Invokes = 4 + +declare void @f() + +define void @g() personality ptr @personality !dbg !200 { +entry: + ; CHECK: remark: test.c:6:3: in function 'g', direct call, callee is 'f' + call void @f(), !dbg !202 + ; CHECK: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' + call void @g(), !dbg !203 + ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h' + call void @h(), !dbg !204 + %0 = load ptr, ptr null, align 8 + ; CHECK: remark: test.c:9:3: in function 'g', indirect call + call void %0(), !dbg !205 + ; CHECK: remark: test.c:10:3: in function 'g', direct invoke, callee is 'f' + invoke void @f() to label %fcont unwind label %cleanup, !dbg !206 +fcont: + ; CHECK: remark: test.c:11:3: in function 'g', direct invoke to defined function, callee is 'g' + invoke void @g() to label %gcont unwind label %cleanup, !dbg !207 +gcont: + ; CHECK: remark: test.c:12:3: in function 'g', direct invoke to defined function, callee is artificial 'h' + invoke void @h() to label %hcont unwind label %cleanup, !dbg !208 +hcont: + ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke + invoke void %0() to label %end unwind label %cleanup, !dbg !209 +cleanup: + %ll = landingpad { ptr, i32 } + cleanup + br label %end +end: + ret void +} +; CHECK: remark: test.c:3:0: in function 'g', DirectCalls = 6 +; CHECK: remark: test.c:3:0: in function 'g', IndirectCalls = 2 +; CHECK: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 4 +; CHECK: remark: test.c:3:0: in function 'g', Invokes = 4 + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{null} +!4 = !{} + +!100 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!101 = distinct !DISubroutineType(types: !3) +!102 = !DILocation(line: 16, column: 5, scope: !103) +!103 = distinct !DILexicalBlock(scope: !100, file: !2, line: 13, column: 3) +!104 = !DILocation(line: 17, column: 5, scope: !103) +!105 = !DILocation(line: 18, column: 5, scope: !103) +!106 = !DILocation(line: 19, column: 5, scope: !103) +!107 = !DILocation(line: 20, column: 5, scope: !103) +!108 = !DILocation(line: 21, column: 5, scope: !103) +!109 = !DILocation(line: 22, column: 5, scope: !103) +!110 = !DILocation(line: 23, column: 5, scope: !103) + +!200 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!201 = !DISubroutineType(types: !3) +!202 = !DILocation(line: 6, column: 3, scope: !200) +!203 = !DILocation(line: 7, column: 3, scope: !200) +!204 = !DILocation(line: 8, column: 3, scope: !200) +!205 = !DILocation(line: 9, column: 3, scope: !200) +!206 = !DILocation(line: 10, column: 3, scope: !200) +!207 = !DILocation(line: 11, column: 3, scope: !200) +!208 = !DILocation(line: 12, column: 3, scope: !200) +!209 = !DILocation(line: 13, column: 3, scope: !200) diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll new file mode 100644 index 0000000000000..7d190ece46e16 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll @@ -0,0 +1,47 @@ +; Check that -kernel-info-end-lto enables kernel-info in the AMD GPU target +; backend. + +; REQUIRES: amdgpu-registered-target + +; -kernel-info-end-lto inserts kernel-info into LTO pipeline. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +; Omitting -kernel-info-end-lto disables kernel-info. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='lto' 2>&1 | \ +; RUN: FileCheck -allow-empty -check-prefixes=NONE %s + +; Omitting LTO disables kernel-info. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ +; RUN: FileCheck -allow-empty -check-prefixes=NONE %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; NONE-NOT: remark: +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!6, !7, !8} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = !{ptr @test, !"maxclusterrank", i32 200} +!7 = !{ptr @test, !"maxntidx", i32 210} +!8 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll new file mode 100644 index 0000000000000..4e790123c313a --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll @@ -0,0 +1,47 @@ +; Check that -kernel-info-end-lto enables kernel-info in the NVPTX target +; backend. + +; REQUIRES: nvptx-registered-target + +; -kernel-info-end-lto inserts kernel-info into LTO pipeline. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +; Omitting -kernel-info-end-lto disables kernel-info. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='lto' 2>&1 | \ +; RUN: FileCheck -allow-empty -check-prefixes=NONE %s + +; Omitting LTO disables kernel-info. +; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ +; RUN: FileCheck -allow-empty -check-prefixes=NONE %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; NONE-NOT: remark: +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!6, !7, !8} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = !{ptr @test, !"maxclusterrank", i32 200} +!7 = !{ptr @test, !"maxntidx", i32 210} +!8 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll new file mode 100644 index 0000000000000..0c98f4ad45950 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll @@ -0,0 +1,40 @@ +; Check info on launch bounds for AMD GPU. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsX = 200 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsY = 201 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsZ = 202 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMin = 210 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMax = 211 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMin = 220 +; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMax = 221 +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" + "omp_target_thread_limit"="101" + "amdgpu-max-num-workgroups"="200,201,202" + "amdgpu-flat-work-group-size"="210,211" + "amdgpu-waves-per-eu"="220,221" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll new file mode 100644 index 0000000000000..c7339f90e3ca9 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll @@ -0,0 +1,36 @@ +; Check info on launch bounds for NVPTX. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'test', Maxclusterrank = 200 +; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidx = 210 +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" + "omp_target_thread_limit"="101" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!6, !7, !8} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = !{ptr @test, !"maxclusterrank", i32 200} +!7 = !{ptr @test, !"maxntidx", i32 210} +!8 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/linkage.ll b/llvm/test/Analysis/KernelInfo/linkage.ll new file mode 100644 index 0000000000000..43154d2379825 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/linkage.ll @@ -0,0 +1,51 @@ +; Check info on linkage. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK: remark: test.c:3:0: in function 'f', ExternalNotKernel = 1 +define external void @f() !dbg !10 { +entry: + ret void +} + +; CHECK: remark: test.c:13:0: in artificial function 'g', ExternalNotKernel = 1 +define void @g() !dbg !20 { +entry: + ret void +} + +; CHECK: remark: test.c:23:0: in function 'h', ExternalNotKernel = 0 +define external void @h() #0 !dbg !30 { +entry: + ret void +} + +; CHECK: remark: test.c:33:0: in artificial function 'i', ExternalNotKernel = 0 +define weak void @i() !dbg !40 { +entry: + ret void +} + +attributes #0 = { "kernel" } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{null} +!4 = !{} +!10 = distinct !DISubprogram(name: "f", scope: !2, file: !2, line: 3, type: !11, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!11 = !DISubroutineType(types: !3) +!20 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 13, type: !21, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!21 = distinct !DISubroutineType(types: !3) +!30 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 23, type: !31, scopeLine: 23, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!31 = distinct !DISubroutineType(types: !3) +!40 = distinct !DISubprogram(name: "i", scope: !2, file: !2, line: 33, type: !41, scopeLine: 33, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!41 = distinct !DISubroutineType(types: !3) diff --git a/llvm/test/Analysis/KernelInfo/openmp/README.md b/llvm/test/Analysis/KernelInfo/openmp/README.md new file mode 100644 index 0000000000000..0d13950e198ed --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/openmp/README.md @@ -0,0 +1,40 @@ +The tests in this directory check that basic KernelInfoAnalysis functionality +behaves reasonably for LLVM IR produced by Clang OpenMP codegen. + +So that these tests are straightforward to maintain and faithfully represent +Clang OpenMP codegen, do not tweak or reduce the LLVM IR in them. Other tests +more exhaustively check KernelInfoAnalysis features using reduced LLVM IR. + +The LLVM IR in each test file `$TEST` can be regenerated as follows in the case +that Clang OpenMP codegen changes or it becomes desirable to adjust the source +OpenMP program below. First, remove the existing LLVM IR from `$TEST`. Then, +where `$TARGET` (e.g., `nvptx64-nvidia-cuda` or `amdgcn-amd-amdhsa`) depends on +`$TEST`: + +``` +$ cd /tmp +$ cat test.c +#pragma omp declare target +void f(); +void g() { + int i; + int a[2]; + f(); + g(); +} +#pragma omp end declare target + +void h(int i) { + #pragma omp target map(tofrom:i) + { + int i; + int a[2]; + f(); + g(); + } +} + +$ clang -g -fopenmp -fopenmp-targets=$TARGET -save-temps -c test.c +$ llvm-dis test-openmp-$TARGET.bc +$ cat test-openmp-$TARGET.ll >> $TEST +``` diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll new file mode 100644 index 0000000000000..ee5f65b8e5ab7 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -0,0 +1,217 @@ +; See ./README.md for how to maintain the LLVM IR in this test. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +; For some builds, we see a warning like: +; +; opt: WARNING: failed to create target machine for 'amdgcn-amd-amdhsa': unable to get target for 'amdgcn-amd-amdhsa', see --version and --triple. +; +; But there should be no other remarks here. +; CHECK-NOT: remark: + +; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_init' +; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_deinit' +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMin = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMax = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1 + +; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2 + +; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0 +; CHECK-NOT: {{.}} + + +; ModuleID = 'test-openmp-amdgcn-amd-amdhsa.bc' +source_filename = "test.c" +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +%struct.ident_t = type { i32, i32, i32, i32, ptr } +%struct.DynamicEnvironmentTy = type { i16 } +%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr } +%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 } + +@__omp_rtl_debug_kind = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_teams_oversubscription = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_threads_oversubscription = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1) constant i32 0 +@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1) constant i32 0 +@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_71f35_h_l12_debug__;13;3;;\00", align 1 +@1 = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 56, ptr @0 }, align 8 +@__omp_offloading_fd02_71f35_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_71f35_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_71f35_h_l12_dynamic_environment to ptr) } +@__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define internal void @__omp_offloading_fd02_71f35_h_l12_debug__(ptr noalias noundef %dyn_ptr) #0 !dbg !16 { +entry: + %dyn_ptr.addr = alloca ptr, align 8, addrspace(5) + %i = alloca i32, align 4, addrspace(5) + %a = alloca [2 x i32], align 4, addrspace(5) + %dyn_ptr.addr.ascast = addrspacecast ptr addrspace(5) %dyn_ptr.addr to ptr + %i.ascast = addrspacecast ptr addrspace(5) %i to ptr + %a.ascast = addrspacecast ptr addrspace(5) %a to ptr + store ptr %dyn_ptr, ptr %dyn_ptr.addr.ascast, align 8 + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %dyn_ptr.addr, metadata !24, metadata !DIExpression()), !dbg !25 + %0 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_71f35_h_l12_kernel_environment to ptr), ptr %dyn_ptr), !dbg !26 + %exec_user_code = icmp eq i32 %0, -1, !dbg !26 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit, !dbg !26 + +user_code.entry: ; preds = %entry + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %i, metadata !27, metadata !DIExpression()), !dbg !30 + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %a, metadata !31, metadata !DIExpression()), !dbg !35 + call void @f() #5, !dbg !36 + call void @g() #5, !dbg !37 + call void @__kmpc_target_deinit(), !dbg !38 + ret void, !dbg !39 + +worker.exit: ; preds = %entry + ret void, !dbg !26 +} + +declare i32 @__kmpc_target_init(ptr, ptr) + +; Function Attrs: convergent +declare void @f(...) #1 + +declare void @__kmpc_target_deinit() + +; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_71f35_h_l12(ptr noalias noundef %dyn_ptr) #2 !dbg !40 { +entry: + %dyn_ptr.addr = alloca ptr, align 8, addrspace(5) + %dyn_ptr.addr.ascast = addrspacecast ptr addrspace(5) %dyn_ptr.addr to ptr + store ptr %dyn_ptr, ptr %dyn_ptr.addr.ascast, align 8 + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %dyn_ptr.addr, metadata !41, metadata !DIExpression()), !dbg !42 + %0 = load ptr, ptr %dyn_ptr.addr.ascast, align 8, !dbg !43 + call void @__omp_offloading_fd02_71f35_h_l12_debug__(ptr %0) #6, !dbg !43 + ret void, !dbg !43 +} + +; Function Attrs: convergent noinline nounwind optnone +define hidden void @g() #3 !dbg !44 { +entry: + %i = alloca i32, align 4, addrspace(5) + %a = alloca [2 x i32], align 4, addrspace(5) + %i.ascast = addrspacecast ptr addrspace(5) %i to ptr + %a.ascast = addrspacecast ptr addrspace(5) %a to ptr + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %i, metadata !47, metadata !DIExpression()), !dbg !48 + tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %a, metadata !49, metadata !DIExpression()), !dbg !50 + call void @f() #5, !dbg !51 + call void @g() #5, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.declare(metadata, metadata, metadata) #4 + +attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "frame-pointer"="all" "no-trapping-math"="true" "omp_target_thread_limit"="256" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +attributes #2 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } +attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #5 = { convergent } +attributes #6 = { nounwind } + +!llvm.dbg.cu = !{!0} +!omp_offload.info = !{!2} +!nvvm.annotations = !{!3} +!llvm.module.flags = !{!4, !5, !6, !7, !8, !9, !10, !11, !12} +!llvm.ident = !{!13, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14} +!opencl.ocl.version = !{!15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git (/tmp/llvm/clang 5a5e94265d423fa9eb39dc1b855511195f8dc0fe)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "eff61a7cf33c8dd1bd6933250fc90157") +!2 = !{i32 0, i32 64770, i32 466741, !"h", i32 12, i32 0, i32 0} +!3 = !{ptr @__omp_offloading_fd02_71f35_h_l12, !"kernel", i32 1} +!4 = !{i32 1, !"amdhsa_code_object_version", i32 500} +!5 = !{i32 7, !"Dwarf Version", i32 5} +!6 = !{i32 2, !"Debug Info Version", i32 3} +!7 = !{i32 1, !"wchar_size", i32 4} +!8 = !{i32 7, !"openmp", i32 51} +!9 = !{i32 7, !"openmp-device", i32 51} +!10 = !{i32 8, !"PIC Level", i32 2} +!11 = !{i32 7, !"frame-pointer", i32 2} +!12 = !{i32 4, !"amdgpu_hostcall", i32 1} +!13 = !{!"clang version 19.0.0git (/tmp/llvm/clang 5a5e94265d423fa9eb39dc1b855511195f8dc0fe)"} +!14 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"} +!15 = !{i32 2, i32 0} +!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_71f35_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!17 = !DIFile(filename: "test.c", directory: "/tmp") +!18 = !DISubroutineType(types: !19) +!19 = !{null, !20} +!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21) +!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22) +!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!23 = !{} +!24 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !16, type: !20, flags: DIFlagArtificial) +!25 = !DILocation(line: 0, scope: !16) +!26 = !DILocation(line: 13, column: 3, scope: !16) +!27 = !DILocalVariable(name: "i", scope: !28, file: !17, line: 14, type: !29) +!28 = distinct !DILexicalBlock(scope: !16, file: !17, line: 13, column: 3) +!29 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!30 = !DILocation(line: 14, column: 9, scope: !28) +!31 = !DILocalVariable(name: "a", scope: !28, file: !17, line: 15, type: !32) +!32 = !DICompositeType(tag: DW_TAG_array_type, baseType: !29, size: 64, elements: !33) +!33 = !{!34} +!34 = !DISubrange(count: 2) +!35 = !DILocation(line: 15, column: 9, scope: !28) +!36 = !DILocation(line: 16, column: 5, scope: !28) +!37 = !DILocation(line: 17, column: 5, scope: !28) +!38 = !DILocation(line: 18, column: 3, scope: !28) +!39 = !DILocation(line: 18, column: 3, scope: !16) +!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_71f35_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial) +!42 = !DILocation(line: 0, scope: !40) +!43 = !DILocation(line: 12, column: 1, scope: !40) +!44 = distinct !DISubprogram(name: "g", scope: !17, file: !17, line: 3, type: !45, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !23) +!45 = !DISubroutineType(types: !46) +!46 = !{null} +!47 = !DILocalVariable(name: "i", scope: !44, file: !17, line: 4, type: !29) +!48 = !DILocation(line: 4, column: 7, scope: !44) +!49 = !DILocalVariable(name: "a", scope: !44, file: !17, line: 5, type: !32) +!50 = !DILocation(line: 5, column: 7, scope: !44) +!51 = !DILocation(line: 6, column: 3, scope: !44) +!52 = !DILocation(line: 7, column: 3, scope: !44) +!53 = !DILocation(line: 8, column: 1, scope: !44) diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll new file mode 100644 index 0000000000000..41d068b03548b --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -0,0 +1,811 @@ +; See ./README.md for how to maintain the LLVM IR in this test. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines %s + +; For some builds, we see a warning like: +; +; opt: WARNING: failed to create target machine for 'nvptx64-nvidia-cuda': unable to get target for 'nvptx64-nvidia-cuda', see --version and --triple. +; +; But there should be no other remarks here. +; CHECK-NOT: remark: + +; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_init' +; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_deinit' +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 128 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Maxntidx = 128 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 3 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1 + +; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2 + +; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' +; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0 +; CHECK-NOT: remark: {{.*: in function 'g',.*}} + +; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't +; want to maintain a list of their allocas, calls, etc. in this test. + + +; ModuleID = 'test-openmp-nvptx64-nvidia-cuda.bc' +source_filename = "test.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, ptr } +%struct.DynamicEnvironmentTy = type { i16 } +%struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr } +%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32, i32, i32 } +%struct.DeviceMemoryPoolTy = type { ptr, i64 } +%struct.DeviceMemoryPoolTrackingTy = type { i64, i64, i64, i64 } +%struct.DeviceEnvironmentTy = type { i32, i32, i32, i32, i64, i64, i64, i64 } +%"struct.(anonymous namespace)::SharedMemorySmartStackTy" = type { [512 x i8], [1024 x i8] } +%"struct.ompx::state::TeamStateTy" = type { %"struct.ompx::state::ICVStateTy", i32, i32, ptr } +%"struct.ompx::state::ICVStateTy" = type { i32, i32, i32, i32, i32, i32, i32 } +%printf_args = type { ptr, i32, ptr, ptr, ptr } +%printf_args.7 = type { ptr, i32, ptr, ptr } + +@__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 +@__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +@0 = private unnamed_addr constant [59 x i8] c";test.c;__omp_offloading_10305_5c00dd_h_l12_debug__;13;3;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 58, ptr @0 }, align 8 +@__omp_offloading_10305_5c00dd_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_10305_5c00dd_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_10305_5c00dd_h_l12_dynamic_environment } +@llvm.used = appending global [3 x ptr] [ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata" +@__omp_rtl_device_memory_pool = weak protected global %struct.DeviceMemoryPoolTy zeroinitializer, align 8 +@__omp_rtl_device_memory_pool_tracker = weak protected global %struct.DeviceMemoryPoolTrackingTy zeroinitializer, align 8 +@__omp_rtl_debug_kind = weak_odr hidden constant i32 0 +@__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +@__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 +@__omp_rtl_device_environment = weak protected addrspace(4) global %struct.DeviceEnvironmentTy undef, align 8 +@.str = private unnamed_addr constant [40 x i8] c"%s:%u: %s: Assertion %s (`%s`) failed.\0A\00", align 1 +@.str1 = private unnamed_addr constant [35 x i8] c"%s:%u: %s: Assertion `%s` failed.\0A\00", align 1 +@.str15 = private unnamed_addr constant [43 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Kernel.cpp\00", align 1 +@__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy = private unnamed_addr constant [36 x i8] c"void genericStateMachine(IdentTy *)\00", align 1 +@.str2 = private unnamed_addr constant [18 x i8] c"WorkFn == nullptr\00", align 1 +@__PRETTY_FUNCTION__.__kmpc_target_deinit = private unnamed_addr constant [28 x i8] c"void __kmpc_target_deinit()\00", align 1 +@IsSPMDMode = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 +@.str1127 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 +@.str13 = private unnamed_addr constant [23 x i8] c"!mapping::isSPMDMode()\00", align 1 +@__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel = private unnamed_addr constant [34 x i8] c"void __kmpc_kernel_end_parallel()\00", align 1 +@_ZL20KernelEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 +@_ZL26KernelLaunchEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 +@_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16 +@.str544 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 +@.str847 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 +@__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_ = private unnamed_addr constant [68 x i8] c"void ompx::state::ICVStateTy::assertEqual(const ICVStateTy &) const\00", align 1 +@.str948 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 +@.str1049 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 +@.str1150 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 +@.str1251 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 +@.str1352 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 +@.str14 = private unnamed_addr constant [43 x i8] c"ParallelTeamSize == Other.ParallelTeamSize\00", align 1 +@__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_ = private unnamed_addr constant [64 x i8] c"void ompx::state::TeamStateTy::assertEqual(TeamStateTy &) const\00", align 1 +@.str1553 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 +@.str24 = private unnamed_addr constant [32 x i8] c"mapping::isSPMDMode() == IsSPMD\00", align 1 +@__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb = private unnamed_addr constant [43 x i8] c"void ompx::state::assumeInitialState(bool)\00", align 1 +@_ZN4ompx5state9TeamStateE = internal local_unnamed_addr addrspace(3) global %"struct.ompx::state::TeamStateTy" undef, align 8 +@_ZN4ompx5state12ThreadStatesE = internal addrspace(3) global ptr undef, align 8 + +; Function Attrs: convergent noinline norecurse nounwind optnone +define internal void @__omp_offloading_10305_5c00dd_h_l12_debug__(ptr noalias noundef %dyn_ptr) #0 !dbg !17 { +entry: + %dyn_ptr.addr = alloca ptr, align 8 + %i = alloca i32, align 4 + %a = alloca [2 x i32], align 4 + store ptr %dyn_ptr, ptr %dyn_ptr.addr, align 8 + tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !24, metadata !DIExpression()), !dbg !25 + %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_10305_5c00dd_h_l12_kernel_environment, ptr %dyn_ptr), !dbg !26 + %exec_user_code = icmp eq i32 %0, -1, !dbg !26 + br i1 %exec_user_code, label %user_code.entry, label %worker.exit, !dbg !26 + +user_code.entry: ; preds = %entry + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !27, metadata !DIExpression()), !dbg !30 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !31, metadata !DIExpression()), !dbg !35 + call void @f() #16, !dbg !36 + call void @g() #16, !dbg !37 + call void @__kmpc_target_deinit(), !dbg !38 + ret void, !dbg !39 + +worker.exit: ; preds = %entry + ret void, !dbg !26 +} + +; Function Attrs: convergent +declare void @f(...) #1 + +; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +define weak_odr protected void @__omp_offloading_10305_5c00dd_h_l12(ptr noalias noundef %dyn_ptr) #2 !dbg !40 { +entry: + %dyn_ptr.addr = alloca ptr, align 8 + store ptr %dyn_ptr, ptr %dyn_ptr.addr, align 8 + tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !41, metadata !DIExpression()), !dbg !42 + %0 = load ptr, ptr %dyn_ptr.addr, align 8, !dbg !43 + call void @__omp_offloading_10305_5c00dd_h_l12_debug__(ptr %0) #17, !dbg !43 + ret void, !dbg !43 +} + +; Function Attrs: convergent noinline nounwind optnone +define hidden void @g() #3 !dbg !44 { +entry: + %i = alloca i32, align 4 + %a = alloca [2 x i32], align 4 + tail call void @llvm.dbg.declare(metadata ptr %i, metadata !47, metadata !DIExpression()), !dbg !48 + tail call void @llvm.dbg.declare(metadata ptr %a, metadata !49, metadata !DIExpression()), !dbg !50 + call void @f() #16, !dbg !51 + call void @g() #16, !dbg !52 + ret void, !dbg !53 +} + +; Function Attrs: convergent mustprogress nounwind +define internal noundef i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(48) %KernelEnvironment, ptr nofree noundef nonnull align 8 dereferenceable(16) %KernelLaunchEnvironment) #4 { +entry: + %WorkFn.i = alloca ptr, align 8 + %ExecMode = getelementptr inbounds i8, ptr %KernelEnvironment, i64 2 + %0 = load i8, ptr %ExecMode, align 2, !tbaa !54 + %1 = and i8 %0, 2 + %tobool.not = icmp eq i8 %1, 0 + %2 = load i8, ptr %KernelEnvironment, align 8, !tbaa !60 + %tobool3.not = icmp ne i8 %2, 0 + br i1 %tobool.not, label %if.else, label %if.then + +if.then: ; preds = %entry + %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %cmp.i.i.i = icmp eq i32 %3, 0 + br i1 %cmp.i.i.i, label %if.then.i, label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge + +if.then.i: ; preds = %if.then + store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 + %idxprom.i.i = zext nneg i32 %3 to i64 + %arrayidx.i.i = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i + %4 = addrspacecast ptr %arrayidx.i.i to ptr addrspace(3) + store i8 0, ptr addrspace(3) %4, align 1, !tbaa !62 + store i32 0, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !63 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !67 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !68 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 12) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !72 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !75 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 + store ptr %KernelEnvironment, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + store ptr %KernelLaunchEnvironment, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !76 + br label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit + +_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge: ; preds = %if.then + %idxprom.i.i.c = zext i32 %3 to i64 + %arrayidx.i.i.c = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i.c + %5 = addrspacecast ptr %arrayidx.i.i.c to ptr addrspace(3) + store i8 0, ptr addrspace(3) %5, align 1, !tbaa !62 + br label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit + +_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit: ; preds = %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge, %if.then.i + tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19 + br label %if.end + +if.else: ; preds = %entry + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 + %sub.i.i.i7 = add i32 %6, -1 + %and.i.i.i8 = and i32 %sub.i.i.i7, -32 + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %cmp.i.i.i9 = icmp eq i32 %7, %and.i.i.i8 + br i1 %cmp.i.i.i9, label %if.then.i11, label %if.end.critedge + +if.then.i11: ; preds = %if.else + store i32 0, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 + %idxprom.i.i13 = zext i32 %7 to i64 + %arrayidx.i.i14 = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i13 + %8 = addrspacecast ptr %arrayidx.i.i14 to ptr addrspace(3) + store i8 0, ptr addrspace(3) %8, align 1, !tbaa !62 + store i32 0, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !63 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !67 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !68 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 12) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !72 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !75 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 + store ptr %KernelEnvironment, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + store ptr %KernelLaunchEnvironment, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !76 + br label %if.end + +if.end.critedge: ; preds = %if.else + %idxprom.i.i13.c = zext i32 %7 to i64 + %arrayidx.i.i14.c = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i13.c + %9 = addrspacecast ptr %arrayidx.i.i14.c to ptr addrspace(3) + store i8 0, ptr addrspace(3) %9, align 1, !tbaa !62 + br label %if.end + +if.end: ; preds = %if.end.critedge, %if.then.i11, %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit + br i1 %tobool.not, label %if.end9, label %if.then7 + +if.then7: ; preds = %if.end + %10 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 + %11 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 + %and.i.i.i21 = and i32 %10, 1 + %and.i.i = and i32 %and.i.i.i21, %11 + %tobool.i.i = icmp ne i32 %and.i.i, 0 + %.pre67.i.i.i = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !80 + %cmp.i.i.i22 = icmp ne i32 %.pre67.i.i.i, 0 + %or.cond.not.i.i.i = select i1 %tobool.i.i, i1 %cmp.i.i.i22, i1 false + br i1 %or.cond.not.i.i.i, label %if.then.i.i.i, label %if.else.i.i.i + +if.then.i.i.i: ; preds = %if.then7 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str847, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else.i.i.i: ; preds = %if.then7 + %cmp5.i.i.i = icmp eq i32 %.pre67.i.i.i, 0 + tail call void @llvm.assume(i1 noundef %cmp5.i.i.i) #21 + %12 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !81 + br i1 %tobool.i.i, label %land.lhs.true7.i.i.i, label %if.else11.i.i.i + +land.lhs.true7.i.i.i: ; preds = %if.else.i.i.i + %cmp9.i.i.i = icmp eq i32 %12, 0 + br i1 %cmp9.i.i.i, label %if.else11.i.i.i, label %if.then10.i.i.i + +if.then10.i.i.i: ; preds = %land.lhs.true7.i.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str948, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else11.i.i.i: ; preds = %land.lhs.true7.i.i.i, %if.else.i.i.i + %13 = phi i32 [ 0, %land.lhs.true7.i.i.i ], [ %12, %if.else.i.i.i ] + %cmp14.i.i.i = icmp eq i32 %13, 0 + tail call void @llvm.assume(i1 noundef %cmp14.i.i.i) #21 + %14 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !82 + br i1 %tobool.i.i, label %land.lhs.true17.i.i.i, label %if.else21.i.i.i + +land.lhs.true17.i.i.i: ; preds = %if.else11.i.i.i + %cmp19.i.i.i = icmp eq i32 %14, 0 + br i1 %cmp19.i.i.i, label %if.else21.i.i.i, label %if.then20.i.i.i + +if.then20.i.i.i: ; preds = %land.lhs.true17.i.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1049, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else21.i.i.i: ; preds = %land.lhs.true17.i.i.i, %if.else11.i.i.i + %15 = phi i32 [ 0, %land.lhs.true17.i.i.i ], [ %14, %if.else11.i.i.i ] + %cmp24.i.i.i = icmp eq i32 %15, 0 + tail call void @llvm.assume(i1 noundef %cmp24.i.i.i) #21 + %16 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !83 + br i1 %tobool.i.i, label %land.lhs.true27.i.i.i, label %if.else31.i.i.i + +land.lhs.true27.i.i.i: ; preds = %if.else21.i.i.i + %cmp29.i.i.i = icmp eq i32 %16, 1 + br i1 %cmp29.i.i.i, label %if.else31.i.i.i, label %if.then30.i.i.i + +if.then30.i.i.i: ; preds = %land.lhs.true27.i.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1150, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else31.i.i.i: ; preds = %land.lhs.true27.i.i.i, %if.else21.i.i.i + %17 = phi i32 [ 1, %land.lhs.true27.i.i.i ], [ %16, %if.else21.i.i.i ] + %cmp34.i.i.i = icmp eq i32 %17, 1 + tail call void @llvm.assume(i1 noundef %cmp34.i.i.i) #21 + %18 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !84 + br i1 %tobool.i.i, label %land.lhs.true37.i.i.i, label %if.else.critedge.i.critedge.critedge.critedge + +land.lhs.true37.i.i.i: ; preds = %if.else31.i.i.i + %cmp39.i.i.i = icmp eq i32 %18, 1 + br i1 %cmp39.i.i.i, label %if.else41.i.i.i, label %if.then40.i.i.i + +if.then40.i.i.i: ; preds = %land.lhs.true37.i.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1251, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else41.i.i.i: ; preds = %land.lhs.true37.i.i.i + %cmp44.i.i.i = icmp eq i32 1, 1 + tail call void @llvm.assume(i1 noundef %cmp44.i.i.i) #21 + br i1 %tobool.i.i, label %land.lhs.true47.i.i.i, label %if.else.critedge.i.critedge + +land.lhs.true47.i.i.i: ; preds = %if.else41.i.i.i + %19 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !85 + %cmp49.i.i.i = icmp eq i32 %19, 1 + br i1 %cmp49.i.i.i, label %if.else51.i.i.i, label %if.then50.i.i.i + +if.then50.i.i.i: ; preds = %land.lhs.true47.i.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1352, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + unreachable + +if.else51.i.i.i: ; preds = %land.lhs.true47.i.i.i + br i1 %tobool.i.i, label %land.lhs.true.i.i, label %if.else.critedge.i.critedge + +land.lhs.true.i.i: ; preds = %if.else51.i.i.i + %20 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 + %cmp.i.i = icmp eq i32 %20, 1 + br i1 %cmp.i.i, label %land.lhs.true8.i.i, label %if.then.i.i + +if.then.i.i: ; preds = %land.lhs.true.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 + unreachable + +land.lhs.true8.i.i: ; preds = %land.lhs.true.i.i + %21 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 + %cmp10.i.i = icmp eq i32 %21, 0 + br i1 %cmp10.i.i, label %land.lhs.true.i24, label %if.then11.i.i + +if.then11.i.i: ; preds = %land.lhs.true8.i.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1553, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 + unreachable + +land.lhs.true.i24: ; preds = %land.lhs.true8.i.i + %22 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 + %tobool.i25.i.not = icmp eq i32 %22, 0 + br i1 %tobool.i25.i.not, label %if.then.i25, label %_ZN4ompx5state18assumeInitialStateEb.exit + +if.then.i25: ; preds = %land.lhs.true.i24 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20 + unreachable + +if.else.critedge.i.critedge.critedge.critedge: ; preds = %if.else31.i.i.i + %cmp44.i.i.i.c = icmp eq i32 %18, 1 + tail call void @llvm.assume(i1 noundef %cmp44.i.i.i.c) #21 + br label %if.else.critedge.i.critedge + +if.else.critedge.i.critedge: ; preds = %if.else41.i.i.i, %if.else.critedge.i.critedge.critedge.critedge, %if.else51.i.i.i + %.pre.i = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 + %23 = icmp ne i32 %.pre.i, 0 + br label %_ZN4ompx5state18assumeInitialStateEb.exit + +_ZN4ompx5state18assumeInitialStateEb.exit: ; preds = %land.lhs.true.i24, %if.else.critedge.i.critedge + %cmp8.i = phi i1 [ %23, %if.else.critedge.i.critedge ], [ true, %land.lhs.true.i24 ] + tail call void @llvm.assume(i1 noundef %cmp8.i) #21 + tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19 + br label %cleanup + +if.end9: ; preds = %if.end + %24 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 + %sub.i.i = add i32 %24, -1 + %and.i.i26 = and i32 %sub.i.i, -32 + %25 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %cmp.i.i27 = icmp eq i32 %25, %and.i.i26 + br i1 %cmp.i.i27, label %cleanup, label %if.end12 + +if.end12: ; preds = %if.end9 + %sub.i = add i32 %24, -32 + %cmp = icmp ult i32 %25, %sub.i + %or.cond33 = and i1 %tobool3.not, %cmp + br i1 %or.cond33, label %do.body.i.preheader, label %cleanup + +do.body.i.preheader: ; preds = %if.end12 + %26 = load i32, ptr @__omp_rtl_debug_kind, align 4 + %27 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8 + %and.i.i29 = and i32 %26, 1 + %and.i = and i32 %and.i.i29, %27 + %tobool.i = icmp ne i32 %and.i, 0 + br label %do.body.i + +do.body.i: ; preds = %do.body.i.preheader, %if.end9.i + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %WorkFn.i) #22 + store ptr null, ptr %WorkFn.i, align 8, !tbaa !76 + tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 + %call1.i = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn.i) #22 + %28 = load ptr, ptr %WorkFn.i, align 8, !tbaa !76 + %tobool.not.not.i = icmp eq ptr %28, null + br i1 %tobool.not.not.i, label %_ZL19genericStateMachineP7IdentTy.exit, label %if.end.i + +if.end.i: ; preds = %do.body.i + br i1 %call1.i, label %if.then3.i, label %if.end9.i + +if.then3.i: ; preds = %if.end.i + %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %tobool.i30 = icmp ne i32 %29, 0 + %or.cond = select i1 %tobool.i, i1 %tobool.i30, i1 false + br i1 %or.cond, label %if.then6.i, label %if.else.i + +if.then6.i: ; preds = %if.then3.i + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(70) @.str15, i32 noundef 58, ptr nofree noundef nonnull dereferenceable(36) @__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy) #20 + unreachable + +if.else.i: ; preds = %if.then3.i + %tobool.i31.not = icmp eq i32 %29, 0 + tail call void @llvm.assume(i1 noundef %tobool.i31.not) #21 + tail call void %28(i32 noundef 0, i32 noundef %25) #23 + tail call void @__kmpc_kernel_end_parallel() #24 + br label %if.end9.i + +if.end9.i: ; preds = %if.else.i, %if.end.i + tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn.i) #22 + br label %do.body.i, !llvm.loop !86 + +_ZL19genericStateMachineP7IdentTy.exit: ; preds = %do.body.i + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn.i) #22 + br label %cleanup + +cleanup: ; preds = %if.end12, %_ZL19genericStateMachineP7IdentTy.exit, %if.end9, %_ZN4ompx5state18assumeInitialStateEb.exit + %retval.0 = phi i32 [ -1, %_ZN4ompx5state18assumeInitialStateEb.exit ], [ -1, %if.end9 ], [ %25, %_ZL19genericStateMachineP7IdentTy.exit ], [ %25, %if.end12 ] + ret i32 %retval.0 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5 + +; Function Attrs: convergent mustprogress noinline norecurse nounwind +define internal void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 %Ordering) local_unnamed_addr #6 { +entry: + tail call void @llvm.nvvm.barrier0() #25 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5 + +; Function Attrs: convergent mustprogress noreturn nounwind +define internal fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(8) %expr, ptr noundef %msg, ptr nofree noundef nonnull dereferenceable(69) %file, i32 noundef %line, ptr nofree noundef nonnull dereferenceable(20) %function) unnamed_addr #7 { +entry: + %tmp = alloca %printf_args, align 8 + %tmp1 = alloca %printf_args.7, align 8 + %tobool.not = icmp eq ptr %msg, null + br i1 %tobool.not, label %if.else, label %if.then + +if.then: ; preds = %entry + store ptr %file, ptr %tmp, align 8 + %0 = getelementptr inbounds i8, ptr %tmp, i64 8 + store i32 %line, ptr %0, align 8 + %1 = getelementptr inbounds i8, ptr %tmp, i64 16 + store ptr %function, ptr %1, align 8 + br label %if.end + +if.else: ; preds = %entry + store ptr %file, ptr %tmp1, align 8 + %2 = getelementptr inbounds i8, ptr %tmp1, i64 8 + store i32 %line, ptr %2, align 8 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %.sink12 = phi i64 [ 16, %if.else ], [ 24, %if.then ] + %tmp1.sink11 = phi ptr [ %tmp1, %if.else ], [ %tmp, %if.then ] + %function.sink = phi ptr [ %function, %if.else ], [ %msg, %if.then ] + %.sink9 = phi i64 [ 24, %if.else ], [ 32, %if.then ] + %.str1.sink = phi ptr [ @.str1, %if.else ], [ @.str, %if.then ] + %3 = getelementptr inbounds i8, ptr %tmp1.sink11, i64 %.sink12 + store ptr %function.sink, ptr %3, align 8 + %4 = getelementptr inbounds i8, ptr %tmp1.sink11, i64 %.sink9 + store ptr %expr, ptr %4, align 8 + %call.i.i = call noundef i32 @vprintf(ptr noundef nonnull %.str1.sink, ptr noundef nonnull %tmp1.sink11) #24 + call void @llvm.trap() #26 + unreachable +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) +declare void @llvm.assume(i1 noundef) #8 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #9 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier.sync(i32) #10 + +; Function Attrs: convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) +define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn) local_unnamed_addr #11 { +entry: + %0 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !76 + store ptr %0, ptr %WorkFn, align 8, !tbaa !76 + %tobool.not = icmp eq ptr %0, null + br i1 %tobool.not, label %return, label %if.end + +if.end: ; preds = %entry + %1 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27 + %2 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !61 + %tobool.not.i = icmp eq i32 %2, 0 + %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 + %4 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %tobool.i.not.i.i = icmp eq i32 %4, 0 + %mul.neg.i.i.i = select i1 %tobool.i.not.i.i, i32 -32, i32 0 + %sub.i.i.i = add i32 %mul.neg.i.i.i, %3 + %cond.i = select i1 %tobool.not.i, i32 %sub.i.i.i, i32 %2 + %cmp = icmp ult i32 %1, %cond.i + br label %return + +return: ; preds = %if.end, %entry + %retval.0 = phi i1 [ %cmp, %if.end ], [ false, %entry ] + ret i1 %retval.0 +} + +; Function Attrs: convergent mustprogress noinline nounwind +define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #12 { +entry: + %0 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 + %1 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 + %and.i.i = and i32 %0, 1 + %and.i = and i32 %and.i.i, %1 + %tobool.i = icmp ne i32 %and.i, 0 + %2 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %tobool.i1 = icmp ne i32 %2, 0 + %or.cond = select i1 %tobool.i, i1 %tobool.i1, i1 false + br i1 %or.cond, label %if.then, label %if.else + +if.then: ; preds = %entry + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(75) @.str1127, i32 noundef 297, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 + unreachable + +if.else: ; preds = %entry + %tobool.i2.not = icmp eq i32 %2, 0 + tail call void @llvm.assume(i1 noundef %tobool.i2.not) #21 + %3 = load i32, ptr @__omp_rtl_assume_no_thread_state, align 4, !tbaa !61 + %tobool.not.i.i = icmp eq i32 %3, 0 + %4 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8 + %tobool.not.i = icmp ne i32 %4, 0 + %or.cond.not.i = select i1 %tobool.not.i.i, i1 %tobool.not.i, i1 false + br i1 %or.cond.not.i, label %lor.rhs.i, label %_ZN4ompx5state19resetStateForThreadEj.exit + +lor.rhs.i: ; preds = %if.else + %5 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27 + %6 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 + %idxprom.i = zext i32 %5 to i64 + %arrayidx.i = getelementptr inbounds ptr, ptr %6, i64 %idxprom.i + %7 = load ptr, ptr %arrayidx.i, align 8, !tbaa !76 + %tobool1.not.i = icmp eq ptr %7, null + br i1 %tobool1.not.i, label %_ZN4ompx5state19resetStateForThreadEj.exit, label %if.end4.i, !prof !88 + +if.end4.i: ; preds = %lor.rhs.i + %PreviousThreadState7.i = getelementptr inbounds i8, ptr %7, i64 32 + %8 = load ptr, ptr %PreviousThreadState7.i, align 8, !tbaa !89 + tail call void @free(ptr noundef nonnull dereferenceable(40) %7) #28 + %9 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 + %arrayidx11.i = getelementptr inbounds ptr, ptr %9, i64 %idxprom.i + store ptr %8, ptr %arrayidx11.i, align 8, !tbaa !76 + %.pre = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + br label %_ZN4ompx5state19resetStateForThreadEj.exit + +_ZN4ompx5state19resetStateForThreadEj.exit: ; preds = %if.else, %lor.rhs.i, %if.end4.i + %10 = phi i32 [ 0, %if.else ], [ 0, %lor.rhs.i ], [ %.pre, %if.end4.i ] + %tobool.i6 = icmp ne i32 %10, 0 + %or.cond8 = select i1 %tobool.i, i1 %tobool.i6, i1 false + br i1 %or.cond8, label %if.then7, label %if.else8 + +if.then7: ; preds = %_ZN4ompx5state19resetStateForThreadEj.exit + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(75) @.str1127, i32 noundef 300, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 + unreachable + +if.else8: ; preds = %_ZN4ompx5state19resetStateForThreadEj.exit + %tobool.i7.not = icmp eq i32 %10, 0 + tail call void @llvm.assume(i1 noundef %tobool.i7.not) #21 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #9 + +; Function Attrs: convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) +declare extern_weak void @free(ptr allocptr nocapture noundef) local_unnamed_addr #13 + +; Function Attrs: convergent +declare i32 @vprintf(ptr noundef, ptr noundef) local_unnamed_addr #14 + +; Function Attrs: cold noreturn nounwind memory(inaccessiblemem: write) +declare void @llvm.trap() #15 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #10 + +; Function Attrs: convergent mustprogress nounwind +define internal void @__kmpc_target_deinit() #4 { +entry: + %WorkFn = alloca ptr, align 8 + %0 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 + %tobool.i.not = icmp eq i32 %0, 0 + br i1 %tobool.i.not, label %if.end, label %cleanup + +if.end: ; preds = %entry + %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 + %sub.i.i = add i32 %1, -1 + %and.i.i = and i32 %sub.i.i, -32 + %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %cmp.i.i = icmp eq i32 %2, %and.i.i + br i1 %cmp.i.i, label %if.then3, label %if.else + +if.then3: ; preds = %if.end + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !76 + br label %cleanup + +if.else: ; preds = %if.end + %3 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 + %4 = load i8, ptr %3, align 8, !tbaa !91 + %tobool6.not = icmp eq i8 %4, 0 + br i1 %tobool6.not, label %if.then7, label %cleanup + +if.then7: ; preds = %if.else + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %WorkFn) #29 + store ptr null, ptr %WorkFn, align 8, !tbaa !76 + %call8 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn) #22 + %5 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 + %6 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 + %and.i.i1 = and i32 %5, 1 + %and.i = and i32 %and.i.i1, %6 + %tobool.i2.not = icmp eq i32 %and.i, 0 + %7 = load ptr, ptr %WorkFn, align 8 + %cmp = icmp eq ptr %7, null + %or.cond = select i1 %tobool.i2.not, i1 true, i1 %cmp + br i1 %or.cond, label %if.else11, label %if.then10 + +if.then10: ; preds = %if.then7 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(18) @.str2, ptr noundef null, ptr nofree noundef nonnull dereferenceable(70) @.str15, i32 noundef 150, ptr nofree noundef nonnull dereferenceable(28) @__PRETTY_FUNCTION__.__kmpc_target_deinit) #20 + unreachable + +if.else11: ; preds = %if.then7 + tail call void @llvm.assume(i1 noundef %cmp) #21 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn) #22 + br label %cleanup + +cleanup: ; preds = %if.else11, %if.else, %if.then3, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.declare(metadata, metadata, metadata) #5 + +attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "omp_target_thread_limit"="128" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } +attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } +attributes #2 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } +attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } +attributes #4 = { convergent mustprogress nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #6 = { convergent mustprogress noinline norecurse nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #7 = { convergent mustprogress noreturn nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #8 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #9 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #10 = { convergent nocallback nounwind } +attributes #11 = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #12 = { convergent mustprogress noinline nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #13 = { convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #14 = { convergent "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #15 = { cold noreturn nounwind memory(inaccessiblemem: write) } +attributes #16 = { convergent } +attributes #17 = { nounwind } +attributes #18 = { "llvm.assume"="ompx_no_call_asm" } +attributes #19 = { convergent nounwind "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" } +attributes #20 = { noreturn nounwind "llvm.assume"="ompx_no_call_asm" } +attributes #21 = { memory(write) "llvm.assume"="ompx_no_call_asm" } +attributes #22 = { nounwind "llvm.assume"="ompx_no_call_asm" } +attributes #23 = { convergent nounwind } +attributes #24 = { convergent nounwind "llvm.assume"="ompx_no_call_asm" } +attributes #25 = { "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" } +attributes #26 = { noreturn "llvm.assume"="ompx_no_call_asm" } +attributes #27 = { nofree willreturn "llvm.assume"="ompx_no_call_asm" } +attributes #28 = { convergent nounwind willreturn "llvm.assume"="ompx_no_call_asm" } +attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } + +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9} +!llvm.dbg.cu = !{!10} +!nvvm.annotations = !{!12, !13} +!omp_offload.info = !{!14} +!llvm.ident = !{!15, !16, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} + +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]} +!1 = !{i32 7, !"Dwarf Version", i32 2} +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !{i32 1, !"wchar_size", i32 4} +!4 = !{i32 7, !"openmp", i32 51} +!5 = !{i32 7, !"openmp-device", i32 51} +!6 = !{i32 8, !"PIC Level", i32 2} +!7 = !{i32 7, !"frame-pointer", i32 2} +!8 = !{i32 1, !"ThinLTO", i32 0} +!9 = !{i32 1, !"EnableSplitLTOUnit", i32 1} +!10 = distinct !DICompileUnit(language: DW_LANG_C11, file: !11, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!11 = !DIFile(filename: "test.c", directory: "/tmp") +!12 = !{ptr @__omp_offloading_10305_5c00dd_h_l12_debug__, !"maxntidx", i32 128} +!13 = !{ptr @__omp_offloading_10305_5c00dd_h_l12, !"kernel", i32 1} +!14 = !{i32 0, i32 66309, i32 6029533, !"h", i32 12, i32 0, i32 0} +!15 = !{!"clang version 19.0.0git"} +!16 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!17 = distinct !DISubprogram(name: "__omp_offloading_10305_5c00dd_h_l12_debug__", scope: !11, file: !11, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !10, retainedNodes: !23) +!18 = !DISubroutineType(types: !19) +!19 = !{null, !20} +!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21) +!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22) +!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!23 = !{} +!24 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !17, type: !20, flags: DIFlagArtificial) +!25 = !DILocation(line: 0, scope: !17) +!26 = !DILocation(line: 13, column: 3, scope: !17) +!27 = !DILocalVariable(name: "i", scope: !28, file: !11, line: 14, type: !29) +!28 = distinct !DILexicalBlock(scope: !17, file: !11, line: 13, column: 3) +!29 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!30 = !DILocation(line: 14, column: 9, scope: !28) +!31 = !DILocalVariable(name: "a", scope: !28, file: !11, line: 15, type: !32) +!32 = !DICompositeType(tag: DW_TAG_array_type, baseType: !29, size: 64, elements: !33) +!33 = !{!34} +!34 = !DISubrange(count: 2) +!35 = !DILocation(line: 15, column: 9, scope: !28) +!36 = !DILocation(line: 16, column: 5, scope: !28) +!37 = !DILocation(line: 17, column: 5, scope: !28) +!38 = !DILocation(line: 18, column: 3, scope: !28) +!39 = !DILocation(line: 18, column: 3, scope: !17) +!40 = distinct !DISubprogram(name: "__omp_offloading_10305_5c00dd_h_l12", scope: !11, file: !11, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !10, retainedNodes: !23) +!41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial) +!42 = !DILocation(line: 0, scope: !40) +!43 = !DILocation(line: 12, column: 1, scope: !40) +!44 = distinct !DISubprogram(name: "g", scope: !11, file: !11, line: 3, type: !45, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !10, retainedNodes: !23) +!45 = !DISubroutineType(types: !46) +!46 = !{null} +!47 = !DILocalVariable(name: "i", scope: !44, file: !11, line: 4, type: !29) +!48 = !DILocation(line: 4, column: 7, scope: !44) +!49 = !DILocalVariable(name: "a", scope: !44, file: !11, line: 5, type: !32) +!50 = !DILocation(line: 5, column: 7, scope: !44) +!51 = !DILocation(line: 6, column: 3, scope: !44) +!52 = !DILocation(line: 7, column: 3, scope: !44) +!53 = !DILocation(line: 8, column: 1, scope: !44) +!54 = !{!55, !58, i64 2} +!55 = !{!"_ZTS26ConfigurationEnvironmentTy", !56, i64 0, !56, i64 1, !58, i64 2, !59, i64 4, !59, i64 8, !59, i64 12, !59, i64 16, !59, i64 20, !59, i64 24} +!56 = !{!"omnipotent char", !57, i64 0} +!57 = !{!"Simple C++ TBAA"} +!58 = !{!"_ZTSN4llvm3omp19OMPTgtExecModeFlagsE", !56, i64 0} +!59 = !{!"int", !56, i64 0} +!60 = !{!55, !56, i64 0} +!61 = !{!59, !59, i64 0} +!62 = !{!56, !56, i64 0} +!63 = !{!64, !59, i64 0} +!64 = !{!"_ZTSN4ompx5state11TeamStateTyE", !65, i64 0, !59, i64 28, !59, i64 32, !66, i64 40} +!65 = !{!"_ZTSN4ompx5state10ICVStateTyE", !59, i64 0, !59, i64 4, !59, i64 8, !59, i64 12, !59, i64 16, !59, i64 20, !59, i64 24} +!66 = !{!"any pointer", !56, i64 0} +!67 = !{!64, !59, i64 4} +!68 = !{!64, !59, i64 8} +!69 = !{!64, !59, i64 12} +!70 = !{!64, !59, i64 16} +!71 = !{!64, !59, i64 20} +!72 = !{!64, !59, i64 24} +!73 = !{!64, !59, i64 28} +!74 = !{!64, !59, i64 32} +!75 = !{!64, !66, i64 40} +!76 = !{!66, !66, i64 0} +!77 = !{!78, !59, i64 0} +!78 = !{!"_ZTS19DeviceEnvironmentTy", !59, i64 0, !59, i64 4, !59, i64 8, !59, i64 12, !79, i64 16, !79, i64 24, !79, i64 32, !79, i64 40} +!79 = !{!"long", !56, i64 0} +!80 = !{!65, !59, i64 0} +!81 = !{!65, !59, i64 4} +!82 = !{!65, !59, i64 8} +!83 = !{!65, !59, i64 16} +!84 = !{!65, !59, i64 20} +!85 = !{!65, !59, i64 24} +!86 = distinct !{!86, !87} +!87 = !{!"llvm.loop.mustprogress"} +!88 = !{!"branch_weights", i32 2000, i32 1} +!89 = !{!90, !66, i64 32} +!90 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !65, i64 0, !66, i64 32} +!91 = !{!92, !56, i64 0} +!92 = !{!"_ZTS19KernelEnvironmentTy", !55, i64 0, !66, i64 32, !66, i64 40} From a7656de882610df9a7f1e60c65ce214cef70a32a Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 17:40:35 -0400 Subject: [PATCH 056/114] Move docs to KernelInfo.rst --- llvm/docs/KernelInfo.rst | 61 +++++++++++++++++++++++++ llvm/include/llvm/Analysis/KernelInfo.h | 29 +----------- 2 files changed, 62 insertions(+), 28 deletions(-) create mode 100644 llvm/docs/KernelInfo.rst diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst new file mode 100644 index 0000000000000..397b32602bce2 --- /dev/null +++ b/llvm/docs/KernelInfo.rst @@ -0,0 +1,61 @@ +========== +KernelInfo +========== + +.. contents:: + :local: + +Introduction +============ + +This LLVM IR pass reports various statistics for codes compiled for GPUs. The +goal of these statistics is to help identify bad code patterns and ways to +mitigate them. The pass operates at the LLVM IR level so that it can, in +theory, support any LLVM-based compiler for programming languages supporting +GPUs. + +By default, the pass is disabled. For convenience, the command-line option +``-kernel-info-end-lto`` inserts it at the end of LTO, and options like +``-Rpass=kernel-info`` enable its remarks. Example ``opt`` and ``clang`` +command lines appear in the next section. + +Remarks include summary statistics (e.g., total size of static allocas) and +individual occurrences (e.g., source location of each alloca). Examples of the +output appear in tests in `llvm/test/Analysis/KernelInfo`. + +Example Command Lines +===================== + +To analyze a C program as it appears to an LLVM GPU backend at the end of LTO: + +.. code-block:: shell + + $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ + -Rpass=kernel-info -mllvm -kernel-info-end-lto + +To analyze specified LLVM IR, perhaps previously generated by something like +``clang -save-temps -g -fopenmp --offload-arch=native test.c``: + +.. code-block:: shell + + $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ + -pass-remarks=kernel-info -passes=kernel-info + +kernel-info can also be inserted into a specified LLVM pass pipeline using +``-kernel-info-end-lto``, or it can be positioned explicitly in that pipeline: + +.. code-block:: shell + + $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ + -Rpass=kernel-info -mllvm -kernel-info-end-lto \ + -Xoffload-linker --lto-newpm-passes='lto' + + $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ + -Rpass=kernel-info \ + -Xoffload-linker --lto-newpm-passes='lto,module(kernel-info)' + + $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ + -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto' + + $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ + -pass-remarks=kernel-info -passes='lto,module(kernel-info)' diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 5495bb2fd4d92..96cd5f68af646 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -9,34 +9,7 @@ // This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter // classes used to extract function properties from a GPU kernel. // -// To analyze a C program as it appears to an LLVM GPU backend at the end of -// LTO: -// -// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ -// -Rpass=kernel-info -mllvm -kernel-info-end-lto -// -// To analyze specified LLVM IR, perhaps previously generated by something like -// 'clang -save-temps -g -fopenmp --offload-arch=native test.c': -// -// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ -// -pass-remarks=kernel-info -passes=kernel-info -// -// kernel-info can also be inserted into a specified LLVM pass pipeline using -// -kernel-info-end-lto, or it can be positioned explicitly in that pipeline: -// -// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ -// -Rpass=kernel-info -mllvm -kernel-info-end-lto \ -// -Xoffload-linker --lto-newpm-passes='lto' -// -// $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ -// -Rpass=kernel-info \ -// -Xoffload-linker --lto-newpm-passes='lto,module(kernel-info)' -// -// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ -// -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto' -// -// $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ -// -pass-remarks=kernel-info -passes='lto,module(kernel-info)' +// See llvm/docs/KernelInfo.rst. // ===---------------------------------------------------------------------===// #ifndef LLVM_ANALYSIS_KERNELINFO_H From d92856ec609d4bdf7642b8186cf0458dadd80f4a Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 17:41:02 -0400 Subject: [PATCH 057/114] Move conditional outside registration call --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 +++++----- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 3b2ed9fe4236c..93d1d6b1b80b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -774,14 +774,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { return nullptr; }); - PB.registerFullLinkTimeOptimizationLastEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { - if (KernelInfoEndLTO) { + if (KernelInfoEndLTO) { + PB.registerFullLinkTimeOptimizationLastEPCallback( + [](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; FPM.addPass(KernelInfoPrinter()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); - } - }); + }); + } } int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 8d77c8e53f7a6..1a4a9781db333 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -240,14 +240,14 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); - PB.registerFullLinkTimeOptimizationLastEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { - if (KernelInfoEndLTO) { + if (KernelInfoEndLTO) { + PB.registerFullLinkTimeOptimizationLastEPCallback( + [](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; FPM.addPass(KernelInfoPrinter()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); - } - }); + }); + } } TargetTransformInfo From 5727284c17e1a0eadfbcbc544d06e0dca0a4384b Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 12 Aug 2024 16:50:43 -0500 Subject: [PATCH 058/114] Merge changes --- offload/plugins-nextgen/common/src/GlobalHandler.cpp | 2 +- offload/test/offloading/gpupgo/pgo1.c | 8 ++++---- offload/test/offloading/gpupgo/pgo2.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index 93abd0a5cea36..0627b7f4a7f5b 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -194,7 +194,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, // Check if given current global is a profiling global based // on name - if (NameOrErr->equals(getInstrProfNamesVarName())) { + if (*NameOrErr == getInstrProfNamesVarName()) { // Read in profiled function names DeviceProfileData.NamesData = SmallVector(Sym.getSize(), 0); GlobalTy NamesGlobal(NameOrErr->str(), Sym.getSize(), diff --git a/offload/test/offloading/gpupgo/pgo1.c b/offload/test/offloading/gpupgo/pgo1.c index f5d8aee7908be..7196663fcfc90 100644 --- a/offload/test/offloading/gpupgo/pgo1.c +++ b/offload/test/offloading/gpupgo/pgo1.c @@ -32,17 +32,17 @@ int main() { // LLVM-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-PGO: Counters: 4 -// LLVM-PGO: Block counts: [20, 10, 20, 10] +// LLVM-PGO: Block counts: [20, 10, 2, 1] // LLVM-PGO-LABEL: test1: // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-PGO: Counters: 1 -// LLVM-PGO: Block counts: [1] +// LLVM-PGO: Block counts: [10] // LLVM-PGO-LABEL: test2: // LLVM-PGO: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-PGO: Counters: 1 -// LLVM-PGO: Block counts: [1] +// LLVM-PGO: Block counts: [20] // LLVM-PGO-LABEL: Instrumentation level: // LLVM-PGO-SAME: IR @@ -52,7 +52,7 @@ int main() { // LLVM-PGO-LABEL: Maximum function count: // LLVM-PGO-SAME: 20 // LLVM-PGO-LABEL: Maximum internal block count: -// LLVM-PGO-SAME: 20 +// LLVM-PGO-SAME: 10 // CLANG-PGO-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // CLANG-PGO: Hash: {{0[xX][0-9a-fA-F]+}} diff --git a/offload/test/offloading/gpupgo/pgo2.c b/offload/test/offloading/gpupgo/pgo2.c index b5d0f2120754a..7f5c9ab744907 100644 --- a/offload/test/offloading/gpupgo/pgo2.c +++ b/offload/test/offloading/gpupgo/pgo2.c @@ -49,7 +49,7 @@ int main() { // LLVM-DEVICE-LABEL: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}}: // LLVM-DEVICE: Hash: {{0[xX][0-9a-fA-F]+}} // LLVM-DEVICE: Counters: 3 -// LLVM-DEVICE: Block counts: [10, 1, 1] +// LLVM-DEVICE: Block counts: [10, 2, 1] // CLANG-HOST-LABEL: main: // CLANG-HOST: Hash: {{0[xX][0-9a-fA-F]+}} From 6ac3f419b94e5c5ecd4e7a33b16e1f7e89fa1790 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 18:03:51 -0400 Subject: [PATCH 059/114] Use llvm::SmallString --- llvm/lib/Analysis/KernelInfo.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 9df3b5b32afcb..caeada91c31af 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/KernelInfo.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/DebugInfo.h" @@ -139,8 +140,8 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, } remarkAlloca(ORE, F, *Alloca, StaticSize); } else if (const CallBase *Call = dyn_cast(&I)) { - std::string CallKind; - std::string RemarkKind; + SmallString<40> CallKind; + SmallString<40> RemarkKind; if (Call->isIndirectCall()) { IndirectCalls += Direction; CallKind += "indirect"; From 6367ad7ea65d7ef1da51b4fe8cf6e50af90b1f36 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 19:22:45 -0400 Subject: [PATCH 060/114] Use TTI.getFlatAddressSpace for addrspace(0) We have to be more careful about targets in the test suite now because `getFlatAddressSpace` returns garbage for unsupported targets. Should we change the remarks to say flat addrspace instead of addrspace(0)? --- llvm/include/llvm/Analysis/KernelInfo.h | 4 +++- llvm/lib/Analysis/KernelInfo.cpp | 18 ++++++++++-------- .../Inputs/test.ll} | 9 --------- .../Analysis/KernelInfo/addrspace0/amdgpu.ll | 12 ++++++++++++ .../Analysis/KernelInfo/addrspace0/nvptx.ll | 12 ++++++++++++ llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll | 10 +++------- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 10 +++------- 7 files changed, 43 insertions(+), 32 deletions(-) rename llvm/test/Analysis/KernelInfo/{addrspace0.ll => addrspace0/Inputs/test.ll} (97%) create mode 100644 llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll create mode 100644 llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 96cd5f68af646..c4a18d47723ab 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -16,6 +16,7 @@ #define LLVM_ANALYSIS_KERNELINFO_H #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetTransformInfo.h" namespace llvm { class DominatorTree; @@ -24,7 +25,8 @@ class Function; /// Data structure holding function info for kernels. class KernelInfo { void updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE); + OptimizationRemarkEmitter &ORE, + const TargetTransformInfo &TTI); public: static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index caeada91c31af..de08bd49aacfc 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -122,7 +122,8 @@ static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE, } void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE) { + OptimizationRemarkEmitter &ORE, + const TargetTransformInfo &TTI) { assert(Direction == 1 || Direction == -1); const Function &F = *BB.getParent(); const Module &M = *F.getParent(); @@ -170,34 +171,34 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, } remarkCall(ORE, F, *Call, CallKind, RemarkKind); if (const AnyMemIntrinsic *MI = dyn_cast(Call)) { - if (MI->getDestAddressSpace() == 0) { + if (MI->getDestAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } else if (const AnyMemTransferInst *MT = dyn_cast(MI)) { - if (MT->getSourceAddressSpace() == 0) { + if (MT->getSourceAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } } } } else if (const LoadInst *Load = dyn_cast(&I)) { - if (Load->getPointerAddressSpace() == 0) { + if (Load->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } } else if (const StoreInst *Store = dyn_cast(&I)) { - if (Store->getPointerAddressSpace() == 0) { + if (Store->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } } else if (const AtomicRMWInst *At = dyn_cast(&I)) { - if (At->getPointerAddressSpace() == 0) { + if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { - if (At->getPointerAddressSpace() == 0) { + if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { AddrspaceZeroAccesses += Direction; remarkAddrspaceZeroAccess(ORE, F, I); } @@ -286,6 +287,7 @@ static std::optional parseNVPTXMDNodeAsInteger(Function &F, KernelInfo KernelInfo::getKernelInfo(Function &F, FunctionAnalysisManager &FAM) { + const TargetTransformInfo &TTI = FAM.getResult(F); KernelInfo KI; // Only analyze modules for GPUs. // TODO: This would be more maintainable if there were an isGPU. @@ -319,7 +321,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, auto &ORE = FAM.getResult(F); for (const auto &BB : F) if (DT.isReachableFromEntry(&BB)) - KI.updateForBB(BB, +1, ORE); + KI.updateForBB(BB, +1, ORE, TTI); #define REMARK_PROPERTY(PROP_NAME) \ remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) diff --git a/llvm/test/Analysis/KernelInfo/addrspace0.ll b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll similarity index 97% rename from llvm/test/Analysis/KernelInfo/addrspace0.ll rename to llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll index 4c472396443f5..79d3cd2562e90 100644 --- a/llvm/test/Analysis/KernelInfo/addrspace0.ll +++ b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll @@ -1,12 +1,3 @@ -; Check info on addrspace(0) memory accesses. - -; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ -; RUN: -disable-output %s 2>&1 | \ -; RUN: FileCheck -match-full-lines --implicit-check-not='addrspace(0)' %s - -target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-nvidia-cuda" - define void @f() !dbg !3 { entry: ; load diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll b/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll new file mode 100644 index 0000000000000..b7a26d6cb47ba --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll @@ -0,0 +1,12 @@ +; Check info on addrspace(0) memory accesses when the target is amdgpu. +; +; The target matters because kernel-info calls +; TargetTransformInfo::getFlatAddressSpace to select addrspace(0). + +; REQUIRES: amdgpu-registered-target + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -mtriple="amdgcn-amd-amdhsa" \ +; RUN: -disable-output %S/Inputs/test.ll 2>&1 | \ +; RUN: FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \ +; RUN: %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll b/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll new file mode 100644 index 0000000000000..43bb985744e0c --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll @@ -0,0 +1,12 @@ +; Check info on addrspace(0) memory accesses when the target is nvptx. +; +; The target matters because kernel-info calls +; TargetTransformInfo::getFlatAddressSpace to select addrspace(0). + +; REQUIRES: nvptx-registered-target + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -mtriple="nvptx64-nvidia-cuda" \ +; RUN: -disable-output %S/Inputs/test.ll 2>&1 | \ +; RUN: FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \ +; RUN: %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index ee5f65b8e5ab7..d417f8b866f73 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -1,16 +1,12 @@ ; See ./README.md for how to maintain the LLVM IR in this test. +; REQUIRES: amdgpu-registered-target + ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -disable-output %s 2>&1 | \ ; RUN: FileCheck -match-full-lines %s -; For some builds, we see a warning like: -; -; opt: WARNING: failed to create target machine for 'amdgcn-amd-amdhsa': unable to get target for 'amdgcn-amd-amdhsa', see --version and --triple. -; -; But there should be no other remarks here. -; CHECK-NOT: remark: - +; CHECK-NOT: remark: ; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 41d068b03548b..1222267a8fe57 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -1,16 +1,12 @@ ; See ./README.md for how to maintain the LLVM IR in this test. +; REQUIRES: nvptx-registered-target + ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -disable-output %s 2>&1 | \ ; RUN: FileCheck -match-full-lines %s -; For some builds, we see a warning like: -; -; opt: WARNING: failed to create target machine for 'nvptx64-nvidia-cuda': unable to get target for 'nvptx64-nvidia-cuda', see --version and --triple. -; -; But there should be no other remarks here. -; CHECK-NOT: remark: - +; CHECK-NOT: remark: ; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes From 78446bbb9e1caed303288a2962dd7c78a8779c06 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 19:31:58 -0400 Subject: [PATCH 061/114] Avoid repetition between amdgpu and nvptx tests --- .../kernel-info-after-lto/Inputs/test.ll | 22 ++++++++++ .../kernel-info-after-lto/amdgpu.ll | 43 ++++--------------- .../KernelInfo/kernel-info-after-lto/nvptx.ll | 43 ++++--------------- 3 files changed, 40 insertions(+), 68 deletions(-) create mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll new file mode 100644 index 0000000000000..b85e3c581867c --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll @@ -0,0 +1,22 @@ +; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; NONE-NOT: remark: +define void @test() #0 !dbg !5 { +entry: + ret void +} + +attributes #0 = { + "omp_target_num_teams"="100" +} + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} +!4 = !DISubroutineType(types: !3) +!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll index 7d190ece46e16..6d6e83e8d317f 100644 --- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll @@ -4,44 +4,19 @@ ; REQUIRES: amdgpu-registered-target ; -kernel-info-end-lto inserts kernel-info into LTO pipeline. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="amdgcn-amd-amdhsa" \ ; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -match-full-lines %s +; RUN: FileCheck -match-full-lines %S/Inputs/test.ll ; Omitting -kernel-info-end-lto disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="amdgcn-amd-amdhsa" \ ; RUN: -passes='lto' 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %s +; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll ; Omitting LTO disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="amdgcn-amd-amdhsa" \ ; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %s - -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" -target triple = "amdgcn-amd-amdhsa" - -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 -; NONE-NOT: remark: -define void @test() #0 !dbg !5 { -entry: - ret void -} - -attributes #0 = { - "omp_target_num_teams"="100" -} - -!llvm.module.flags = !{!0} -!llvm.dbg.cu = !{!1} -!nvvm.annotations = !{!6, !7, !8} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!2 = !DIFile(filename: "test.c", directory: "/tmp") -!3 = !{} -!4 = !DISubroutineType(types: !3) -!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) -!6 = !{ptr @test, !"maxclusterrank", i32 200} -!7 = !{ptr @test, !"maxntidx", i32 210} -!8 = distinct !{ptr null, !"kernel", i32 1} +; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll index 4e790123c313a..1e427daed671e 100644 --- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll @@ -4,44 +4,19 @@ ; REQUIRES: nvptx-registered-target ; -kernel-info-end-lto inserts kernel-info into LTO pipeline. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="nvptx64-nvidia-cuda" \ ; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -match-full-lines %s +; RUN: FileCheck -match-full-lines %S/Inputs/test.ll ; Omitting -kernel-info-end-lto disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="nvptx64-nvidia-cuda" \ ; RUN: -passes='lto' 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %s +; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll ; Omitting LTO disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %s \ +; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ +; RUN: -mtriple="nvptx64-nvidia-cuda" \ ; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %s - -target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" -target triple = "nvptx64-nvidia-cuda" - -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 -; NONE-NOT: remark: -define void @test() #0 !dbg !5 { -entry: - ret void -} - -attributes #0 = { - "omp_target_num_teams"="100" -} - -!llvm.module.flags = !{!0} -!llvm.dbg.cu = !{!1} -!nvvm.annotations = !{!6, !7, !8} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!2 = !DIFile(filename: "test.c", directory: "/tmp") -!3 = !{} -!4 = !DISubroutineType(types: !3) -!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) -!6 = !{ptr @test, !"maxclusterrank", i32 200} -!7 = !{ptr @test, !"maxntidx", i32 210} -!8 = distinct !{ptr null, !"kernel", i32 1} +; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll From fede524269915edb51b7d6680a7280a79ca0f710 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 Aug 2024 19:39:14 -0400 Subject: [PATCH 062/114] Use named values in tests --- .../Analysis/KernelInfo/addrspace0/Inputs/test.ll | 2 +- llvm/test/Analysis/KernelInfo/calls.ll | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll index 79d3cd2562e90..0821fde8e25b1 100644 --- a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll +++ b/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll @@ -1,6 +1,6 @@ define void @f() !dbg !3 { entry: - ; load + ; load: check remarks for both unnamed and named values. ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in addrspace(0) %0 = load i32, ptr null, align 4, !dbg !6 ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in addrspace(0) diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll index 6101a71254898..25b8e3d880303 100644 --- a/llvm/test/Analysis/KernelInfo/calls.ll +++ b/llvm/test/Analysis/KernelInfo/calls.ll @@ -17,9 +17,9 @@ entry: call void @g(), !dbg !104 ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !105 - %0 = load ptr, ptr null, align 8 + %fnPtr = load ptr, ptr null, align 8 ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call - call void %0(), !dbg !106 + call void %fnPtr(), !dbg !106 ; CHECK: remark: test.c:20:5: in artificial function 'h', direct invoke, callee is 'f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !107 fcont: @@ -30,7 +30,7 @@ gcont: invoke void @h() to label %hcont unwind label %cleanup, !dbg !109 hcont: ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke - invoke void %0() to label %end unwind label %cleanup, !dbg !110 + invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110 cleanup: %ll = landingpad { ptr, i32 } cleanup @@ -53,9 +53,9 @@ entry: call void @g(), !dbg !203 ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !204 - %0 = load ptr, ptr null, align 8 + %fnPtr = load ptr, ptr null, align 8 ; CHECK: remark: test.c:9:3: in function 'g', indirect call - call void %0(), !dbg !205 + call void %fnPtr(), !dbg !205 ; CHECK: remark: test.c:10:3: in function 'g', direct invoke, callee is 'f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !206 fcont: @@ -66,7 +66,7 @@ gcont: invoke void @h() to label %hcont unwind label %cleanup, !dbg !208 hcont: ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke - invoke void %0() to label %end unwind label %cleanup, !dbg !209 + invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209 cleanup: %ll = landingpad { ptr, i32 } cleanup From 4c30b8a767c8e5fcaa4c6e8979d5515b9f4656f1 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Tue, 13 Aug 2024 12:03:06 -0400 Subject: [PATCH 063/114] Say flat address space instead of addrspace(0) --- llvm/include/llvm/Analysis/KernelInfo.h | 4 +- llvm/lib/Analysis/KernelInfo.cpp | 32 ++++---- .../Inputs/test.ll | 74 +++++++++---------- .../{addrspace0 => flat-addrspace}/amdgpu.ll | 6 +- .../{addrspace0 => flat-addrspace}/nvptx.ll | 6 +- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 12 +-- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 12 +-- 7 files changed, 73 insertions(+), 73 deletions(-) rename llvm/test/Analysis/KernelInfo/{addrspace0 => flat-addrspace}/Inputs/test.ll (82%) rename llvm/test/Analysis/KernelInfo/{addrspace0 => flat-addrspace}/amdgpu.ll (53%) rename llvm/test/Analysis/KernelInfo/{addrspace0 => flat-addrspace}/nvptx.ll (54%) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index c4a18d47723ab..66dd95046dd97 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -89,8 +89,8 @@ class KernelInfo { /// Number of calls of type InvokeInst. int64_t Invokes = 0; - /// Number of addrspace(0) memory accesses (via load, store, etc.). - int64_t AddrspaceZeroAccesses = 0; + /// Number of flat addrspace memory accesses (via load, store, etc.). + int64_t FlatAddrspaceAccesses = 0; }; /// Analysis class for KernelInfo. diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index de08bd49aacfc..4eccc8807106b 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -102,11 +102,11 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, }); } -static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE, +static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, const Function &Caller, const Instruction &Inst) { ORE.emit([&] { - OptimizationRemark R(DEBUG_TYPE, "AddrspaceZeroAccess", &Inst); + OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &Inst); R << "in "; identifyFunction(R, Caller); if (const IntrinsicInst *II = dyn_cast(&Inst)) { @@ -116,7 +116,7 @@ static void remarkAddrspaceZeroAccess(OptimizationRemarkEmitter &ORE, } if (Inst.hasName()) R << " ('%" << Inst.getName() << "')"; - R << " accesses memory in addrspace(0)"; + R << " accesses memory in flat address space"; return R; }); } @@ -172,35 +172,35 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, remarkCall(ORE, F, *Call, CallKind, RemarkKind); if (const AnyMemIntrinsic *MI = dyn_cast(Call)) { if (MI->getDestAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } else if (const AnyMemTransferInst *MT = dyn_cast(MI)) { if (MT->getSourceAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } } } } else if (const LoadInst *Load = dyn_cast(&I)) { if (Load->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const StoreInst *Store = dyn_cast(&I)) { if (Store->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicRMWInst *At = dyn_cast(&I)) { if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { - AddrspaceZeroAccesses += Direction; - remarkAddrspaceZeroAccess(ORE, F, I); + FlatAddrspaceAccesses += Direction; + remarkFlatAddrspaceAccess(ORE, F, I); } } } @@ -344,7 +344,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, REMARK_PROPERTY(IndirectCalls); REMARK_PROPERTY(DirectCallsToDefinedFunctions); REMARK_PROPERTY(Invokes); - REMARK_PROPERTY(AddrspaceZeroAccesses); + REMARK_PROPERTY(FlatAddrspaceAccesses); #undef REMARK_PROPERTY return KI; diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll similarity index 82% rename from llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll rename to llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll index 0821fde8e25b1..07c884792f45c 100644 --- a/llvm/test/Analysis/KernelInfo/addrspace0/Inputs/test.ll +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll @@ -1,129 +1,129 @@ define void @f() !dbg !3 { entry: ; load: check remarks for both unnamed and named values. - ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in flat address space %0 = load i32, ptr null, align 4, !dbg !6 - ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in addrspace(0) + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in flat address space %load = load i32, ptr null, align 4, !dbg !6 - ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in addrspace(0) + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in flat address space %load0 = load i32, ptr addrspace(0) null, align 4, !dbg !6 %load1 = load i32, ptr addrspace(1) null, align 4, !dbg !6 %load2 = load i32, ptr addrspace(2) null, align 4, !dbg !6 ; store - ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space store i32 0, ptr null, align 4, !dbg !7 - ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space store i32 0, ptr addrspace(0) null, align 4, !dbg !7 store i32 0, ptr addrspace(1) null, align 4, !dbg !7 store i32 0, ptr addrspace(8) null, align 4, !dbg !7 ; atomicrmw - ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8 - ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8 atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8 atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8 ; cmpxchg - ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9 - ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in addrspace(0) + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 ; llvm.memcpy - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in flat address space call void @llvm.memcpy.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p1.i64' call accesses memory in flat address space call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p1.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.p0.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 ; llvm.memcpy.inline - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.inline.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p0.p1.i64' call accesses memory in flat address space call void @llvm.memcpy.inline.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.inline.p1.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.inline.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !10 call void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !10 ; llvm.memcpy.element.unordered.atomic - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p0.p1.i64' call accesses memory in flat address space call void @llvm.memcpy.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10 - ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:7:3: in function 'f', 'llvm.memcpy.element.unordered.atomic.p1.p0.i64' call accesses memory in flat address space call void @llvm.memcpy.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !10 call void @llvm.memcpy.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !10 ; llvm.memmove - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in flat address space call void @llvm.memmove.p0.p1.i64(ptr align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p1.i64' call accesses memory in flat address space call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in flat address space call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p1.p0.i64' call accesses memory in flat address space call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i1 false), !dbg !11 call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) align 4 null, ptr addrspace(4) align 4 null, i64 10, i1 false), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in flat address space call void @llvm.memmove.p0.p0.i64(ptr align 4 null, ptr align 4 null, i64 10, i1 false), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.p0.p0.i64' call accesses memory in flat address space call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i1 false), !dbg !11 ; llvm.memmove.element.unordered.atomic - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p0.i64' call accesses memory in flat address space call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(ptr addrspace(0) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p1.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p0.p1.i64' call accesses memory in flat address space call void @llvm.memmove.element.unordered.atomic.p0.p1.i64(ptr addrspace(0) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11 - ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p1.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:8:4: in function 'f', 'llvm.memmove.element.unordered.atomic.p1.p0.i64' call accesses memory in flat address space call void @llvm.memmove.element.unordered.atomic.p1.p0.i64(ptr addrspace(1) align 4 null, ptr addrspace(0) align 4 null, i64 10, i32 4), !dbg !11 call void @llvm.memmove.element.unordered.atomic.p1.p1.i64(ptr addrspace(1) align 4 null, ptr addrspace(1) align 4 null, i64 10, i32 4), !dbg !11 ; llvm.memset - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in flat address space call void @llvm.memset.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12 - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.p0.i64' call accesses memory in flat address space call void @llvm.memset.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12 call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12 call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12 ; llvm.memset.inline - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in flat address space call void @llvm.memset.inline.p0.i64(ptr align 4 null, i8 0, i64 10, i1 false), !dbg !12 - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.inline.p0.i64' call accesses memory in flat address space call void @llvm.memset.inline.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i1 false), !dbg !12 call void @llvm.memset.inline.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i1 false), !dbg !12 call void @llvm.memset.inline.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i1 false), !dbg !12 ; llvm.memset.element.unordered.atomic - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in flat address space call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 null, i8 0, i64 10, i32 4), !dbg !12 - ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in addrspace(0) + ; CHECK: remark: test.c:9:5: in function 'f', 'llvm.memset.element.unordered.atomic.p0.i64' call accesses memory in flat address space call void @llvm.memset.element.unordered.atomic.p0.i64(ptr addrspace(0) align 4 null, i8 0, i64 10, i32 4), !dbg !12 call void @llvm.memset.element.unordered.atomic.p1.i64(ptr addrspace(1) align 4 null, i8 0, i64 10, i32 4), !dbg !12 call void @llvm.memset.element.unordered.atomic.p3.i64(ptr addrspace(3) align 4 null, i8 0, i64 10, i32 4), !dbg !12 ret void } -; CHECK: remark: test.c:2:0: in function 'f', AddrspaceZeroAccesses = 36 +; CHECK: remark: test.c:2:0: in function 'f', FlatAddrspaceAccesses = 36 !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2} diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll similarity index 53% rename from llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll rename to llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll index b7a26d6cb47ba..7447dcf51cc89 100644 --- a/llvm/test/Analysis/KernelInfo/addrspace0/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll @@ -1,12 +1,12 @@ -; Check info on addrspace(0) memory accesses when the target is amdgpu. +; Check info on flat address space memory accesses when the target is amdgpu. ; ; The target matters because kernel-info calls -; TargetTransformInfo::getFlatAddressSpace to select addrspace(0). +; TargetTransformInfo::getFlatAddressSpace to select the flat address space. ; REQUIRES: amdgpu-registered-target ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -mtriple="amdgcn-amd-amdhsa" \ ; RUN: -disable-output %S/Inputs/test.ll 2>&1 | \ -; RUN: FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \ +; RUN: FileCheck -match-full-lines -implicit-check-not='flat address space' \ ; RUN: %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll similarity index 54% rename from llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll rename to llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll index 43bb985744e0c..02321c19e022d 100644 --- a/llvm/test/Analysis/KernelInfo/addrspace0/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll @@ -1,12 +1,12 @@ -; Check info on addrspace(0) memory accesses when the target is nvptx. +; Check info on flat address space memory accesses when the target is nvptx. ; ; The target matters because kernel-info calls -; TargetTransformInfo::getFlatAddressSpace to select addrspace(0). +; TargetTransformInfo::getFlatAddressSpace to select the flat address space. ; REQUIRES: nvptx-registered-target ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -mtriple="nvptx64-nvidia-cuda" \ ; RUN: -disable-output %S/Inputs/test.ll 2>&1 | \ -; RUN: FileCheck -match-full-lines -implicit-check-not='addrspace(0)' \ +; RUN: FileCheck -match-full-lines -implicit-check-not='flat address space' \ ; RUN: %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index d417f8b866f73..56ee35810ef26 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -10,7 +10,7 @@ ; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_init' ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' @@ -26,11 +26,11 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0) -; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 @@ -40,7 +40,7 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes @@ -54,7 +54,7 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 ; CHECK-NOT: {{.}} diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 1222267a8fe57..ee76ecdf5d795 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -10,7 +10,7 @@ ; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_init' ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' @@ -25,11 +25,11 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AddrspaceZeroAccesses = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in addrspace(0) -; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in addrspace(0) +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 @@ -39,7 +39,7 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AddrspaceZeroAccesses = 2 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes @@ -53,7 +53,7 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AddrspaceZeroAccesses = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 ; CHECK-NOT: remark: {{.*: in function 'g',.*}} ; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't From 33f0d4dd276eda64f495cdf66411bc77d20517c6 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Tue, 13 Aug 2024 12:23:42 -0400 Subject: [PATCH 064/114] Cache the flat address space --- llvm/include/llvm/Analysis/KernelInfo.h | 8 +++++--- llvm/lib/Analysis/KernelInfo.cpp | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 66dd95046dd97..3cf5bec58cf55 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -25,8 +25,7 @@ class Function; /// Data structure holding function info for kernels. class KernelInfo { void updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE, - const TargetTransformInfo &TTI); + OptimizationRemarkEmitter &ORE); public: static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); @@ -89,7 +88,10 @@ class KernelInfo { /// Number of calls of type InvokeInst. int64_t Invokes = 0; - /// Number of flat addrspace memory accesses (via load, store, etc.). + /// Target-specific flat address space. + unsigned FlatAddrspace; + + /// Number of flat address space memory accesses (via load, store, etc.). int64_t FlatAddrspaceAccesses = 0; }; diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 4eccc8807106b..b5b9145641550 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -122,8 +122,7 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, } void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE, - const TargetTransformInfo &TTI) { + OptimizationRemarkEmitter &ORE) { assert(Direction == 1 || Direction == -1); const Function &F = *BB.getParent(); const Module &M = *F.getParent(); @@ -171,34 +170,34 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, } remarkCall(ORE, F, *Call, CallKind, RemarkKind); if (const AnyMemIntrinsic *MI = dyn_cast(Call)) { - if (MI->getDestAddressSpace() == TTI.getFlatAddressSpace()) { + if (MI->getDestAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } else if (const AnyMemTransferInst *MT = dyn_cast(MI)) { - if (MT->getSourceAddressSpace() == TTI.getFlatAddressSpace()) { + if (MT->getSourceAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } } } } else if (const LoadInst *Load = dyn_cast(&I)) { - if (Load->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { + if (Load->getPointerAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const StoreInst *Store = dyn_cast(&I)) { - if (Store->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { + if (Store->getPointerAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicRMWInst *At = dyn_cast(&I)) { - if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { + if (At->getPointerAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { - if (At->getPointerAddressSpace() == TTI.getFlatAddressSpace()) { + if (At->getPointerAddressSpace() == FlatAddrspace) { FlatAddrspaceAccesses += Direction; remarkFlatAddrspaceAccess(ORE, F, I); } @@ -287,7 +286,6 @@ static std::optional parseNVPTXMDNodeAsInteger(Function &F, KernelInfo KernelInfo::getKernelInfo(Function &F, FunctionAnalysisManager &FAM) { - const TargetTransformInfo &TTI = FAM.getResult(F); KernelInfo KI; // Only analyze modules for GPUs. // TODO: This would be more maintainable if there were an isGPU. @@ -297,6 +295,8 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, return KI; KI.IsValid = true; + KI.FlatAddrspace = FAM.getResult(F).getFlatAddressSpace(); + // Record function properties. KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams"); @@ -321,7 +321,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, auto &ORE = FAM.getResult(F); for (const auto &BB : F) if (DT.isReachableFromEntry(&BB)) - KI.updateForBB(BB, +1, ORE, TTI); + KI.updateForBB(BB, +1, ORE); #define REMARK_PROPERTY(PROP_NAME) \ remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) From a2a512c5bfbea1bbe14f4db2574631b0703106ea Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Tue, 13 Aug 2024 13:18:20 -0400 Subject: [PATCH 065/114] Link KernelInfo.rst from Passes.rst --- llvm/docs/Passes.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst index 49f633e98d16f..939aeabd599b9 100644 --- a/llvm/docs/Passes.rst +++ b/llvm/docs/Passes.rst @@ -5,6 +5,11 @@ LLVM's Analysis and Transform Passes .. contents:: :local: +.. toctree:: + :hidden: + + KernelInfo + Introduction ============ .. warning:: This document is not updated frequently, and the list of passes @@ -148,6 +153,12 @@ This pass collects the count of all instructions and reports them. Bookkeeping for "interesting" users of expressions computed from induction variables. +``kernel-info``: GPU Kernel Info +-------------------------------- + +Reports various statistics for codes compiled for GPUs. This pass is +:doc:`documented separately`. + ``lazy-value-info``: Lazy Value Information Analysis ---------------------------------------------------- From de04ac4fee83f24bad8510f055cc7b303cf76939 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Tue, 13 Aug 2024 13:48:32 -0400 Subject: [PATCH 066/114] Don't filter out cpus -kernel-info-end-lto doesn't insert kernel-info for cpu modules. If the user explicitly specifies the pass for a cpu module, then it will run now. --- llvm/include/llvm/Analysis/KernelInfo.h | 4 ---- llvm/lib/Analysis/KernelInfo.cpp | 8 -------- 2 files changed, 12 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 3cf5bec58cf55..951c58cfc0218 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -36,10 +36,6 @@ class KernelInfo { bool operator!=(const KernelInfo &FPI) const { return !(*this == FPI); } - /// If false, nothing was recorded here because the supplied function didn't - /// appear in a module compiled for a GPU. - bool IsValid = false; - /// Whether the function has external linkage and is not a kernel function. bool ExternalNotKernel = false; diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index b5b9145641550..b29c3c3fecd16 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -287,14 +287,6 @@ static std::optional parseNVPTXMDNodeAsInteger(Function &F, KernelInfo KernelInfo::getKernelInfo(Function &F, FunctionAnalysisManager &FAM) { KernelInfo KI; - // Only analyze modules for GPUs. - // TODO: This would be more maintainable if there were an isGPU. - const std::string &TT = F.getParent()->getTargetTriple(); - llvm::Triple T(TT); - if (!T.isAMDGPU() && !T.isNVPTX()) - return KI; - KI.IsValid = true; - KI.FlatAddrspace = FAM.getResult(F).getFlatAddressSpace(); // Record function properties. From ec5d2bd00ed0c9305a0820d56f69f1be25ebdd6b Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 12:19:16 -0400 Subject: [PATCH 067/114] Include less in header --- llvm/include/llvm/Analysis/KernelInfo.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 951c58cfc0218..c3bc0849efa0f 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -15,12 +15,11 @@ #ifndef LLVM_ANALYSIS_KERNELINFO_H #define LLVM_ANALYSIS_KERNELINFO_H -#include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/PassManager.h" namespace llvm { -class DominatorTree; -class Function; +class BasicBlock; +class OptimizationRemarkEmitter; /// Data structure holding function info for kernels. class KernelInfo { From c06b9052e6f18e2f290f54eb1ca2583aa3bbeee0 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 12:19:42 -0400 Subject: [PATCH 068/114] Removed unused comparison operators They wouldn't have worked reliably anyway given uninitialized padding in the struct. --- llvm/include/llvm/Analysis/KernelInfo.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index c3bc0849efa0f..6d4edfb3525cc 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -29,12 +29,6 @@ class KernelInfo { public: static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); - bool operator==(const KernelInfo &FPI) const { - return std::memcmp(this, &FPI, sizeof(KernelInfo)) == 0; - } - - bool operator!=(const KernelInfo &FPI) const { return !(*this == FPI); } - /// Whether the function has external linkage and is not a kernel function. bool ExternalNotKernel = false; From d83d22a1079eb66487b084905af114ec384a8319 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 12:19:52 -0400 Subject: [PATCH 069/114] Remove redundant null check --- llvm/lib/Analysis/KernelInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index b29c3c3fecd16..c039d495ee6ed 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -161,7 +161,7 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, } if (!Call->isIndirectCall()) { if (const Function *Callee = Call->getCalledFunction()) { - if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration()) { + if (!Callee->isIntrinsic() && !Callee->isDeclaration()) { DirectCallsToDefinedFunctions += Direction; CallKind += " to defined function"; RemarkKind += "ToDefinedFunction"; From 1649cf8d3af43fd4bdcb5bf6335fffb52f9d92af Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 15:41:36 -0400 Subject: [PATCH 070/114] Move KernelInfo to KernelInfo.cpp, remove KernelInfoAnalysis For now, analysis results will not be used beyond emitting remarks. If that changes, we can revert. --- llvm/include/llvm/Analysis/KernelInfo.h | 90 +------------------ llvm/lib/Analysis/KernelInfo.cpp | 73 ++++++++++++++- llvm/lib/Passes/PassRegistry.def | 1 - .../test/Analysis/KernelInfo/openmp/README.md | 4 +- 4 files changed, 75 insertions(+), 93 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 6d4edfb3525cc..c5c33fac34655 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter -// classes used to extract function properties from a GPU kernel. +// This file defines the KernelInfoPrinter class used to emit remarks about +// function properties from a GPU kernel. // // See llvm/docs/KernelInfo.rst. // ===---------------------------------------------------------------------===// @@ -18,95 +18,11 @@ #include "llvm/IR/PassManager.h" namespace llvm { -class BasicBlock; -class OptimizationRemarkEmitter; - -/// Data structure holding function info for kernels. -class KernelInfo { - void updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE); - -public: - static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); - - /// Whether the function has external linkage and is not a kernel function. - bool ExternalNotKernel = false; - - /// OpenMP Launch bounds. - ///@{ - std::optional OmpTargetNumTeams; - std::optional OmpTargetThreadLimit; - ///@} - - /// AMDGPU launch bounds. - ///@{ - std::optional AmdgpuMaxNumWorkgroupsX; - std::optional AmdgpuMaxNumWorkgroupsY; - std::optional AmdgpuMaxNumWorkgroupsZ; - std::optional AmdgpuFlatWorkGroupSizeMin; - std::optional AmdgpuFlatWorkGroupSizeMax; - std::optional AmdgpuWavesPerEuMin; - std::optional AmdgpuWavesPerEuMax; - ///@} - - /// NVPTX launch bounds. - ///@{ - std::optional Maxclusterrank; - std::optional Maxntidx; - ///@} - - /// The number of alloca instructions inside the function, the number of those - /// with allocation sizes that cannot be determined at compile time, and the - /// sum of the sizes that can be. - /// - /// With the current implementation for at least some GPU archs, - /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in - /// case the implementation changes. - int64_t Allocas = 0; - int64_t AllocasDyn = 0; - int64_t AllocasStaticSizeSum = 0; - - /// Number of direct/indirect calls (anything derived from CallBase). - int64_t DirectCalls = 0; - int64_t IndirectCalls = 0; - - /// Number of direct calls made from this function to other functions - /// defined in this module. - int64_t DirectCallsToDefinedFunctions = 0; - - /// Number of calls of type InvokeInst. - int64_t Invokes = 0; - - /// Target-specific flat address space. - unsigned FlatAddrspace; - - /// Number of flat address space memory accesses (via load, store, etc.). - int64_t FlatAddrspaceAccesses = 0; -}; - -/// Analysis class for KernelInfo. -class KernelInfoAnalysis : public AnalysisInfoMixin { -public: - static AnalysisKey Key; - - using Result = const KernelInfo; - - KernelInfo run(Function &F, FunctionAnalysisManager &FAM) { - return KernelInfo::getKernelInfo(F, FAM); - } -}; - -/// Printer pass for KernelInfoAnalysis. -/// -/// It just calls KernelInfoAnalysis, which prints remarks if they are enabled. class KernelInfoPrinter : public PassInfoMixin { public: explicit KernelInfoPrinter() {} - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) { - AM.getResult(F); - return PreservedAnalyses::all(); - } + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); static bool isRequired() { return true; } }; diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index c039d495ee6ed..a628f370c802e 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This file defines the KernelInfo, KernelInfoAnalysis, and KernelInfoPrinter -// classes used to extract function properties from a kernel. +// This file defines the KernelInfoPrinter class used to emit remarks about +// function properties from a GPU kernel. // //===----------------------------------------------------------------------===// @@ -27,6 +27,69 @@ using namespace llvm; #define DEBUG_TYPE "kernel-info" +/// Data structure holding function info for kernels. +class KernelInfo { + void updateForBB(const BasicBlock &BB, int64_t Direction, + OptimizationRemarkEmitter &ORE); + +public: + static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); + + /// Whether the function has external linkage and is not a kernel function. + bool ExternalNotKernel = false; + + /// OpenMP Launch bounds. + ///@{ + std::optional OmpTargetNumTeams; + std::optional OmpTargetThreadLimit; + ///@} + + /// AMDGPU launch bounds. + ///@{ + std::optional AmdgpuMaxNumWorkgroupsX; + std::optional AmdgpuMaxNumWorkgroupsY; + std::optional AmdgpuMaxNumWorkgroupsZ; + std::optional AmdgpuFlatWorkGroupSizeMin; + std::optional AmdgpuFlatWorkGroupSizeMax; + std::optional AmdgpuWavesPerEuMin; + std::optional AmdgpuWavesPerEuMax; + ///@} + + /// NVPTX launch bounds. + ///@{ + std::optional Maxclusterrank; + std::optional Maxntidx; + ///@} + + /// The number of alloca instructions inside the function, the number of those + /// with allocation sizes that cannot be determined at compile time, and the + /// sum of the sizes that can be. + /// + /// With the current implementation for at least some GPU archs, + /// AllocasDyn > 0 might not be possible, but we report AllocasDyn anyway in + /// case the implementation changes. + int64_t Allocas = 0; + int64_t AllocasDyn = 0; + int64_t AllocasStaticSizeSum = 0; + + /// Number of direct/indirect calls (anything derived from CallBase). + int64_t DirectCalls = 0; + int64_t IndirectCalls = 0; + + /// Number of direct calls made from this function to other functions + /// defined in this module. + int64_t DirectCallsToDefinedFunctions = 0; + + /// Number of calls of type InvokeInst. + int64_t Invokes = 0; + + /// Target-specific flat address space. + unsigned FlatAddrspace; + + /// Number of flat address space memory accesses (via load, store, etc.). + int64_t FlatAddrspaceAccesses = 0; +}; + static bool isKernelFunction(Function &F) { // TODO: Is this general enough? Consider languages beyond OpenMP. return F.hasFnAttribute("kernel"); @@ -342,4 +405,8 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, return KI; } -AnalysisKey KernelInfoAnalysis::Key; +PreservedAnalyses KernelInfoPrinter::run(Function &F, + FunctionAnalysisManager &AM) { + KernelInfo::getKernelInfo(F, AM); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index dcfa732f410b3..391cca0da2ea1 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -278,7 +278,6 @@ FUNCTION_ANALYSIS( MachineFunctionAnalysis(static_cast(TM))) FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis()) FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis()) -FUNCTION_ANALYSIS("kernel-info", KernelInfoAnalysis()) FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis()) FUNCTION_ANALYSIS("loops", LoopAnalysis()) FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis()) diff --git a/llvm/test/Analysis/KernelInfo/openmp/README.md b/llvm/test/Analysis/KernelInfo/openmp/README.md index 0d13950e198ed..5471b2e1b220d 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/README.md +++ b/llvm/test/Analysis/KernelInfo/openmp/README.md @@ -1,9 +1,9 @@ -The tests in this directory check that basic KernelInfoAnalysis functionality +The tests in this directory check that basic KernelInfoPrinter functionality behaves reasonably for LLVM IR produced by Clang OpenMP codegen. So that these tests are straightforward to maintain and faithfully represent Clang OpenMP codegen, do not tweak or reduce the LLVM IR in them. Other tests -more exhaustively check KernelInfoAnalysis features using reduced LLVM IR. +more exhaustively check KernelInfoPrinter features using reduced LLVM IR. The LLVM IR in each test file `$TEST` can be regenerated as follows in the case that Clang OpenMP codegen changes or it becomes desirable to adjust the source From 1a3c0aef034087e235fda909c69cc9e75b0bb874 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 15:42:20 -0400 Subject: [PATCH 071/114] Use printAsOperand not getName to identify instruction --- llvm/lib/Analysis/KernelInfo.cpp | 8 ++++++-- .../Analysis/KernelInfo/flat-addrspace/Inputs/test.ll | 10 +++++----- llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll | 2 +- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index a628f370c802e..41acde725b471 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -177,8 +177,12 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, } else { R << ", '" << Inst.getOpcodeName() << "' instruction"; } - if (Inst.hasName()) - R << " ('%" << Inst.getName() << "')"; + if (!Inst.getType()->isVoidTy()) { + std::string Name; + raw_string_ostream OS(Name); + Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); + R << " ('" << Name << "')"; + } R << " accesses memory in flat address space"; return R; }); diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll index 07c884792f45c..b54c3a18f3e70 100644 --- a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll @@ -1,7 +1,7 @@ define void @f() !dbg !3 { entry: ; load: check remarks for both unnamed and named values. - ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction accesses memory in flat address space + ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%0') accesses memory in flat address space %0 = load i32, ptr null, align 4, !dbg !6 ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in flat address space %load = load i32, ptr null, align 4, !dbg !6 @@ -19,17 +19,17 @@ entry: store i32 0, ptr addrspace(8) null, align 4, !dbg !7 ; atomicrmw - ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8 - ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction accesses memory in flat address space + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8 atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8 atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8 ; cmpxchg - ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9 - ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction accesses memory in flat address space + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 56ee35810ef26..82f6f243264bc 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -30,7 +30,7 @@ ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space -; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index ee76ecdf5d795..eb2cba596be22 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -29,7 +29,7 @@ ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space -; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 From ea89a81b0ebf30fa331f3bcd0dbfced21478846d Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 15:42:28 -0400 Subject: [PATCH 072/114] Use printAsOperand to report indirect callee --- llvm/lib/Analysis/KernelInfo.cpp | 21 +++++++++++---------- llvm/test/Analysis/KernelInfo/calls.ll | 8 ++++---- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 41acde725b471..9768fe90b1433 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -147,20 +147,21 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call); R << "in "; identifyFunction(R, Caller); - R << ", " << CallKind; - if (const Function *Callee = - dyn_cast_or_null(Call.getCalledOperand())) { - R << ", callee is"; - StringRef Name = Callee->getName(); - if (auto *SubProgram = Callee->getSubprogram()) { + R << ", " << CallKind << ", callee is"; + Value *Callee = Call.getCalledOperand(); + std::string Name; + if (const Function *FnCallee = dyn_cast(Callee)) { + if (auto *SubProgram = FnCallee->getSubprogram()) { if (SubProgram->isArtificial()) R << " artificial"; } - if (!Name.empty()) - R << " '" << Name << "'"; - else - R << " with unknown name"; + Name = FnCallee->getName(); } + if (Name.empty()) { + raw_string_ostream OS(Name); + Callee->printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); + } + R << " '" << Name << "'"; return R; }); } diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll index 25b8e3d880303..d00ab2b74d398 100644 --- a/llvm/test/Analysis/KernelInfo/calls.ll +++ b/llvm/test/Analysis/KernelInfo/calls.ll @@ -18,7 +18,7 @@ entry: ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !105 %fnPtr = load ptr, ptr null, align 8 - ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call + ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !106 ; CHECK: remark: test.c:20:5: in artificial function 'h', direct invoke, callee is 'f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !107 @@ -29,7 +29,7 @@ gcont: ; CHECK: remark: test.c:22:5: in artificial function 'h', direct invoke to defined function, callee is artificial 'h' invoke void @h() to label %hcont unwind label %cleanup, !dbg !109 hcont: - ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke + ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110 cleanup: %ll = landingpad { ptr, i32 } @@ -54,7 +54,7 @@ entry: ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !204 %fnPtr = load ptr, ptr null, align 8 - ; CHECK: remark: test.c:9:3: in function 'g', indirect call + ; CHECK: remark: test.c:9:3: in function 'g', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !205 ; CHECK: remark: test.c:10:3: in function 'g', direct invoke, callee is 'f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !206 @@ -65,7 +65,7 @@ gcont: ; CHECK: remark: test.c:12:3: in function 'g', direct invoke to defined function, callee is artificial 'h' invoke void @h() to label %hcont unwind label %cleanup, !dbg !208 hcont: - ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke + ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209 cleanup: %ll = landingpad { ptr, i32 } From 8da602b92369af0d9a4f794b1956bd15ecac0263 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 16:36:36 -0400 Subject: [PATCH 073/114] Report inline assembly calls --- llvm/lib/Analysis/KernelInfo.cpp | 8 +++++++ llvm/test/Analysis/KernelInfo/calls.ll | 24 +++++++++++++++---- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 3 +++ llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 3 +++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 9768fe90b1433..034194e27f9fb 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -80,6 +80,9 @@ class KernelInfo { /// defined in this module. int64_t DirectCallsToDefinedFunctions = 0; + /// Number of direct calls to inline assembly. + int64_t InlineAssemblyCalls = 0; + /// Number of calls of type InvokeInst. int64_t Invokes = 0; @@ -234,6 +237,10 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, CallKind += " to defined function"; RemarkKind += "ToDefinedFunction"; } + } else if (Call->isInlineAsm()) { + InlineAssemblyCalls += Direction; + CallKind += " to inline assembly"; + RemarkKind += "ToInlineAssembly"; } } remarkCall(ORE, F, *Call, CallKind, RemarkKind); @@ -403,6 +410,7 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, REMARK_PROPERTY(DirectCalls); REMARK_PROPERTY(IndirectCalls); REMARK_PROPERTY(DirectCallsToDefinedFunctions); + REMARK_PROPERTY(InlineAssemblyCalls); REMARK_PROPERTY(Invokes); REMARK_PROPERTY(FlatAddrspaceAccesses); #undef REMARK_PROPERTY diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll index d00ab2b74d398..2a2672c70b85c 100644 --- a/llvm/test/Analysis/KernelInfo/calls.ll +++ b/llvm/test/Analysis/KernelInfo/calls.ll @@ -17,6 +17,8 @@ entry: call void @g(), !dbg !104 ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !105 + ; CHECK: remark: test.c:24:5: in artificial function 'h', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' + call void asm sideeffect "eieio", ""(), !dbg !111 %fnPtr = load ptr, ptr null, align 8 ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !106 @@ -29,6 +31,9 @@ gcont: ; CHECK: remark: test.c:22:5: in artificial function 'h', direct invoke to defined function, callee is artificial 'h' invoke void @h() to label %hcont unwind label %cleanup, !dbg !109 hcont: + ; CHECK: remark: test.c:25:5: in artificial function 'h', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' + invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !112 +asmcont: ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110 cleanup: @@ -38,10 +43,11 @@ cleanup: end: ret void } -; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCalls = 6 +; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCalls = 8 ; CHECK: remark: test.c:13:0: in artificial function 'h', IndirectCalls = 2 ; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCallsToDefinedFunctions = 4 -; CHECK: remark: test.c:13:0: in artificial function 'h', Invokes = 4 +; CHECK: remark: test.c:13:0: in artificial function 'h', InlineAssemblyCalls = 2 +; CHECK: remark: test.c:13:0: in artificial function 'h', Invokes = 5 declare void @f() @@ -53,6 +59,8 @@ entry: call void @g(), !dbg !203 ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h' call void @h(), !dbg !204 + ; CHECK: remark: test.c:14:3: in function 'g', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' + call void asm sideeffect "eieio", ""(), !dbg !210 %fnPtr = load ptr, ptr null, align 8 ; CHECK: remark: test.c:9:3: in function 'g', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !205 @@ -65,6 +73,9 @@ gcont: ; CHECK: remark: test.c:12:3: in function 'g', direct invoke to defined function, callee is artificial 'h' invoke void @h() to label %hcont unwind label %cleanup, !dbg !208 hcont: + ; CHECK: remark: test.c:15:3: in function 'g', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' + invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !211 +asmcont: ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209 cleanup: @@ -74,10 +85,11 @@ cleanup: end: ret void } -; CHECK: remark: test.c:3:0: in function 'g', DirectCalls = 6 +; CHECK: remark: test.c:3:0: in function 'g', DirectCalls = 8 ; CHECK: remark: test.c:3:0: in function 'g', IndirectCalls = 2 ; CHECK: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 4 -; CHECK: remark: test.c:3:0: in function 'g', Invokes = 4 +; CHECK: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 2 +; CHECK: remark: test.c:3:0: in function 'g', Invokes = 5 !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} @@ -99,6 +111,8 @@ end: !108 = !DILocation(line: 21, column: 5, scope: !103) !109 = !DILocation(line: 22, column: 5, scope: !103) !110 = !DILocation(line: 23, column: 5, scope: !103) +!111 = !DILocation(line: 24, column: 5, scope: !103) +!112 = !DILocation(line: 25, column: 5, scope: !103) !200 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) !201 = !DISubroutineType(types: !3) @@ -110,3 +124,5 @@ end: !207 = !DILocation(line: 11, column: 3, scope: !200) !208 = !DILocation(line: 12, column: 3, scope: !200) !209 = !DILocation(line: 13, column: 3, scope: !200) +!210 = !DILocation(line: 14, column: 3, scope: !200) +!211 = !DILocation(line: 15, column: 3, scope: !200) diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 82f6f243264bc..be3b357cc4530 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -25,6 +25,7 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 @@ -39,6 +40,7 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 @@ -53,6 +55,7 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 ; CHECK-NOT: {{.}} diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index eb2cba596be22..2dbd04b2536c4 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -24,6 +24,7 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCalls = 4 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', DirectCallsToDefinedFunctions = 3 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 @@ -38,6 +39,7 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCalls = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 @@ -52,6 +54,7 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCalls = 2 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', IndirectCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 ; CHECK-NOT: remark: {{.*: in function 'g',.*}} From 45114fd9d85d614f2f3bc18543fb6779cab1053d Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 16:41:34 -0400 Subject: [PATCH 074/114] Use llvm::SmallString --- llvm/lib/Analysis/KernelInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 034194e27f9fb..96a77c96dc1f3 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -152,7 +152,7 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, identifyFunction(R, Caller); R << ", " << CallKind << ", callee is"; Value *Callee = Call.getCalledOperand(); - std::string Name; + SmallString<100> Name; // might be function name or asm expression if (const Function *FnCallee = dyn_cast(Callee)) { if (auto *SubProgram = FnCallee->getSubprogram()) { if (SubProgram->isArtificial()) @@ -161,7 +161,7 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, Name = FnCallee->getName(); } if (Name.empty()) { - raw_string_ostream OS(Name); + raw_svector_ostream OS(Name); Callee->printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); } R << " '" << Name << "'"; From eea139c63cde6f900962c5e999ffce79568b4391 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 16:47:20 -0400 Subject: [PATCH 075/114] Use llvm::SmallString --- llvm/lib/Analysis/KernelInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 96a77c96dc1f3..ff71323516238 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -182,8 +182,8 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, R << ", '" << Inst.getOpcodeName() << "' instruction"; } if (!Inst.getType()->isVoidTy()) { - std::string Name; - raw_string_ostream OS(Name); + SmallString<20> Name; + raw_svector_ostream OS(Name); Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); R << " ('" << Name << "')"; } From 8bf6e4e4bb262e0866d3e2098bb1a16c7293e2be Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 16 Aug 2024 17:17:07 -0400 Subject: [PATCH 076/114] getKernelInfo -> emitKernelInfo because return is unused --- llvm/lib/Analysis/KernelInfo.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index ff71323516238..282dc092bfd62 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -33,7 +33,7 @@ class KernelInfo { OptimizationRemarkEmitter &ORE); public: - static KernelInfo getKernelInfo(Function &F, FunctionAnalysisManager &FAM); + static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM); /// Whether the function has external linkage and is not a kernel function. bool ExternalNotKernel = false; @@ -359,8 +359,7 @@ static std::optional parseNVPTXMDNodeAsInteger(Function &F, return Result; } -KernelInfo KernelInfo::getKernelInfo(Function &F, - FunctionAnalysisManager &FAM) { +void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) { KernelInfo KI; KI.FlatAddrspace = FAM.getResult(F).getFlatAddressSpace(); @@ -415,11 +414,11 @@ KernelInfo KernelInfo::getKernelInfo(Function &F, REMARK_PROPERTY(FlatAddrspaceAccesses); #undef REMARK_PROPERTY - return KI; + return; } PreservedAnalyses KernelInfoPrinter::run(Function &F, FunctionAnalysisManager &AM) { - KernelInfo::getKernelInfo(F, AM); + KernelInfo::emitKernelInfo(F, AM); return PreservedAnalyses::all(); } From 62d494d9a9f13e5b58d71f083a6cb9f67f19579b Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 13 Sep 2024 17:30:23 -0400 Subject: [PATCH 077/114] Clean up launch bounds * For amdgpu, use AMGPUSubtarget functions to query values. Thus, we end up with logical values that don't appear explicitly in the IR, and we ignore some impossible values that do appear explicitly. * For nvptx, use NVPTXUtilities.h functions to query values. Thus, drop KernelInfo.cpp's implementation of NVVM annotation parsing. Also, add support for a few more launch bounds. * Move target-specific collection of launch bounds to target-specific classes (GCNSubtarget and NVPTXSubtarget). While making the above changes, I struggled to find the right headers to enable keeping the implementation in KernelInfo.cpp, and one reviewer wanted to see some reorganization along these lines anyway. --- llvm/include/llvm/Analysis/KernelInfo.h | 8 +- .../llvm/CodeGen/TargetSubtargetInfo.h | 5 + llvm/lib/Analysis/KernelInfo.cpp | 127 ++++-------------- llvm/lib/Passes/PassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 16 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 + llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp | 16 +++ llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 4 + llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 4 +- .../KernelInfo/launch-bounds/amdgpu.ll | 67 +++++++-- .../KernelInfo/launch-bounds/nvptx.ll | 10 +- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 19 +++ 13 files changed, 166 insertions(+), 120 deletions(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index c5c33fac34655..6633c28858a2f 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -18,9 +18,15 @@ #include "llvm/IR/PassManager.h" namespace llvm { + +class TargetMachine; + class KernelInfoPrinter : public PassInfoMixin { +private: + TargetMachine *TM; + public: - explicit KernelInfoPrinter() {} + explicit KernelInfoPrinter(TargetMachine *TM) : TM(TM) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h index b4b018f080914..5d75510e91513 100644 --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -338,6 +338,11 @@ class TargetSubtargetInfo : public MCSubtargetInfo { /// the pass, with architecture specific overrides providing the information /// where they are implemented. virtual bool supportsInitUndef() const { return false; } + + /// For \p F, call \p Body with the name and value of each launch bound. + virtual void forEachLaunchBound( + const Function &F, + std::function Body) const {} }; } // end namespace llvm diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 282dc092bfd62..6d0efdfec8344 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -22,6 +23,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/Passes/PassBuilder.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -33,7 +35,8 @@ class KernelInfo { OptimizationRemarkEmitter &ORE); public: - static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM); + static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, + TargetMachine *TM); /// Whether the function has external linkage and is not a kernel function. bool ExternalNotKernel = false; @@ -44,23 +47,6 @@ class KernelInfo { std::optional OmpTargetThreadLimit; ///@} - /// AMDGPU launch bounds. - ///@{ - std::optional AmdgpuMaxNumWorkgroupsX; - std::optional AmdgpuMaxNumWorkgroupsY; - std::optional AmdgpuMaxNumWorkgroupsZ; - std::optional AmdgpuFlatWorkGroupSizeMin; - std::optional AmdgpuFlatWorkGroupSizeMax; - std::optional AmdgpuWavesPerEuMin; - std::optional AmdgpuWavesPerEuMax; - ///@} - - /// NVPTX launch bounds. - ///@{ - std::optional Maxclusterrank; - std::optional Maxntidx; - ///@} - /// The number of alloca instructions inside the function, the number of those /// with allocation sizes that cannot be determined at compile time, and the /// sum of the sizes that can be. @@ -298,68 +284,23 @@ static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, remarkProperty(ORE, F, Name, Value.value()); } -static std::vector> -parseFnAttrAsIntegerFields(Function &F, StringRef Name, unsigned NumFields) { - std::vector> Result(NumFields); - Attribute A = F.getFnAttribute(Name); - if (!A.isStringAttribute()) - return Result; - StringRef Rest = A.getValueAsString(); - for (unsigned I = 0; I < NumFields; ++I) { - StringRef Field; - std::tie(Field, Rest) = Rest.split(','); - if (Field.empty()) - break; - int64_t Val; - if (Field.getAsInteger(0, Val)) { - F.getContext().emitError("cannot parse integer in attribute '" + Name + - "': " + Field); - break; - } - Result[I] = Val; - } - if (!Rest.empty()) - F.getContext().emitError("too many fields in attribute " + Name); - return Result; -} - static std::optional parseFnAttrAsInteger(Function &F, StringRef Name) { - return parseFnAttrAsIntegerFields(F, Name, 1)[0]; -} - -// TODO: This nearly duplicates the same function in OMPIRBuilder.cpp. Can we -// share? -static MDNode *getNVPTXMDNode(Function &F, StringRef Name) { - Module &M = *F.getParent(); - NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"); - if (!MD) - return nullptr; - for (auto *Op : MD->operands()) { - if (Op->getNumOperands() != 3) - continue; - auto *KernelOp = dyn_cast(Op->getOperand(0)); - if (!KernelOp || KernelOp->getValue() != &F) - continue; - auto *Prop = dyn_cast(Op->getOperand(1)); - if (!Prop || Prop->getString() != Name) - continue; - return Op; - } - return nullptr; -} - -static std::optional parseNVPTXMDNodeAsInteger(Function &F, - StringRef Name) { - std::optional Result; - if (MDNode *ExistingOp = getNVPTXMDNode(F, Name)) { - auto *Op = cast(ExistingOp->getOperand(2)); - Result = cast(Op->getValue())->getZExtValue(); + Attribute A = F.getFnAttribute(Name); + if (!A.isStringAttribute()) + return std::nullopt; + StringRef Field = A.getValueAsString(); + int64_t Val; + if (Field.getAsInteger(0, Val)) { + F.getContext().emitError("cannot parse integer in attribute '" + Name + + "': " + Field); + return std::nullopt; } - return Result; + return Val; } -void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) { +void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, + TargetMachine *TM) { KernelInfo KI; KI.FlatAddrspace = FAM.getResult(F).getFlatAddressSpace(); @@ -367,21 +308,6 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) { KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams"); KI.OmpTargetThreadLimit = parseFnAttrAsInteger(F, "omp_target_thread_limit"); - auto AmdgpuMaxNumWorkgroups = - parseFnAttrAsIntegerFields(F, "amdgpu-max-num-workgroups", 3); - KI.AmdgpuMaxNumWorkgroupsX = AmdgpuMaxNumWorkgroups[0]; - KI.AmdgpuMaxNumWorkgroupsY = AmdgpuMaxNumWorkgroups[1]; - KI.AmdgpuMaxNumWorkgroupsZ = AmdgpuMaxNumWorkgroups[2]; - auto AmdgpuFlatWorkGroupSize = - parseFnAttrAsIntegerFields(F, "amdgpu-flat-work-group-size", 2); - KI.AmdgpuFlatWorkGroupSizeMin = AmdgpuFlatWorkGroupSize[0]; - KI.AmdgpuFlatWorkGroupSizeMax = AmdgpuFlatWorkGroupSize[1]; - auto AmdgpuWavesPerEu = - parseFnAttrAsIntegerFields(F, "amdgpu-waves-per-eu", 2); - KI.AmdgpuWavesPerEuMin = AmdgpuWavesPerEu[0]; - KI.AmdgpuWavesPerEuMax = AmdgpuWavesPerEu[1]; - KI.Maxclusterrank = parseNVPTXMDNodeAsInteger(F, "maxclusterrank"); - KI.Maxntidx = parseNVPTXMDNodeAsInteger(F, "maxntidx"); const DominatorTree &DT = FAM.getResult(F); auto &ORE = FAM.getResult(F); @@ -394,15 +320,16 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) { REMARK_PROPERTY(ExternalNotKernel); REMARK_PROPERTY(OmpTargetNumTeams); REMARK_PROPERTY(OmpTargetThreadLimit); - REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsX); - REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsY); - REMARK_PROPERTY(AmdgpuMaxNumWorkgroupsZ); - REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMin); - REMARK_PROPERTY(AmdgpuFlatWorkGroupSizeMax); - REMARK_PROPERTY(AmdgpuWavesPerEuMin); - REMARK_PROPERTY(AmdgpuWavesPerEuMax); - REMARK_PROPERTY(Maxclusterrank); - REMARK_PROPERTY(Maxntidx); + // TM might be nullptr if support for the target was not built. For example, + // we currently have some KernelInfo tests where the choice of target isn't + // important, so they arbitrarily choose a target triple. Those tests are + // expected to run successfully even if support for that target was not built. + if (TM) { + TM->getSubtargetImpl(F)->forEachLaunchBound( + F, [&](StringRef Name, unsigned Value) { + remarkProperty(ORE, F, Name, Value); + }); + } REMARK_PROPERTY(Allocas); REMARK_PROPERTY(AllocasStaticSizeSum); REMARK_PROPERTY(AllocasDyn); @@ -419,6 +346,6 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM) { PreservedAnalyses KernelInfoPrinter::run(Function &F, FunctionAnalysisManager &AM) { - KernelInfo::emitKernelInfo(F, AM); + KernelInfo::emitKernelInfo(F, AM, TM); return PreservedAnalyses::all(); } diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 6b3ccfccf3ae0..10b0b3f57c289 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -382,7 +382,7 @@ FUNCTION_PASS("irce", IRCEPass()) FUNCTION_PASS("jump-threading", JumpThreadingPass()) FUNCTION_PASS("jump-table-to-switch", JumpTableToSwitchPass()); FUNCTION_PASS("kcfi", KCFIPass()) -FUNCTION_PASS("kernel-info", KernelInfoPrinter()) +FUNCTION_PASS("kernel-info", KernelInfoPrinter(TM)) FUNCTION_PASS("lcssa", LCSSAPass()) FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass()) FUNCTION_PASS("lint", LintPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 555302b290da2..c0e3df93264c9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -790,9 +790,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (KernelInfoEndLTO) { PB.registerFullLinkTimeOptimizationLastEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { + [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; - FPM.addPass(KernelInfoPrinter()); + FPM.addPass(KernelInfoPrinter(this)); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 52c24a5c25ec2..f1eb5fcb2c06f 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -711,6 +711,22 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { return NSAThreshold; } +void GCNSubtarget::forEachLaunchBound( + const Function &F, + std::function Body) const { + auto AmdgpuMaxNumWorkgroups = getMaxNumWorkGroups(F); + Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]); + Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]); + Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]); + auto AmdgpuFlatWorkGroupSize = getFlatWorkGroupSizes(F); + Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first); + Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second); + auto AmdgpuWavesPerEU = getWavesPerEU(F); + Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first); + Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second); + // TODO: Any others we should add? +} + GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST) : ST(ST) { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 7b74eab96c567..a514945a5e6f5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1587,6 +1587,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // the nop. return true; } + + virtual void forEachLaunchBound( + const Function &F, + std::function Body) const override; }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index 420065585b384..fccb3de453734 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -12,6 +12,7 @@ #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" +#include "NVPTXUtilities.h" using namespace llvm; @@ -69,3 +70,18 @@ bool NVPTXSubtarget::hasImageHandles() const { bool NVPTXSubtarget::allowFP16Math() const { return hasFP16Math() && NoF16Math == false; } + +void NVPTXSubtarget::forEachLaunchBound( + const Function &F, + std::function Body) const { + unsigned Val; + if (getMaxClusterRank(F, Val)) + Body("Maxclusterrank", Val); + if (auto Val = getMaxNTIDx(F)) + Body("Maxntidx", *Val); + if (auto Val = getMaxNTIDy(F)) + Body("Maxntidy", *Val); + if (auto Val = getMaxNTIDz(F)) + Body("Maxntidz", *Val); + // TODO: Any others we should add? +} diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 457f10f1d64a2..6cc8b6764cf8e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -119,6 +119,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); + + virtual void forEachLaunchBound( + const Function &F, + std::function Body) const override; }; } // End llvm namespace diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 777d1215214ec..8fd3dacbab87e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -242,9 +242,9 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (KernelInfoEndLTO) { PB.registerFullLinkTimeOptimizationLastEPCallback( - [](ModulePassManager &PM, OptimizationLevel Level) { + [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; - FPM.addPass(KernelInfoPrinter()); + FPM.addPass(KernelInfoPrinter(this)); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); } diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll index 0c98f4ad45950..472d7c0286b01 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll @@ -1,5 +1,7 @@ ; Check info on launch bounds for AMD GPU. +; REQUIRES: amdgpu-registered-target + ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -disable-output %s 2>&1 | \ ; RUN: FileCheck -match-full-lines %s @@ -7,16 +9,44 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" target triple = "amdgcn-amd-amdhsa" -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsX = 200 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsY = 201 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuMaxNumWorkgroupsZ = 202 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMin = 210 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuFlatWorkGroupSizeMax = 211 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMin = 220 -; CHECK: remark: test.c:10:0: in artificial function 'test', AmdgpuWavesPerEuMax = 221 -define void @test() #0 !dbg !5 { +; CHECK: remark: test.c:10:0: in artificial function 'all', OmpTargetNumTeams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'all', OmpTargetThreadLimit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsX = 200 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsY = 201 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsZ = 202 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuFlatWorkGroupSizeMin = 210 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuFlatWorkGroupSizeMax = 211 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuWavesPerEUMin = 2 +; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuWavesPerEUMax = 9 +define void @all() #0 !dbg !5 { +entry: + ret void +} + +; CHECK-NOT: remark: test.c:11:0: in function 'none', OmpTargetNumTeams = {{.*}} +; CHECK-NOT: remark: test.c:11:0: in function 'none', OmpTargetThreadLimit = {{.*}} +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsX = 0 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsY = 0 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsZ = 0 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuFlatWorkGroupSizeMin = 1 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuFlatWorkGroupSizeMax = 1024 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuWavesPerEUMin = 4 +; CHECK: remark: test.c:11:0: in function 'none', AmdgpuWavesPerEUMax = 10 +define void @none() !dbg !6 { +entry: + ret void +} + +; CHECK: remark: test.c:12:0: in function 'bogus', OmpTargetNumTeams = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', OmpTargetThreadLimit = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsX = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsY = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsZ = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuFlatWorkGroupSizeMin = 1 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuFlatWorkGroupSizeMax = 1024 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuWavesPerEUMin = 4 +; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuWavesPerEUMax = 10 +define void @bogus() #1 !dbg !7 { entry: ret void } @@ -26,7 +56,18 @@ attributes #0 = { "omp_target_thread_limit"="101" "amdgpu-max-num-workgroups"="200,201,202" "amdgpu-flat-work-group-size"="210,211" - "amdgpu-waves-per-eu"="220,221" + "amdgpu-waves-per-eu"="2,9" +} + +; We choose values that are small enough to parse successfully but that are +; impossibly large. For values that are validated, we check that they are +; overridden with realistic values. +attributes #1 = { + "omp_target_num_teams"="987654321" + "omp_target_thread_limit"="987654321" + "amdgpu-max-num-workgroups"="987654321,987654321,987654321" + "amdgpu-flat-work-group-size"="987654321,987654321" + "amdgpu-waves-per-eu"="987654321,987654321" } !llvm.module.flags = !{!0} @@ -37,4 +78,6 @@ attributes #0 = { !2 = !DIFile(filename: "test.c", directory: "/tmp") !3 = !{} !4 = !DISubroutineType(types: !3) -!5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!5 = distinct !DISubprogram(name: "all", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!6 = distinct !DISubprogram(name: "none", scope: !2, file: !2, line: 11, type: !4, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) +!7 = distinct !DISubprogram(name: "bogus", scope: !2, file: !2, line: 12, type: !4, scopeLine: 12, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll index c7339f90e3ca9..d9a024f38652e 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll @@ -1,5 +1,7 @@ ; Check info on launch bounds for NVPTX. +; REQUIRES: nvptx-registered-target + ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -disable-output %s 2>&1 | \ ; RUN: FileCheck -match-full-lines %s @@ -11,6 +13,8 @@ target triple = "nvptx64-nvidia-cuda" ; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101 ; CHECK: remark: test.c:10:0: in artificial function 'test', Maxclusterrank = 200 ; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidx = 210 +; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidy = 211 +; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidz = 212 define void @test() #0 !dbg !5 { entry: ret void @@ -23,7 +27,7 @@ attributes #0 = { !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} -!nvvm.annotations = !{!6, !7, !8} +!nvvm.annotations = !{!6, !7, !8, !9, !10} !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) @@ -33,4 +37,6 @@ attributes #0 = { !5 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 10, type: !4, scopeLine: 10, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !3) !6 = !{ptr @test, !"maxclusterrank", i32 200} !7 = !{ptr @test, !"maxntidx", i32 210} -!8 = distinct !{ptr null, !"kernel", i32 1} +!8 = !{ptr @test, !"maxntidy", i32 211} +!9 = !{ptr @test, !"maxntidz", i32 212} +!10 = distinct !{ptr null, !"kernel", i32 1} diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index be3b357cc4530..d21dde10f979a 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -17,8 +17,13 @@ ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsX = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsY = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsZ = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMin = 1 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMax = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuWavesPerEUMin = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuWavesPerEUMax = 10 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 @@ -34,6 +39,13 @@ ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsX = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsY = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsZ = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuFlatWorkGroupSizeMin = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuFlatWorkGroupSizeMax = 1024 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuWavesPerEUMin = 4 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuWavesPerEUMax = 10 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 @@ -49,6 +61,13 @@ ; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsX = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsY = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsZ = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuFlatWorkGroupSizeMin = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuFlatWorkGroupSizeMax = 1024 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuWavesPerEUMin = 4 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuWavesPerEUMax = 10 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0 From 94d90d17e156f6a8e89cf3155bde2138a65c4f42 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 16 Sep 2024 16:21:32 -0400 Subject: [PATCH 078/114] Adjust forEachLaunchBound param * std::function -> llvm::function_ref * unsigned -> int64_t --- llvm/include/llvm/CodeGen/TargetSubtargetInfo.h | 2 +- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 2 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 7 ++++--- llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 7 ++++--- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h index 485aa7e13fe69..d301304a47275 100644 --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -343,7 +343,7 @@ class TargetSubtargetInfo : public MCSubtargetInfo { /// For \p F, call \p Body with the name and value of each launch bound. virtual void forEachLaunchBound( const Function &F, - std::function Body) const {} + llvm::function_ref Body) const {} }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index f1eb5fcb2c06f..1ec7a6f64bbf5 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -713,7 +713,7 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { void GCNSubtarget::forEachLaunchBound( const Function &F, - std::function Body) const { + llvm::function_ref Body) const { auto AmdgpuMaxNumWorkgroups = getMaxNumWorkGroups(F); Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]); Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 7fb7fcd496ade..0df0a3e8ecca6 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1594,9 +1594,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return false; } - virtual void forEachLaunchBound( - const Function &F, - std::function Body) const override; + virtual void + forEachLaunchBound(const Function &F, + llvm::function_ref + Body) const override; }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index fccb3de453734..ab68f54f0473c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -73,7 +73,7 @@ bool NVPTXSubtarget::allowFP16Math() const { void NVPTXSubtarget::forEachLaunchBound( const Function &F, - std::function Body) const { + llvm::function_ref Body) const { unsigned Val; if (getMaxClusterRank(F, Val)) Body("Maxclusterrank", Val); diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 6cc8b6764cf8e..710faf0665054 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -120,9 +120,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); - virtual void forEachLaunchBound( - const Function &F, - std::function Body) const override; + virtual void + forEachLaunchBound(const Function &F, + llvm::function_ref + Body) const override; }; } // End llvm namespace From 762a217705f0ffd90723e2d8d9d54f1c39975c2a Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 16 Sep 2024 16:21:58 -0400 Subject: [PATCH 079/114] Reuse Function::getFnAttributeAsParsedInteger --- llvm/lib/Analysis/KernelInfo.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 6d0efdfec8344..85d923a97740d 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -286,17 +286,9 @@ static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, static std::optional parseFnAttrAsInteger(Function &F, StringRef Name) { - Attribute A = F.getFnAttribute(Name); - if (!A.isStringAttribute()) + if (!F.hasFnAttribute(Name)) return std::nullopt; - StringRef Field = A.getValueAsString(); - int64_t Val; - if (Field.getAsInteger(0, Val)) { - F.getContext().emitError("cannot parse integer in attribute '" + Name + - "': " + Field); - return std::nullopt; - } - return Val; + return F.getFnAttributeAsParsedInteger(Name); } void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, From df66a3d2c28339f2f3d6cc515a550894e5a05bef Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 16 Sep 2024 16:22:09 -0400 Subject: [PATCH 080/114] Move forEachLaunchBound to TargetTransformInfo --- .../include/llvm/Analysis/TargetTransformInfo.h | 15 +++++++++++++++ .../llvm/Analysis/TargetTransformInfoImpl.h | 4 ++++ llvm/include/llvm/CodeGen/TargetSubtargetInfo.h | 5 ----- llvm/lib/Analysis/KernelInfo.cpp | 17 +++++------------ llvm/lib/Analysis/TargetTransformInfo.cpp | 6 ++++++ .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 16 ++++++++++++++++ .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 3 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 16 ---------------- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 ----- llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp | 16 ---------------- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 5 ----- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 15 +++++++++++++++ .../lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 3 +++ 13 files changed, 67 insertions(+), 59 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index b2124c6106198..e55aed11e53c9 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1788,6 +1788,11 @@ class TargetTransformInfo { /// @} + /// For \p F, call \p Body with the name and value of each launch bound. + void forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const; + private: /// The abstract base class used to type erase specific TTI /// implementations. @@ -2179,6 +2184,9 @@ class TargetTransformInfo::Concept { getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual unsigned getMaxNumArgs() const = 0; + virtual void forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const = 0; }; template @@ -2952,6 +2960,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } + + void + forEachLaunchBound(const Function &F, + llvm::function_ref + Body) const override { + return Impl.forEachLaunchBound(F, Body); + } }; template diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 90eef93a2a54d..684aa44cb945e 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -973,6 +973,10 @@ class TargetTransformInfoImplBase { unsigned getMaxNumArgs() const { return UINT_MAX; } + void forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const {} + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h index d301304a47275..bfaa6450779ae 100644 --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -339,11 +339,6 @@ class TargetSubtargetInfo : public MCSubtargetInfo { // Conservatively assume such instructions exist by default. return true; } - - /// For \p F, call \p Body with the name and value of each launch bound. - virtual void forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const {} }; } // end namespace llvm diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 85d923a97740d..a71d8b3acd09f 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -15,7 +15,6 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" @@ -294,7 +293,8 @@ static std::optional parseFnAttrAsInteger(Function &F, void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, TargetMachine *TM) { KernelInfo KI; - KI.FlatAddrspace = FAM.getResult(F).getFlatAddressSpace(); + TargetTransformInfo &TheTTI = FAM.getResult(F); + KI.FlatAddrspace = TheTTI.getFlatAddressSpace(); // Record function properties. KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); @@ -312,16 +312,9 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, REMARK_PROPERTY(ExternalNotKernel); REMARK_PROPERTY(OmpTargetNumTeams); REMARK_PROPERTY(OmpTargetThreadLimit); - // TM might be nullptr if support for the target was not built. For example, - // we currently have some KernelInfo tests where the choice of target isn't - // important, so they arbitrarily choose a target triple. Those tests are - // expected to run successfully even if support for that target was not built. - if (TM) { - TM->getSubtargetImpl(F)->forEachLaunchBound( - F, [&](StringRef Name, unsigned Value) { - remarkProperty(ORE, F, Name, Value); - }); - } + TheTTI.forEachLaunchBound(F, [&](StringRef Name, unsigned Value) { + remarkProperty(ORE, F, Name, Value); + }); REMARK_PROPERTY(Allocas); REMARK_PROPERTY(AllocasStaticSizeSum); REMARK_PROPERTY(AllocasDyn); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 2c26493bd3f1c..cf48fa7614173 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1348,6 +1348,12 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } +void TargetTransformInfo::forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const { + return TTIImpl->forEachLaunchBound(F, Body); +} + TargetTransformInfo::Concept::~Concept() = default; TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 4cf7733a260ff..fe362f40cf56f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1390,3 +1390,19 @@ unsigned GCNTTIImpl::getPrefetchDistance() const { bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { return AMDGPU::isFlatGlobalAddrSpace(AS); } + +void GCNTTIImpl::forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const { + auto AmdgpuMaxNumWorkgroups = ST->getMaxNumWorkGroups(F); + Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]); + Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]); + Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]); + auto AmdgpuFlatWorkGroupSize = ST->getFlatWorkGroupSizes(F); + Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first); + Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second); + auto AmdgpuWavesPerEU = ST->getWavesPerEU(F); + Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first); + Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second); + // TODO: Any others we should add? +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 01df2e6caaba1..529170888f2e9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -266,6 +266,9 @@ class GCNTTIImpl final : public BasicTTIImplBase { /// \return if target want to issue a prefetch in address space \p AS. bool shouldPrefetchAddressSpace(unsigned AS) const override; + void forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 1ec7a6f64bbf5..52c24a5c25ec2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -711,22 +711,6 @@ unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { return NSAThreshold; } -void GCNSubtarget::forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const { - auto AmdgpuMaxNumWorkgroups = getMaxNumWorkGroups(F); - Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]); - Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]); - Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]); - auto AmdgpuFlatWorkGroupSize = getFlatWorkGroupSizes(F); - Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first); - Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second); - auto AmdgpuWavesPerEU = getWavesPerEU(F); - Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first); - Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second); - // TODO: Any others we should add? -} - GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST) : ST(ST) { diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 0df0a3e8ecca6..a4ae8a1be3225 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1593,11 +1593,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // to the same register. return false; } - - virtual void - forEachLaunchBound(const Function &F, - llvm::function_ref - Body) const override; }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index ab68f54f0473c..420065585b384 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -12,7 +12,6 @@ #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" -#include "NVPTXUtilities.h" using namespace llvm; @@ -70,18 +69,3 @@ bool NVPTXSubtarget::hasImageHandles() const { bool NVPTXSubtarget::allowFP16Math() const { return hasFP16Math() && NoF16Math == false; } - -void NVPTXSubtarget::forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const { - unsigned Val; - if (getMaxClusterRank(F, Val)) - Body("Maxclusterrank", Val); - if (auto Val = getMaxNTIDx(F)) - Body("Maxntidx", *Val); - if (auto Val = getMaxNTIDy(F)) - Body("Maxntidy", *Val); - if (auto Val = getMaxNTIDz(F)) - Body("Maxntidz", *Val); - // TODO: Any others we should add? -} diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 710faf0665054..457f10f1d64a2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -119,11 +119,6 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); - - virtual void - forEachLaunchBound(const Function &F, - llvm::function_ref - Body) const override; }; } // End llvm namespace diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 9a8ea8f87896a..50cc2c8e22d4f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -442,3 +442,18 @@ void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) { BaseT::getPeelingPreferences(L, SE, PP); } + +void NVPTXTTIImpl::forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const { + unsigned Val; + if (getMaxClusterRank(F, Val)) + Body("Maxclusterrank", Val); + if (auto Val = getMaxNTIDx(F)) + Body("Maxntidx", *Val); + if (auto Val = getMaxNTIDy(F)) + Body("Maxntidy", *Val); + if (auto Val = getMaxNTIDz(F)) + Body("Maxntidz", *Val); + // TODO: Any others we should add? +} diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 4160f5f6bfae7..2d794f1d80050 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -124,6 +124,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase { return true; } } + void forEachLaunchBound( + const Function &F, + llvm::function_ref Body) const; }; } // end namespace llvm From 3f63d532fa99a59a3be58e31d09943b143b1c889 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Thu, 26 Sep 2024 14:03:13 -0400 Subject: [PATCH 081/114] forEachLaunchBound -> collectLaunchBounds Return the launch bounds instead of passing them to a callback. --- .../llvm/Analysis/TargetTransformInfo.h | 21 +++++++------- .../llvm/Analysis/TargetTransformInfoImpl.h | 4 +-- llvm/lib/Analysis/KernelInfo.cpp | 28 ++++++------------- llvm/lib/Analysis/TargetTransformInfo.cpp | 6 ++-- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 18 ++++++------ .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 6 ++-- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 12 ++++---- .../Target/NVPTX/NVPTXTargetTransformInfo.h | 6 ++-- 8 files changed, 45 insertions(+), 56 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 27798ca4747e6..106fef4ef820b 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1794,10 +1794,10 @@ class TargetTransformInfo { /// @} - /// For \p F, call \p Body with the name and value of each launch bound. - void forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const; + /// Collect launch bounds for \p F into \p LB. + void + collectLaunchBounds(const Function &F, + SmallVectorImpl> &LB) const; private: /// The abstract base class used to type erase specific TTI @@ -2191,9 +2191,9 @@ class TargetTransformInfo::Concept { getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual unsigned getMaxNumArgs() const = 0; - virtual void forEachLaunchBound( + virtual void collectLaunchBounds( const Function &F, - llvm::function_ref Body) const = 0; + SmallVectorImpl> &LB) const = 0; }; template @@ -2973,11 +2973,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.getMaxNumArgs(); } - void - forEachLaunchBound(const Function &F, - llvm::function_ref - Body) const override { - return Impl.forEachLaunchBound(F, Body); + void collectLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const override { + Impl.collectLaunchBounds(F, LB); } }; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 7832c2f2c2803..1e05fa7200fe7 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -979,9 +979,9 @@ class TargetTransformInfoImplBase { unsigned getMaxNumArgs() const { return UINT_MAX; } - void forEachLaunchBound( + void collectLaunchBounds( const Function &F, - llvm::function_ref Body) const {} + SmallVectorImpl> &LB) const {} protected: // Obtain the minimum required size to hold the value (without the sign) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index a71d8b3acd09f..826340ca8401d 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -40,11 +40,8 @@ class KernelInfo { /// Whether the function has external linkage and is not a kernel function. bool ExternalNotKernel = false; - /// OpenMP Launch bounds. - ///@{ - std::optional OmpTargetNumTeams; - std::optional OmpTargetThreadLimit; - ///@} + /// Launch bounds. + SmallVector> LaunchBounds; /// The number of alloca instructions inside the function, the number of those /// with allocation sizes that cannot be determined at compile time, and the @@ -276,13 +273,6 @@ static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, }); } -static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, - StringRef Name, std::optional Value) { - if (!Value) - return; - remarkProperty(ORE, F, Name, Value.value()); -} - static std::optional parseFnAttrAsInteger(Function &F, StringRef Name) { if (!F.hasFnAttribute(Name)) @@ -298,8 +288,11 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, // Record function properties. KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); - KI.OmpTargetNumTeams = parseFnAttrAsInteger(F, "omp_target_num_teams"); - KI.OmpTargetThreadLimit = parseFnAttrAsInteger(F, "omp_target_thread_limit"); + if (auto Val = parseFnAttrAsInteger(F, "omp_target_num_teams")) + KI.LaunchBounds.push_back({"OmpTargetNumTeams", *Val}); + if (auto Val = parseFnAttrAsInteger(F, "omp_target_thread_limit")) + KI.LaunchBounds.push_back({"OmpTargetThreadLimit", *Val}); + TheTTI.collectLaunchBounds(F, KI.LaunchBounds); const DominatorTree &DT = FAM.getResult(F); auto &ORE = FAM.getResult(F); @@ -310,11 +303,8 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, #define REMARK_PROPERTY(PROP_NAME) \ remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) REMARK_PROPERTY(ExternalNotKernel); - REMARK_PROPERTY(OmpTargetNumTeams); - REMARK_PROPERTY(OmpTargetThreadLimit); - TheTTI.forEachLaunchBound(F, [&](StringRef Name, unsigned Value) { - remarkProperty(ORE, F, Name, Value); - }); + for (auto LB : KI.LaunchBounds) + remarkProperty(ORE, F, LB.first, LB.second); REMARK_PROPERTY(Allocas); REMARK_PROPERTY(AllocasStaticSizeSum); REMARK_PROPERTY(AllocasDyn); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 946754735efcb..6c24ec34d80b2 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1354,10 +1354,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } -void TargetTransformInfo::forEachLaunchBound( +void TargetTransformInfo::collectLaunchBounds( const Function &F, - llvm::function_ref Body) const { - return TTIImpl->forEachLaunchBound(F, Body); + SmallVectorImpl> &LB) const { + return TTIImpl->collectLaunchBounds(F, LB); } TargetTransformInfo::Concept::~Concept() = default; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index fe362f40cf56f..6094e5a42f4bf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1391,18 +1391,18 @@ bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { return AMDGPU::isFlatGlobalAddrSpace(AS); } -void GCNTTIImpl::forEachLaunchBound( +void GCNTTIImpl::collectLaunchBounds( const Function &F, - llvm::function_ref Body) const { + SmallVectorImpl> &LB) const { auto AmdgpuMaxNumWorkgroups = ST->getMaxNumWorkGroups(F); - Body("AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]); - Body("AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]); - Body("AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]); + LB.push_back({"AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]}); + LB.push_back({"AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]}); + LB.push_back({"AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]}); auto AmdgpuFlatWorkGroupSize = ST->getFlatWorkGroupSizes(F); - Body("AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first); - Body("AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second); + LB.push_back({"AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first}); + LB.push_back({"AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second}); auto AmdgpuWavesPerEU = ST->getWavesPerEU(F); - Body("AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first); - Body("AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second); + LB.push_back({"AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first}); + LB.push_back({"AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second}); // TODO: Any others we should add? } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 34944e6c478aa..4b30ac71ccd33 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -265,9 +265,9 @@ class GCNTTIImpl final : public BasicTTIImplBase { /// \return if target want to issue a prefetch in address space \p AS. bool shouldPrefetchAddressSpace(unsigned AS) const override; - void forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const; + void + collectLaunchBounds(const Function &F, + SmallVectorImpl> &LB) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 1a99a1cf91144..4752cf01dd205 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -443,16 +443,16 @@ void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, BaseT::getPeelingPreferences(L, SE, PP); } -void NVPTXTTIImpl::forEachLaunchBound( +void NVPTXTTIImpl::collectLaunchBounds( const Function &F, - llvm::function_ref Body) const { + SmallVectorImpl> &LB) const { if (auto Val = getMaxClusterRank(F)) - Body("Maxclusterrank", *Val); + LB.push_back({"Maxclusterrank", *Val}); if (auto Val = getMaxNTIDx(F)) - Body("Maxntidx", *Val); + LB.push_back({"Maxntidx", *Val}); if (auto Val = getMaxNTIDy(F)) - Body("Maxntidy", *Val); + LB.push_back({"Maxntidy", *Val}); if (auto Val = getMaxNTIDz(F)) - Body("Maxntidz", *Val); + LB.push_back({"Maxntidz", *Val}); // TODO: Any others we should add? } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 7e7d1dd588855..07c14e88cc786 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -123,9 +123,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase { return true; } } - void forEachLaunchBound( - const Function &F, - llvm::function_ref Body) const; + void + collectLaunchBounds(const Function &F, + SmallVectorImpl> &LB) const; }; } // end namespace llvm From f5d9f550cd711bd86c0d7b7bf70b7f5ecd9772e0 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 27 Sep 2024 19:37:45 -0500 Subject: [PATCH 082/114] Rebase updates --- compiler-rt/lib/profile/InstrProfilingFile.c | 1 - .../common/src/PluginInterface.cpp | 5 +- offload/test/offloading/pgo1.c | 74 ------------------- 3 files changed, 2 insertions(+), 78 deletions(-) delete mode 100644 offload/test/offloading/pgo1.c diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index a912de79da03a..c40942e37cb07 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1359,7 +1359,6 @@ int __llvm_write_custom_profile(const char *Target, /* Prepend "TARGET." to current filename */ memcpy(TargetFilename, Target, TargetLength); TargetFilename[TargetLength] = '.'; - memcpy(TargetFilename, Target, TargetLength); memcpy(TargetFilename + 1 + TargetLength, Filename, FilenameLength); TargetFilename[FilenameLength + 1 + TargetLength] = 0; diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 0153da6cbf677..dc6f50cab9d58 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -859,9 +859,8 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { ProfOrErr->dump(); // Write data to profiling file - if (auto Err = ProfOrErr->write()) { - consumeError(std::move(Err)); - } + if (auto Err = ProfOrErr->write()) + return Err; } // Delete the memory manager before deinitializing the device. Otherwise, diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c deleted file mode 100644 index c0d698323adf0..0000000000000 --- a/offload/test/offloading/pgo1.c +++ /dev/null @@ -1,74 +0,0 @@ -// RUN: %libomptarget-compile-generic -fprofile-instr-generate \ -// RUN: -Xclang "-fprofile-instrument=clang" -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ -// RUN: --check-prefix="CLANG-PGO" -// RUN: %libomptarget-compile-generic -fprofile-generate \ -// RUN: -Xclang "-fprofile-instrument=llvm" -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ -// RUN: --check-prefix="LLVM-PGO" - -// REQUIRES: gpu -// REQUIRES: pgo - -#ifdef _OPENMP -#include -#endif - -int test1(int a) { return a / 2; } -int test2(int a) { return a * 2; } - -int main() { - int m = 2; -#pragma omp target - for (int i = 0; i < 10; i++) { - m = test1(m); - for (int j = 0; j < 2; j++) { - m = test2(m); - } - } -} - -// CLANG-PGO: ======== Counters ========= -// CLANG-PGO-NEXT: [ 0 11 20 ] -// CLANG-PGO-NEXT: [ 10 ] -// CLANG-PGO-NEXT: [ 20 ] -// CLANG-PGO-NEXT: ========== Data =========== -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// CLANG-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// CLANG-PGO-NEXT: ======== Functions ======== -// CLANG-PGO-NEXT: pgo1.c: -// CLANG-PGO-SAME: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} -// CLANG-PGO-NEXT: test1 -// CLANG-PGO-NEXT: test2 - -// LLVM-PGO: ======== Counters ========= -// LLVM-PGO-NEXT: [ 20 10 2 1 ] -// LLVM-PGO-NEXT: [ 10 ] -// LLVM-PGO-NEXT: [ 20 ] -// LLVM-PGO-NEXT: ========== Data =========== -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: { {{[0-9]*}} {{[0-9]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{0x[0-9a-fA-F]*}} {{0x[0-9a-fA-F]*}} -// LLVM-PGO-SAME: {{[0-9]*}} {{[0-9]*}} {{[0-9]*}} } -// LLVM-PGO-NEXT: ======== Functions ======== -// LLVM-PGO-NEXT: __omp_offloading_{{[_0-9a-zA-Z]*}}_main_{{[_0-9a-zA-Z]*}} -// LLVM-PGO-NEXT: test1 -// LLVM-PGO-NEXT: test2 From e246227bc5ca2ce485798cc49d7e25d736b663ac Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 27 Sep 2024 17:50:43 -0400 Subject: [PATCH 083/114] Hack offload tests to find built llvm-profdata --- offload/test/lit.cfg | 2 ++ offload/test/lit.site.cfg.in | 1 + 2 files changed, 3 insertions(+) diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 5f5c96b35c734..bb2b3cd2e8e61 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -420,3 +420,5 @@ config.substitutions.append(("%flags", config.test_flags)) config.substitutions.append(("%not", config.libomptarget_not)) config.substitutions.append(("%offload-device-info", config.offload_device_info)) +config.substitutions.append(("llvm-profdata", + config.bin_dir + "/../../bin/llvm-profdata")) diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in index a1cb5acc38a40..c7713910fd39d 100644 --- a/offload/test/lit.site.cfg.in +++ b/offload/test/lit.site.cfg.in @@ -1,5 +1,6 @@ @AUTO_GEN_COMMENT@ +config.bin_dir = "@CMAKE_BINARY_DIR@" config.bin_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin" config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" From feeaa3780cf725f0da1404b99b3f8634dbce75de Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Sat, 28 Sep 2024 12:50:23 -0400 Subject: [PATCH 084/114] Remove redundant private --- llvm/include/llvm/Analysis/KernelInfo.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/include/llvm/Analysis/KernelInfo.h b/llvm/include/llvm/Analysis/KernelInfo.h index 6633c28858a2f..75d92c202212b 100644 --- a/llvm/include/llvm/Analysis/KernelInfo.h +++ b/llvm/include/llvm/Analysis/KernelInfo.h @@ -22,7 +22,6 @@ namespace llvm { class TargetMachine; class KernelInfoPrinter : public PassInfoMixin { -private: TargetMachine *TM; public: From d2847b0e4fde9d948ca5b5067cdbe0648ea98ff9 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 30 Sep 2024 17:55:01 -0400 Subject: [PATCH 085/114] Extend kernel-info to emit PGO-based FLOP count --- llvm/docs/KernelInfo.rst | 47 ++++++ llvm/lib/Analysis/KernelInfo.cpp | 91 +++++++++--- llvm/test/Analysis/KernelInfo/flop-pgo.ll | 138 ++++++++++++++++++ .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 3 + llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 3 + 5 files changed, 264 insertions(+), 18 deletions(-) create mode 100644 llvm/test/Analysis/KernelInfo/flop-pgo.ll diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst index 397b32602bce2..a73ffa9917f41 100644 --- a/llvm/docs/KernelInfo.rst +++ b/llvm/docs/KernelInfo.rst @@ -59,3 +59,50 @@ kernel-info can also be inserted into a specified LLVM pass pipeline using $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ -pass-remarks=kernel-info -passes='lto,module(kernel-info)' + +PGO +=== + +Using LLVM's PGO implementation for GPUs, profile data can augment the info +reported by kernel-info. In particular, an estimate of the number of floating +point operations executed can be reported. + +For example, the following computes 2\ :sup:`4`\ , so we expect 4 fmul +instructions to be executed at run time: + +.. code-block:: shell + + $ cat test.c + #include + #include + __attribute__((noinline)) + double test(double x, int n) { + double res = 1; + for (int i = 0; i < n; ++i) + res *= x; + return res; + } + int main(int argc, char *argv[]) { + double x = atof(argv[1]); + unsigned n = atoi(argv[2]); + #pragma omp target map(tofrom:x) + x = test(x, n); + printf("%f\n", x); + return 0; + } + + $ clang -O1 -g -fopenmp --offload-arch=native test.c -o test \ + -fprofile-generate -fprofile-generate-gpu + + $ LLVM_PROFILE_FILE=test.profraw ./test 2 4 + 16.000000 + + $ llvm-profdata merge -output=test.profdata *.profraw + + $ clang -O1 -g -fopenmp --offload-arch=native test.c -foffload-lto \ + -Rpass=kernel-info -mllvm -kernel-info-end-lto \ + -fprofile-use-gpu=test.profdata | \ + grep -i "test.c:.*float" + test.c:13:0: in artificial function '__omp_offloading_35_1369040_main_l13', FloatingPointOpProfileCount = 0 + test.c:7:9: in function 'test.internalized', 'fmul' instruction ('%9') is a floating point op where the block profile count is 4 + test.c:4:0: in function 'test.internalized', FloatingPointOpProfileCount = 4 diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 826340ca8401d..e1be4cdfcdc62 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -31,7 +31,7 @@ using namespace llvm; /// Data structure holding function info for kernels. class KernelInfo { void updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE); + BlockFrequencyInfo &BFI, OptimizationRemarkEmitter &ORE); public: static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, @@ -73,6 +73,11 @@ class KernelInfo { /// Number of flat address space memory accesses (via load, store, etc.). int64_t FlatAddrspaceAccesses = 0; + + /// Estimate of the number of floating point operations typically executed + /// based on any available profile data. If no profile data is available, the + /// count is zero. + uint64_t FloatingPointOpProfileCount = 0; }; static bool isKernelFunction(Function &F) { @@ -80,6 +85,20 @@ static bool isKernelFunction(Function &F) { return F.hasFnAttribute("kernel"); } +// For the purposes of KernelInfo::FloatingPointOpProfileCount, should this be +// considered a floating point operation? +// +// TODO: Does this correctly identify floating point operations we care about? +// For example, we skip phi and load even when they return floating point +// values. Should different operations have different weights? +static bool isFloatingPointOperation(const Instruction &I) { + if (const AtomicRMWInst *At = dyn_cast(&I)) + return At->isFloatingPointOperation(); + if (!I.getType()->isFPOrFPVectorTy()) + return false; + return I.isBinaryOp() || I.isUnaryOp(); +} + static void identifyFunction(OptimizationRemark &R, const Function &F) { if (auto *SubProgram = F.getSubprogram()) { if (SubProgram->isArtificial()) @@ -88,6 +107,19 @@ static void identifyFunction(OptimizationRemark &R, const Function &F) { R << "function '" << F.getName() << "'"; } +static void identifyInstruction(OptimizationRemark &R, const Instruction &I) { + if (const IntrinsicInst *II = dyn_cast(&I)) + R << "'" << II->getCalledFunction()->getName() << "' call"; + else + R << "'" << I.getOpcodeName() << "' instruction"; + if (!I.getType()->isVoidTy()) { + SmallString<20> Name; + raw_svector_ostream OS(Name); + I.printAsOperand(OS, /*PrintType=*/false, I.getModule()); + R << " ('" << Name << "')"; + } +} + static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller, const AllocaInst &Alloca, TypeSize::ScalarTy StaticSize) { @@ -153,33 +185,45 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, const Function &Caller, - const Instruction &Inst) { + const Instruction &I) { ORE.emit([&] { - OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &Inst); + OptimizationRemark R(DEBUG_TYPE, "FlatAddrspaceAccess", &I); R << "in "; identifyFunction(R, Caller); - if (const IntrinsicInst *II = dyn_cast(&Inst)) { - R << ", '" << II->getCalledFunction()->getName() << "' call"; - } else { - R << ", '" << Inst.getOpcodeName() << "' instruction"; - } - if (!Inst.getType()->isVoidTy()) { - SmallString<20> Name; - raw_svector_ostream OS(Name); - Inst.printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); - R << " ('" << Name << "')"; - } + R << ", "; + identifyInstruction(R, I); R << " accesses memory in flat address space"; return R; }); } +static void remarkFloatingPointOp(OptimizationRemarkEmitter &ORE, + const Function &Caller, + const Instruction &I, + uint64_t BlockProfileCount) { + ORE.emit([&] { + OptimizationRemark R(DEBUG_TYPE, "FloatingPointOp", &I); + R << "in "; + identifyFunction(R, Caller); + R << ", "; + identifyInstruction(R, I); + R << " is a floating point op where the block profile count is " + << utostr(BlockProfileCount); + return R; + }); +} + void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, + BlockFrequencyInfo &BFI, OptimizationRemarkEmitter &ORE) { assert(Direction == 1 || Direction == -1); const Function &F = *BB.getParent(); const Module &M = *F.getParent(); const DataLayout &DL = M.getDataLayout(); + uint64_t BlockProfileCount = 0; + // TODO: Is AllowSynthetic what we want? + if (auto Val = BFI.getBlockProfileCount(&BB, /*AllowSynthetic=*/true)) + BlockProfileCount = *Val; for (const Instruction &I : BB.instructionsWithoutDebug()) { if (const AllocaInst *Alloca = dyn_cast(&I)) { Allocas += Direction; @@ -259,16 +303,25 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, remarkFlatAddrspaceAccess(ORE, F, I); } } + if (isFloatingPointOperation(I)) { + FloatingPointOpProfileCount += Direction * BlockProfileCount; + remarkFloatingPointOp(ORE, F, I, BlockProfileCount); + } } } -static void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, - StringRef Name, int64_t Value) { +static std::string toString(bool Val) { return itostr(Val); } +static std::string toString(int64_t Val) { return itostr(Val); } +static std::string toString(uint64_t Val) { return utostr(Val); } + +template +void remarkProperty(OptimizationRemarkEmitter &ORE, const Function &F, + StringRef Name, T Val) { ORE.emit([&] { OptimizationRemark R(DEBUG_TYPE, Name, &F); R << "in "; identifyFunction(R, F); - R << ", " << Name << " = " << itostr(Value); + R << ", " << Name << " = " << toString(Val); return R; }); } @@ -284,6 +337,7 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, TargetMachine *TM) { KernelInfo KI; TargetTransformInfo &TheTTI = FAM.getResult(F); + BlockFrequencyInfo &BFI = FAM.getResult(F); KI.FlatAddrspace = TheTTI.getFlatAddressSpace(); // Record function properties. @@ -298,7 +352,7 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, auto &ORE = FAM.getResult(F); for (const auto &BB : F) if (DT.isReachableFromEntry(&BB)) - KI.updateForBB(BB, +1, ORE); + KI.updateForBB(BB, +1, BFI, ORE); #define REMARK_PROPERTY(PROP_NAME) \ remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) @@ -314,6 +368,7 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, REMARK_PROPERTY(InlineAssemblyCalls); REMARK_PROPERTY(Invokes); REMARK_PROPERTY(FlatAddrspaceAccesses); + REMARK_PROPERTY(FloatingPointOpProfileCount); #undef REMARK_PROPERTY return; diff --git a/llvm/test/Analysis/KernelInfo/flop-pgo.ll b/llvm/test/Analysis/KernelInfo/flop-pgo.ll new file mode 100644 index 0000000000000..007e88ba4593a --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/flop-pgo.ll @@ -0,0 +1,138 @@ +; Check info on floating point operations. + +; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ +; RUN: -disable-output %s 2>&1 | \ +; RUN: FileCheck -match-full-lines -implicit-check-not='floating point' %s + +target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; Check function with profile data but no floating point operations. +define double @noFlop() !dbg !100 !prof !102 { + ret double 0.000000e+00, !dbg !105 +} +; CHECK: remark: test.c:2:0: in function 'noFlop', FloatingPointOpProfileCount = 0 + +; Check function with floating point operations but no profile data. +define double @noProf() !dbg !200 { + ; CHECK: remark: test.c:3:9: in function 'noProf', 'fadd' instruction ('%fadd') is a floating point op where the block profile count is 0 + %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !204 + ret double 0.000000e+00, !dbg !205 +} +; CHECK: remark: test.c:3:0: in function 'noProf', FloatingPointOpProfileCount = 0 + +; Check function with floating point operations and profile data. +define double @f() !dbg !300 !prof !302 { + ; Check floating point operation in entry block, which has a count of 1 per + ; entry into the function. + ; + ; Also, check case of basic block with exactly 1 floating point operation. + %alloca = alloca double, align 8, addrspace(1), !dbg !398 + ; CHECK: remark: test.c:5:9: in function 'f', 'fadd' instruction ('%fadd') is a floating point op where the block profile count is 2 + %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !305 + br label %.none, !dbg !398 + + ; Check floating point operation in ret block. + ; + ; branch_weights gives this block a count of 1 per entry into the function. +.ret: ; preds = %.many + ; CHECK: remark: test.c:6:9: in function 'f', 'fsub' instruction ('%fsub') is a floating point op where the block profile count is 2 + %fsub = fsub double 0.000000e+00, 0.000000e+00, !dbg !306 + ; CHECK: remark: test.c:7:9: in function 'f', 'fmul' instruction ('%fmul') is a floating point op where the block profile count is 2 + %fmul = fmul double 0.000000e+00, 0.000000e+00, !dbg !307 + ret double 0.000000e+00, !dbg !398 + + ; Check case of 0 floating point operations in a basic block. +.none: ; preds = %0 + br label %.many, !dbg !398 + + ; Check case of many floating point operations in a basic block. + ; + ; branch_weights gives this block a count of 3 per entry into the function. +.many: ; preds = %.none, %.many + ; These are not considered floating point ops even though they return floating + ; point values. + %phi = phi double [ %fadd, %.none ], [ %load, %.many ], !dbg !398 + %load = load double, ptr addrspace(1) %alloca, align 8, !dbg !398 + + ; Check simple floating point ops not already checked above, and check an + ; unnamed value. + ; + ; CHECK: remark: test.c:8:9: in function 'f', 'fdiv' instruction ('%1') is a floating point op where the block profile count is 6 + %1 = fdiv double 0.000000e+00, 0.000000e+00, !dbg !308 + ; CHECK: remark: test.c:9:9: in function 'f', 'fneg' instruction ('%fneg') is a floating point op where the block profile count is 6 + %fneg = fneg double 0.000000e+00, !dbg !309 + + ; Check atomicrmw. + ; + ; CHECK: remark: test.c:10:9: in function 'f', 'atomicrmw' instruction ('%[[#]]') is a floating point op where the block profile count is 6 + atomicrmw fadd ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !310 + ; CHECK: remark: test.c:11:9: in function 'f', 'atomicrmw' instruction ('%[[#]]') is a floating point op where the block profile count is 6 + atomicrmw fsub ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !311 + ; CHECK: remark: test.c:12:9: in function 'f', 'atomicrmw' instruction ('%[[#]]') is a floating point op where the block profile count is 6 + atomicrmw fmax ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !312 + ; CHECK: remark: test.c:13:9: in function 'f', 'atomicrmw' instruction ('%[[#]]') is a floating point op where the block profile count is 6 + atomicrmw fmin ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !313 + ; atomicrmw that is not a floating point op. + atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !398 + + ; Check floating point types besides double scalar. + ; + ; CHECK: remark: test.c:14:9: in function 'f', 'fadd' instruction ('%float') is a floating point op where the block profile count is 6 + %float = fadd float 0.000000e+00, 0.000000e+00, !dbg !314 + ; CHECK: remark: test.c:15:9: in function 'f', 'fadd' instruction ('%half') is a floating point op where the block profile count is 6 + %half = fadd half 0.000000e+00, 0.000000e+00, !dbg !315 + ; CHECK: remark: test.c:16:9: in function 'f', 'fadd' instruction ('%bfloat') is a floating point op where the block profile count is 6 + %bfloat = fadd bfloat 0.000000e+00, 0.000000e+00, !dbg !316 + ; CHECK: remark: test.c:17:9: in function 'f', 'fadd' instruction ('%fp128') is a floating point op where the block profile count is 6 + %fp128 = fadd fp128 0xL0, 0xL0, !dbg !317 + ; CHECK: remark: test.c:18:9: in function 'f', 'fadd' instruction ('%vector') is a floating point op where the block profile count is 6 + %vector = fadd <2 x double> , , !dbg !318 + + br i1 false, label %.ret, label %.many, !prof !399, !dbg !398 +} +; CHECK: remark: test.c:4:0: in function 'f', FloatingPointOpProfileCount = 72 + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C11, file: !2, producer: "clang version 20.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!2 = !DIFile(filename: "test.c", directory: "/tmp") +!3 = !{} + +!100 = distinct !DISubprogram(name: "noFlop", scope: !2, file: !2, line: 2, type: !101, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3) +!101 = !DISubroutineType(types: !3) +!102 = !{!"function_entry_count", i64 5} +!103 = distinct !DILexicalBlock(scope: !104, file: !2, line: 2, column: 3) +!104 = distinct !DILexicalBlock(scope: !100, file: !2, line: 2, column: 3) +!105 = !DILocation(line: 2, column: 9, scope: !103) + +!200 = distinct !DISubprogram(name: "noProf", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3) +!201 = !DISubroutineType(types: !3) +!202 = distinct !DILexicalBlock(scope: !203, file: !2, line: 3, column: 3) +!203 = distinct !DILexicalBlock(scope: !200, file: !2, line: 3, column: 3) +!204 = !DILocation(line: 3, column: 9, scope: !202) +!205 = !DILocation(line: 4, column: 9, scope: !202) + +!300 = distinct !DISubprogram(name: "f", scope: !2, file: !2, line: 4, type: !301, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3) +!301 = !DISubroutineType(types: !3) +!302 = !{!"function_entry_count", i64 2} +!303 = distinct !DILexicalBlock(scope: !304, file: !2, line: 6, column: 3) +!304 = distinct !DILexicalBlock(scope: !300, file: !2, line: 6, column: 3) +!305 = !DILocation(line: 5, column: 9, scope: !303) +!306 = !DILocation(line: 6, column: 9, scope: !303) +!307 = !DILocation(line: 7, column: 9, scope: !303) +!308 = !DILocation(line: 8, column: 9, scope: !303) +!309 = !DILocation(line: 9, column: 9, scope: !303) +!310 = !DILocation(line: 10, column: 9, scope: !303) +!311 = !DILocation(line: 11, column: 9, scope: !303) +!312 = !DILocation(line: 12, column: 9, scope: !303) +!313 = !DILocation(line: 13, column: 9, scope: !303) +!314 = !DILocation(line: 14, column: 9, scope: !303) +!315 = !DILocation(line: 15, column: 9, scope: !303) +!316 = !DILocation(line: 16, column: 9, scope: !303) +!317 = !DILocation(line: 17, column: 9, scope: !303) +!318 = !DILocation(line: 18, column: 9, scope: !303) +!398 = !DILocation(line: 999, column: 999, scope: !303) +!399 = !{!"branch_weights", i32 127, i32 257} diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index d21dde10f979a..9c6efa8e10185 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -33,6 +33,7 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FloatingPointOpProfileCount = 0 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space @@ -55,6 +56,7 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FloatingPointOpProfileCount = 0 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes @@ -77,6 +79,7 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', FloatingPointOpProfileCount = 0 ; CHECK-NOT: {{.}} diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 2dbd04b2536c4..582b62f647bad 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -27,6 +27,7 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FloatingPointOpProfileCount = 0 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space @@ -42,6 +43,7 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FloatingPointOpProfileCount = 0 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes @@ -57,6 +59,7 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', FloatingPointOpProfileCount = 0 ; CHECK-NOT: remark: {{.*: in function 'g',.*}} ; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't From e04b933331c855097fdd34c25ae2998fb527c2be Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Thu, 3 Oct 2024 16:38:58 -0400 Subject: [PATCH 086/114] Improve some kernel-info instruction remarks - Point out floating point types. - Point out when profile data is unavailable. - Otherwise, make remarks less verbose. --- llvm/docs/KernelInfo.rst | 4 +- llvm/lib/Analysis/KernelInfo.cpp | 50 ++++++++++++------- .../KernelInfo/flat-addrspace/Inputs/test.ll | 18 +++---- llvm/test/Analysis/KernelInfo/flop-pgo.ll | 30 +++++------ .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 6 +-- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 6 +-- 6 files changed, 63 insertions(+), 51 deletions(-) diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst index a73ffa9917f41..4a373e1143729 100644 --- a/llvm/docs/KernelInfo.rst +++ b/llvm/docs/KernelInfo.rst @@ -102,7 +102,7 @@ instructions to be executed at run time: $ clang -O1 -g -fopenmp --offload-arch=native test.c -foffload-lto \ -Rpass=kernel-info -mllvm -kernel-info-end-lto \ -fprofile-use-gpu=test.profdata | \ - grep -i "test.c:.*float" + grep "test.c:.*Floating\|double" test.c:13:0: in artificial function '__omp_offloading_35_1369040_main_l13', FloatingPointOpProfileCount = 0 - test.c:7:9: in function 'test.internalized', 'fmul' instruction ('%9') is a floating point op where the block profile count is 4 + test.c:7:9: in function 'test.internalized', double 'fmul' ('%9') executed 4 times test.c:4:0: in function 'test.internalized', FloatingPointOpProfileCount = 4 diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index e1be4cdfcdc62..3d127a3b7da6a 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -86,17 +86,24 @@ static bool isKernelFunction(Function &F) { } // For the purposes of KernelInfo::FloatingPointOpProfileCount, should this be -// considered a floating point operation? +// considered a floating point operation? If so, return the floating point +// type. Otherwise, return nullptr. // // TODO: Does this correctly identify floating point operations we care about? // For example, we skip phi and load even when they return floating point // values. Should different operations have different weights? -static bool isFloatingPointOperation(const Instruction &I) { - if (const AtomicRMWInst *At = dyn_cast(&I)) - return At->isFloatingPointOperation(); - if (!I.getType()->isFPOrFPVectorTy()) - return false; - return I.isBinaryOp() || I.isUnaryOp(); +static Type *getFloatingPointOpType(const Instruction &I) { + if (const AtomicRMWInst *At = dyn_cast(&I)) { + if (At->isFloatingPointOperation()) + return At->getType(); + return nullptr; + } + if (!I.isBinaryOp() && !I.isUnaryOp()) + return nullptr; + Type *Ty = I.getType(); + if (Ty->isFPOrFPVectorTy()) + return Ty; + return nullptr; } static void identifyFunction(OptimizationRemark &R, const Function &F) { @@ -111,7 +118,7 @@ static void identifyInstruction(OptimizationRemark &R, const Instruction &I) { if (const IntrinsicInst *II = dyn_cast(&I)) R << "'" << II->getCalledFunction()->getName() << "' call"; else - R << "'" << I.getOpcodeName() << "' instruction"; + R << "'" << I.getOpcodeName() << "'"; if (!I.getType()->isVoidTy()) { SmallString<20> Name; raw_svector_ostream OS(Name); @@ -198,17 +205,23 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, } static void remarkFloatingPointOp(OptimizationRemarkEmitter &ORE, - const Function &Caller, - const Instruction &I, - uint64_t BlockProfileCount) { + const Function &Caller, const Instruction &I, + Type *Ty, + std::optional BlockProfileCount) { ORE.emit([&] { OptimizationRemark R(DEBUG_TYPE, "FloatingPointOp", &I); R << "in "; identifyFunction(R, Caller); R << ", "; + SmallString<10> TyName; + raw_svector_ostream OS(TyName); + Ty->print(OS); + R << TyName << " "; identifyInstruction(R, I); - R << " is a floating point op where the block profile count is " - << utostr(BlockProfileCount); + if (BlockProfileCount) + R << " executed " << utostr(*BlockProfileCount) << " times"; + else + R << " has no profile data"; return R; }); } @@ -220,10 +233,9 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, const Function &F = *BB.getParent(); const Module &M = *F.getParent(); const DataLayout &DL = M.getDataLayout(); - uint64_t BlockProfileCount = 0; // TODO: Is AllowSynthetic what we want? - if (auto Val = BFI.getBlockProfileCount(&BB, /*AllowSynthetic=*/true)) - BlockProfileCount = *Val; + std::optional BlockProfileCount = + BFI.getBlockProfileCount(&BB, /*AllowSynthetic=*/true); for (const Instruction &I : BB.instructionsWithoutDebug()) { if (const AllocaInst *Alloca = dyn_cast(&I)) { Allocas += Direction; @@ -303,9 +315,9 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, remarkFlatAddrspaceAccess(ORE, F, I); } } - if (isFloatingPointOperation(I)) { - FloatingPointOpProfileCount += Direction * BlockProfileCount; - remarkFloatingPointOp(ORE, F, I, BlockProfileCount); + if (Type *Ty = getFloatingPointOpType(I)) { + FloatingPointOpProfileCount += Direction * BlockProfileCount.value_or(0); + remarkFloatingPointOp(ORE, F, I, Ty, BlockProfileCount); } } } diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll index b54c3a18f3e70..a5d7fd783ec48 100644 --- a/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll +++ b/llvm/test/Analysis/KernelInfo/flat-addrspace/Inputs/test.ll @@ -1,35 +1,35 @@ define void @f() !dbg !3 { entry: ; load: check remarks for both unnamed and named values. - ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%0') accesses memory in flat address space + ; CHECK: remark: test.c:3:11: in function 'f', 'load' ('%0') accesses memory in flat address space %0 = load i32, ptr null, align 4, !dbg !6 - ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load') accesses memory in flat address space + ; CHECK: remark: test.c:3:11: in function 'f', 'load' ('%load') accesses memory in flat address space %load = load i32, ptr null, align 4, !dbg !6 - ; CHECK: remark: test.c:3:11: in function 'f', 'load' instruction ('%load0') accesses memory in flat address space + ; CHECK: remark: test.c:3:11: in function 'f', 'load' ('%load0') accesses memory in flat address space %load0 = load i32, ptr addrspace(0) null, align 4, !dbg !6 %load1 = load i32, ptr addrspace(1) null, align 4, !dbg !6 %load2 = load i32, ptr addrspace(2) null, align 4, !dbg !6 ; store - ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space + ; CHECK: remark: test.c:4:6: in function 'f', 'store' accesses memory in flat address space store i32 0, ptr null, align 4, !dbg !7 - ; CHECK: remark: test.c:4:6: in function 'f', 'store' instruction accesses memory in flat address space + ; CHECK: remark: test.c:4:6: in function 'f', 'store' accesses memory in flat address space store i32 0, ptr addrspace(0) null, align 4, !dbg !7 store i32 0, ptr addrspace(1) null, align 4, !dbg !7 store i32 0, ptr addrspace(8) null, align 4, !dbg !7 ; atomicrmw - ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' ('%[[#]]') accesses memory in flat address space atomicrmw xchg ptr null, i32 10 seq_cst, !dbg !8 - ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' instruction ('%[[#]]') accesses memory in flat address space + ; CHECK: remark: test.c:5:1: in function 'f', 'atomicrmw' ('%[[#]]') accesses memory in flat address space atomicrmw add ptr addrspace(0) null, i32 10 seq_cst, !dbg !8 atomicrmw xchg ptr addrspace(1) null, i32 10 seq_cst, !dbg !8 atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !8 ; cmpxchg - ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' ('%[[#]]') accesses memory in flat address space cmpxchg ptr null, i32 0, i32 1 acq_rel monotonic, !dbg !9 - ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' instruction ('%[[#]]') accesses memory in flat address space + ; CHECK: remark: test.c:6:2: in function 'f', 'cmpxchg' ('%[[#]]') accesses memory in flat address space cmpxchg ptr addrspace(0) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 cmpxchg ptr addrspace(1) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 cmpxchg ptr addrspace(934) null, i32 0, i32 1 acq_rel monotonic, !dbg !9 diff --git a/llvm/test/Analysis/KernelInfo/flop-pgo.ll b/llvm/test/Analysis/KernelInfo/flop-pgo.ll index 007e88ba4593a..18811a0dfa00b 100644 --- a/llvm/test/Analysis/KernelInfo/flop-pgo.ll +++ b/llvm/test/Analysis/KernelInfo/flop-pgo.ll @@ -15,7 +15,7 @@ define double @noFlop() !dbg !100 !prof !102 { ; Check function with floating point operations but no profile data. define double @noProf() !dbg !200 { - ; CHECK: remark: test.c:3:9: in function 'noProf', 'fadd' instruction ('%fadd') is a floating point op where the block profile count is 0 + ; CHECK: remark: test.c:3:9: in function 'noProf', double 'fadd' ('%fadd') has no profile data %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !204 ret double 0.000000e+00, !dbg !205 } @@ -28,7 +28,7 @@ define double @f() !dbg !300 !prof !302 { ; ; Also, check case of basic block with exactly 1 floating point operation. %alloca = alloca double, align 8, addrspace(1), !dbg !398 - ; CHECK: remark: test.c:5:9: in function 'f', 'fadd' instruction ('%fadd') is a floating point op where the block profile count is 2 + ; CHECK: remark: test.c:5:9: in function 'f', double 'fadd' ('%fadd') executed 2 times %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !305 br label %.none, !dbg !398 @@ -36,9 +36,9 @@ define double @f() !dbg !300 !prof !302 { ; ; branch_weights gives this block a count of 1 per entry into the function. .ret: ; preds = %.many - ; CHECK: remark: test.c:6:9: in function 'f', 'fsub' instruction ('%fsub') is a floating point op where the block profile count is 2 + ; CHECK: remark: test.c:6:9: in function 'f', double 'fsub' ('%fsub') executed 2 times %fsub = fsub double 0.000000e+00, 0.000000e+00, !dbg !306 - ; CHECK: remark: test.c:7:9: in function 'f', 'fmul' instruction ('%fmul') is a floating point op where the block profile count is 2 + ; CHECK: remark: test.c:7:9: in function 'f', double 'fmul' ('%fmul') executed 2 times %fmul = fmul double 0.000000e+00, 0.000000e+00, !dbg !307 ret double 0.000000e+00, !dbg !398 @@ -58,35 +58,35 @@ define double @f() !dbg !300 !prof !302 { ; Check simple floating point ops not already checked above, and check an ; unnamed value. ; - ; CHECK: remark: test.c:8:9: in function 'f', 'fdiv' instruction ('%1') is a floating point op where the block profile count is 6 + ; CHECK: remark: test.c:8:9: in function 'f', double 'fdiv' ('%1') executed 6 times %1 = fdiv double 0.000000e+00, 0.000000e+00, !dbg !308 - ; CHECK: remark: test.c:9:9: in function 'f', 'fneg' instruction ('%fneg') is a floating point op where the block profile count is 6 + ; CHECK: remark: test.c:9:9: in function 'f', double 'fneg' ('%fneg') executed 6 times %fneg = fneg double 0.000000e+00, !dbg !309 ; Check atomicrmw. ; - ; CHECK: remark: test.c:10:9: in function 'f', 'atomicrmw' instruction ('%[[#]]') is a floating point op where the block profile count is 6 + ; CHECK: remark: test.c:10:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times atomicrmw fadd ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !310 - ; CHECK: remark: test.c:11:9: in function 'f', 'atomicrmw' instruction ('%[[#]]') is a floating point op where the block profile count is 6 + ; CHECK: remark: test.c:11:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times atomicrmw fsub ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !311 - ; CHECK: remark: test.c:12:9: in function 'f', 'atomicrmw' instruction ('%[[#]]') is a floating point op where the block profile count is 6 + ; CHECK: remark: test.c:12:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times atomicrmw fmax ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !312 - ; CHECK: remark: test.c:13:9: in function 'f', 'atomicrmw' instruction ('%[[#]]') is a floating point op where the block profile count is 6 + ; CHECK: remark: test.c:13:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times atomicrmw fmin ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !313 ; atomicrmw that is not a floating point op. atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !398 ; Check floating point types besides double scalar. ; - ; CHECK: remark: test.c:14:9: in function 'f', 'fadd' instruction ('%float') is a floating point op where the block profile count is 6 + ; CHECK: remark: test.c:14:9: in function 'f', float 'fadd' ('%float') executed 6 times %float = fadd float 0.000000e+00, 0.000000e+00, !dbg !314 - ; CHECK: remark: test.c:15:9: in function 'f', 'fadd' instruction ('%half') is a floating point op where the block profile count is 6 + ; CHECK: remark: test.c:15:9: in function 'f', half 'fadd' ('%half') executed 6 times %half = fadd half 0.000000e+00, 0.000000e+00, !dbg !315 - ; CHECK: remark: test.c:16:9: in function 'f', 'fadd' instruction ('%bfloat') is a floating point op where the block profile count is 6 + ; CHECK: remark: test.c:16:9: in function 'f', bfloat 'fadd' ('%bfloat') executed 6 times %bfloat = fadd bfloat 0.000000e+00, 0.000000e+00, !dbg !316 - ; CHECK: remark: test.c:17:9: in function 'f', 'fadd' instruction ('%fp128') is a floating point op where the block profile count is 6 + ; CHECK: remark: test.c:17:9: in function 'f', fp128 'fadd' ('%fp128') executed 6 times %fp128 = fadd fp128 0xL0, 0xL0, !dbg !317 - ; CHECK: remark: test.c:18:9: in function 'f', 'fadd' instruction ('%vector') is a floating point op where the block profile count is 6 + ; CHECK: remark: test.c:18:9: in function 'f', <2 x double> 'fadd' ('%vector') executed 6 times %vector = fadd <2 x double> , , !dbg !318 br i1 false, label %.ret, label %.many, !prof !399, !dbg !398 diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 9c6efa8e10185..2bdab392c0fb7 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -10,7 +10,7 @@ ; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' accesses memory in flat address space ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_init' ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' @@ -36,8 +36,8 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FloatingPointOpProfileCount = 0 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space -; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsX = 0 diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 582b62f647bad..d1d9370e2d33b 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -10,7 +10,7 @@ ; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' accesses memory in flat address space ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_init' ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' @@ -30,8 +30,8 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FloatingPointOpProfileCount = 0 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes -; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space -; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space +; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' accesses memory in flat address space +; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 From 116f1c9f14961b287d3393f9706badbcd63c515d Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 14:51:24 -0400 Subject: [PATCH 087/114] Remove todos, as requested --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 1 - llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 1 - 2 files changed, 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index f730738494219..880497908df27 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1424,5 +1424,4 @@ void GCNTTIImpl::collectLaunchBounds( auto AmdgpuWavesPerEU = ST->getWavesPerEU(F); LB.push_back({"AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first}); LB.push_back({"AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second}); - // TODO: Any others we should add? } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 47510912fd1f7..3004620b40cbe 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -455,5 +455,4 @@ void NVPTXTTIImpl::collectLaunchBounds( LB.push_back({"Maxntidy", *Val}); if (auto Val = getMaxNTIDz(F)) LB.push_back({"Maxntidz", *Val}); - // TODO: Any others we should add? } From 2094465ae367d35a5cc05bdc2e1703d806491976 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 15:22:56 -0400 Subject: [PATCH 088/114] Combine registerFullLinkTimeOptimizationLastEPCallback calls --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 65771e145ff11..1264749059359 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -825,6 +825,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(AMDGPULowerModuleLDSPass(*this)); if (EnableAMDGPUAttributor && Level != OptimizationLevel::O0) PM.addPass(AMDGPUAttributorPass(*this)); + if (KernelInfoEndLTO) { + FunctionPassManager FPM; + FPM.addPass(KernelInfoPrinter(this)); + PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } }); PB.registerRegClassFilterParsingCallback( @@ -836,14 +841,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { return nullptr; }); - if (KernelInfoEndLTO) { - PB.registerFullLinkTimeOptimizationLastEPCallback( - [this](ModulePassManager &PM, OptimizationLevel Level) { - FunctionPassManager FPM; - FPM.addPass(KernelInfoPrinter(this)); - PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); - }); - } } int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { From 39bce7c5a18c94ff8085d169322d0683e266a792 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 15:28:34 -0400 Subject: [PATCH 089/114] collectLaunchBounds -> collectKernelLaunchBounds --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 14 +++++++------- .../llvm/Analysis/TargetTransformInfoImpl.h | 2 +- llvm/lib/Analysis/KernelInfo.cpp | 2 +- llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++-- .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 6 +++--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 6 +++--- 8 files changed, 19 insertions(+), 19 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 29d96b4333ef2..e6ceb19ef045e 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1815,10 +1815,10 @@ class TargetTransformInfo { /// @} - /// Collect launch bounds for \p F into \p LB. - void - collectLaunchBounds(const Function &F, - SmallVectorImpl> &LB) const; + /// Collect kernel launch bounds for \p F into \p LB. + void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const; private: /// The abstract base class used to type erase specific TTI @@ -2220,7 +2220,7 @@ class TargetTransformInfo::Concept { getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual unsigned getMaxNumArgs() const = 0; - virtual void collectLaunchBounds( + virtual void collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const = 0; }; @@ -3020,10 +3020,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.getMaxNumArgs(); } - void collectLaunchBounds( + void collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const override { - Impl.collectLaunchBounds(F, LB); + Impl.collectKernelLaunchBounds(F, LB); } }; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 89317ac2ec32d..1e6cb7841ccdf 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1001,7 +1001,7 @@ class TargetTransformInfoImplBase { unsigned getMaxNumArgs() const { return UINT_MAX; } - void collectLaunchBounds( + void collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const {} diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 826340ca8401d..49e001c85b08f 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -292,7 +292,7 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, KI.LaunchBounds.push_back({"OmpTargetNumTeams", *Val}); if (auto Val = parseFnAttrAsInteger(F, "omp_target_thread_limit")) KI.LaunchBounds.push_back({"OmpTargetThreadLimit", *Val}); - TheTTI.collectLaunchBounds(F, KI.LaunchBounds); + TheTTI.collectKernelLaunchBounds(F, KI.LaunchBounds); const DominatorTree &DT = FAM.getResult(F); auto &ORE = FAM.getResult(F); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 8f05a19644d31..24fea39e44f05 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1372,10 +1372,10 @@ bool TargetTransformInfo::isVectorShiftByScalarCheap(Type *Ty) const { return TTIImpl->isVectorShiftByScalarCheap(Ty); } -void TargetTransformInfo::collectLaunchBounds( +void TargetTransformInfo::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { - return TTIImpl->collectLaunchBounds(F, LB); + return TTIImpl->collectKernelLaunchBounds(F, LB); } TargetTransformInfo::Concept::~Concept() = default; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 880497908df27..98dbb064ac79f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1411,7 +1411,7 @@ bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { return AMDGPU::isFlatGlobalAddrSpace(AS); } -void GCNTTIImpl::collectLaunchBounds( +void GCNTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { auto AmdgpuMaxNumWorkgroups = ST->getMaxNumWorkGroups(F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 60bc829f5242a..0081748253c92 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -269,9 +269,9 @@ class GCNTTIImpl final : public BasicTTIImplBase { /// \return if target want to issue a prefetch in address space \p AS. bool shouldPrefetchAddressSpace(unsigned AS) const override; - void - collectLaunchBounds(const Function &F, - SmallVectorImpl> &LB) const; + void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 3004620b40cbe..c590d4a2c7d06 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -444,7 +444,7 @@ void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, BaseT::getPeelingPreferences(L, SE, PP); } -void NVPTXTTIImpl::collectLaunchBounds( +void NVPTXTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { if (auto Val = getMaxClusterRank(F)) diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 07c14e88cc786..5a8db82616ce1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -123,9 +123,9 @@ class NVPTXTTIImpl : public BasicTTIImplBase { return true; } } - void - collectLaunchBounds(const Function &F, - SmallVectorImpl> &LB) const; + void collectKernelLaunchBounds( + const Function &F, + SmallVectorImpl> &LB) const; }; } // end namespace llvm From 14345cf13bd071efa4bbff695351846560647d5d Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 16:02:04 -0400 Subject: [PATCH 090/114] Spell kernel-info properties like their IR attributes --- llvm/lib/Analysis/KernelInfo.cpp | 8 +-- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 20 +++---- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 8 +-- .../kernel-info-after-lto/Inputs/test.ll | 2 +- .../KernelInfo/launch-bounds/amdgpu.ll | 54 +++++++++---------- .../KernelInfo/launch-bounds/nvptx.ll | 12 ++--- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 44 +++++++-------- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 4 +- 8 files changed, 76 insertions(+), 76 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 49e001c85b08f..3aca4c59105ce 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -288,10 +288,10 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, // Record function properties. KI.ExternalNotKernel = F.hasExternalLinkage() && !isKernelFunction(F); - if (auto Val = parseFnAttrAsInteger(F, "omp_target_num_teams")) - KI.LaunchBounds.push_back({"OmpTargetNumTeams", *Val}); - if (auto Val = parseFnAttrAsInteger(F, "omp_target_thread_limit")) - KI.LaunchBounds.push_back({"OmpTargetThreadLimit", *Val}); + for (StringRef Name : {"omp_target_num_teams", "omp_target_thread_limit"}) { + if (auto Val = parseFnAttrAsInteger(F, Name)) + KI.LaunchBounds.push_back({Name, *Val}); + } TheTTI.collectKernelLaunchBounds(F, KI.LaunchBounds); const DominatorTree &DT = FAM.getResult(F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 98dbb064ac79f..6d92dccad076f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1414,14 +1414,14 @@ bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { void GCNTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { - auto AmdgpuMaxNumWorkgroups = ST->getMaxNumWorkGroups(F); - LB.push_back({"AmdgpuMaxNumWorkgroupsX", AmdgpuMaxNumWorkgroups[0]}); - LB.push_back({"AmdgpuMaxNumWorkgroupsY", AmdgpuMaxNumWorkgroups[1]}); - LB.push_back({"AmdgpuMaxNumWorkgroupsZ", AmdgpuMaxNumWorkgroups[2]}); - auto AmdgpuFlatWorkGroupSize = ST->getFlatWorkGroupSizes(F); - LB.push_back({"AmdgpuFlatWorkGroupSizeMin", AmdgpuFlatWorkGroupSize.first}); - LB.push_back({"AmdgpuFlatWorkGroupSizeMax", AmdgpuFlatWorkGroupSize.second}); - auto AmdgpuWavesPerEU = ST->getWavesPerEU(F); - LB.push_back({"AmdgpuWavesPerEUMin", AmdgpuWavesPerEU.first}); - LB.push_back({"AmdgpuWavesPerEUMax", AmdgpuWavesPerEU.second}); + auto MaxNumWorkgroups = ST->getMaxNumWorkGroups(F); + LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]}); + LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]}); + LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]}); + auto FlatWorkGroupSize = ST->getFlatWorkGroupSizes(F); + LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first}); + LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second}); + auto WavesPerEU = ST->getWavesPerEU(F); + LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first}); + LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second}); } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index c590d4a2c7d06..d230a66449063 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -448,11 +448,11 @@ void NVPTXTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { if (auto Val = getMaxClusterRank(F)) - LB.push_back({"Maxclusterrank", *Val}); + LB.push_back({"maxclusterrank", *Val}); if (auto Val = getMaxNTIDx(F)) - LB.push_back({"Maxntidx", *Val}); + LB.push_back({"maxntidx", *Val}); if (auto Val = getMaxNTIDy(F)) - LB.push_back({"Maxntidy", *Val}); + LB.push_back({"maxntidy", *Val}); if (auto Val = getMaxNTIDz(F)) - LB.push_back({"Maxntidz", *Val}); + LB.push_back({"maxntidz", *Val}); } diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll index b85e3c581867c..461544e44d538 100644 --- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll +++ b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll @@ -1,4 +1,4 @@ -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_num_teams = 100 ; NONE-NOT: remark: define void @test() #0 !dbg !5 { entry: diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll index 472d7c0286b01..d37dceec003f9 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll @@ -9,43 +9,43 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" target triple = "amdgcn-amd-amdhsa" -; CHECK: remark: test.c:10:0: in artificial function 'all', OmpTargetNumTeams = 100 -; CHECK: remark: test.c:10:0: in artificial function 'all', OmpTargetThreadLimit = 101 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsX = 200 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsY = 201 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuMaxNumWorkgroupsZ = 202 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuFlatWorkGroupSizeMin = 210 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuFlatWorkGroupSizeMax = 211 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuWavesPerEUMin = 2 -; CHECK: remark: test.c:10:0: in artificial function 'all', AmdgpuWavesPerEUMax = 9 +; CHECK: remark: test.c:10:0: in artificial function 'all', omp_target_num_teams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'all', omp_target_thread_limit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[0] = 200 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[1] = 201 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-max-num-workgroups[2] = 202 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-flat-work-group-size[0] = 210 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-flat-work-group-size[1] = 211 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-waves-per-eu[0] = 2 +; CHECK: remark: test.c:10:0: in artificial function 'all', amdgpu-waves-per-eu[1] = 9 define void @all() #0 !dbg !5 { entry: ret void } -; CHECK-NOT: remark: test.c:11:0: in function 'none', OmpTargetNumTeams = {{.*}} -; CHECK-NOT: remark: test.c:11:0: in function 'none', OmpTargetThreadLimit = {{.*}} -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsX = 0 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsY = 0 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuMaxNumWorkgroupsZ = 0 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuFlatWorkGroupSizeMin = 1 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuFlatWorkGroupSizeMax = 1024 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuWavesPerEUMin = 4 -; CHECK: remark: test.c:11:0: in function 'none', AmdgpuWavesPerEUMax = 10 +; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_num_teams = {{.*}} +; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_thread_limit = {{.*}} +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[0] = 0 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[1] = 0 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[2] = 0 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[0] = 1 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[1] = 1024 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-waves-per-eu[0] = 4 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-waves-per-eu[1] = 10 define void @none() !dbg !6 { entry: ret void } -; CHECK: remark: test.c:12:0: in function 'bogus', OmpTargetNumTeams = 987654321 -; CHECK: remark: test.c:12:0: in function 'bogus', OmpTargetThreadLimit = 987654321 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsX = 987654321 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsY = 987654321 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuMaxNumWorkgroupsZ = 987654321 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuFlatWorkGroupSizeMin = 1 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuFlatWorkGroupSizeMax = 1024 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuWavesPerEUMin = 4 -; CHECK: remark: test.c:12:0: in function 'bogus', AmdgpuWavesPerEUMax = 10 +; CHECK: remark: test.c:12:0: in function 'bogus', omp_target_num_teams = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', omp_target_thread_limit = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[0] = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[1] = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-max-num-workgroups[2] = 987654321 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-flat-work-group-size[0] = 1 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-flat-work-group-size[1] = 1024 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-waves-per-eu[0] = 4 +; CHECK: remark: test.c:12:0: in function 'bogus', amdgpu-waves-per-eu[1] = 10 define void @bogus() #1 !dbg !7 { entry: ret void diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll index d9a024f38652e..7a055c7152ec8 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/nvptx.ll @@ -9,12 +9,12 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetNumTeams = 100 -; CHECK: remark: test.c:10:0: in artificial function 'test', OmpTargetThreadLimit = 101 -; CHECK: remark: test.c:10:0: in artificial function 'test', Maxclusterrank = 200 -; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidx = 210 -; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidy = 211 -; CHECK: remark: test.c:10:0: in artificial function 'test', Maxntidz = 212 +; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_num_teams = 100 +; CHECK: remark: test.c:10:0: in artificial function 'test', omp_target_thread_limit = 101 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxclusterrank = 200 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidx = 210 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidy = 211 +; CHECK: remark: test.c:10:0: in artificial function 'test', maxntidz = 212 define void @test() #0 !dbg !5 { entry: ret void diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index d21dde10f979a..17ded0b6d3753 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -16,14 +16,14 @@ ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 256 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsX = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsY = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuMaxNumWorkgroupsZ = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMin = 1 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuFlatWorkGroupSizeMax = 256 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuWavesPerEUMin = 1 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AmdgpuWavesPerEUMax = 10 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[1] = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[0] = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[1] = 256 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[0] = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[1] = 10 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 @@ -39,13 +39,13 @@ ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsX = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsY = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuMaxNumWorkgroupsZ = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuFlatWorkGroupSizeMin = 1 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuFlatWorkGroupSizeMax = 1024 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuWavesPerEUMin = 4 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AmdgpuWavesPerEUMax = 10 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[0] = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[1] = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[0] = 1 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[1] = 1024 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[0] = 4 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[1] = 10 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 @@ -61,13 +61,13 @@ ; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsX = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsY = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuMaxNumWorkgroupsZ = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuFlatWorkGroupSizeMin = 1 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuFlatWorkGroupSizeMax = 1024 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuWavesPerEUMin = 4 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', AmdgpuWavesPerEUMax = 10 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[0] = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[1] = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[0] = 1 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[1] = 1024 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[0] = 4 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[1] = 10 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasStaticSizeSum = 12 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', AllocasDyn = 0 diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 2dbd04b2536c4..68c416acd6388 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -16,8 +16,8 @@ ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', OmpTargetThreadLimit = 128 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Maxntidx = 128 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 128 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', maxntidx = 128 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 From ad393d25109d16b9ce8bdb718eb1b7d3b02b1319 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 18:53:41 -0400 Subject: [PATCH 091/114] Replace -kernel-info-end-lto with -no-kernel-info-end-lto --- llvm/docs/KernelInfo.rst | 22 ++++++++++--------- llvm/include/llvm/Target/TargetMachine.h | 2 +- llvm/lib/Analysis/KernelInfo.cpp | 4 +++- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 2 +- llvm/lib/Target/TargetMachine.cpp | 6 ++--- .../Inputs/test.ll | 0 .../KernelInfo/enable-kernel-info/amdgpu.ll | 18 +++++++++++++++ .../KernelInfo/enable-kernel-info/nvptx.ll | 18 +++++++++++++++ .../kernel-info-after-lto/amdgpu.ll | 22 ------------------- .../KernelInfo/kernel-info-after-lto/nvptx.ll | 22 ------------------- 11 files changed, 57 insertions(+), 61 deletions(-) rename llvm/test/Analysis/KernelInfo/{kernel-info-after-lto => enable-kernel-info}/Inputs/test.ll (100%) create mode 100644 llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll create mode 100644 llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll delete mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll delete mode 100644 llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst index 397b32602bce2..dac642f1ffc65 100644 --- a/llvm/docs/KernelInfo.rst +++ b/llvm/docs/KernelInfo.rst @@ -14,8 +14,7 @@ mitigate them. The pass operates at the LLVM IR level so that it can, in theory, support any LLVM-based compiler for programming languages supporting GPUs. -By default, the pass is disabled. For convenience, the command-line option -``-kernel-info-end-lto`` inserts it at the end of LTO, and options like +By default, the pass runs at the end of LTO, and options like ``-Rpass=kernel-info`` enable its remarks. Example ``opt`` and ``clang`` command lines appear in the next section. @@ -31,7 +30,7 @@ To analyze a C program as it appears to an LLVM GPU backend at the end of LTO: .. code-block:: shell $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ - -Rpass=kernel-info -mllvm -kernel-info-end-lto + -Rpass=kernel-info To analyze specified LLVM IR, perhaps previously generated by something like ``clang -save-temps -g -fopenmp --offload-arch=native test.c``: @@ -41,21 +40,24 @@ To analyze specified LLVM IR, perhaps previously generated by something like $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ -pass-remarks=kernel-info -passes=kernel-info -kernel-info can also be inserted into a specified LLVM pass pipeline using -``-kernel-info-end-lto``, or it can be positioned explicitly in that pipeline: +When specifying an LLVM pass pipeline on the command line, ``kernel-info`` still +runs at the end of LTO by default. ``-no-kernel-info-end-lto`` disables that +behavior so you can position ``kernel-info`` explicitly: .. code-block:: shell $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ - -Rpass=kernel-info -mllvm -kernel-info-end-lto \ + -Rpass=kernel-info \ -Xoffload-linker --lto-newpm-passes='lto' $ clang -O2 -g -fopenmp --offload-arch=native test.c -foffload-lto \ - -Rpass=kernel-info \ - -Xoffload-linker --lto-newpm-passes='lto,module(kernel-info)' + -Rpass=kernel-info -mllvm -no-kernel-info-end-lto \ + -Xoffload-linker --lto-newpm-passes='module(kernel-info),lto' $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ - -pass-remarks=kernel-info -kernel-info-end-lto -passes='lto' + -pass-remarks=kernel-info \ + -passes='lto' $ opt -disable-output test-openmp-nvptx64-nvidia-cuda-sm_70.bc \ - -pass-remarks=kernel-info -passes='lto,module(kernel-info)' + -pass-remarks=kernel-info -no-kernel-info-end-lto \ + -passes='module(kernel-info),lto' diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h index 5c338a8fcd0cf..f34f4c3528dfe 100644 --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -28,7 +28,7 @@ #include #include -extern llvm::cl::opt KernelInfoEndLTO; +extern llvm::cl::opt NoKernelInfoEndLTO; namespace llvm { diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 3aca4c59105ce..81085c8c6beba 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -321,6 +321,8 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, PreservedAnalyses KernelInfoPrinter::run(Function &F, FunctionAnalysisManager &AM) { - KernelInfo::emitKernelInfo(F, AM, TM); + // Skip it if remarks are not enabled as it will do nothing useful. + if (F.getContext().getDiagHandlerPtr()->isPassedOptRemarkEnabled(DEBUG_TYPE)) + KernelInfo::emitKernelInfo(F, AM, TM); return PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 1264749059359..936de58633b87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -825,7 +825,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(AMDGPULowerModuleLDSPass(*this)); if (EnableAMDGPUAttributor && Level != OptimizationLevel::O0) PM.addPass(AMDGPUAttributorPass(*this)); - if (KernelInfoEndLTO) { + if (!NoKernelInfoEndLTO) { FunctionPassManager FPM; FPM.addPass(KernelInfoPrinter(this)); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 3955d173b48f2..db1ea2b38bb54 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -240,7 +240,7 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); - if (KernelInfoEndLTO) { + if (!NoKernelInfoEndLTO) { PB.registerFullLinkTimeOptimizationLastEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index b235fd8f6f49a..07e2a44c21cdd 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -26,9 +26,9 @@ #include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; -cl::opt KernelInfoEndLTO( - "kernel-info-end-lto", - cl::desc("add the kernel-info pass at the end of the full LTO pipeline"), +cl::opt NoKernelInfoEndLTO( + "no-kernel-info-end-lto", + cl::desc("remove the kernel-info pass at the end of the full LTO pipeline"), cl::init(false), cl::Hidden); //--------------------------------------------------------------------------- diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/Inputs/test.ll similarity index 100% rename from llvm/test/Analysis/KernelInfo/kernel-info-after-lto/Inputs/test.ll rename to llvm/test/Analysis/KernelInfo/enable-kernel-info/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll new file mode 100644 index 0000000000000..e969eabfe7cd8 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll @@ -0,0 +1,18 @@ +; Check when kernel-info is enabled in the AMD GPU target backend. + +; REQUIRES: amdgpu-registered-target + +; DEFINE: %{opt} = opt -disable-output %S/Inputs/test.ll \ +; DEFINE: -mtriple="amdgcn-amd-amdhsa" 2>&1 +; DEFINE: %{fcheck-on} = FileCheck -match-full-lines %S/Inputs/test.ll +; DEFINE: %{fcheck-off} = FileCheck -allow-empty -check-prefixes=NONE \ +; DEFINE: %S/Inputs/test.ll + +; By default, kernel-info is in the LTO pipeline. To see output, the LTO +; pipeline must run, -no-kernel-info-end-lto must not be specified, and remarks +; must be enabled. +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info | %{fcheck-on} +; RUN: %{opt} -passes='default' -pass-remarks=kernel-info | %{fcheck-off} +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info \ +; RUN: -no-kernel-info-end-lto | %{fcheck-off} +; RUN: %{opt} -passes='lto' | %{fcheck-off} diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll new file mode 100644 index 0000000000000..65249b4d92e34 --- /dev/null +++ b/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll @@ -0,0 +1,18 @@ +; Check when kernel-info is enabled in the NVPTX target backend. + +; REQUIRES: nvptx-registered-target + +; DEFINE: %{opt} = opt -disable-output %S/Inputs/test.ll \ +; DEFINE: -mtriple="nvptx64-nvidia-cuda" 2>&1 +; DEFINE: %{fcheck-on} = FileCheck -match-full-lines %S/Inputs/test.ll +; DEFINE: %{fcheck-off} = FileCheck -allow-empty -check-prefixes=NONE \ +; DEFINE: %S/Inputs/test.ll + +; By default, kernel-info is in the LTO pipeline. To see output, the LTO +; pipeline must run, -no-kernel-info-end-lto must not be specified, and remarks +; must be enabled. +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info | %{fcheck-on} +; RUN: %{opt} -passes='default' -pass-remarks=kernel-info | %{fcheck-off} +; RUN: %{opt} -passes='lto' -pass-remarks=kernel-info \ +; RUN: -no-kernel-info-end-lto | %{fcheck-off} +; RUN: %{opt} -passes='lto' | %{fcheck-off} diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll deleted file mode 100644 index 6d6e83e8d317f..0000000000000 --- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/amdgpu.ll +++ /dev/null @@ -1,22 +0,0 @@ -; Check that -kernel-info-end-lto enables kernel-info in the AMD GPU target -; backend. - -; REQUIRES: amdgpu-registered-target - -; -kernel-info-end-lto inserts kernel-info into LTO pipeline. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="amdgcn-amd-amdhsa" \ -; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -match-full-lines %S/Inputs/test.ll - -; Omitting -kernel-info-end-lto disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="amdgcn-amd-amdhsa" \ -; RUN: -passes='lto' 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll - -; Omitting LTO disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="amdgcn-amd-amdhsa" \ -; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll diff --git a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll b/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll deleted file mode 100644 index 1e427daed671e..0000000000000 --- a/llvm/test/Analysis/KernelInfo/kernel-info-after-lto/nvptx.ll +++ /dev/null @@ -1,22 +0,0 @@ -; Check that -kernel-info-end-lto enables kernel-info in the NVPTX target -; backend. - -; REQUIRES: nvptx-registered-target - -; -kernel-info-end-lto inserts kernel-info into LTO pipeline. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="nvptx64-nvidia-cuda" \ -; RUN: -passes='lto' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -match-full-lines %S/Inputs/test.ll - -; Omitting -kernel-info-end-lto disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="nvptx64-nvidia-cuda" \ -; RUN: -passes='lto' 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll - -; Omitting LTO disables kernel-info. -; RUN: opt -pass-remarks=kernel-info -disable-output %S/Inputs/test.ll \ -; RUN: -mtriple="nvptx64-nvidia-cuda" \ -; RUN: -passes='default' -kernel-info-end-lto 2>&1 | \ -; RUN: FileCheck -allow-empty -check-prefixes=NONE %S/Inputs/test.ll From d3beccfe9eb34636cd0015b867d6fcda8fa6ea26 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 11 Oct 2024 19:39:37 -0400 Subject: [PATCH 092/114] Apply clang-format --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 936de58633b87..bdf13d33c11c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -840,7 +840,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { return onlyAllocateVGPRs; return nullptr; }); - } int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { From 5a4b873d99545a5865a8577ea8f48f0aac4623d5 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 14 Oct 2024 17:57:50 -0400 Subject: [PATCH 093/114] Avoid auto, as requested --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 7 ++++--- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 9 +++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 6d92dccad076f..254fb72c0fb71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1414,14 +1414,15 @@ bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { void GCNTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { - auto MaxNumWorkgroups = ST->getMaxNumWorkGroups(F); + SmallVector MaxNumWorkgroups = ST->getMaxNumWorkGroups(F); LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]}); LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]}); LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]}); - auto FlatWorkGroupSize = ST->getFlatWorkGroupSizes(F); + std::pair FlatWorkGroupSize = + ST->getFlatWorkGroupSizes(F); LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first}); LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second}); - auto WavesPerEU = ST->getWavesPerEU(F); + std::pair WavesPerEU = ST->getWavesPerEU(F); LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first}); LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second}); } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index d230a66449063..f0229c202c283 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -447,12 +447,13 @@ void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, void NVPTXTTIImpl::collectKernelLaunchBounds( const Function &F, SmallVectorImpl> &LB) const { - if (auto Val = getMaxClusterRank(F)) + std::optional Val; + if ((Val = getMaxClusterRank(F))) LB.push_back({"maxclusterrank", *Val}); - if (auto Val = getMaxNTIDx(F)) + if ((Val = getMaxNTIDx(F))) LB.push_back({"maxntidx", *Val}); - if (auto Val = getMaxNTIDy(F)) + if ((Val = getMaxNTIDy(F))) LB.push_back({"maxntidy", *Val}); - if (auto Val = getMaxNTIDz(F)) + if ((Val = getMaxNTIDz(F))) LB.push_back({"maxntidz", *Val}); } From 571181b4e1a5a5dd2f08841e9c637a933ab4451e Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 14 Oct 2024 17:58:03 -0400 Subject: [PATCH 094/114] For function name, use debug info or keep @ --- llvm/lib/Analysis/KernelInfo.cpp | 42 ++++++----- llvm/test/Analysis/KernelInfo/calls.ll | 75 +++++++++++-------- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 8 +- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 8 +- 4 files changed, 73 insertions(+), 60 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 81085c8c6beba..8c25b3b901047 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -80,12 +80,27 @@ static bool isKernelFunction(Function &F) { return F.hasFnAttribute("kernel"); } -static void identifyFunction(OptimizationRemark &R, const Function &F) { - if (auto *SubProgram = F.getSubprogram()) { - if (SubProgram->isArtificial()) - R << "artificial "; +static void identifyCallee(OptimizationRemark &R, const Module *M, + const Value *V, StringRef Kind = "") { + SmallString<100> Name; // might be function name or asm expression + if (const Function *F = dyn_cast(V)) { + if (auto *SubProgram = F->getSubprogram()) { + if (SubProgram->isArtificial()) + R << "artificial "; + Name = SubProgram->getName(); + } } - R << "function '" << F.getName() << "'"; + if (Name.empty()) { + raw_svector_ostream OS(Name); + V->printAsOperand(OS, /*PrintType=*/false, M); + } + if (!Kind.empty()) + R << Kind << " "; + R << "'" << Name << "'"; +} + +static void identifyFunction(OptimizationRemark &R, const Function &F) { + identifyCallee(R, F.getParent(), &F, "function"); } static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller, @@ -132,21 +147,8 @@ static void remarkCall(OptimizationRemarkEmitter &ORE, const Function &Caller, OptimizationRemark R(DEBUG_TYPE, RemarkKind, &Call); R << "in "; identifyFunction(R, Caller); - R << ", " << CallKind << ", callee is"; - Value *Callee = Call.getCalledOperand(); - SmallString<100> Name; // might be function name or asm expression - if (const Function *FnCallee = dyn_cast(Callee)) { - if (auto *SubProgram = FnCallee->getSubprogram()) { - if (SubProgram->isArtificial()) - R << " artificial"; - } - Name = FnCallee->getName(); - } - if (Name.empty()) { - raw_svector_ostream OS(Name); - Callee->printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); - } - R << " '" << Name << "'"; + R << ", " << CallKind << ", callee is "; + identifyCallee(R, Caller.getParent(), Call.getCalledOperand()); return R; }); } diff --git a/llvm/test/Analysis/KernelInfo/calls.ll b/llvm/test/Analysis/KernelInfo/calls.ll index 2a2672c70b85c..6a2a5c426b78b 100644 --- a/llvm/test/Analysis/KernelInfo/calls.ll +++ b/llvm/test/Analysis/KernelInfo/calls.ll @@ -11,30 +11,30 @@ declare void @personality() define void @h() personality ptr @personality !dbg !100 { entry: - ; CHECK: remark: test.c:16:5: in artificial function 'h', direct call, callee is 'f' + ; CHECK: remark: test.c:16:5: in artificial function 'h_dbg', direct call, callee is '@f' call void @f(), !dbg !102 - ; CHECK: remark: test.c:17:5: in artificial function 'h', direct call to defined function, callee is 'g' + ; CHECK: remark: test.c:17:5: in artificial function 'h_dbg', direct call to defined function, callee is 'g_dbg' call void @g(), !dbg !104 - ; CHECK: remark: test.c:18:5: in artificial function 'h', direct call to defined function, callee is artificial 'h' + ; CHECK: remark: test.c:18:5: in artificial function 'h_dbg', direct call to defined function, callee is artificial 'h_dbg' call void @h(), !dbg !105 - ; CHECK: remark: test.c:24:5: in artificial function 'h', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' + ; CHECK: remark: test.c:24:5: in artificial function 'h_dbg', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' call void asm sideeffect "eieio", ""(), !dbg !111 %fnPtr = load ptr, ptr null, align 8 - ; CHECK: remark: test.c:19:5: in artificial function 'h', indirect call, callee is '%fnPtr' + ; CHECK: remark: test.c:19:5: in artificial function 'h_dbg', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !106 - ; CHECK: remark: test.c:20:5: in artificial function 'h', direct invoke, callee is 'f' + ; CHECK: remark: test.c:20:5: in artificial function 'h_dbg', direct invoke, callee is '@f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !107 fcont: - ; CHECK: remark: test.c:21:5: in artificial function 'h', direct invoke to defined function, callee is 'g' + ; CHECK: remark: test.c:21:5: in artificial function 'h_dbg', direct invoke to defined function, callee is 'g_dbg' invoke void @g() to label %gcont unwind label %cleanup, !dbg !108 gcont: - ; CHECK: remark: test.c:22:5: in artificial function 'h', direct invoke to defined function, callee is artificial 'h' + ; CHECK: remark: test.c:22:5: in artificial function 'h_dbg', direct invoke to defined function, callee is artificial 'h_dbg' invoke void @h() to label %hcont unwind label %cleanup, !dbg !109 hcont: - ; CHECK: remark: test.c:25:5: in artificial function 'h', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' + ; CHECK: remark: test.c:25:5: in artificial function 'h_dbg', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !112 asmcont: - ; CHECK: remark: test.c:23:5: in artificial function 'h', indirect invoke, callee is '%fnPtr' + ; CHECK: remark: test.c:23:5: in artificial function 'h_dbg', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !110 cleanup: %ll = landingpad { ptr, i32 } @@ -43,40 +43,40 @@ cleanup: end: ret void } -; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCalls = 8 -; CHECK: remark: test.c:13:0: in artificial function 'h', IndirectCalls = 2 -; CHECK: remark: test.c:13:0: in artificial function 'h', DirectCallsToDefinedFunctions = 4 -; CHECK: remark: test.c:13:0: in artificial function 'h', InlineAssemblyCalls = 2 -; CHECK: remark: test.c:13:0: in artificial function 'h', Invokes = 5 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', DirectCalls = 8 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', IndirectCalls = 2 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', DirectCallsToDefinedFunctions = 4 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', InlineAssemblyCalls = 2 +; CHECK: remark: test.c:13:0: in artificial function 'h_dbg', Invokes = 5 declare void @f() define void @g() personality ptr @personality !dbg !200 { entry: - ; CHECK: remark: test.c:6:3: in function 'g', direct call, callee is 'f' + ; CHECK: remark: test.c:6:3: in function 'g_dbg', direct call, callee is '@f' call void @f(), !dbg !202 - ; CHECK: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' + ; CHECK: remark: test.c:7:3: in function 'g_dbg', direct call to defined function, callee is 'g_dbg' call void @g(), !dbg !203 - ; CHECK: remark: test.c:8:3: in function 'g', direct call to defined function, callee is artificial 'h' + ; CHECK: remark: test.c:8:3: in function 'g_dbg', direct call to defined function, callee is artificial 'h_dbg' call void @h(), !dbg !204 - ; CHECK: remark: test.c:14:3: in function 'g', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' + ; CHECK: remark: test.c:14:3: in function 'g_dbg', direct call to inline assembly, callee is 'asm sideeffect "eieio", ""' call void asm sideeffect "eieio", ""(), !dbg !210 %fnPtr = load ptr, ptr null, align 8 - ; CHECK: remark: test.c:9:3: in function 'g', indirect call, callee is '%fnPtr' + ; CHECK: remark: test.c:9:3: in function 'g_dbg', indirect call, callee is '%fnPtr' call void %fnPtr(), !dbg !205 - ; CHECK: remark: test.c:10:3: in function 'g', direct invoke, callee is 'f' + ; CHECK: remark: test.c:10:3: in function 'g_dbg', direct invoke, callee is '@f' invoke void @f() to label %fcont unwind label %cleanup, !dbg !206 fcont: - ; CHECK: remark: test.c:11:3: in function 'g', direct invoke to defined function, callee is 'g' + ; CHECK: remark: test.c:11:3: in function 'g_dbg', direct invoke to defined function, callee is 'g_dbg' invoke void @g() to label %gcont unwind label %cleanup, !dbg !207 gcont: - ; CHECK: remark: test.c:12:3: in function 'g', direct invoke to defined function, callee is artificial 'h' + ; CHECK: remark: test.c:12:3: in function 'g_dbg', direct invoke to defined function, callee is artificial 'h_dbg' invoke void @h() to label %hcont unwind label %cleanup, !dbg !208 hcont: - ; CHECK: remark: test.c:15:3: in function 'g', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' + ; CHECK: remark: test.c:15:3: in function 'g_dbg', direct invoke to inline assembly, callee is 'asm sideeffect "eieio", ""' invoke void asm sideeffect "eieio", ""() to label %asmcont unwind label %cleanup, !dbg !211 asmcont: - ; CHECK: remark: test.c:13:3: in function 'g', indirect invoke, callee is '%fnPtr' + ; CHECK: remark: test.c:13:3: in function 'g_dbg', indirect invoke, callee is '%fnPtr' invoke void %fnPtr() to label %end unwind label %cleanup, !dbg !209 cleanup: %ll = landingpad { ptr, i32 } @@ -85,11 +85,22 @@ cleanup: end: ret void } -; CHECK: remark: test.c:3:0: in function 'g', DirectCalls = 8 -; CHECK: remark: test.c:3:0: in function 'g', IndirectCalls = 2 -; CHECK: remark: test.c:3:0: in function 'g', DirectCallsToDefinedFunctions = 4 -; CHECK: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 2 -; CHECK: remark: test.c:3:0: in function 'g', Invokes = 5 +; CHECK: remark: test.c:3:0: in function 'g_dbg', DirectCalls = 8 +; CHECK: remark: test.c:3:0: in function 'g_dbg', IndirectCalls = 2 +; CHECK: remark: test.c:3:0: in function 'g_dbg', DirectCallsToDefinedFunctions = 4 +; CHECK: remark: test.c:3:0: in function 'g_dbg', InlineAssemblyCalls = 2 +; CHECK: remark: test.c:3:0: in function 'g_dbg', Invokes = 5 + +define void @i() { + ; CHECK: remark: :0:0: in function '@i', direct call, callee is '@f' + call void @f() + ret void +} +; CHECK: remark: :0:0: in function '@i', DirectCalls = 1 +; CHECK: remark: :0:0: in function '@i', IndirectCalls = 0 +; CHECK: remark: :0:0: in function '@i', DirectCallsToDefinedFunctions = 0 +; CHECK: remark: :0:0: in function '@i', InlineAssemblyCalls = 0 +; CHECK: remark: :0:0: in function '@i', Invokes = 0 !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} @@ -100,7 +111,7 @@ end: !3 = !{null} !4 = !{} -!100 = distinct !DISubprogram(name: "h", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) +!100 = distinct !DISubprogram(name: "h_dbg", scope: !2, file: !2, line: 13, type: !101, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !1, retainedNodes: !4) !101 = distinct !DISubroutineType(types: !3) !102 = !DILocation(line: 16, column: 5, scope: !103) !103 = distinct !DILexicalBlock(scope: !100, file: !2, line: 13, column: 3) @@ -114,7 +125,7 @@ end: !111 = !DILocation(line: 24, column: 5, scope: !103) !112 = !DILocation(line: 25, column: 5, scope: !103) -!200 = distinct !DISubprogram(name: "g", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) +!200 = distinct !DISubprogram(name: "g_dbg", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !4) !201 = !DISubroutineType(types: !3) !202 = !DILocation(line: 6, column: 3, scope: !200) !203 = !DILocation(line: 7, column: 3, scope: !200) diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 17ded0b6d3753..10bfa164e2386 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -11,10 +11,10 @@ ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space -; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_init' -; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_init' +; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' -; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '__kmpc_target_deinit' +; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 256 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 0 @@ -58,7 +58,7 @@ ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes -; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[0] = 0 diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 68c416acd6388..0d55cbbe79135 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -11,10 +11,10 @@ ; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space -; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_init' -; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_init' +; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' -; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '__kmpc_target_deinit' +; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 128 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', maxntidx = 128 @@ -45,7 +45,7 @@ ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes -; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is 'f' +; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Allocas = 2 From a5ce5477d12f0ca1c6020cde5c51a96887945b17 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 16 Oct 2024 12:40:30 -0400 Subject: [PATCH 095/114] Use anonymous namespace --- llvm/lib/Analysis/KernelInfo.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 8c25b3b901047..2f52e819036cb 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -28,6 +28,8 @@ using namespace llvm; #define DEBUG_TYPE "kernel-info" +namespace { + /// Data structure holding function info for kernels. class KernelInfo { void updateForBB(const BasicBlock &BB, int64_t Direction, @@ -75,6 +77,8 @@ class KernelInfo { int64_t FlatAddrspaceAccesses = 0; }; +} // end anonymous namespace + static bool isKernelFunction(Function &F) { // TODO: Is this general enough? Consider languages beyond OpenMP. return F.hasFnAttribute("kernel"); From 4d60911942d3aa876599e4f9ad2a0e23f5b92bc3 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 16 Oct 2024 12:55:22 -0400 Subject: [PATCH 096/114] Remove currently unused capabilities, as requested They were originally copied from FunctionPropertiesAnalysis.cpp. --- llvm/lib/Analysis/KernelInfo.cpp | 38 ++++++++++++++------------------ 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 2f52e819036cb..3658f54923e3f 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -32,8 +32,7 @@ namespace { /// Data structure holding function info for kernels. class KernelInfo { - void updateForBB(const BasicBlock &BB, int64_t Direction, - OptimizationRemarkEmitter &ORE); + void updateForBB(const BasicBlock &BB, OptimizationRemarkEmitter &ORE); public: static void emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, @@ -180,38 +179,37 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, }); } -void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, +void KernelInfo::updateForBB(const BasicBlock &BB, OptimizationRemarkEmitter &ORE) { - assert(Direction == 1 || Direction == -1); const Function &F = *BB.getParent(); const Module &M = *F.getParent(); const DataLayout &DL = M.getDataLayout(); for (const Instruction &I : BB.instructionsWithoutDebug()) { if (const AllocaInst *Alloca = dyn_cast(&I)) { - Allocas += Direction; + ++Allocas; TypeSize::ScalarTy StaticSize = 0; if (std::optional Size = Alloca->getAllocationSize(DL)) { StaticSize = Size->getFixedValue(); assert(StaticSize <= std::numeric_limits::max()); - AllocasStaticSizeSum += Direction * StaticSize; + AllocasStaticSizeSum += StaticSize; } else { - AllocasDyn += Direction; + ++AllocasDyn; } remarkAlloca(ORE, F, *Alloca, StaticSize); } else if (const CallBase *Call = dyn_cast(&I)) { SmallString<40> CallKind; SmallString<40> RemarkKind; if (Call->isIndirectCall()) { - IndirectCalls += Direction; + ++IndirectCalls; CallKind += "indirect"; RemarkKind += "Indirect"; } else { - DirectCalls += Direction; + ++DirectCalls; CallKind += "direct"; RemarkKind += "Direct"; } if (isa(Call)) { - Invokes += Direction; + ++Invokes; CallKind += " invoke"; RemarkKind += "Invoke"; } else { @@ -221,12 +219,12 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, if (!Call->isIndirectCall()) { if (const Function *Callee = Call->getCalledFunction()) { if (!Callee->isIntrinsic() && !Callee->isDeclaration()) { - DirectCallsToDefinedFunctions += Direction; + ++DirectCallsToDefinedFunctions; CallKind += " to defined function"; RemarkKind += "ToDefinedFunction"; } } else if (Call->isInlineAsm()) { - InlineAssemblyCalls += Direction; + ++InlineAssemblyCalls; CallKind += " to inline assembly"; RemarkKind += "ToInlineAssembly"; } @@ -234,34 +232,34 @@ void KernelInfo::updateForBB(const BasicBlock &BB, int64_t Direction, remarkCall(ORE, F, *Call, CallKind, RemarkKind); if (const AnyMemIntrinsic *MI = dyn_cast(Call)) { if (MI->getDestAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } else if (const AnyMemTransferInst *MT = dyn_cast(MI)) { if (MT->getSourceAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } } } } else if (const LoadInst *Load = dyn_cast(&I)) { if (Load->getPointerAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const StoreInst *Store = dyn_cast(&I)) { if (Store->getPointerAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicRMWInst *At = dyn_cast(&I)) { if (At->getPointerAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { if (At->getPointerAddressSpace() == FlatAddrspace) { - FlatAddrspaceAccesses += Direction; + ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } } @@ -300,11 +298,9 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, } TheTTI.collectKernelLaunchBounds(F, KI.LaunchBounds); - const DominatorTree &DT = FAM.getResult(F); auto &ORE = FAM.getResult(F); for (const auto &BB : F) - if (DT.isReachableFromEntry(&BB)) - KI.updateForBB(BB, +1, ORE); + KI.updateForBB(BB, ORE); #define REMARK_PROPERTY(PROP_NAME) \ remarkProperty(ORE, F, #PROP_NAME, KI.PROP_NAME) From 0c30e7ceeb36294f4523da2590101314ca1c662d Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 16 Oct 2024 13:11:12 -0400 Subject: [PATCH 097/114] Rename test files without LLVM IR to .test --- .../KernelInfo/enable-kernel-info/{amdgpu.ll => amdgpu.test} | 0 .../KernelInfo/enable-kernel-info/{nvptx.ll => nvptx.test} | 0 .../Analysis/KernelInfo/flat-addrspace/{amdgpu.ll => amdgpu.test} | 0 .../Analysis/KernelInfo/flat-addrspace/{nvptx.ll => nvptx.test} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename llvm/test/Analysis/KernelInfo/enable-kernel-info/{amdgpu.ll => amdgpu.test} (100%) rename llvm/test/Analysis/KernelInfo/enable-kernel-info/{nvptx.ll => nvptx.test} (100%) rename llvm/test/Analysis/KernelInfo/flat-addrspace/{amdgpu.ll => amdgpu.test} (100%) rename llvm/test/Analysis/KernelInfo/flat-addrspace/{nvptx.ll => nvptx.test} (100%) diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.test similarity index 100% rename from llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.ll rename to llvm/test/Analysis/KernelInfo/enable-kernel-info/amdgpu.test diff --git a/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll b/llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.test similarity index 100% rename from llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.ll rename to llvm/test/Analysis/KernelInfo/enable-kernel-info/nvptx.test diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.test similarity index 100% rename from llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.ll rename to llvm/test/Analysis/KernelInfo/flat-addrspace/amdgpu.test diff --git a/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll b/llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.test similarity index 100% rename from llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.ll rename to llvm/test/Analysis/KernelInfo/flat-addrspace/nvptx.test From f5a6fbd408b111570e8d9b2e37655704b36a9ca3 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Thu, 17 Oct 2024 13:26:06 -0400 Subject: [PATCH 098/114] Regenerate OpenMP tests from current clang See llvm/test/Analysis/KernelInfo/openmp/README.md. --- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 125 +- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 1141 ++++++++--------- 2 files changed, 627 insertions(+), 639 deletions(-) diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 10bfa164e2386..c2caf8267cae7 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -16,13 +16,12 @@ ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 256 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[1] = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[2] = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[0] = 1 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[1] = 256 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[0] = 1 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[1] = 1024 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[0] = 4 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[1] = 10 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 @@ -39,12 +38,13 @@ ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 256 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[0] = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[1] = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[2] = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[0] = 1 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[1] = 1024 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[0] = 4 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[1] = 256 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[0] = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[1] = 10 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 @@ -95,82 +95,75 @@ target triple = "amdgcn-amd-amdhsa" @__omp_rtl_assume_threads_oversubscription = weak_odr hidden addrspace(1) constant i32 0 @__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1) constant i32 0 @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1) constant i32 0 -@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_71f35_h_l12_debug__;13;3;;\00", align 1 +@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_6f0c0_h_l12_debug__;13;3;;\00", align 1 @1 = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 56, ptr @0 }, align 8 -@__omp_offloading_fd02_71f35_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer -@__omp_offloading_fd02_71f35_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_71f35_h_l12_dynamic_environment to ptr) } +@__omp_offloading_fd02_6f0c0_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_6f0c0_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_6f0c0_h_l12_dynamic_environment to ptr) } @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 ; Function Attrs: convergent noinline norecurse nounwind optnone -define internal void @__omp_offloading_fd02_71f35_h_l12_debug__(ptr noalias noundef %dyn_ptr) #0 !dbg !16 { -entry: - %dyn_ptr.addr = alloca ptr, align 8, addrspace(5) - %i = alloca i32, align 4, addrspace(5) - %a = alloca [2 x i32], align 4, addrspace(5) - %dyn_ptr.addr.ascast = addrspacecast ptr addrspace(5) %dyn_ptr.addr to ptr - %i.ascast = addrspacecast ptr addrspace(5) %i to ptr - %a.ascast = addrspacecast ptr addrspace(5) %a to ptr - store ptr %dyn_ptr, ptr %dyn_ptr.addr.ascast, align 8 - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %dyn_ptr.addr, metadata !24, metadata !DIExpression()), !dbg !25 - %0 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_71f35_h_l12_kernel_environment to ptr), ptr %dyn_ptr), !dbg !26 - %exec_user_code = icmp eq i32 %0, -1, !dbg !26 - br i1 %exec_user_code, label %user_code.entry, label %worker.exit, !dbg !26 - -user_code.entry: ; preds = %entry - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %i, metadata !27, metadata !DIExpression()), !dbg !30 - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %a, metadata !31, metadata !DIExpression()), !dbg !35 - call void @f() #5, !dbg !36 - call void @g() #5, !dbg !37 +define internal void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !16 { + %2 = alloca ptr, align 8, addrspace(5) + %3 = alloca i32, align 4, addrspace(5) + %4 = alloca [2 x i32], align 4, addrspace(5) + %5 = addrspacecast ptr addrspace(5) %2 to ptr + %6 = addrspacecast ptr addrspace(5) %3 to ptr + %7 = addrspacecast ptr addrspace(5) %4 to ptr + store ptr %0, ptr %5, align 8 + #dbg_declare(ptr addrspace(5) %2, !24, !DIExpression(), !25) + %8 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_6f0c0_h_l12_kernel_environment to ptr), ptr %0), !dbg !26 + %9 = icmp eq i32 %8, -1, !dbg !26 + br i1 %9, label %10, label %11, !dbg !26 + +10: ; preds = %1 + #dbg_declare(ptr addrspace(5) %3, !27, !DIExpression(), !30) + #dbg_declare(ptr addrspace(5) %4, !31, !DIExpression(), !35) + call void @f() #4, !dbg !36 + call void @g() #4, !dbg !37 call void @__kmpc_target_deinit(), !dbg !38 ret void, !dbg !39 -worker.exit: ; preds = %entry +11: ; preds = %1 ret void, !dbg !26 } +; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_6f0c0_h_l12(ptr noalias noundef %0) #1 !dbg !40 { + %2 = alloca ptr, align 8, addrspace(5) + %3 = addrspacecast ptr addrspace(5) %2 to ptr + store ptr %0, ptr %3, align 8 + #dbg_declare(ptr addrspace(5) %2, !41, !DIExpression(), !42) + %4 = load ptr, ptr %3, align 8, !dbg !43 + call void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr %4) #5, !dbg !43 + ret void, !dbg !43 +} + declare i32 @__kmpc_target_init(ptr, ptr) ; Function Attrs: convergent -declare void @f(...) #1 +declare void @f(...) #2 declare void @__kmpc_target_deinit() -; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone -define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_71f35_h_l12(ptr noalias noundef %dyn_ptr) #2 !dbg !40 { -entry: - %dyn_ptr.addr = alloca ptr, align 8, addrspace(5) - %dyn_ptr.addr.ascast = addrspacecast ptr addrspace(5) %dyn_ptr.addr to ptr - store ptr %dyn_ptr, ptr %dyn_ptr.addr.ascast, align 8 - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %dyn_ptr.addr, metadata !41, metadata !DIExpression()), !dbg !42 - %0 = load ptr, ptr %dyn_ptr.addr.ascast, align 8, !dbg !43 - call void @__omp_offloading_fd02_71f35_h_l12_debug__(ptr %0) #6, !dbg !43 - ret void, !dbg !43 -} - ; Function Attrs: convergent noinline nounwind optnone define hidden void @g() #3 !dbg !44 { -entry: - %i = alloca i32, align 4, addrspace(5) - %a = alloca [2 x i32], align 4, addrspace(5) - %i.ascast = addrspacecast ptr addrspace(5) %i to ptr - %a.ascast = addrspacecast ptr addrspace(5) %a to ptr - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %i, metadata !47, metadata !DIExpression()), !dbg !48 - tail call void @llvm.dbg.declare(metadata ptr addrspace(5) %a, metadata !49, metadata !DIExpression()), !dbg !50 - call void @f() #5, !dbg !51 - call void @g() #5, !dbg !52 + %1 = alloca i32, align 4, addrspace(5) + %2 = alloca [2 x i32], align 4, addrspace(5) + %3 = addrspacecast ptr addrspace(5) %1 to ptr + %4 = addrspacecast ptr addrspace(5) %2 to ptr + #dbg_declare(ptr addrspace(5) %1, !47, !DIExpression(), !48) + #dbg_declare(ptr addrspace(5) %2, !49, !DIExpression(), !50) + call void @f() #4, !dbg !51 + call void @g() #4, !dbg !52 ret void, !dbg !53 } -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare void @llvm.dbg.declare(metadata, metadata, metadata) #4 - -attributes #0 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "frame-pointer"="all" "no-trapping-math"="true" "omp_target_thread_limit"="256" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -attributes #2 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } +attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } +attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "frame-pointer"="all" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="256" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" } +attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -attributes #5 = { convergent } -attributes #6 = { nounwind } +attributes #4 = { convergent } +attributes #5 = { nounwind } !llvm.dbg.cu = !{!0} !omp_offload.info = !{!2} @@ -179,10 +172,10 @@ attributes #6 = { nounwind } !llvm.ident = !{!13, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14} !opencl.ocl.version = !{!15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} -!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 19.0.0git (/tmp/llvm/clang 5a5e94265d423fa9eb39dc1b855511195f8dc0fe)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "eff61a7cf33c8dd1bd6933250fc90157") -!2 = !{i32 0, i32 64770, i32 466741, !"h", i32 12, i32 0, i32 0} -!3 = !{ptr @__omp_offloading_fd02_71f35_h_l12, !"kernel", i32 1} +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "854099697e49b3ca7d3b3c08503e6fef") +!2 = !{i32 0, i32 64770, i32 454848, !"h", i32 12, i32 0, i32 0} +!3 = !{ptr @__omp_offloading_fd02_6f0c0_h_l12, !"kernel", i32 1} !4 = !{i32 1, !"amdhsa_code_object_version", i32 500} !5 = !{i32 7, !"Dwarf Version", i32 5} !6 = !{i32 2, !"Debug Info Version", i32 3} @@ -192,10 +185,10 @@ attributes #6 = { nounwind } !10 = !{i32 8, !"PIC Level", i32 2} !11 = !{i32 7, !"frame-pointer", i32 2} !12 = !{i32 4, !"amdgpu_hostcall", i32 1} -!13 = !{!"clang version 19.0.0git (/tmp/llvm/clang 5a5e94265d423fa9eb39dc1b855511195f8dc0fe)"} +!13 = !{!"clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)"} !14 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"} !15 = !{i32 2, i32 0} -!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_71f35_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_6f0c0_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) !17 = !DIFile(filename: "test.c", directory: "/tmp") !18 = !DISubroutineType(types: !19) !19 = !{null, !20} @@ -219,7 +212,7 @@ attributes #6 = { nounwind } !37 = !DILocation(line: 17, column: 5, scope: !28) !38 = !DILocation(line: 18, column: 3, scope: !28) !39 = !DILocation(line: 18, column: 3, scope: !16) -!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_71f35_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_6f0c0_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) !41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial) !42 = !DILocation(line: 0, scope: !40) !43 = !DILocation(line: 12, column: 1, scope: !40) diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 0d55cbbe79135..e717599aab687 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -16,8 +16,6 @@ ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', omp_target_thread_limit = 128 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', maxntidx = 128 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Allocas = 3 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasStaticSizeSum = 20 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', AllocasDyn = 0 @@ -33,6 +31,8 @@ ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 128 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', maxntidx = 128 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Allocas = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasStaticSizeSum = 8 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', AllocasDyn = 0 @@ -83,10 +83,10 @@ target triple = "nvptx64-nvidia-cuda" @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 -@0 = private unnamed_addr constant [59 x i8] c";test.c;__omp_offloading_10305_5c00dd_h_l12_debug__;13;3;;\00", align 1 -@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 58, ptr @0 }, align 8 -@__omp_offloading_10305_5c00dd_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer -@__omp_offloading_10305_5c00dd_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_10305_5c00dd_h_l12_dynamic_environment } +@0 = private unnamed_addr constant [58 x i8] c";test.c;__omp_offloading_fd02_10d1d6_h_l12_debug__;13;3;;\00", align 1 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 57, ptr @0 }, align 8 +@__omp_offloading_fd02_10d1d6_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_10d1d6_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_fd02_10d1d6_h_l12_dynamic_environment } @llvm.used = appending global [3 x ptr] [ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata" @__omp_rtl_device_memory_pool = weak protected global %struct.DeviceMemoryPoolTy zeroinitializer, align 8 @__omp_rtl_device_memory_pool_tracker = weak protected global %struct.DeviceMemoryPoolTrackingTy zeroinitializer, align 8 @@ -101,371 +101,372 @@ target triple = "nvptx64-nvidia-cuda" @.str2 = private unnamed_addr constant [18 x i8] c"WorkFn == nullptr\00", align 1 @__PRETTY_FUNCTION__.__kmpc_target_deinit = private unnamed_addr constant [28 x i8] c"void __kmpc_target_deinit()\00", align 1 @IsSPMDMode = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 -@.str1127 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 +@.str1124 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 @.str13 = private unnamed_addr constant [23 x i8] c"!mapping::isSPMDMode()\00", align 1 @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel = private unnamed_addr constant [34 x i8] c"void __kmpc_kernel_end_parallel()\00", align 1 @_ZL20KernelEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZL26KernelLaunchEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16 -@.str544 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 -@.str847 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 +@.str541 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 +@.str844 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_ = private unnamed_addr constant [68 x i8] c"void ompx::state::ICVStateTy::assertEqual(const ICVStateTy &) const\00", align 1 -@.str948 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 -@.str1049 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 -@.str1150 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 -@.str1251 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 -@.str1352 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 +@.str945 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 +@.str1046 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 +@.str1147 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 +@.str1248 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 +@.str1349 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 @.str14 = private unnamed_addr constant [43 x i8] c"ParallelTeamSize == Other.ParallelTeamSize\00", align 1 @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_ = private unnamed_addr constant [64 x i8] c"void ompx::state::TeamStateTy::assertEqual(TeamStateTy &) const\00", align 1 -@.str1553 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 +@.str1550 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 @.str24 = private unnamed_addr constant [32 x i8] c"mapping::isSPMDMode() == IsSPMD\00", align 1 @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb = private unnamed_addr constant [43 x i8] c"void ompx::state::assumeInitialState(bool)\00", align 1 +@_ZL9ThreadDST = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZN4ompx5state9TeamStateE = internal local_unnamed_addr addrspace(3) global %"struct.ompx::state::TeamStateTy" undef, align 8 @_ZN4ompx5state12ThreadStatesE = internal addrspace(3) global ptr undef, align 8 ; Function Attrs: convergent noinline norecurse nounwind optnone -define internal void @__omp_offloading_10305_5c00dd_h_l12_debug__(ptr noalias noundef %dyn_ptr) #0 !dbg !17 { -entry: - %dyn_ptr.addr = alloca ptr, align 8 - %i = alloca i32, align 4 - %a = alloca [2 x i32], align 4 - store ptr %dyn_ptr, ptr %dyn_ptr.addr, align 8 - tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !24, metadata !DIExpression()), !dbg !25 - %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_10305_5c00dd_h_l12_kernel_environment, ptr %dyn_ptr), !dbg !26 - %exec_user_code = icmp eq i32 %0, -1, !dbg !26 - br i1 %exec_user_code, label %user_code.entry, label %worker.exit, !dbg !26 - -user_code.entry: ; preds = %entry - tail call void @llvm.dbg.declare(metadata ptr %i, metadata !27, metadata !DIExpression()), !dbg !30 - tail call void @llvm.dbg.declare(metadata ptr %a, metadata !31, metadata !DIExpression()), !dbg !35 - call void @f() #16, !dbg !36 - call void @g() #16, !dbg !37 - call void @__kmpc_target_deinit(), !dbg !38 - ret void, !dbg !39 - -worker.exit: ; preds = %entry - ret void, !dbg !26 +define internal void @__omp_offloading_fd02_10d1d6_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !19 { + %2 = alloca ptr, align 8 + %3 = alloca i32, align 4 + %4 = alloca [2 x i32], align 4 + store ptr %0, ptr %2, align 8 + #dbg_declare(ptr %2, !26, !DIExpression(), !27) + %5 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_10d1d6_h_l12_kernel_environment, ptr %0), !dbg !28 + %6 = icmp eq i32 %5, -1, !dbg !28 + br i1 %6, label %7, label %8, !dbg !28 + +7: ; preds = %1 + #dbg_declare(ptr %3, !29, !DIExpression(), !32) + #dbg_declare(ptr %4, !33, !DIExpression(), !37) + call void @f() #16, !dbg !38 + call void @g() #16, !dbg !39 + call void @__kmpc_target_deinit(), !dbg !40 + ret void, !dbg !41 + +8: ; preds = %1 + ret void, !dbg !28 } -; Function Attrs: convergent -declare void @f(...) #1 - ; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone -define weak_odr protected void @__omp_offloading_10305_5c00dd_h_l12(ptr noalias noundef %dyn_ptr) #2 !dbg !40 { -entry: - %dyn_ptr.addr = alloca ptr, align 8 - store ptr %dyn_ptr, ptr %dyn_ptr.addr, align 8 - tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !41, metadata !DIExpression()), !dbg !42 - %0 = load ptr, ptr %dyn_ptr.addr, align 8, !dbg !43 - call void @__omp_offloading_10305_5c00dd_h_l12_debug__(ptr %0) #17, !dbg !43 - ret void, !dbg !43 +define weak_odr protected void @__omp_offloading_fd02_10d1d6_h_l12(ptr noalias noundef %0) #1 !dbg !42 { + %2 = alloca ptr, align 8 + store ptr %0, ptr %2, align 8 + #dbg_declare(ptr %2, !43, !DIExpression(), !44) + %3 = load ptr, ptr %2, align 8, !dbg !45 + call void @__omp_offloading_fd02_10d1d6_h_l12_debug__(ptr %3) #17, !dbg !45 + ret void, !dbg !45 } +; Function Attrs: convergent +declare void @f(...) #2 + ; Function Attrs: convergent noinline nounwind optnone -define hidden void @g() #3 !dbg !44 { -entry: - %i = alloca i32, align 4 - %a = alloca [2 x i32], align 4 - tail call void @llvm.dbg.declare(metadata ptr %i, metadata !47, metadata !DIExpression()), !dbg !48 - tail call void @llvm.dbg.declare(metadata ptr %a, metadata !49, metadata !DIExpression()), !dbg !50 - call void @f() #16, !dbg !51 - call void @g() #16, !dbg !52 - ret void, !dbg !53 +define hidden void @g() #3 !dbg !46 { + %1 = alloca i32, align 4 + %2 = alloca [2 x i32], align 4 + #dbg_declare(ptr %1, !49, !DIExpression(), !50) + #dbg_declare(ptr %2, !51, !DIExpression(), !52) + call void @f() #16, !dbg !53 + call void @g() #16, !dbg !54 + ret void, !dbg !55 } ; Function Attrs: convergent mustprogress nounwind -define internal noundef i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(48) %KernelEnvironment, ptr nofree noundef nonnull align 8 dereferenceable(16) %KernelLaunchEnvironment) #4 { -entry: - %WorkFn.i = alloca ptr, align 8 - %ExecMode = getelementptr inbounds i8, ptr %KernelEnvironment, i64 2 - %0 = load i8, ptr %ExecMode, align 2, !tbaa !54 - %1 = and i8 %0, 2 - %tobool.not = icmp eq i8 %1, 0 - %2 = load i8, ptr %KernelEnvironment, align 8, !tbaa !60 - %tobool3.not = icmp ne i8 %2, 0 - br i1 %tobool.not, label %if.else, label %if.then - -if.then: ; preds = %entry - %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 - %cmp.i.i.i = icmp eq i32 %3, 0 - br i1 %cmp.i.i.i, label %if.then.i, label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge - -if.then.i: ; preds = %if.then - store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 - %idxprom.i.i = zext nneg i32 %3 to i64 - %arrayidx.i.i = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i - %4 = addrspacecast ptr %arrayidx.i.i to ptr addrspace(3) - store i8 0, ptr addrspace(3) %4, align 1, !tbaa !62 - store i32 0, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !63 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !67 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !68 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 12) to ptr addrspace(3)), align 4, !tbaa !69 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !70 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !71 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !72 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !75 - store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 - store ptr %KernelEnvironment, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 - store ptr %KernelLaunchEnvironment, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !76 - br label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit - -_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge: ; preds = %if.then - %idxprom.i.i.c = zext i32 %3 to i64 - %arrayidx.i.i.c = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i.c - %5 = addrspacecast ptr %arrayidx.i.i.c to ptr addrspace(3) - store i8 0, ptr addrspace(3) %5, align 1, !tbaa !62 - br label %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit - -_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit: ; preds = %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit.critedge, %if.then.i +define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(48) %0, ptr nofree noundef nonnull align 8 dereferenceable(16) %1) #4 { + %3 = alloca ptr, align 8 + %4 = getelementptr inbounds nuw i8, ptr %0, i64 2 + %5 = load i8, ptr %4, align 2, !tbaa !56 + %6 = and i8 %5, 2 + %7 = icmp eq i8 %6, 0 + %8 = load i8, ptr %0, align 8, !tbaa !62 + %9 = icmp ne i8 %8, 0 + br i1 %7, label %21, label %10 + +10: ; preds = %2 + %11 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %12 = icmp eq i32 %11, 0 + br i1 %12, label %13, label %14 + +13: ; preds = %10 + store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + store i8 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512) to ptr addrspace(3)), align 1, !tbaa !64 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !65 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !74 + br label %18 + +14: ; preds = %10 + %15 = zext nneg i32 %11 to i64 + %16 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %15 + %17 = addrspacecast ptr %16 to ptr addrspace(3) + store i8 0, ptr addrspace(3) %17, align 1, !tbaa !64 + br label %18 + +18: ; preds = %14, %13 + br i1 %12, label %19, label %20 + +19: ; preds = %18 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !74 + br label %20 + +20: ; preds = %18, %19 tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19 - br label %if.end - -if.else: ; preds = %entry - %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 - %sub.i.i.i7 = add i32 %6, -1 - %and.i.i.i8 = and i32 %sub.i.i.i7, -32 - %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 - %cmp.i.i.i9 = icmp eq i32 %7, %and.i.i.i8 - br i1 %cmp.i.i.i9, label %if.then.i11, label %if.end.critedge - -if.then.i11: ; preds = %if.else - store i32 0, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 - %idxprom.i.i13 = zext i32 %7 to i64 - %arrayidx.i.i14 = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i13 - %8 = addrspacecast ptr %arrayidx.i.i14 to ptr addrspace(3) - store i8 0, ptr addrspace(3) %8, align 1, !tbaa !62 - store i32 0, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !63 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !67 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !68 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 12) to ptr addrspace(3)), align 4, !tbaa !69 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !70 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !71 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !72 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !75 - store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 - store ptr %KernelEnvironment, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 - store ptr %KernelLaunchEnvironment, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !76 - br label %if.end - -if.end.critedge: ; preds = %if.else - %idxprom.i.i13.c = zext i32 %7 to i64 - %arrayidx.i.i14.c = getelementptr inbounds [1024 x i8], ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %idxprom.i.i13.c - %9 = addrspacecast ptr %arrayidx.i.i14.c to ptr addrspace(3) - store i8 0, ptr addrspace(3) %9, align 1, !tbaa !62 - br label %if.end - -if.end: ; preds = %if.end.critedge, %if.then.i11, %_ZN4ompx5state4initEbR19KernelEnvironmentTyR25KernelLaunchEnvironmentTy.exit - br i1 %tobool.not, label %if.end9, label %if.then7 - -if.then7: ; preds = %if.end - %10 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 - %11 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 - %and.i.i.i21 = and i32 %10, 1 - %and.i.i = and i32 %and.i.i.i21, %11 - %tobool.i.i = icmp ne i32 %and.i.i, 0 - %.pre67.i.i.i = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !80 - %cmp.i.i.i22 = icmp ne i32 %.pre67.i.i.i, 0 - %or.cond.not.i.i.i = select i1 %tobool.i.i, i1 %cmp.i.i.i22, i1 false - br i1 %or.cond.not.i.i.i, label %if.then.i.i.i, label %if.else.i.i.i - -if.then.i.i.i: ; preds = %if.then7 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str847, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + br label %37 + +21: ; preds = %2 + %22 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %23 = add nsw i32 %22, -1 + %24 = and i32 %23, -32 + %25 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 + %26 = icmp eq i32 %25, %24 + br i1 %26, label %27, label %31 + +27: ; preds = %21 + store i32 0, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + %28 = zext nneg i32 %25 to i64 + %29 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %28 + %30 = addrspacecast ptr %29 to ptr addrspace(3) + store i8 0, ptr addrspace(3) %30, align 1, !tbaa !64 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !65 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !74 + br label %35 + +31: ; preds = %21 + %32 = zext nneg i32 %25 to i64 + %33 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %32 + %34 = addrspacecast ptr %33 to ptr addrspace(3) + store i8 0, ptr addrspace(3) %34, align 1, !tbaa !64 + br label %35 + +35: ; preds = %31, %27 + br i1 %26, label %36, label %37 + +36: ; preds = %35 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !74 + br label %37 + +37: ; preds = %36, %35, %20 + br i1 %7, label %100, label %38 + +38: ; preds = %37 + %39 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 + %40 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %41 = and i32 %39, 1 + %42 = and i32 %41, %40 + %43 = icmp ne i32 %42, 0 + %44 = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !79 + %45 = icmp ne i32 %44, 0 + %46 = select i1 %43, i1 %45, i1 false + br i1 %46, label %47, label %48 + +47: ; preds = %38 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str844, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else.i.i.i: ; preds = %if.then7 - %cmp5.i.i.i = icmp eq i32 %.pre67.i.i.i, 0 - tail call void @llvm.assume(i1 noundef %cmp5.i.i.i) #21 - %12 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !81 - br i1 %tobool.i.i, label %land.lhs.true7.i.i.i, label %if.else11.i.i.i +48: ; preds = %38 + %49 = icmp eq i32 %44, 0 + tail call void @llvm.assume(i1 noundef %49) #21 + %50 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !80 + br i1 %43, label %51, label %54 -land.lhs.true7.i.i.i: ; preds = %if.else.i.i.i - %cmp9.i.i.i = icmp eq i32 %12, 0 - br i1 %cmp9.i.i.i, label %if.else11.i.i.i, label %if.then10.i.i.i +51: ; preds = %48 + %52 = icmp eq i32 %50, 0 + br i1 %52, label %54, label %53 -if.then10.i.i.i: ; preds = %land.lhs.true7.i.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str948, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 +53: ; preds = %51 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str945, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else11.i.i.i: ; preds = %land.lhs.true7.i.i.i, %if.else.i.i.i - %13 = phi i32 [ 0, %land.lhs.true7.i.i.i ], [ %12, %if.else.i.i.i ] - %cmp14.i.i.i = icmp eq i32 %13, 0 - tail call void @llvm.assume(i1 noundef %cmp14.i.i.i) #21 - %14 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !82 - br i1 %tobool.i.i, label %land.lhs.true17.i.i.i, label %if.else21.i.i.i +54: ; preds = %51, %48 + %55 = phi i32 [ 0, %51 ], [ %50, %48 ] + %56 = icmp eq i32 %55, 0 + tail call void @llvm.assume(i1 noundef %56) #21 + %57 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !81 + br i1 %43, label %58, label %61 -land.lhs.true17.i.i.i: ; preds = %if.else11.i.i.i - %cmp19.i.i.i = icmp eq i32 %14, 0 - br i1 %cmp19.i.i.i, label %if.else21.i.i.i, label %if.then20.i.i.i +58: ; preds = %54 + %59 = icmp eq i32 %57, 0 + br i1 %59, label %61, label %60 -if.then20.i.i.i: ; preds = %land.lhs.true17.i.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1049, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 +60: ; preds = %58 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1046, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else21.i.i.i: ; preds = %land.lhs.true17.i.i.i, %if.else11.i.i.i - %15 = phi i32 [ 0, %land.lhs.true17.i.i.i ], [ %14, %if.else11.i.i.i ] - %cmp24.i.i.i = icmp eq i32 %15, 0 - tail call void @llvm.assume(i1 noundef %cmp24.i.i.i) #21 - %16 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !83 - br i1 %tobool.i.i, label %land.lhs.true27.i.i.i, label %if.else31.i.i.i +61: ; preds = %58, %54 + %62 = phi i32 [ 0, %58 ], [ %57, %54 ] + %63 = icmp eq i32 %62, 0 + tail call void @llvm.assume(i1 noundef %63) #21 + %64 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !82 + br i1 %43, label %65, label %68 -land.lhs.true27.i.i.i: ; preds = %if.else21.i.i.i - %cmp29.i.i.i = icmp eq i32 %16, 1 - br i1 %cmp29.i.i.i, label %if.else31.i.i.i, label %if.then30.i.i.i +65: ; preds = %61 + %66 = icmp eq i32 %64, 1 + br i1 %66, label %68, label %67 -if.then30.i.i.i: ; preds = %land.lhs.true27.i.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1150, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 +67: ; preds = %65 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1147, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else31.i.i.i: ; preds = %land.lhs.true27.i.i.i, %if.else21.i.i.i - %17 = phi i32 [ 1, %land.lhs.true27.i.i.i ], [ %16, %if.else21.i.i.i ] - %cmp34.i.i.i = icmp eq i32 %17, 1 - tail call void @llvm.assume(i1 noundef %cmp34.i.i.i) #21 - %18 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !84 - br i1 %tobool.i.i, label %land.lhs.true37.i.i.i, label %if.else.critedge.i.critedge.critedge.critedge +68: ; preds = %65, %61 + %69 = phi i32 [ 1, %65 ], [ %64, %61 ] + %70 = icmp eq i32 %69, 1 + tail call void @llvm.assume(i1 noundef %70) #21 + %71 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !83 + br i1 %43, label %72, label %93 -land.lhs.true37.i.i.i: ; preds = %if.else31.i.i.i - %cmp39.i.i.i = icmp eq i32 %18, 1 - br i1 %cmp39.i.i.i, label %if.else41.i.i.i, label %if.then40.i.i.i +72: ; preds = %68 + %73 = icmp eq i32 %71, 1 + br i1 %73, label %75, label %74 -if.then40.i.i.i: ; preds = %land.lhs.true37.i.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1251, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 +74: ; preds = %72 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1248, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else41.i.i.i: ; preds = %land.lhs.true37.i.i.i - %cmp44.i.i.i = icmp eq i32 1, 1 - tail call void @llvm.assume(i1 noundef %cmp44.i.i.i) #21 - br i1 %tobool.i.i, label %land.lhs.true47.i.i.i, label %if.else.critedge.i.critedge +75: ; preds = %72 + %76 = icmp eq i32 1, 1 + tail call void @llvm.assume(i1 noundef %76) #21 + br i1 %43, label %77, label %95 -land.lhs.true47.i.i.i: ; preds = %if.else41.i.i.i - %19 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !85 - %cmp49.i.i.i = icmp eq i32 %19, 1 - br i1 %cmp49.i.i.i, label %if.else51.i.i.i, label %if.then50.i.i.i +77: ; preds = %75 + %78 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !84 + %79 = icmp eq i32 %78, 1 + br i1 %79, label %81, label %80 -if.then50.i.i.i: ; preds = %land.lhs.true47.i.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1352, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 +80: ; preds = %77 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1349, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable -if.else51.i.i.i: ; preds = %land.lhs.true47.i.i.i - br i1 %tobool.i.i, label %land.lhs.true.i.i, label %if.else.critedge.i.critedge +81: ; preds = %77 + %82 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + %83 = icmp eq i32 %82, 1 + br i1 %83, label %85, label %84 -land.lhs.true.i.i: ; preds = %if.else51.i.i.i - %20 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !73 - %cmp.i.i = icmp eq i32 %20, 1 - br i1 %cmp.i.i, label %land.lhs.true8.i.i, label %if.then.i.i - -if.then.i.i: ; preds = %land.lhs.true.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 +84: ; preds = %81 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 unreachable -land.lhs.true8.i.i: ; preds = %land.lhs.true.i.i - %21 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !74 - %cmp10.i.i = icmp eq i32 %21, 0 - br i1 %cmp10.i.i, label %land.lhs.true.i24, label %if.then11.i.i +85: ; preds = %81 + %86 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + %87 = icmp eq i32 %86, 0 + br i1 %87, label %89, label %88 -if.then11.i.i: ; preds = %land.lhs.true8.i.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1553, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 +88: ; preds = %85 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1550, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 unreachable -land.lhs.true.i24: ; preds = %land.lhs.true8.i.i - %22 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 - %tobool.i25.i.not = icmp eq i32 %22, 0 - br i1 %tobool.i25.i.not, label %if.then.i25, label %_ZN4ompx5state18assumeInitialStateEb.exit +89: ; preds = %85 + %90 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + %91 = icmp eq i32 %90, 0 + br i1 %91, label %92, label %98 -if.then.i25: ; preds = %land.lhs.true.i24 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(69) @.str544, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20 +92: ; preds = %89 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20 unreachable -if.else.critedge.i.critedge.critedge.critedge: ; preds = %if.else31.i.i.i - %cmp44.i.i.i.c = icmp eq i32 %18, 1 - tail call void @llvm.assume(i1 noundef %cmp44.i.i.i.c) #21 - br label %if.else.critedge.i.critedge +93: ; preds = %68 + %94 = icmp eq i32 %71, 1 + tail call void @llvm.assume(i1 noundef %94) #21 + br label %95 -if.else.critedge.i.critedge: ; preds = %if.else41.i.i.i, %if.else.critedge.i.critedge.critedge.critedge, %if.else51.i.i.i - %.pre.i = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 - %23 = icmp ne i32 %.pre.i, 0 - br label %_ZN4ompx5state18assumeInitialStateEb.exit +95: ; preds = %75, %93 + %96 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + %97 = icmp ne i32 %96, 0 + br label %98 -_ZN4ompx5state18assumeInitialStateEb.exit: ; preds = %land.lhs.true.i24, %if.else.critedge.i.critedge - %cmp8.i = phi i1 [ %23, %if.else.critedge.i.critedge ], [ true, %land.lhs.true.i24 ] - tail call void @llvm.assume(i1 noundef %cmp8.i) #21 +98: ; preds = %89, %95 + %99 = phi i1 [ %97, %95 ], [ true, %89 ] + tail call void @llvm.assume(i1 noundef %99) #21 tail call void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 poison) #19 - br label %cleanup - -if.end9: ; preds = %if.end - %24 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 - %sub.i.i = add i32 %24, -1 - %and.i.i26 = and i32 %sub.i.i, -32 - %25 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 - %cmp.i.i27 = icmp eq i32 %25, %and.i.i26 - br i1 %cmp.i.i27, label %cleanup, label %if.end12 - -if.end12: ; preds = %if.end9 - %sub.i = add i32 %24, -32 - %cmp = icmp ult i32 %25, %sub.i - %or.cond33 = and i1 %tobool3.not, %cmp - br i1 %or.cond33, label %do.body.i.preheader, label %cleanup - -do.body.i.preheader: ; preds = %if.end12 - %26 = load i32, ptr @__omp_rtl_debug_kind, align 4 - %27 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8 - %and.i.i29 = and i32 %26, 1 - %and.i = and i32 %and.i.i29, %27 - %tobool.i = icmp ne i32 %and.i, 0 - br label %do.body.i - -do.body.i: ; preds = %do.body.i.preheader, %if.end9.i - call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %WorkFn.i) #22 - store ptr null, ptr %WorkFn.i, align 8, !tbaa !76 + br label %130 + +100: ; preds = %37 + %101 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %102 = add nsw i32 %101, -1 + %103 = and i32 %102, -32 + %104 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !85 + %105 = icmp eq i32 %104, %103 + br i1 %105, label %130, label %106 + +106: ; preds = %100 + %107 = add nsw i32 %101, -32 + %108 = icmp ult i32 %104, %107 + %109 = select i1 %9, i1 %108, i1 false + br i1 %109, label %110, label %130 + +110: ; preds = %106 + %111 = load i32, ptr @__omp_rtl_debug_kind, align 4 + %112 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8 + %113 = and i32 %111, 1 + %114 = and i32 %113, %112 + %115 = icmp ne i32 %114, 0 + br label %116 + +116: ; preds = %110, %128 + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %3) #22 + store ptr null, ptr %3, align 8, !tbaa !74 tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 - %call1.i = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn.i) #22 - %28 = load ptr, ptr %WorkFn.i, align 8, !tbaa !76 - %tobool.not.not.i = icmp eq ptr %28, null - br i1 %tobool.not.not.i, label %_ZL19genericStateMachineP7IdentTy.exit, label %if.end.i - -if.end.i: ; preds = %do.body.i - br i1 %call1.i, label %if.then3.i, label %if.end9.i - -if.then3.i: ; preds = %if.end.i - %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 - %tobool.i30 = icmp ne i32 %29, 0 - %or.cond = select i1 %tobool.i, i1 %tobool.i30, i1 false - br i1 %or.cond, label %if.then6.i, label %if.else.i - -if.then6.i: ; preds = %if.then3.i - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(70) @.str15, i32 noundef 58, ptr nofree noundef nonnull dereferenceable(36) @__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy) #20 + %117 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %3) #22 + %118 = load ptr, ptr %3, align 8, !tbaa !74 + %119 = icmp eq ptr %118, null + br i1 %119, label %129, label %120 + +120: ; preds = %116 + br i1 %117, label %121, label %128 + +121: ; preds = %120 + %122 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %123 = icmp ne i32 %122, 0 + %124 = select i1 %115, i1 %123, i1 false + br i1 %124, label %125, label %126 + +125: ; preds = %121 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(67) @.str15, i32 noundef 60, ptr nofree noundef nonnull dereferenceable(36) @__PRETTY_FUNCTION__._ZL19genericStateMachineP7IdentTy) #20 unreachable -if.else.i: ; preds = %if.then3.i - %tobool.i31.not = icmp eq i32 %29, 0 - tail call void @llvm.assume(i1 noundef %tobool.i31.not) #21 - tail call void %28(i32 noundef 0, i32 noundef %25) #23 +126: ; preds = %121 + %127 = icmp eq i32 %122, 0 + tail call void @llvm.assume(i1 noundef %127) #21 + tail call void %118(i32 noundef 0, i32 noundef %104) #23 tail call void @__kmpc_kernel_end_parallel() #24 - br label %if.end9.i + br label %128 -if.end9.i: ; preds = %if.else.i, %if.end.i +128: ; preds = %126, %120 tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 - call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn.i) #22 - br label %do.body.i, !llvm.loop !86 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #22 + br label %116, !llvm.loop !86 -_ZL19genericStateMachineP7IdentTy.exit: ; preds = %do.body.i - call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn.i) #22 - br label %cleanup +129: ; preds = %116 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #22 + br label %130 -cleanup: ; preds = %if.end12, %_ZL19genericStateMachineP7IdentTy.exit, %if.end9, %_ZN4ompx5state18assumeInitialStateEb.exit - %retval.0 = phi i32 [ -1, %_ZN4ompx5state18assumeInitialStateEb.exit ], [ -1, %if.end9 ], [ %25, %_ZL19genericStateMachineP7IdentTy.exit ], [ %25, %if.end12 ] - ret i32 %retval.0 +130: ; preds = %106, %129, %100, %98 + %131 = phi i32 [ -1, %98 ], [ -1, %100 ], [ %104, %129 ], [ %104, %106 ] + ret i32 %131 } ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #5 +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #6 + ; Function Attrs: convergent mustprogress noinline norecurse nounwind -define internal void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 %Ordering) local_unnamed_addr #6 { -entry: +define internal void @_ZN4ompx11synchronize14threadsAlignedENS_6atomic10OrderingTyE(i32 %0) local_unnamed_addr #7 { tail call void @llvm.nvvm.barrier0() #25 ret void } @@ -473,338 +474,332 @@ entry: ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #5 -; Function Attrs: convergent mustprogress noreturn nounwind -define internal fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(8) %expr, ptr noundef %msg, ptr nofree noundef nonnull dereferenceable(69) %file, i32 noundef %line, ptr nofree noundef nonnull dereferenceable(20) %function) unnamed_addr #7 { -entry: - %tmp = alloca %printf_args, align 8 - %tmp1 = alloca %printf_args.7, align 8 - %tobool.not = icmp eq ptr %msg, null - br i1 %tobool.not, label %if.else, label %if.then - -if.then: ; preds = %entry - store ptr %file, ptr %tmp, align 8 - %0 = getelementptr inbounds i8, ptr %tmp, i64 8 - store i32 %line, ptr %0, align 8 - %1 = getelementptr inbounds i8, ptr %tmp, i64 16 - store ptr %function, ptr %1, align 8 - br label %if.end - -if.else: ; preds = %entry - store ptr %file, ptr %tmp1, align 8 - %2 = getelementptr inbounds i8, ptr %tmp1, i64 8 - store i32 %line, ptr %2, align 8 - br label %if.end - -if.end: ; preds = %if.else, %if.then - %.sink12 = phi i64 [ 16, %if.else ], [ 24, %if.then ] - %tmp1.sink11 = phi ptr [ %tmp1, %if.else ], [ %tmp, %if.then ] - %function.sink = phi ptr [ %function, %if.else ], [ %msg, %if.then ] - %.sink9 = phi i64 [ 24, %if.else ], [ 32, %if.then ] - %.str1.sink = phi ptr [ @.str1, %if.else ], [ @.str, %if.then ] - %3 = getelementptr inbounds i8, ptr %tmp1.sink11, i64 %.sink12 - store ptr %function.sink, ptr %3, align 8 - %4 = getelementptr inbounds i8, ptr %tmp1.sink11, i64 %.sink9 - store ptr %expr, ptr %4, align 8 - %call.i.i = call noundef i32 @vprintf(ptr noundef nonnull %.str1.sink, ptr noundef nonnull %tmp1.sink11) #24 +; Function Attrs: cold convergent mustprogress noreturn nounwind +define internal fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(8) %0, ptr noundef %1, ptr nofree noundef nonnull dereferenceable(66) %2, i32 noundef range(i32 60, 905) %3, ptr nofree noundef nonnull dereferenceable(20) %4) unnamed_addr #8 { + %6 = alloca %printf_args, align 8 + %7 = alloca %printf_args.7, align 8 + %8 = icmp eq ptr %1, null + br i1 %8, label %12, label %9 + +9: ; preds = %5 + store ptr %2, ptr %6, align 8 + %10 = getelementptr inbounds nuw i8, ptr %6, i64 8 + store i32 %3, ptr %10, align 8 + %11 = getelementptr inbounds nuw i8, ptr %6, i64 16 + store ptr %4, ptr %11, align 8 + br label %14 + +12: ; preds = %5 + store ptr %2, ptr %7, align 8 + %13 = getelementptr inbounds nuw i8, ptr %7, i64 8 + store i32 %3, ptr %13, align 8 + br label %14 + +14: ; preds = %12, %9 + %15 = phi i64 [ 16, %12 ], [ 24, %9 ] + %16 = phi ptr [ %7, %12 ], [ %6, %9 ] + %17 = phi ptr [ %4, %12 ], [ %1, %9 ] + %18 = phi i64 [ 24, %12 ], [ 32, %9 ] + %19 = phi ptr [ @.str1, %12 ], [ @.str, %9 ] + %20 = getelementptr inbounds nuw i8, ptr %16, i64 %15 + store ptr %17, ptr %20, align 8 + %21 = getelementptr inbounds nuw i8, ptr %16, i64 %18 + store ptr %0, ptr %21, align 8 + %22 = call i32 @vprintf(ptr noundef nonnull %19, ptr noundef nonnull %16) #22 call void @llvm.trap() #26 unreachable } ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) -declare void @llvm.assume(i1 noundef) #8 +declare void @llvm.assume(i1 noundef) #9 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) -declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #9 +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #10 ; Function Attrs: convergent nocallback nounwind -declare void @llvm.nvvm.barrier.sync(i32) #10 +declare void @llvm.nvvm.barrier.sync(i32) #11 ; Function Attrs: convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) -define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn) local_unnamed_addr #11 { -entry: - %0 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !76 - store ptr %0, ptr %WorkFn, align 8, !tbaa !76 - %tobool.not = icmp eq ptr %0, null - br i1 %tobool.not, label %return, label %if.end - -if.end: ; preds = %entry - %1 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27 - %2 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !61 - %tobool.not.i = icmp eq i32 %2, 0 - %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 - %4 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 - %tobool.i.not.i.i = icmp eq i32 %4, 0 - %mul.neg.i.i.i = select i1 %tobool.i.not.i.i, i32 -32, i32 0 - %sub.i.i.i = add i32 %mul.neg.i.i.i, %3 - %cond.i = select i1 %tobool.not.i, i32 %sub.i.i.i, i32 %2 - %cmp = icmp ult i32 %1, %cond.i - br label %return - -return: ; preds = %if.end, %entry - %retval.0 = phi i1 [ %cmp, %if.end ], [ false, %entry ] - ret i1 %retval.0 +define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %0) local_unnamed_addr #12 { + %2 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr %2, ptr %0, align 8, !tbaa !74 + %3 = icmp eq ptr %2, null + br i1 %3, label %15, label %4 + +4: ; preds = %1 + %5 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !85 + %6 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !63 + %7 = icmp eq i32 %6, 0 + %8 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %9 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %10 = icmp eq i32 %9, 0 + %11 = select i1 %10, i32 -32, i32 0 + %12 = add nsw i32 %11, %8 + %13 = select i1 %7, i32 %12, i32 %6 + %14 = icmp ult i32 %5, %13 + br label %15 + +15: ; preds = %4, %1 + %16 = phi i1 [ %14, %4 ], [ false, %1 ] + ret i1 %16 } ; Function Attrs: convergent mustprogress noinline nounwind -define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #12 { -entry: - %0 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 - %1 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 - %and.i.i = and i32 %0, 1 - %and.i = and i32 %and.i.i, %1 - %tobool.i = icmp ne i32 %and.i, 0 - %2 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 - %tobool.i1 = icmp ne i32 %2, 0 - %or.cond = select i1 %tobool.i, i1 %tobool.i1, i1 false - br i1 %or.cond, label %if.then, label %if.else - -if.then: ; preds = %entry - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(75) @.str1127, i32 noundef 297, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 +define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { + %1 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 + %2 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %3 = and i32 %1, 1 + %4 = and i32 %3, %2 + %5 = icmp ne i32 %4, 0 + %6 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + %7 = icmp ne i32 %6, 0 + %8 = select i1 %5, i1 %7, i1 false + br i1 %8, label %9, label %10 + +9: ; preds = %0 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1124, i32 noundef 298, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 unreachable -if.else: ; preds = %entry - %tobool.i2.not = icmp eq i32 %2, 0 - tail call void @llvm.assume(i1 noundef %tobool.i2.not) #21 - %3 = load i32, ptr @__omp_rtl_assume_no_thread_state, align 4, !tbaa !61 - %tobool.not.i.i = icmp eq i32 %3, 0 - %4 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8 - %tobool.not.i = icmp ne i32 %4, 0 - %or.cond.not.i = select i1 %tobool.not.i.i, i1 %tobool.not.i, i1 false - br i1 %or.cond.not.i, label %lor.rhs.i, label %_ZN4ompx5state19resetStateForThreadEj.exit - -lor.rhs.i: ; preds = %if.else - %5 = tail call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27 - %6 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 - %idxprom.i = zext i32 %5 to i64 - %arrayidx.i = getelementptr inbounds ptr, ptr %6, i64 %idxprom.i - %7 = load ptr, ptr %arrayidx.i, align 8, !tbaa !76 - %tobool1.not.i = icmp eq ptr %7, null - br i1 %tobool1.not.i, label %_ZN4ompx5state19resetStateForThreadEj.exit, label %if.end4.i, !prof !88 - -if.end4.i: ; preds = %lor.rhs.i - %PreviousThreadState7.i = getelementptr inbounds i8, ptr %7, i64 32 - %8 = load ptr, ptr %PreviousThreadState7.i, align 8, !tbaa !89 - tail call void @free(ptr noundef nonnull dereferenceable(40) %7) #28 - %9 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !76 - %arrayidx11.i = getelementptr inbounds ptr, ptr %9, i64 %idxprom.i - store ptr %8, ptr %arrayidx11.i, align 8, !tbaa !76 - %.pre = load i32, ptr addrspace(3) @IsSPMDMode, align 4 - br label %_ZN4ompx5state19resetStateForThreadEj.exit - -_ZN4ompx5state19resetStateForThreadEj.exit: ; preds = %if.else, %lor.rhs.i, %if.end4.i - %10 = phi i32 [ 0, %if.else ], [ 0, %lor.rhs.i ], [ %.pre, %if.end4.i ] - %tobool.i6 = icmp ne i32 %10, 0 - %or.cond8 = select i1 %tobool.i, i1 %tobool.i6, i1 false - br i1 %or.cond8, label %if.then7, label %if.else8 - -if.then7: ; preds = %_ZN4ompx5state19resetStateForThreadEj.exit - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(75) @.str1127, i32 noundef 300, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 +10: ; preds = %0 + %11 = icmp eq i32 %6, 0 + tail call void @llvm.assume(i1 noundef %11) #21 + %12 = load i32, ptr @__omp_rtl_assume_no_thread_state, align 4, !tbaa !63 + %13 = icmp eq i32 %12, 0 + %14 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8 + %15 = icmp ne i32 %14, 0 + %16 = select i1 %13, i1 %15, i1 false + br i1 %16, label %17, label %30 + +17: ; preds = %10 + %18 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !85 + %19 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + %20 = zext nneg i32 %18 to i64 + %21 = getelementptr inbounds nuw ptr, ptr %19, i64 %20 + %22 = load ptr, ptr %21, align 8, !tbaa !74 + %23 = icmp eq ptr %22, null + br i1 %23, label %30, label %24, !prof !88 + +24: ; preds = %17 + %25 = getelementptr inbounds nuw i8, ptr %22, i64 32 + %26 = load ptr, ptr %25, align 8, !tbaa !89 + tail call void @free(ptr noundef nonnull dereferenceable(40) %22) #28 + %27 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + %28 = getelementptr inbounds nuw ptr, ptr %27, i64 %20 + store ptr %26, ptr %28, align 8, !tbaa !74 + %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 + br label %30 + +30: ; preds = %10, %17, %24 + %31 = phi i32 [ 0, %10 ], [ 0, %17 ], [ %29, %24 ] + %32 = icmp ne i32 %31, 0 + %33 = select i1 %5, i1 %32, i1 false + br i1 %33, label %34, label %35 + +34: ; preds = %30 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1124, i32 noundef 301, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 unreachable -if.else8: ; preds = %_ZN4ompx5state19resetStateForThreadEj.exit - %tobool.i7.not = icmp eq i32 %10, 0 - tail call void @llvm.assume(i1 noundef %tobool.i7.not) #21 +35: ; preds = %30 + %36 = icmp eq i32 %31, 0 + tail call void @llvm.assume(i1 noundef %36) #21 ret void } ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) -declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #9 +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #10 ; Function Attrs: convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) -declare extern_weak void @free(ptr allocptr nocapture noundef) local_unnamed_addr #13 +declare extern_weak void @free(ptr allocptr nocapture noundef) local_unnamed_addr #14 ; Function Attrs: convergent -declare i32 @vprintf(ptr noundef, ptr noundef) local_unnamed_addr #14 +declare i32 @vprintf(ptr, ptr) local_unnamed_addr #2 ; Function Attrs: cold noreturn nounwind memory(inaccessiblemem: write) declare void @llvm.trap() #15 ; Function Attrs: convergent nocallback nounwind -declare void @llvm.nvvm.barrier0() #10 +declare void @llvm.nvvm.barrier0() #11 ; Function Attrs: convergent mustprogress nounwind define internal void @__kmpc_target_deinit() #4 { -entry: - %WorkFn = alloca ptr, align 8 - %0 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !61 - %tobool.i.not = icmp eq i32 %0, 0 - br i1 %tobool.i.not, label %if.end, label %cleanup - -if.end: ; preds = %entry - %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18 - %sub.i.i = add i32 %1, -1 - %and.i.i = and i32 %sub.i.i, -32 - %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 - %cmp.i.i = icmp eq i32 %2, %and.i.i - br i1 %cmp.i.i, label %if.then3, label %if.else - -if.then3: ; preds = %if.end - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !76 - br label %cleanup - -if.else: ; preds = %if.end - %3 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !76 - %4 = load i8, ptr %3, align 8, !tbaa !91 - %tobool6.not = icmp eq i8 %4, 0 - br i1 %tobool6.not, label %if.then7, label %cleanup - -if.then7: ; preds = %if.else - call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %WorkFn) #29 - store ptr null, ptr %WorkFn, align 8, !tbaa !76 - %call8 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %WorkFn) #22 - %5 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !61 - %6 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !77 - %and.i.i1 = and i32 %5, 1 - %and.i = and i32 %and.i.i1, %6 - %tobool.i2.not = icmp eq i32 %and.i, 0 - %7 = load ptr, ptr %WorkFn, align 8 - %cmp = icmp eq ptr %7, null - %or.cond = select i1 %tobool.i2.not, i1 true, i1 %cmp - br i1 %or.cond, label %if.else11, label %if.then10 - -if.then10: ; preds = %if.then7 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(18) @.str2, ptr noundef null, ptr nofree noundef nonnull dereferenceable(70) @.str15, i32 noundef 150, ptr nofree noundef nonnull dereferenceable(28) @__PRETTY_FUNCTION__.__kmpc_target_deinit) #20 + %1 = alloca ptr, align 8 + %2 = load i32, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 + %3 = icmp eq i32 %2, 0 + br i1 %3, label %4, label %27 + +4: ; preds = %0 + %5 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %6 = add nsw i32 %5, -1 + %7 = and i32 %6, -32 + %8 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !85 + %9 = icmp eq i32 %8, %7 + br i1 %9, label %10, label %11 + +10: ; preds = %4 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 + br label %27 + +11: ; preds = %4 + %12 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 + %13 = load i8, ptr %12, align 8, !tbaa !91 + %14 = icmp eq i8 %13, 0 + br i1 %14, label %15, label %27 + +15: ; preds = %11 + call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %1) #29 + store ptr null, ptr %1, align 8, !tbaa !74 + %16 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %1) #22 + %17 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 + %18 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %19 = and i32 %17, 1 + %20 = and i32 %19, %18 + %21 = icmp eq i32 %20, 0 + %22 = load ptr, ptr %1, align 8 + %23 = icmp eq ptr %22, null + %24 = select i1 %21, i1 true, i1 %23 + br i1 %24, label %26, label %25 + +25: ; preds = %15 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(18) @.str2, ptr noundef null, ptr nofree noundef nonnull dereferenceable(67) @.str15, i32 noundef 152, ptr nofree noundef nonnull dereferenceable(28) @__PRETTY_FUNCTION__.__kmpc_target_deinit) #20 unreachable -if.else11: ; preds = %if.then7 - tail call void @llvm.assume(i1 noundef %cmp) #21 - call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %WorkFn) #22 - br label %cleanup +26: ; preds = %15 + tail call void @llvm.assume(i1 noundef %23) #21 + call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %1) #22 + br label %27 -cleanup: ; preds = %if.else11, %if.else, %if.then3, %entry +27: ; preds = %26, %11, %10, %0 ret void } -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare void @llvm.dbg.declare(metadata, metadata, metadata) #5 - -attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "omp_target_thread_limit"="128" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } -attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } -attributes #2 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } -attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx78,+sm_61" } -attributes #4 = { convergent mustprogress nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #0 = { convergent noinline norecurse nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "frame-pointer"="all" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="128" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #3 = { convergent noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx83,+sm_70" } +attributes #4 = { convergent mustprogress nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } attributes #5 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -attributes #6 = { convergent mustprogress noinline norecurse nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } -attributes #7 = { convergent mustprogress noreturn nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } -attributes #8 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } -attributes #9 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } -attributes #10 = { convergent nocallback nounwind } -attributes #11 = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } -attributes #12 = { convergent mustprogress noinline nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } -attributes #13 = { convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } -attributes #14 = { convergent "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx63,+ptx78,+sm_61" } +attributes #6 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #7 = { convergent mustprogress noinline norecurse nounwind "frame-pointer"="all" "llvm.assume"="ompx_aligned_barrier,ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #8 = { cold convergent mustprogress noreturn nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #9 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) } +attributes #10 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #11 = { convergent nocallback nounwind } +attributes #12 = { convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #13 = { convergent mustprogress noinline nounwind "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } +attributes #14 = { convergent mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc" "frame-pointer"="all" "llvm.assume"="ompx_no_call_asm" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx63,+ptx83,+sm_70" } attributes #15 = { cold noreturn nounwind memory(inaccessiblemem: write) } attributes #16 = { convergent } attributes #17 = { nounwind } attributes #18 = { "llvm.assume"="ompx_no_call_asm" } -attributes #19 = { convergent nounwind "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" } +attributes #19 = { convergent nounwind "llvm.assume"="ompx_aligned_barrier,ompx_no_call_asm" } attributes #20 = { noreturn nounwind "llvm.assume"="ompx_no_call_asm" } attributes #21 = { memory(write) "llvm.assume"="ompx_no_call_asm" } attributes #22 = { nounwind "llvm.assume"="ompx_no_call_asm" } attributes #23 = { convergent nounwind } attributes #24 = { convergent nounwind "llvm.assume"="ompx_no_call_asm" } -attributes #25 = { "llvm.assume"="ompx_no_call_asm,ompx_aligned_barrier" } +attributes #25 = { "llvm.assume"="ompx_aligned_barrier,ompx_no_call_asm" } attributes #26 = { noreturn "llvm.assume"="ompx_no_call_asm" } attributes #27 = { nofree willreturn "llvm.assume"="ompx_no_call_asm" } attributes #28 = { convergent nounwind willreturn "llvm.assume"="ompx_no_call_asm" } attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } -!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9} -!llvm.dbg.cu = !{!10} -!nvvm.annotations = !{!12, !13} -!omp_offload.info = !{!14} -!llvm.ident = !{!15, !16, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} +!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10} +!llvm.dbg.cu = !{!11} +!nvvm.annotations = !{!13, !14} +!omp_offload.info = !{!15} +!llvm.ident = !{!16, !17, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16, !16} +!nvvmir.version = !{!18} -!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]} +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 12, i32 3]} !1 = !{i32 7, !"Dwarf Version", i32 2} !2 = !{i32 2, !"Debug Info Version", i32 3} !3 = !{i32 1, !"wchar_size", i32 4} -!4 = !{i32 7, !"openmp", i32 51} -!5 = !{i32 7, !"openmp-device", i32 51} -!6 = !{i32 8, !"PIC Level", i32 2} -!7 = !{i32 7, !"frame-pointer", i32 2} -!8 = !{i32 1, !"ThinLTO", i32 0} -!9 = !{i32 1, !"EnableSplitLTOUnit", i32 1} -!10 = distinct !DICompileUnit(language: DW_LANG_C11, file: !11, producer: "clang version 19.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!11 = !DIFile(filename: "test.c", directory: "/tmp") -!12 = !{ptr @__omp_offloading_10305_5c00dd_h_l12_debug__, !"maxntidx", i32 128} -!13 = !{ptr @__omp_offloading_10305_5c00dd_h_l12, !"kernel", i32 1} -!14 = !{i32 0, i32 66309, i32 6029533, !"h", i32 12, i32 0, i32 0} -!15 = !{!"clang version 19.0.0git"} -!16 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} -!17 = distinct !DISubprogram(name: "__omp_offloading_10305_5c00dd_h_l12_debug__", scope: !11, file: !11, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !10, retainedNodes: !23) -!18 = !DISubroutineType(types: !19) -!19 = !{null, !20} -!20 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !21) -!21 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !22) -!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) -!23 = !{} -!24 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !17, type: !20, flags: DIFlagArtificial) -!25 = !DILocation(line: 0, scope: !17) -!26 = !DILocation(line: 13, column: 3, scope: !17) -!27 = !DILocalVariable(name: "i", scope: !28, file: !11, line: 14, type: !29) -!28 = distinct !DILexicalBlock(scope: !17, file: !11, line: 13, column: 3) -!29 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!30 = !DILocation(line: 14, column: 9, scope: !28) -!31 = !DILocalVariable(name: "a", scope: !28, file: !11, line: 15, type: !32) -!32 = !DICompositeType(tag: DW_TAG_array_type, baseType: !29, size: 64, elements: !33) -!33 = !{!34} -!34 = !DISubrange(count: 2) -!35 = !DILocation(line: 15, column: 9, scope: !28) -!36 = !DILocation(line: 16, column: 5, scope: !28) -!37 = !DILocation(line: 17, column: 5, scope: !28) -!38 = !DILocation(line: 18, column: 3, scope: !28) -!39 = !DILocation(line: 18, column: 3, scope: !17) -!40 = distinct !DISubprogram(name: "__omp_offloading_10305_5c00dd_h_l12", scope: !11, file: !11, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !10, retainedNodes: !23) -!41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial) -!42 = !DILocation(line: 0, scope: !40) -!43 = !DILocation(line: 12, column: 1, scope: !40) -!44 = distinct !DISubprogram(name: "g", scope: !11, file: !11, line: 3, type: !45, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !10, retainedNodes: !23) -!45 = !DISubroutineType(types: !46) -!46 = !{null} -!47 = !DILocalVariable(name: "i", scope: !44, file: !11, line: 4, type: !29) -!48 = !DILocation(line: 4, column: 7, scope: !44) -!49 = !DILocalVariable(name: "a", scope: !44, file: !11, line: 5, type: !32) -!50 = !DILocation(line: 5, column: 7, scope: !44) -!51 = !DILocation(line: 6, column: 3, scope: !44) -!52 = !DILocation(line: 7, column: 3, scope: !44) -!53 = !DILocation(line: 8, column: 1, scope: !44) -!54 = !{!55, !58, i64 2} -!55 = !{!"_ZTS26ConfigurationEnvironmentTy", !56, i64 0, !56, i64 1, !58, i64 2, !59, i64 4, !59, i64 8, !59, i64 12, !59, i64 16, !59, i64 20, !59, i64 24} -!56 = !{!"omnipotent char", !57, i64 0} -!57 = !{!"Simple C++ TBAA"} -!58 = !{!"_ZTSN4llvm3omp19OMPTgtExecModeFlagsE", !56, i64 0} -!59 = !{!"int", !56, i64 0} -!60 = !{!55, !56, i64 0} -!61 = !{!59, !59, i64 0} -!62 = !{!56, !56, i64 0} -!63 = !{!64, !59, i64 0} -!64 = !{!"_ZTSN4ompx5state11TeamStateTyE", !65, i64 0, !59, i64 28, !59, i64 32, !66, i64 40} -!65 = !{!"_ZTSN4ompx5state10ICVStateTyE", !59, i64 0, !59, i64 4, !59, i64 8, !59, i64 12, !59, i64 16, !59, i64 20, !59, i64 24} -!66 = !{!"any pointer", !56, i64 0} -!67 = !{!64, !59, i64 4} -!68 = !{!64, !59, i64 8} -!69 = !{!64, !59, i64 12} -!70 = !{!64, !59, i64 16} -!71 = !{!64, !59, i64 20} -!72 = !{!64, !59, i64 24} -!73 = !{!64, !59, i64 28} -!74 = !{!64, !59, i64 32} -!75 = !{!64, !66, i64 40} -!76 = !{!66, !66, i64 0} -!77 = !{!78, !59, i64 0} -!78 = !{!"_ZTS19DeviceEnvironmentTy", !59, i64 0, !59, i64 4, !59, i64 8, !59, i64 12, !79, i64 16, !79, i64 24, !79, i64 32, !79, i64 40} -!79 = !{!"long", !56, i64 0} -!80 = !{!65, !59, i64 0} -!81 = !{!65, !59, i64 4} -!82 = !{!65, !59, i64 8} -!83 = !{!65, !59, i64 16} -!84 = !{!65, !59, i64 20} -!85 = !{!65, !59, i64 24} +!4 = !{i32 4, !"nvvm-reflect-ftz", i32 0} +!5 = !{i32 7, !"openmp", i32 51} +!6 = !{i32 7, !"openmp-device", i32 51} +!7 = !{i32 8, !"PIC Level", i32 2} +!8 = !{i32 7, !"frame-pointer", i32 2} +!9 = !{i32 1, !"ThinLTO", i32 0} +!10 = !{i32 1, !"EnableSplitLTOUnit", i32 1} +!11 = distinct !DICompileUnit(language: DW_LANG_C11, file: !12, producer: "clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!12 = !DIFile(filename: "test.c", directory: "/tmp") +!13 = !{ptr @__omp_offloading_fd02_10d1d6_h_l12, !"maxntidx", i32 128} +!14 = !{ptr @__omp_offloading_fd02_10d1d6_h_l12, !"kernel", i32 1} +!15 = !{i32 0, i32 64770, i32 1102294, !"h", i32 12, i32 0, i32 0} +!16 = !{!"clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)"} +!17 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!18 = !{i32 2, i32 0} +!19 = distinct !DISubprogram(name: "__omp_offloading_fd02_10d1d6_h_l12_debug__", scope: !12, file: !12, line: 13, type: !20, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) +!20 = !DISubroutineType(types: !21) +!21 = !{null, !22} +!22 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !23) +!23 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !24) +!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!25 = !{} +!26 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !19, type: !22, flags: DIFlagArtificial) +!27 = !DILocation(line: 0, scope: !19) +!28 = !DILocation(line: 13, column: 3, scope: !19) +!29 = !DILocalVariable(name: "i", scope: !30, file: !12, line: 14, type: !31) +!30 = distinct !DILexicalBlock(scope: !19, file: !12, line: 13, column: 3) +!31 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!32 = !DILocation(line: 14, column: 9, scope: !30) +!33 = !DILocalVariable(name: "a", scope: !30, file: !12, line: 15, type: !34) +!34 = !DICompositeType(tag: DW_TAG_array_type, baseType: !31, size: 64, elements: !35) +!35 = !{!36} +!36 = !DISubrange(count: 2) +!37 = !DILocation(line: 15, column: 9, scope: !30) +!38 = !DILocation(line: 16, column: 5, scope: !30) +!39 = !DILocation(line: 17, column: 5, scope: !30) +!40 = !DILocation(line: 18, column: 3, scope: !30) +!41 = !DILocation(line: 18, column: 3, scope: !19) +!42 = distinct !DISubprogram(name: "__omp_offloading_fd02_10d1d6_h_l12", scope: !12, file: !12, line: 12, type: !20, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) +!43 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !42, type: !22, flags: DIFlagArtificial) +!44 = !DILocation(line: 0, scope: !42) +!45 = !DILocation(line: 12, column: 1, scope: !42) +!46 = distinct !DISubprogram(name: "g", scope: !12, file: !12, line: 3, type: !47, scopeLine: 3, spFlags: DISPFlagDefinition, unit: !11, retainedNodes: !25) +!47 = !DISubroutineType(types: !48) +!48 = !{null} +!49 = !DILocalVariable(name: "i", scope: !46, file: !12, line: 4, type: !31) +!50 = !DILocation(line: 4, column: 7, scope: !46) +!51 = !DILocalVariable(name: "a", scope: !46, file: !12, line: 5, type: !34) +!52 = !DILocation(line: 5, column: 7, scope: !46) +!53 = !DILocation(line: 6, column: 3, scope: !46) +!54 = !DILocation(line: 7, column: 3, scope: !46) +!55 = !DILocation(line: 8, column: 1, scope: !46) +!56 = !{!57, !60, i64 2} +!57 = !{!"_ZTS26ConfigurationEnvironmentTy", !58, i64 0, !58, i64 1, !60, i64 2, !61, i64 4, !61, i64 8, !61, i64 12, !61, i64 16, !61, i64 20, !61, i64 24} +!58 = !{!"omnipotent char", !59, i64 0} +!59 = !{!"Simple C++ TBAA"} +!60 = !{!"_ZTSN4llvm3omp19OMPTgtExecModeFlagsE", !58, i64 0} +!61 = !{!"int", !58, i64 0} +!62 = !{!57, !58, i64 0} +!63 = !{!61, !61, i64 0} +!64 = !{!58, !58, i64 0} +!65 = !{!66, !61, i64 16} +!66 = !{!"_ZTSN4ompx5state11TeamStateTyE", !67, i64 0, !61, i64 28, !61, i64 32, !68, i64 40} +!67 = !{!"_ZTSN4ompx5state10ICVStateTyE", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !61, i64 16, !61, i64 20, !61, i64 24} +!68 = !{!"any pointer", !58, i64 0} +!69 = !{!66, !61, i64 20} +!70 = !{!66, !61, i64 24} +!71 = !{!66, !61, i64 28} +!72 = !{!66, !61, i64 32} +!73 = !{!66, !68, i64 40} +!74 = !{!68, !68, i64 0} +!75 = !{i32 1, i32 1025} +!76 = !{!77, !61, i64 0} +!77 = !{!"_ZTS19DeviceEnvironmentTy", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !78, i64 16, !78, i64 24, !78, i64 32, !78, i64 40} +!78 = !{!"long", !58, i64 0} +!79 = !{!67, !61, i64 0} +!80 = !{!67, !61, i64 4} +!81 = !{!67, !61, i64 8} +!82 = !{!67, !61, i64 16} +!83 = !{!67, !61, i64 20} +!84 = !{!67, !61, i64 24} +!85 = !{i32 0, i32 1024} !86 = distinct !{!86, !87} !87 = !{!"llvm.loop.mustprogress"} -!88 = !{!"branch_weights", i32 2000, i32 1} -!89 = !{!90, !66, i64 32} -!90 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !65, i64 0, !66, i64 32} -!91 = !{!92, !56, i64 0} -!92 = !{!"_ZTS19KernelEnvironmentTy", !55, i64 0, !66, i64 32, !66, i64 40} +!88 = !{!"branch_weights", !"expected", i32 2000, i32 1} +!89 = !{!90, !68, i64 32} +!90 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !67, i64 0, !68, i64 32} +!91 = !{!92, !58, i64 0} +!92 = !{!"_ZTS19KernelEnvironmentTy", !57, i64 0, !68, i64 32, !68, i64 40} From baad223fc0772766f6a9463635a4bd681d435b2b Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Thu, 17 Oct 2024 15:44:34 -0400 Subject: [PATCH 099/114] Include LLVM value name in alloca report --- llvm/lib/Analysis/KernelInfo.cpp | 19 ++++++++++--------- llvm/test/Analysis/KernelInfo/allocas.ll | 16 +++++++++------- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 12 ++++++------ llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 12 ++++++------ 4 files changed, 31 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 3658f54923e3f..f9832a6deb75a 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -110,13 +110,13 @@ static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller, const AllocaInst &Alloca, TypeSize::ScalarTy StaticSize) { ORE.emit([&] { - StringRef Name; + StringRef DbgName; DebugLoc Loc; bool Artificial = false; auto DVRs = findDVRDeclares(&const_cast(Alloca)); if (!DVRs.empty()) { const DbgVariableRecord &DVR = **DVRs.begin(); - Name = DVR.getVariable()->getName(); + DbgName = DVR.getVariable()->getName(); Loc = DVR.getDebugLoc(); Artificial = DVR.Variable->isArtificial(); } @@ -127,13 +127,14 @@ static void remarkAlloca(OptimizationRemarkEmitter &ORE, const Function &Caller, R << ", "; if (Artificial) R << "artificial "; - if (Name.empty()) { - R << "unnamed alloca "; - if (DVRs.empty()) - R << "(missing debug metadata) "; - } else { - R << "alloca '" << Name << "' "; - } + SmallString<20> ValName; + raw_svector_ostream OS(ValName); + Alloca.printAsOperand(OS, /*PrintType=*/false, Caller.getParent()); + R << "alloca ('" << ValName << "') "; + if (!DbgName.empty()) + R << "for '" << DbgName << "' "; + else + R << "without debug info "; R << "with "; if (StaticSize) R << "static size of " << itostr(StaticSize) << " bytes"; diff --git a/llvm/test/Analysis/KernelInfo/allocas.ll b/llvm/test/Analysis/KernelInfo/allocas.ll index 048d53799c33e..3ecde004a9b2a 100644 --- a/llvm/test/Analysis/KernelInfo/allocas.ll +++ b/llvm/test/Analysis/KernelInfo/allocas.ll @@ -9,26 +9,28 @@ target triple = "nvptx64-nvidia-cuda" define void @h() !dbg !3 { entry: - ; CHECK: remark: test.c:0:0: in artificial function 'h', artificial alloca 'dyn_ptr' with static size of 8 bytes + ; CHECK: remark: test.c:0:0: in artificial function 'h', artificial alloca ('%dyn_ptr.addr') for 'dyn_ptr' with static size of 8 bytes %dyn_ptr.addr = alloca ptr, align 8 - ; CHECK: remark: test.c:14:9: in artificial function 'h', alloca 'i' with static size of 4 bytes + ; CHECK: remark: test.c:14:9: in artificial function 'h', alloca ('%i') for 'i' with static size of 4 bytes %i = alloca i32, align 4 - ; CHECK: remark: test.c:15:9: in artificial function 'h', alloca 'a' with static size of 8 bytes + ; CHECK: remark: test.c:15:9: in artificial function 'h', alloca ('%a') for 'a' with static size of 8 bytes %a = alloca [2 x i32], align 4 + ; CHECK: remark: :0:0: in artificial function 'h', alloca ('%nodbg') without debug info with static size of 4 bytes + %nodbg = alloca i32, align 4 tail call void @llvm.dbg.declare(metadata ptr %dyn_ptr.addr, metadata !7, metadata !DIExpression()), !dbg !11 tail call void @llvm.dbg.declare(metadata ptr %i, metadata !12, metadata !DIExpression()), !dbg !15 tail call void @llvm.dbg.declare(metadata ptr %a, metadata !16, metadata !DIExpression()), !dbg !20 ret void } -; CHECK: remark: test.c:13:0: in artificial function 'h', Allocas = 3 -; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasStaticSizeSum = 20 +; CHECK: remark: test.c:13:0: in artificial function 'h', Allocas = 4 +; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasStaticSizeSum = 24 ; CHECK: remark: test.c:13:0: in artificial function 'h', AllocasDyn = 0 define void @g() !dbg !21 { entry: - ; CHECK: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes + ; CHECK: remark: test.c:4:7: in function 'g', alloca ('%i') for 'i' with static size of 4 bytes %i = alloca i32, align 4 - ; CHECK: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes + ; CHECK: remark: test.c:5:7: in function 'g', alloca ('%a') for 'a' with static size of 8 bytes %a = alloca [2 x i32], align 4 tail call void @llvm.dbg.declare(metadata ptr %i, metadata !23, metadata !DIExpression()), !dbg !24 tail call void @llvm.dbg.declare(metadata ptr %a, metadata !25, metadata !DIExpression()), !dbg !26 diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index c2caf8267cae7..246eccaac2fc0 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -7,9 +7,9 @@ ; RUN: FileCheck -match-full-lines %s ; CHECK-NOT: remark: -; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes -; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes -; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes +; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'a' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_init' ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f' @@ -33,7 +33,7 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 -; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' @@ -56,8 +56,8 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 -; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes -; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index e717599aab687..656171896a4ff 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -7,9 +7,9 @@ ; RUN: FileCheck -match-full-lines %s ; CHECK-NOT: remark: -; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca 'dyn_ptr' with static size of 8 bytes -; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'i' with static size of 4 bytes -; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca 'a' with static size of 8 bytes +; CHECK: remark: test.c:0:0: in artificial function '[[OFF_FUNC:__omp_offloading_[a-f0-9_]*_h_l12]]_debug__', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:14:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:15:9: in artificial function '[[OFF_FUNC]]_debug__', alloca ('%[[#]]') for 'a' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]_debug__', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:13:3: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is '@__kmpc_target_init' ; CHECK-NEXT: remark: test.c:16:5: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@f' @@ -26,7 +26,7 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 -; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca 'dyn_ptr' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' instruction accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', 'load' instruction ('%[[#]]') accesses memory in flat address space ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' @@ -43,8 +43,8 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 -; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca 'i' with static size of 4 bytes -; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca 'a' with static size of 8 bytes +; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes +; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes ; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 From 1d0a961aabe488e6d09b96a80329498b8f586923 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Fri, 25 Oct 2024 13:42:19 -0500 Subject: [PATCH 100/114] Add llvm-profdata substitution to offload tests --- offload/test/lit.cfg | 2 ++ offload/test/lit.site.cfg.in | 2 +- offload/test/offloading/pgo1.c | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 7994a08ba063f..cfd1ad6c3c1eb 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -112,8 +112,10 @@ config.available_features.add(config.libomptarget_current_target) if config.libomptarget_has_libc: config.available_features.add('libc') +profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata") if config.libomptarget_test_pgo: config.available_features.add('pgo') + config.substitutions.append(("%profdata", profdata_path)) # Determine whether the test system supports unified memory. # For CUDA, this is the case with compute capability 70 (Volta) or higher. diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in index a1cb5acc38a40..d998fb0c83970 100644 --- a/offload/test/lit.site.cfg.in +++ b/offload/test/lit.site.cfg.in @@ -1,6 +1,6 @@ @AUTO_GEN_COMMENT@ -config.bin_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin" +config.bin_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@" config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@" diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c index 1ef540e430a27..51671afa62b0d 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/pgo1.c @@ -1,14 +1,14 @@ // RUN: %libomptarget-compile-generic -fprofile-generate \ // RUN: -Xclang "-fprofile-instrument=llvm" // RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1 -// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %profdata show --all-functions --counts \ // RUN: %target_triple.llvm.profraw | %fcheck-generic \ // RUN: --check-prefix="LLVM-PGO" // RUN: %libomptarget-compile-generic -fprofile-instr-generate \ // RUN: -Xclang "-fprofile-instrument=clang" // RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1 -// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %profdata show --all-functions --counts \ // RUN: %target_triple.clang.profraw | %fcheck-generic \ // RUN: --check-prefix="CLANG-PGO" From c6b34ad7a676a462955b2e7b534b12264363b430 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 28 Oct 2024 18:45:37 -0500 Subject: [PATCH 101/114] Prepend target prefix to basename --- compiler-rt/lib/profile/InstrProfilingFile.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index decafbcb1a535..6b6f47e239714 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1355,10 +1355,21 @@ int __llvm_write_custom_profile(const char *Target, TargetFilename = (char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength + 2); + /* Find file basename and path sizes */ + int32_t DirEnd = FilenameLength - 1; + while (DirEnd >= 0 && !IS_DIR_SEPARATOR(Filename[DirEnd])) { + DirEnd--; + } + uint32_t DirSize = DirEnd + 1, BaseSize = FilenameLength - DirSize; + /* Prepend "TARGET." to current filename */ - memcpy(TargetFilename, Target, TargetLength); - TargetFilename[TargetLength] = '.'; - memcpy(TargetFilename + 1 + TargetLength, Filename, FilenameLength); + if (DirSize > 0) { + memcpy(TargetFilename, Filename, DirSize); + } + memcpy(TargetFilename + DirSize, Target, TargetLength); + TargetFilename[TargetLength + DirSize] = '.'; + memcpy(TargetFilename + DirSize + 1 + TargetLength, Filename + DirSize, + BaseSize); TargetFilename[FilenameLength + 1 + TargetLength] = 0; /* Check if there is llvm/runtime version mismatch. */ From c9aebce3b1d7fd489970e68f51621c1009559a62 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 27 Nov 2024 13:08:55 -0500 Subject: [PATCH 102/114] Update expected amdgpu-max-num-workgroups default values Due to 0b40f979298a. --- .../KernelInfo/launch-bounds/amdgpu.ll | 6 +++--- llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll index d37dceec003f9..7fbdb923d8800 100644 --- a/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/launch-bounds/amdgpu.ll @@ -25,9 +25,9 @@ entry: ; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_num_teams = {{.*}} ; CHECK-NOT: remark: test.c:11:0: in function 'none', omp_target_thread_limit = {{.*}} -; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[0] = 0 -; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[1] = 0 -; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[2] = 0 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK: remark: test.c:11:0: in function 'none', amdgpu-max-num-workgroups[2] = 4294967295 ; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[0] = 1 ; CHECK: remark: test.c:11:0: in function 'none', amdgpu-flat-work-group-size[1] = 1024 ; CHECK: remark: test.c:11:0: in function 'none', amdgpu-waves-per-eu[0] = 4 diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index 246eccaac2fc0..f9aadb21825f9 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -16,9 +16,9 @@ ; CHECK-NEXT: remark: test.c:17:5: in artificial function '[[OFF_FUNC]]_debug__', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:18:3: in artificial function '[[OFF_FUNC]]_debug__', direct call, callee is '@__kmpc_target_deinit' ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ExternalNotKernel = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[1] = 0 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-max-num-workgroups[2] = 4294967295 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[0] = 1 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-flat-work-group-size[1] = 1024 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', amdgpu-waves-per-eu[0] = 4 @@ -39,9 +39,9 @@ ; CHECK-NEXT: remark: test.c:12:1: in artificial function '[[OFF_FUNC]]', direct call to defined function, callee is artificial '[[OFF_FUNC]]_debug__' ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ExternalNotKernel = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', omp_target_thread_limit = 256 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[0] = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[1] = 0 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-max-num-workgroups[2] = 4294967295 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[0] = 1 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-flat-work-group-size[1] = 256 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', amdgpu-waves-per-eu[0] = 1 @@ -61,9 +61,9 @@ ; CHECK-NEXT: remark: test.c:6:3: in function 'g', direct call, callee is '@f' ; CHECK-NEXT: remark: test.c:7:3: in function 'g', direct call to defined function, callee is 'g' ; CHECK-NEXT: remark: test.c:3:0: in function 'g', ExternalNotKernel = 1 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[0] = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[1] = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[2] = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[0] = 4294967295 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[1] = 4294967295 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-max-num-workgroups[2] = 4294967295 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[0] = 1 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-flat-work-group-size[1] = 1024 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', amdgpu-waves-per-eu[0] = 4 From e690e2ac256d414f49f6b89182f4398e37d23bbd Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 27 Nov 2024 16:14:26 -0500 Subject: [PATCH 103/114] Update llvm-profdata test fix That is, replace our local hack with the fix from 1d0a961 from PR#93365. --- offload/test/lit.cfg | 4 ++-- offload/test/lit.site.cfg.in | 3 +-- offload/test/offloading/gpupgo/pgo1.c | 4 ++-- offload/test/offloading/gpupgo/pgo2.c | 8 ++++---- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index bb2b3cd2e8e61..6914f1b2da416 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -112,8 +112,10 @@ config.available_features.add(config.libomptarget_current_target) if config.libomptarget_has_libc: config.available_features.add('libc') +profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata") if config.libomptarget_test_pgo: config.available_features.add('pgo') + config.substitutions.append(("%profdata", profdata_path)) # Determine whether the test system supports unified memory. # For CUDA, this is the case with compute capability 70 (Volta) or higher. @@ -420,5 +422,3 @@ config.substitutions.append(("%flags", config.test_flags)) config.substitutions.append(("%not", config.libomptarget_not)) config.substitutions.append(("%offload-device-info", config.offload_device_info)) -config.substitutions.append(("llvm-profdata", - config.bin_dir + "/../../bin/llvm-profdata")) diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in index c7713910fd39d..d998fb0c83970 100644 --- a/offload/test/lit.site.cfg.in +++ b/offload/test/lit.site.cfg.in @@ -1,7 +1,6 @@ @AUTO_GEN_COMMENT@ -config.bin_dir = "@CMAKE_BINARY_DIR@" -config.bin_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin" +config.bin_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@" config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@" diff --git a/offload/test/offloading/gpupgo/pgo1.c b/offload/test/offloading/gpupgo/pgo1.c index 7196663fcfc90..746cc779af841 100644 --- a/offload/test/offloading/gpupgo/pgo1.c +++ b/offload/test/offloading/gpupgo/pgo1.c @@ -1,14 +1,14 @@ // RUN: %libomptarget-compile-generic -fprofile-generate-gpu // RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \ // RUN: %libomptarget-run-generic 2>&1 -// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %profdata show --all-functions --counts \ // RUN: %target_triple.%basename_t.llvm.profraw | \ // RUN: %fcheck-generic --check-prefix="LLVM-PGO" // RUN: %libomptarget-compile-generic -fprofile-instr-generate-gpu // RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \ // RUN: %libomptarget-run-generic 2>&1 -// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %profdata show --all-functions --counts \ // RUN: %target_triple.%basename_t.clang.profraw | \ // RUN: %fcheck-generic --check-prefix="CLANG-PGO" diff --git a/offload/test/offloading/gpupgo/pgo2.c b/offload/test/offloading/gpupgo/pgo2.c index 7f5c9ab744907..121d84c28618b 100644 --- a/offload/test/offloading/gpupgo/pgo2.c +++ b/offload/test/offloading/gpupgo/pgo2.c @@ -2,10 +2,10 @@ // RUN: -fprofile-generate-gpu // RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \ // RUN: %libomptarget-run-generic 2>&1 -// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %profdata show --all-functions --counts \ // RUN: %basename_t.llvm.profraw | %fcheck-generic \ // RUN: --check-prefix="LLVM-HOST" -// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %profdata show --all-functions --counts \ // RUN: %target_triple.%basename_t.llvm.profraw \ // RUN: | %fcheck-generic --check-prefix="LLVM-DEVICE" @@ -13,10 +13,10 @@ // RUN: -fprofile-instr-generate-gpu // RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \ // RUN: %libomptarget-run-generic 2>&1 -// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %profdata show --all-functions --counts \ // RUN: %basename_t.clang.profraw | %fcheck-generic \ // RUN: --check-prefix="CLANG-HOST" -// RUN: llvm-profdata show --all-functions --counts \ +// RUN: %profdata show --all-functions --counts \ // RUN: %target_triple.%basename_t.clang.profraw | \ // RUN: %fcheck-generic --check-prefix="CLANG-DEV" From e80f7ffaa6b7096624b6eadcf5eed4740f4aeef0 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 28 Oct 2024 18:45:37 -0500 Subject: [PATCH 104/114] Prepend target prefix to basename This fix is cherry-picked from c6b34ad7a676 from PR#93365. --- compiler-rt/lib/profile/InstrProfilingFile.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 9c4a59aa4bec0..f0fc1e4c17bd2 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1356,10 +1356,21 @@ int __llvm_write_custom_profile(const char *Target, TargetFilename = (char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength + 2); + /* Find file basename and path sizes */ + int32_t DirEnd = FilenameLength - 1; + while (DirEnd >= 0 && !IS_DIR_SEPARATOR(Filename[DirEnd])) { + DirEnd--; + } + uint32_t DirSize = DirEnd + 1, BaseSize = FilenameLength - DirSize; + /* Prepend "TARGET." to current filename */ - memcpy(TargetFilename, Target, TargetLength); - TargetFilename[TargetLength] = '.'; - memcpy(TargetFilename + 1 + TargetLength, Filename, FilenameLength); + if (DirSize > 0) { + memcpy(TargetFilename, Filename, DirSize); + } + memcpy(TargetFilename + DirSize, Target, TargetLength); + TargetFilename[TargetLength + DirSize] = '.'; + memcpy(TargetFilename + DirSize + 1 + TargetLength, Filename + DirSize, + BaseSize); TargetFilename[FilenameLength + 1 + TargetLength] = 0; /* Check if there is llvm/runtime version mismatch. */ From 151bfb3529c8bf62ad98243c7583450a6d1354b7 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Fri, 27 Dec 2024 14:54:58 -0500 Subject: [PATCH 105/114] Regenerate OpenMP tests from current clang See llvm/test/Analysis/KernelInfo/openmp/README.md. --- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 29 +- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 247 ++++++++++-------- 2 files changed, 145 insertions(+), 131 deletions(-) diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index f9aadb21825f9..6016919ec8280 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -79,7 +79,6 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 ; CHECK-NOT: {{.}} - ; ModuleID = 'test-openmp-amdgcn-amd-amdhsa.bc' source_filename = "test.c" target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" @@ -95,14 +94,14 @@ target triple = "amdgcn-amd-amdhsa" @__omp_rtl_assume_threads_oversubscription = weak_odr hidden addrspace(1) constant i32 0 @__omp_rtl_assume_no_thread_state = weak_odr hidden addrspace(1) constant i32 0 @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden addrspace(1) constant i32 0 -@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_6f0c0_h_l12_debug__;13;3;;\00", align 1 +@0 = private unnamed_addr constant [57 x i8] c";test.c;__omp_offloading_fd02_624a0_h_l12_debug__;13;3;;\00", align 1 @1 = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 56, ptr @0 }, align 8 -@__omp_offloading_fd02_6f0c0_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer -@__omp_offloading_fd02_6f0c0_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_6f0c0_h_l12_dynamic_environment to ptr) } +@__omp_offloading_fd02_624a0_h_l12_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_624a0_h_l12_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_624a0_h_l12_dynamic_environment to ptr) } @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 ; Function Attrs: convergent noinline norecurse nounwind optnone -define internal void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !16 { +define internal void @__omp_offloading_fd02_624a0_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !16 { %2 = alloca ptr, align 8, addrspace(5) %3 = alloca i32, align 4, addrspace(5) %4 = alloca [2 x i32], align 4, addrspace(5) @@ -111,7 +110,7 @@ define internal void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr noalias noun %7 = addrspacecast ptr addrspace(5) %4 to ptr store ptr %0, ptr %5, align 8 #dbg_declare(ptr addrspace(5) %2, !24, !DIExpression(), !25) - %8 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_6f0c0_h_l12_kernel_environment to ptr), ptr %0), !dbg !26 + %8 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_fd02_624a0_h_l12_kernel_environment to ptr), ptr %0), !dbg !26 %9 = icmp eq i32 %8, -1, !dbg !26 br i1 %9, label %10, label %11, !dbg !26 @@ -128,13 +127,13 @@ define internal void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr noalias noun } ; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone -define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_6f0c0_h_l12(ptr noalias noundef %0) #1 !dbg !40 { +define weak_odr protected amdgpu_kernel void @__omp_offloading_fd02_624a0_h_l12(ptr noalias noundef %0) #1 !dbg !40 { %2 = alloca ptr, align 8, addrspace(5) %3 = addrspacecast ptr addrspace(5) %2 to ptr store ptr %0, ptr %3, align 8 #dbg_declare(ptr addrspace(5) %2, !41, !DIExpression(), !42) %4 = load ptr, ptr %3, align 8, !dbg !43 - call void @__omp_offloading_fd02_6f0c0_h_l12_debug__(ptr %4) #5, !dbg !43 + call void @__omp_offloading_fd02_624a0_h_l12_debug__(ptr %4) #5, !dbg !43 ret void, !dbg !43 } @@ -172,10 +171,10 @@ attributes #5 = { nounwind } !llvm.ident = !{!13, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14, !14} !opencl.ocl.version = !{!15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15, !15} -!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "854099697e49b3ca7d3b3c08503e6fef") -!2 = !{i32 0, i32 64770, i32 454848, !"h", i32 12, i32 0, i32 0} -!3 = !{ptr @__omp_offloading_fd02_6f0c0_h_l12, !"kernel", i32 1} +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "44c4bbdbb9b7a9c7492ced3432d74b0c") +!2 = !{i32 0, i32 64770, i32 402592, !"h", i32 12, i32 0, i32 0} +!3 = !{ptr @__omp_offloading_fd02_624a0_h_l12, !"kernel", i32 1} !4 = !{i32 1, !"amdhsa_code_object_version", i32 500} !5 = !{i32 7, !"Dwarf Version", i32 5} !6 = !{i32 2, !"Debug Info Version", i32 3} @@ -185,10 +184,10 @@ attributes #5 = { nounwind } !10 = !{i32 8, !"PIC Level", i32 2} !11 = !{i32 7, !"frame-pointer", i32 2} !12 = !{i32 4, !"amdgpu_hostcall", i32 1} -!13 = !{!"clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)"} +!13 = !{!"clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)"} !14 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"} !15 = !{i32 2, i32 0} -!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_6f0c0_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!16 = distinct !DISubprogram(name: "__omp_offloading_fd02_624a0_h_l12_debug__", scope: !17, file: !17, line: 13, type: !18, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) !17 = !DIFile(filename: "test.c", directory: "/tmp") !18 = !DISubroutineType(types: !19) !19 = !{null, !20} @@ -212,7 +211,7 @@ attributes #5 = { nounwind } !37 = !DILocation(line: 17, column: 5, scope: !28) !38 = !DILocation(line: 18, column: 3, scope: !28) !39 = !DILocation(line: 18, column: 3, scope: !16) -!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_6f0c0_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) +!40 = distinct !DISubprogram(name: "__omp_offloading_fd02_624a0_h_l12", scope: !17, file: !17, line: 12, type: !18, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !23) !41 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !40, type: !20, flags: DIFlagArtificial) !42 = !DILocation(line: 0, scope: !40) !43 = !DILocation(line: 12, column: 1, scope: !40) diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index 656171896a4ff..0633c3fa687c1 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -62,7 +62,6 @@ ; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't ; want to maintain a list of their allocas, calls, etc. in this test. - ; ModuleID = 'test-openmp-nvptx64-nvidia-cuda.bc' source_filename = "test.c" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" @@ -75,6 +74,8 @@ target triple = "nvptx64-nvidia-cuda" %struct.DeviceMemoryPoolTy = type { ptr, i64 } %struct.DeviceMemoryPoolTrackingTy = type { i64, i64, i64, i64 } %struct.DeviceEnvironmentTy = type { i32, i32, i32, i32, i64, i64, i64, i64 } +%"struct.rpc::Client" = type { %"struct.rpc::Process" } +%"struct.rpc::Process" = type { i32, ptr, ptr, ptr, ptr, [128 x i32] } %"struct.(anonymous namespace)::SharedMemorySmartStackTy" = type { [512 x i8], [1024 x i8] } %"struct.ompx::state::TeamStateTy" = type { %"struct.ompx::state::ICVStateTy", i32, i32, ptr } %"struct.ompx::state::ICVStateTy" = type { i32, i32, i32, i32, i32, i32, i32 } @@ -83,11 +84,11 @@ target triple = "nvptx64-nvidia-cuda" @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 -@0 = private unnamed_addr constant [58 x i8] c";test.c;__omp_offloading_fd02_10d1d6_h_l12_debug__;13;3;;\00", align 1 +@0 = private unnamed_addr constant [58 x i8] c";test.c;__omp_offloading_fd02_100102_h_l12_debug__;13;3;;\00", align 1 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 57, ptr @0 }, align 8 -@__omp_offloading_fd02_10d1d6_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer -@__omp_offloading_fd02_10d1d6_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_fd02_10d1d6_h_l12_dynamic_environment } -@llvm.used = appending global [3 x ptr] [ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata" +@__omp_offloading_fd02_100102_h_l12_dynamic_environment = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer +@__omp_offloading_fd02_100102_h_l12_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 128, i32 -1, i32 -1, i32 0, i32 0 }, ptr @1, ptr @__omp_offloading_fd02_100102_h_l12_dynamic_environment } +@llvm.used = appending global [4 x ptr] [ptr @__llvm_rpc_client, ptr addrspacecast (ptr addrspace(4) @__omp_rtl_device_environment to ptr), ptr @__omp_rtl_device_memory_pool, ptr @__omp_rtl_device_memory_pool_tracker], section "llvm.metadata" @__omp_rtl_device_memory_pool = weak protected global %struct.DeviceMemoryPoolTy zeroinitializer, align 8 @__omp_rtl_device_memory_pool_tracker = weak protected global %struct.DeviceMemoryPoolTrackingTy zeroinitializer, align 8 @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 @@ -101,23 +102,24 @@ target triple = "nvptx64-nvidia-cuda" @.str2 = private unnamed_addr constant [18 x i8] c"WorkFn == nullptr\00", align 1 @__PRETTY_FUNCTION__.__kmpc_target_deinit = private unnamed_addr constant [28 x i8] c"void __kmpc_target_deinit()\00", align 1 @IsSPMDMode = internal local_unnamed_addr addrspace(3) global i32 undef, align 4 -@.str1124 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 +@__llvm_rpc_client = weak protected global %"struct.rpc::Client" zeroinitializer, align 8 +@.str1125 = private unnamed_addr constant [48 x i8] c"/tmp/llvm/offload/DeviceRTL/src/Parallelism.cpp\00", align 1 @.str13 = private unnamed_addr constant [23 x i8] c"!mapping::isSPMDMode()\00", align 1 @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel = private unnamed_addr constant [34 x i8] c"void __kmpc_kernel_end_parallel()\00", align 1 @_ZL20KernelEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZL26KernelLaunchEnvironmentPtr = internal unnamed_addr addrspace(3) global ptr undef, align 8 @_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16 -@.str541 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 -@.str844 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 +@.str542 = private unnamed_addr constant [42 x i8] c"/tmp/llvm/offload/DeviceRTL/src/State.cpp\00", align 1 +@.str845 = private unnamed_addr constant [33 x i8] c"NThreadsVar == Other.NThreadsVar\00", align 1 @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_ = private unnamed_addr constant [68 x i8] c"void ompx::state::ICVStateTy::assertEqual(const ICVStateTy &) const\00", align 1 -@.str945 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 -@.str1046 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 -@.str1147 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 -@.str1248 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 -@.str1349 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 +@.str946 = private unnamed_addr constant [27 x i8] c"LevelVar == Other.LevelVar\00", align 1 +@.str1047 = private unnamed_addr constant [39 x i8] c"ActiveLevelVar == Other.ActiveLevelVar\00", align 1 +@.str1148 = private unnamed_addr constant [47 x i8] c"MaxActiveLevelsVar == Other.MaxActiveLevelsVar\00", align 1 +@.str1249 = private unnamed_addr constant [33 x i8] c"RunSchedVar == Other.RunSchedVar\00", align 1 +@.str1350 = private unnamed_addr constant [43 x i8] c"RunSchedChunkVar == Other.RunSchedChunkVar\00", align 1 @.str14 = private unnamed_addr constant [43 x i8] c"ParallelTeamSize == Other.ParallelTeamSize\00", align 1 @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_ = private unnamed_addr constant [64 x i8] c"void ompx::state::TeamStateTy::assertEqual(TeamStateTy &) const\00", align 1 -@.str1550 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 +@.str1551 = private unnamed_addr constant [39 x i8] c"HasThreadState == Other.HasThreadState\00", align 1 @.str24 = private unnamed_addr constant [32 x i8] c"mapping::isSPMDMode() == IsSPMD\00", align 1 @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb = private unnamed_addr constant [43 x i8] c"void ompx::state::assumeInitialState(bool)\00", align 1 @_ZL9ThreadDST = internal unnamed_addr addrspace(3) global ptr undef, align 8 @@ -125,13 +127,13 @@ target triple = "nvptx64-nvidia-cuda" @_ZN4ompx5state12ThreadStatesE = internal addrspace(3) global ptr undef, align 8 ; Function Attrs: convergent noinline norecurse nounwind optnone -define internal void @__omp_offloading_fd02_10d1d6_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !19 { +define internal void @__omp_offloading_fd02_100102_h_l12_debug__(ptr noalias noundef %0) #0 !dbg !19 { %2 = alloca ptr, align 8 %3 = alloca i32, align 4 %4 = alloca [2 x i32], align 4 store ptr %0, ptr %2, align 8 #dbg_declare(ptr %2, !26, !DIExpression(), !27) - %5 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_10d1d6_h_l12_kernel_environment, ptr %0), !dbg !28 + %5 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_100102_h_l12_kernel_environment, ptr %0), !dbg !28 %6 = icmp eq i32 %5, -1, !dbg !28 br i1 %6, label %7, label %8, !dbg !28 @@ -148,12 +150,12 @@ define internal void @__omp_offloading_fd02_10d1d6_h_l12_debug__(ptr noalias nou } ; Function Attrs: convergent mustprogress noinline norecurse nounwind optnone -define weak_odr protected void @__omp_offloading_fd02_10d1d6_h_l12(ptr noalias noundef %0) #1 !dbg !42 { +define weak_odr protected void @__omp_offloading_fd02_100102_h_l12(ptr noalias noundef %0) #1 !dbg !42 { %2 = alloca ptr, align 8 store ptr %0, ptr %2, align 8 #dbg_declare(ptr %2, !43, !DIExpression(), !44) %3 = load ptr, ptr %2, align 8, !dbg !45 - call void @__omp_offloading_fd02_10d1d6_h_l12_debug__(ptr %3) #17, !dbg !45 + call void @__omp_offloading_fd02_100102_h_l12_debug__(ptr %3) #17, !dbg !45 ret void, !dbg !45 } @@ -190,16 +192,16 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n 13: ; preds = %10 store i32 1, ptr addrspace(3) @IsSPMDMode, align 4, !tbaa !63 store i8 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512) to ptr addrspace(3)), align 1, !tbaa !64 - tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !65 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 - store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 - store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 - store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !74 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !71 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !72 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !77 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !79 br label %18 14: ; preds = %10 @@ -213,7 +215,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %12, label %19, label %20 19: ; preds = %18 - store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !74 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !81 br label %20 20: ; preds = %18, %19 @@ -221,7 +223,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br label %37 21: ; preds = %2 - %22 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %22 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 %23 = add nsw i32 %22, -1 %24 = and i32 %23, -32 %25 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18 @@ -234,16 +236,16 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n %29 = getelementptr inbounds nuw [1024 x i8], ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN12_GLOBAL__N_122SharedMemorySmartStackE to ptr), i64 512), i64 0, i64 %28 %30 = addrspacecast ptr %29 to ptr addrspace(3) store i8 0, ptr addrspace(3) %30, align 1, !tbaa !64 - tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 + tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 8 dereferenceable(48) addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i8 noundef 0, i64 noundef 16, i1 noundef false) #18 store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !65 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !69 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !70 - store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 - store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !73 - store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 - store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 - store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !74 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !70 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !71 + store i32 1, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !72 + store i32 0, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !73 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr null, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 + store ptr %0, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !77 + store ptr %1, ptr addrspace(3) @_ZL26KernelLaunchEnvironmentPtr, align 8, !tbaa !79 br label %35 31: ; preds = %21 @@ -257,7 +259,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %26, label %36, label %37 36: ; preds = %35 - store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !74 + store ptr null, ptr addrspace(3) @_ZL9ThreadDST, align 8, !tbaa !81 br label %37 37: ; preds = %36, %35, %20 @@ -265,23 +267,23 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n 38: ; preds = %37 %39 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 - %40 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %40 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !84 %41 = and i32 %39, 1 %42 = and i32 %41, %40 %43 = icmp ne i32 %42, 0 - %44 = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !79 + %44 = load i32, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 8, !tbaa !87 %45 = icmp ne i32 %44, 0 %46 = select i1 %43, i1 %45, i1 false br i1 %46, label %47, label %48 47: ; preds = %38 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str844, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str845, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 193, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 48: ; preds = %38 %49 = icmp eq i32 %44, 0 tail call void @llvm.assume(i1 noundef %49) #21 - %50 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !80 + %50 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 4) to ptr addrspace(3)), align 4, !tbaa !88 br i1 %43, label %51, label %54 51: ; preds = %48 @@ -289,14 +291,14 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %52, label %54, label %53 53: ; preds = %51 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str945, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(27) @.str946, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 194, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 54: ; preds = %51, %48 %55 = phi i32 [ 0, %51 ], [ %50, %48 ] %56 = icmp eq i32 %55, 0 tail call void @llvm.assume(i1 noundef %56) #21 - %57 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !81 + %57 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 8) to ptr addrspace(3)), align 8, !tbaa !89 br i1 %43, label %58, label %61 58: ; preds = %54 @@ -304,14 +306,14 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %59, label %61, label %60 60: ; preds = %58 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1046, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1047, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 195, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 61: ; preds = %58, %54 %62 = phi i32 [ 0, %58 ], [ %57, %54 ] %63 = icmp eq i32 %62, 0 tail call void @llvm.assume(i1 noundef %63) #21 - %64 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !82 + %64 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 16) to ptr addrspace(3)), align 8, !tbaa !90 br i1 %43, label %65, label %68 65: ; preds = %61 @@ -319,14 +321,14 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %66, label %68, label %67 67: ; preds = %65 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1147, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(47) @.str1148, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 196, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 68: ; preds = %65, %61 %69 = phi i32 [ 1, %65 ], [ %64, %61 ] %70 = icmp eq i32 %69, 1 tail call void @llvm.assume(i1 noundef %70) #21 - %71 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !83 + %71 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 20) to ptr addrspace(3)), align 4, !tbaa !91 br i1 %43, label %72, label %93 72: ; preds = %68 @@ -334,7 +336,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %73, label %75, label %74 74: ; preds = %72 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1248, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(33) @.str1249, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 197, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 75: ; preds = %72 @@ -343,30 +345,30 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %43, label %77, label %95 77: ; preds = %75 - %78 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !84 + %78 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 24) to ptr addrspace(3)), align 8, !tbaa !92 %79 = icmp eq i32 %78, 1 br i1 %79, label %81, label %80 80: ; preds = %77 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1349, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str1350, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 198, ptr nofree noundef nonnull dereferenceable(68) @__PRETTY_FUNCTION__._ZNK4ompx5state10ICVStateTy11assertEqualERKS1_) #20 unreachable 81: ; preds = %77 - %82 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !71 + %82 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !72 %83 = icmp eq i32 %82, 1 br i1 %83, label %85, label %84 84: ; preds = %81 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(43) @.str14, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 222, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 unreachable 85: ; preds = %81 - %86 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !72 + %86 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 32) to ptr addrspace(3)), align 8, !tbaa !73 %87 = icmp eq i32 %86, 0 br i1 %87, label %89, label %88 88: ; preds = %85 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1550, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(39) @.str1551, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 223, ptr nofree noundef nonnull dereferenceable(64) @__PRETTY_FUNCTION__._ZNK4ompx5state11TeamStateTy11assertEqualERS1_) #20 unreachable 89: ; preds = %85 @@ -375,7 +377,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br i1 %91, label %92, label %98 92: ; preds = %89 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str541, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(32) @.str24, ptr noundef null, ptr nofree noundef nonnull dereferenceable(66) @.str542, i32 noundef 326, ptr nofree noundef nonnull dereferenceable(43) @__PRETTY_FUNCTION__._ZN4ompx5state18assumeInitialStateEb) #20 unreachable 93: ; preds = %68 @@ -395,10 +397,10 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n br label %130 100: ; preds = %37 - %101 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %101 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 %102 = add nsw i32 %101, -1 %103 = and i32 %102, -32 - %104 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !85 + %104 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !93 %105 = icmp eq i32 %104, %103 br i1 %105, label %130, label %106 @@ -418,10 +420,10 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n 116: ; preds = %110, %128 call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %3) #22 - store ptr null, ptr %3, align 8, !tbaa !74 + store ptr null, ptr %3, align 8, !tbaa !94 tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 %117 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %3) #22 - %118 = load ptr, ptr %3, align 8, !tbaa !74 + %118 = load ptr, ptr %3, align 8, !tbaa !94 %119 = icmp eq ptr %118, null br i1 %119, label %129, label %120 @@ -448,7 +450,7 @@ define internal noundef range(i32 -1, 1024) i32 @__kmpc_target_init(ptr nofree n 128: ; preds = %126, %120 tail call void @llvm.nvvm.barrier.sync(i32 noundef 8) #18 call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #22 - br label %116, !llvm.loop !86 + br label %116, !llvm.loop !95 129: ; preds = %116 call void @llvm.lifetime.end.p0(i64 noundef 8, ptr noundef nonnull %3) #22 @@ -520,17 +522,17 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #10 declare void @llvm.nvvm.barrier.sync(i32) #11 ; Function Attrs: convergent mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, argmem: write, inaccessiblemem: none) -define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %0) local_unnamed_addr #12 { - %2 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 - store ptr %2, ptr %0, align 8, !tbaa !74 +define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) initializes((0, 8)) %0) local_unnamed_addr #12 { + %2 = load ptr, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !94 + store ptr %2, ptr %0, align 8, !tbaa !94 %3 = icmp eq ptr %2, null br i1 %3, label %15, label %4 4: ; preds = %1 - %5 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !85 + %5 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !93 %6 = load i32, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 28) to ptr addrspace(3)), align 4, !tbaa !63 %7 = icmp eq i32 %6, 0 - %8 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %8 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 %9 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 %10 = icmp eq i32 %9, 0 %11 = select i1 %10, i32 -32, i32 0 @@ -547,7 +549,7 @@ define internal noundef zeroext i1 @__kmpc_kernel_parallel(ptr nocapture nofree ; Function Attrs: convergent mustprogress noinline nounwind define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { %1 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 - %2 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %2 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !84 %3 = and i32 %1, 1 %4 = and i32 %3, %2 %5 = icmp ne i32 %4, 0 @@ -557,7 +559,7 @@ define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { br i1 %8, label %9, label %10 9: ; preds = %0 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1124, i32 noundef 298, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1125, i32 noundef 298, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 unreachable 10: ; preds = %0 @@ -571,21 +573,21 @@ define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { br i1 %16, label %17, label %30 17: ; preds = %10 - %18 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !85 - %19 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + %18 = tail call noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #27, !range !93 + %19 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 %20 = zext nneg i32 %18 to i64 %21 = getelementptr inbounds nuw ptr, ptr %19, i64 %20 - %22 = load ptr, ptr %21, align 8, !tbaa !74 + %22 = load ptr, ptr %21, align 8, !tbaa !97 %23 = icmp eq ptr %22, null - br i1 %23, label %30, label %24, !prof !88 + br i1 %23, label %30, label %24, !prof !99 24: ; preds = %17 %25 = getelementptr inbounds nuw i8, ptr %22, i64 32 - %26 = load ptr, ptr %25, align 8, !tbaa !89 + %26 = load ptr, ptr %25, align 8, !tbaa !100 tail call void @free(ptr noundef nonnull dereferenceable(40) %22) #28 - %27 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !74 + %27 = load ptr, ptr addrspace(3) @_ZN4ompx5state12ThreadStatesE, align 8, !tbaa !75 %28 = getelementptr inbounds nuw ptr, ptr %27, i64 %20 - store ptr %26, ptr %28, align 8, !tbaa !74 + store ptr %26, ptr %28, align 8, !tbaa !97 %29 = load i32, ptr addrspace(3) @IsSPMDMode, align 4 br label %30 @@ -596,7 +598,7 @@ define internal void @__kmpc_kernel_end_parallel() local_unnamed_addr #13 { br i1 %33, label %34, label %35 34: ; preds = %30 - tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1124, i32 noundef 301, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 + tail call fastcc void @__assert_fail_internal(ptr nofree noundef nonnull dereferenceable(23) @.str13, ptr noundef null, ptr nofree noundef nonnull dereferenceable(72) @.str1125, i32 noundef 301, ptr nofree noundef nonnull dereferenceable(34) @__PRETTY_FUNCTION__.__kmpc_kernel_end_parallel) #20 unreachable 35: ; preds = %30 @@ -628,29 +630,29 @@ define internal void @__kmpc_target_deinit() #4 { br i1 %3, label %4, label %27 4: ; preds = %0 - %5 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !75 + %5 = tail call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #18, !range !83 %6 = add nsw i32 %5, -1 %7 = and i32 %6, -32 - %8 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !85 + %8 = tail call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #18, !range !93 %9 = icmp eq i32 %8, %7 br i1 %9, label %10, label %11 10: ; preds = %4 - store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !74 + store ptr null, ptr addrspace(3) addrspacecast (ptr getelementptr inbounds nuw (i8, ptr addrspacecast (ptr addrspace(3) @_ZN4ompx5state9TeamStateE to ptr), i64 40) to ptr addrspace(3)), align 8, !tbaa !94 br label %27 11: ; preds = %4 - %12 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !74 - %13 = load i8, ptr %12, align 8, !tbaa !91 + %12 = load ptr, ptr addrspace(3) @_ZL20KernelEnvironmentPtr, align 8, !tbaa !77 + %13 = load i8, ptr %12, align 8, !tbaa !102 %14 = icmp eq i8 %13, 0 br i1 %14, label %15, label %27 15: ; preds = %11 call void @llvm.lifetime.start.p0(i64 noundef 8, ptr noundef nonnull align 8 dereferenceable(8) %1) #29 - store ptr null, ptr %1, align 8, !tbaa !74 + store ptr null, ptr %1, align 8, !tbaa !94 %16 = call zeroext i1 @__kmpc_kernel_parallel(ptr noalias nocapture nofree noundef nonnull writeonly align 8 dereferenceable(8) %1) #22 %17 = load i32, ptr @__omp_rtl_debug_kind, align 4, !tbaa !63 - %18 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !76 + %18 = load i32, ptr addrspace(4) @__omp_rtl_device_environment, align 8, !tbaa !84 %19 = and i32 %17, 1 %20 = and i32 %19, %18 %21 = icmp eq i32 %20, 0 @@ -721,15 +723,15 @@ attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } !8 = !{i32 7, !"frame-pointer", i32 2} !9 = !{i32 1, !"ThinLTO", i32 0} !10 = !{i32 1, !"EnableSplitLTOUnit", i32 1} -!11 = distinct !DICompileUnit(language: DW_LANG_C11, file: !12, producer: "clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!11 = distinct !DICompileUnit(language: DW_LANG_C11, file: !12, producer: "clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) !12 = !DIFile(filename: "test.c", directory: "/tmp") -!13 = !{ptr @__omp_offloading_fd02_10d1d6_h_l12, !"maxntidx", i32 128} -!14 = !{ptr @__omp_offloading_fd02_10d1d6_h_l12, !"kernel", i32 1} -!15 = !{i32 0, i32 64770, i32 1102294, !"h", i32 12, i32 0, i32 0} -!16 = !{!"clang version 20.0.0git (/tmp/llvm/clang 0c30e7ceeb36294f4523da2590101314ca1c662d)"} +!13 = !{ptr @__omp_offloading_fd02_100102_h_l12, !"maxntidx", i32 128} +!14 = !{ptr @__omp_offloading_fd02_100102_h_l12, !"kernel", i32 1} +!15 = !{i32 0, i32 64770, i32 1048834, !"h", i32 12, i32 0, i32 0} +!16 = !{!"clang version 20.0.0git (/tmp/llvm/clang 8982f8ff551bd4c11d47afefe97364c3a5c25ec8)"} !17 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} !18 = !{i32 2, i32 0} -!19 = distinct !DISubprogram(name: "__omp_offloading_fd02_10d1d6_h_l12_debug__", scope: !12, file: !12, line: 13, type: !20, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) +!19 = distinct !DISubprogram(name: "__omp_offloading_fd02_100102_h_l12_debug__", scope: !12, file: !12, line: 13, type: !20, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) !20 = !DISubroutineType(types: !21) !21 = !{null, !22} !22 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !23) @@ -752,7 +754,7 @@ attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } !39 = !DILocation(line: 17, column: 5, scope: !30) !40 = !DILocation(line: 18, column: 3, scope: !30) !41 = !DILocation(line: 18, column: 3, scope: !19) -!42 = distinct !DISubprogram(name: "__omp_offloading_fd02_10d1d6_h_l12", scope: !12, file: !12, line: 12, type: !20, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) +!42 = distinct !DISubprogram(name: "__omp_offloading_fd02_100102_h_l12", scope: !12, file: !12, line: 12, type: !20, scopeLine: 12, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !11, retainedNodes: !25) !43 = !DILocalVariable(name: "dyn_ptr", arg: 1, scope: !42, type: !22, flags: DIFlagArtificial) !44 = !DILocation(line: 0, scope: !42) !45 = !DILocation(line: 12, column: 1, scope: !42) @@ -778,28 +780,41 @@ attributes #29 = { nofree nounwind willreturn "llvm.assume"="ompx_no_call_asm" } !65 = !{!66, !61, i64 16} !66 = !{!"_ZTSN4ompx5state11TeamStateTyE", !67, i64 0, !61, i64 28, !61, i64 32, !68, i64 40} !67 = !{!"_ZTSN4ompx5state10ICVStateTyE", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !61, i64 16, !61, i64 20, !61, i64 24} -!68 = !{!"any pointer", !58, i64 0} -!69 = !{!66, !61, i64 20} -!70 = !{!66, !61, i64 24} -!71 = !{!66, !61, i64 28} -!72 = !{!66, !61, i64 32} -!73 = !{!66, !68, i64 40} -!74 = !{!68, !68, i64 0} -!75 = !{i32 1, i32 1025} -!76 = !{!77, !61, i64 0} -!77 = !{!"_ZTS19DeviceEnvironmentTy", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !78, i64 16, !78, i64 24, !78, i64 32, !78, i64 40} -!78 = !{!"long", !58, i64 0} -!79 = !{!67, !61, i64 0} -!80 = !{!67, !61, i64 4} -!81 = !{!67, !61, i64 8} -!82 = !{!67, !61, i64 16} -!83 = !{!67, !61, i64 20} -!84 = !{!67, !61, i64 24} -!85 = !{i32 0, i32 1024} -!86 = distinct !{!86, !87} -!87 = !{!"llvm.loop.mustprogress"} -!88 = !{!"branch_weights", !"expected", i32 2000, i32 1} -!89 = !{!90, !68, i64 32} -!90 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !67, i64 0, !68, i64 32} -!91 = !{!92, !58, i64 0} -!92 = !{!"_ZTS19KernelEnvironmentTy", !57, i64 0, !68, i64 32, !68, i64 40} +!68 = !{!"p1 void", !69, i64 0} +!69 = !{!"any pointer", !58, i64 0} +!70 = !{!66, !61, i64 20} +!71 = !{!66, !61, i64 24} +!72 = !{!66, !61, i64 28} +!73 = !{!66, !61, i64 32} +!74 = !{!66, !68, i64 40} +!75 = !{!76, !76, i64 0} +!76 = !{!"p2 _ZTSN4ompx5state13ThreadStateTyE", !69, i64 0} +!77 = !{!78, !78, i64 0} +!78 = !{!"p1 _ZTS19KernelEnvironmentTy", !69, i64 0} +!79 = !{!80, !80, i64 0} +!80 = !{!"p1 _ZTS25KernelLaunchEnvironmentTy", !69, i64 0} +!81 = !{!82, !82, i64 0} +!82 = !{!"p2 _ZTS22DynamicScheduleTracker", !69, i64 0} +!83 = !{i32 1, i32 1025} +!84 = !{!85, !61, i64 0} +!85 = !{!"_ZTS19DeviceEnvironmentTy", !61, i64 0, !61, i64 4, !61, i64 8, !61, i64 12, !86, i64 16, !86, i64 24, !86, i64 32, !86, i64 40} +!86 = !{!"long", !58, i64 0} +!87 = !{!67, !61, i64 0} +!88 = !{!67, !61, i64 4} +!89 = !{!67, !61, i64 8} +!90 = !{!67, !61, i64 16} +!91 = !{!67, !61, i64 20} +!92 = !{!67, !61, i64 24} +!93 = !{i32 0, i32 1024} +!94 = !{!68, !68, i64 0} +!95 = distinct !{!95, !96} +!96 = !{!"llvm.loop.mustprogress"} +!97 = !{!98, !98, i64 0} +!98 = !{!"p1 _ZTSN4ompx5state13ThreadStateTyE", !69, i64 0} +!99 = !{!"branch_weights", !"expected", i32 2000, i32 1} +!100 = !{!101, !98, i64 32} +!101 = !{!"_ZTSN4ompx5state13ThreadStateTyE", !67, i64 0, !98, i64 32} +!102 = !{!103, !58, i64 0} +!103 = !{!"_ZTS19KernelEnvironmentTy", !57, i64 0, !104, i64 32, !105, i64 40} +!104 = !{!"p1 _ZTS7IdentTy", !69, i64 0} +!105 = !{!"p1 _ZTS20DynamicEnvironmentTy", !69, i64 0} From ffcc50d35abaa96996aced2a555046c67005a70c Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 9 Apr 2025 18:32:09 -0400 Subject: [PATCH 106/114] Update KernelInfo.rst for upstream PGO GPU interface changes --- llvm/docs/KernelInfo.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst index 863fad12dbe23..24f587e19639f 100644 --- a/llvm/docs/KernelInfo.rst +++ b/llvm/docs/KernelInfo.rst @@ -66,8 +66,8 @@ PGO === Using LLVM's PGO implementation for GPUs, profile data can augment the info -reported by kernel-info. In particular, kernel-info can estimate of the number -of floating point operations executed. +reported by kernel-info. In particular, kernel-info can estimate the number of +floating point operations executed. For example, the following computes 2\ :sup:`4`\ , so we expect 4 fmul instructions to execute at run time: @@ -94,7 +94,7 @@ instructions to execute at run time: } $ clang -O1 -g -fopenmp --offload-arch=native test.c -o test \ - -fprofile-generate -fprofile-generate-gpu + -fprofile-generate $ LLVM_PROFILE_FILE=test.profraw ./test 2 4 16.000000 @@ -102,8 +102,8 @@ instructions to execute at run time: $ llvm-profdata merge -output=test.profdata *.profraw $ clang -O1 -g -fopenmp --offload-arch=native test.c -foffload-lto \ - -Rpass=kernel-info -fprofile-use-gpu=test.profdata | \ + -Rpass=kernel-info -fprofile-use=test.profdata | \ grep "test.c:.*Floating\|double" - test.c:13:0: in artificial function '__omp_offloading_35_126b72c_main_l13', FloatingPointOpProfileCount = 0 + test.c:13:0: in artificial function '__omp_offloading_34_1bc8484_main_l13', FloatingPointOpProfileCount = 0 test.c:7:9: in function 'test', double 'fmul' ('%9') executed 4 times test.c:4:0: in function 'test', FloatingPointOpProfileCount = 4 From 8149708a415472a4d59069225f4895794c46055a Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Sat, 10 May 2025 13:34:03 -0400 Subject: [PATCH 107/114] Extend to intrinsics (e.g., @llvm.fmuladd.*) --- llvm/lib/Analysis/KernelInfo.cpp | 7 +- llvm/test/Analysis/KernelInfo/flop-pgo.ll | 156 +++++++++++++--------- 2 files changed, 96 insertions(+), 67 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 303ea098f26e1..26f639c4fd67f 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -89,14 +89,17 @@ class KernelInfo { // // TODO: Does this correctly identify floating point operations we care about? // For example, we skip phi and load even when they return floating point -// values. Should different operations have different weights? +// values. +// +// TODO: Should different operations have different weights? For example, +// @llvm.fmuladd.* might be expensive for some targets. static Type *getFloatingPointOpType(const Instruction &I) { if (const AtomicRMWInst *At = dyn_cast(&I)) { if (At->isFloatingPointOperation()) return At->getType(); return nullptr; } - if (!I.isBinaryOp() && !I.isUnaryOp()) + if (!I.isBinaryOp() && !I.isUnaryOp() && !isa(&I)) return nullptr; Type *Ty = I.getType(); if (Ty->isFPOrFPVectorTy()) diff --git a/llvm/test/Analysis/KernelInfo/flop-pgo.ll b/llvm/test/Analysis/KernelInfo/flop-pgo.ll index 18811a0dfa00b..8554d11828d3e 100644 --- a/llvm/test/Analysis/KernelInfo/flop-pgo.ll +++ b/llvm/test/Analysis/KernelInfo/flop-pgo.ll @@ -7,44 +7,50 @@ target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" -; Check function with profile data but no floating point operations. -define double @noFlop() !dbg !100 !prof !102 { +; Check function with neither profile data nor floating point operations. +define double @noFlopOrProf() !dbg !100 { ret double 0.000000e+00, !dbg !105 } +; CHECK: remark: test.c:1:0: in function 'noFlopOrProf', FloatingPointOpProfileCount = 0 + +; Check function with profile data but no floating point operations. +define double @noFlop() !dbg !200 !prof !202 { + ret double 0.000000e+00, !dbg !205 +} ; CHECK: remark: test.c:2:0: in function 'noFlop', FloatingPointOpProfileCount = 0 ; Check function with floating point operations but no profile data. -define double @noProf() !dbg !200 { +define double @noProf() !dbg !300 { ; CHECK: remark: test.c:3:9: in function 'noProf', double 'fadd' ('%fadd') has no profile data - %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !204 - ret double 0.000000e+00, !dbg !205 + %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !304 + ret double 0.000000e+00, !dbg !305 } ; CHECK: remark: test.c:3:0: in function 'noProf', FloatingPointOpProfileCount = 0 ; Check function with floating point operations and profile data. -define double @f() !dbg !300 !prof !302 { +define double @f() !dbg !400 !prof !402 { ; Check floating point operation in entry block, which has a count of 1 per ; entry into the function. ; ; Also, check case of basic block with exactly 1 floating point operation. - %alloca = alloca double, align 8, addrspace(1), !dbg !398 + %alloca = alloca double, align 8, addrspace(1), !dbg !498 ; CHECK: remark: test.c:5:9: in function 'f', double 'fadd' ('%fadd') executed 2 times - %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !305 - br label %.none, !dbg !398 + %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !405 + br label %.none, !dbg !498 ; Check floating point operation in ret block. ; ; branch_weights gives this block a count of 1 per entry into the function. .ret: ; preds = %.many ; CHECK: remark: test.c:6:9: in function 'f', double 'fsub' ('%fsub') executed 2 times - %fsub = fsub double 0.000000e+00, 0.000000e+00, !dbg !306 + %fsub = fsub double 0.000000e+00, 0.000000e+00, !dbg !406 ; CHECK: remark: test.c:7:9: in function 'f', double 'fmul' ('%fmul') executed 2 times - %fmul = fmul double 0.000000e+00, 0.000000e+00, !dbg !307 - ret double 0.000000e+00, !dbg !398 + %fmul = fmul double 0.000000e+00, 0.000000e+00, !dbg !407 + ret double 0.000000e+00, !dbg !498 ; Check case of 0 floating point operations in a basic block. .none: ; preds = %0 - br label %.many, !dbg !398 + br label %.many, !dbg !498 ; Check case of many floating point operations in a basic block. ; @@ -52,46 +58,57 @@ define double @f() !dbg !300 !prof !302 { .many: ; preds = %.none, %.many ; These are not considered floating point ops even though they return floating ; point values. - %phi = phi double [ %fadd, %.none ], [ %load, %.many ], !dbg !398 - %load = load double, ptr addrspace(1) %alloca, align 8, !dbg !398 + %phi = phi double [ %fadd, %.none ], [ %load, %.many ], !dbg !498 + %load = load double, ptr addrspace(1) %alloca, align 8, !dbg !498 ; Check simple floating point ops not already checked above, and check an ; unnamed value. ; ; CHECK: remark: test.c:8:9: in function 'f', double 'fdiv' ('%1') executed 6 times - %1 = fdiv double 0.000000e+00, 0.000000e+00, !dbg !308 + %1 = fdiv double 0.000000e+00, 0.000000e+00, !dbg !408 ; CHECK: remark: test.c:9:9: in function 'f', double 'fneg' ('%fneg') executed 6 times - %fneg = fneg double 0.000000e+00, !dbg !309 + %fneg = fneg double 0.000000e+00, !dbg !409 ; Check atomicrmw. ; ; CHECK: remark: test.c:10:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times - atomicrmw fadd ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !310 + atomicrmw fadd ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !410 ; CHECK: remark: test.c:11:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times - atomicrmw fsub ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !311 + atomicrmw fsub ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !411 ; CHECK: remark: test.c:12:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times - atomicrmw fmax ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !312 + atomicrmw fmax ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !412 ; CHECK: remark: test.c:13:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times - atomicrmw fmin ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !313 + atomicrmw fmin ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !413 ; atomicrmw that is not a floating point op. - atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !398 + atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !498 + + ; Check some intrinsics. + ; + ; CHECK: remark: test.c:14:9: in function 'f', double 'llvm.sqrt.f64' call ('%sqrt') executed 6 times + %sqrt = call double @llvm.sqrt.f64(double 0.000000e+00), !dbg !414 + ; CHECK: remark: test.c:15:9: in function 'f', double 'llvm.sin.f64' call ('%sin') executed 6 times + %sin = call double @llvm.sin.f64(double 0.000000e+00), !dbg !415 + ; CHECK: remark: test.c:16:9: in function 'f', double 'llvm.fmuladd.f64' call ('%fmuladd') executed 6 times + %fmuladd = call double @llvm.fmuladd.f64(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00), !dbg !416 + ; Intrinsic that is not a floating point op. + %umax = call i32 @llvm.umax.i32(i32 0, i32 0), !dbg !498 ; Check floating point types besides double scalar. ; - ; CHECK: remark: test.c:14:9: in function 'f', float 'fadd' ('%float') executed 6 times - %float = fadd float 0.000000e+00, 0.000000e+00, !dbg !314 - ; CHECK: remark: test.c:15:9: in function 'f', half 'fadd' ('%half') executed 6 times - %half = fadd half 0.000000e+00, 0.000000e+00, !dbg !315 - ; CHECK: remark: test.c:16:9: in function 'f', bfloat 'fadd' ('%bfloat') executed 6 times - %bfloat = fadd bfloat 0.000000e+00, 0.000000e+00, !dbg !316 - ; CHECK: remark: test.c:17:9: in function 'f', fp128 'fadd' ('%fp128') executed 6 times - %fp128 = fadd fp128 0xL0, 0xL0, !dbg !317 - ; CHECK: remark: test.c:18:9: in function 'f', <2 x double> 'fadd' ('%vector') executed 6 times - %vector = fadd <2 x double> , , !dbg !318 - - br i1 false, label %.ret, label %.many, !prof !399, !dbg !398 + ; CHECK: remark: test.c:50:9: in function 'f', float 'fadd' ('%float') executed 6 times + %float = fadd float 0.000000e+00, 0.000000e+00, !dbg !450 + ; CHECK: remark: test.c:51:9: in function 'f', half 'fadd' ('%half') executed 6 times + %half = fadd half 0.000000e+00, 0.000000e+00, !dbg !451 + ; CHECK: remark: test.c:52:9: in function 'f', bfloat 'fadd' ('%bfloat') executed 6 times + %bfloat = fadd bfloat 0.000000e+00, 0.000000e+00, !dbg !452 + ; CHECK: remark: test.c:53:9: in function 'f', fp128 'fadd' ('%fp128') executed 6 times + %fp128 = fadd fp128 0xL0, 0xL0, !dbg !453 + ; CHECK: remark: test.c:54:9: in function 'f', <2 x double> 'fadd' ('%vector') executed 6 times + %vector = fadd <2 x double> , , !dbg !454 + + br i1 false, label %.ret, label %.many, !prof !499, !dbg !498 } -; CHECK: remark: test.c:4:0: in function 'f', FloatingPointOpProfileCount = 72 +; CHECK: remark: test.c:4:0: in function 'f', FloatingPointOpProfileCount = 90 !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} @@ -101,38 +118,47 @@ define double @f() !dbg !300 !prof !302 { !2 = !DIFile(filename: "test.c", directory: "/tmp") !3 = !{} -!100 = distinct !DISubprogram(name: "noFlop", scope: !2, file: !2, line: 2, type: !101, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3) +!100 = distinct !DISubprogram(name: "noFlopOrProf", scope: !2, file: !2, line: 1, type: !101, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3) !101 = !DISubroutineType(types: !3) -!102 = !{!"function_entry_count", i64 5} -!103 = distinct !DILexicalBlock(scope: !104, file: !2, line: 2, column: 3) -!104 = distinct !DILexicalBlock(scope: !100, file: !2, line: 2, column: 3) -!105 = !DILocation(line: 2, column: 9, scope: !103) +!103 = distinct !DILexicalBlock(scope: !104, file: !2, line: 1, column: 3) +!104 = distinct !DILexicalBlock(scope: !100, file: !2, line: 1, column: 3) +!105 = !DILocation(line: 1, column: 9, scope: !103) -!200 = distinct !DISubprogram(name: "noProf", scope: !2, file: !2, line: 3, type: !201, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3) +!200 = distinct !DISubprogram(name: "noFlop", scope: !2, file: !2, line: 2, type: !201, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3) !201 = !DISubroutineType(types: !3) -!202 = distinct !DILexicalBlock(scope: !203, file: !2, line: 3, column: 3) -!203 = distinct !DILexicalBlock(scope: !200, file: !2, line: 3, column: 3) -!204 = !DILocation(line: 3, column: 9, scope: !202) -!205 = !DILocation(line: 4, column: 9, scope: !202) +!202 = !{!"function_entry_count", i64 5} +!203 = distinct !DILexicalBlock(scope: !204, file: !2, line: 2, column: 3) +!204 = distinct !DILexicalBlock(scope: !200, file: !2, line: 2, column: 3) +!205 = !DILocation(line: 2, column: 9, scope: !203) -!300 = distinct !DISubprogram(name: "f", scope: !2, file: !2, line: 4, type: !301, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3) +!300 = distinct !DISubprogram(name: "noProf", scope: !2, file: !2, line: 3, type: !301, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3) !301 = !DISubroutineType(types: !3) -!302 = !{!"function_entry_count", i64 2} -!303 = distinct !DILexicalBlock(scope: !304, file: !2, line: 6, column: 3) -!304 = distinct !DILexicalBlock(scope: !300, file: !2, line: 6, column: 3) -!305 = !DILocation(line: 5, column: 9, scope: !303) -!306 = !DILocation(line: 6, column: 9, scope: !303) -!307 = !DILocation(line: 7, column: 9, scope: !303) -!308 = !DILocation(line: 8, column: 9, scope: !303) -!309 = !DILocation(line: 9, column: 9, scope: !303) -!310 = !DILocation(line: 10, column: 9, scope: !303) -!311 = !DILocation(line: 11, column: 9, scope: !303) -!312 = !DILocation(line: 12, column: 9, scope: !303) -!313 = !DILocation(line: 13, column: 9, scope: !303) -!314 = !DILocation(line: 14, column: 9, scope: !303) -!315 = !DILocation(line: 15, column: 9, scope: !303) -!316 = !DILocation(line: 16, column: 9, scope: !303) -!317 = !DILocation(line: 17, column: 9, scope: !303) -!318 = !DILocation(line: 18, column: 9, scope: !303) -!398 = !DILocation(line: 999, column: 999, scope: !303) -!399 = !{!"branch_weights", i32 127, i32 257} +!302 = distinct !DILexicalBlock(scope: !303, file: !2, line: 3, column: 3) +!303 = distinct !DILexicalBlock(scope: !300, file: !2, line: 3, column: 3) +!304 = !DILocation(line: 3, column: 9, scope: !302) +!305 = !DILocation(line: 4, column: 9, scope: !302) + +!400 = distinct !DISubprogram(name: "f", scope: !2, file: !2, line: 4, type: !401, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3) +!401 = !DISubroutineType(types: !3) +!402 = !{!"function_entry_count", i64 2} +!403 = distinct !DILexicalBlock(scope: !404, file: !2, line: 6, column: 3) +!404 = distinct !DILexicalBlock(scope: !400, file: !2, line: 6, column: 3) +!405 = !DILocation(line: 5, column: 9, scope: !403) +!406 = !DILocation(line: 6, column: 9, scope: !403) +!407 = !DILocation(line: 7, column: 9, scope: !403) +!408 = !DILocation(line: 8, column: 9, scope: !403) +!409 = !DILocation(line: 9, column: 9, scope: !403) +!410 = !DILocation(line: 10, column: 9, scope: !403) +!411 = !DILocation(line: 11, column: 9, scope: !403) +!412 = !DILocation(line: 12, column: 9, scope: !403) +!413 = !DILocation(line: 13, column: 9, scope: !403) +!414 = !DILocation(line: 14, column: 9, scope: !403) +!415 = !DILocation(line: 15, column: 9, scope: !403) +!416 = !DILocation(line: 16, column: 9, scope: !403) +!450 = !DILocation(line: 50, column: 9, scope: !403) +!451 = !DILocation(line: 51, column: 9, scope: !403) +!452 = !DILocation(line: 52, column: 9, scope: !403) +!453 = !DILocation(line: 53, column: 9, scope: !403) +!454 = !DILocation(line: 54, column: 9, scope: !403) +!498 = !DILocation(line: 999, column: 999, scope: !403) +!499 = !{!"branch_weights", i32 127, i32 257} From 002f3933028bfb2a0f886ab389ecbd94eedeb154 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Sat, 10 May 2025 14:11:34 -0400 Subject: [PATCH 108/114] Use `-Xarch_device -fprofile-update=atomic` in example --- llvm/docs/KernelInfo.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst index 24f587e19639f..f2f05170dcb8d 100644 --- a/llvm/docs/KernelInfo.rst +++ b/llvm/docs/KernelInfo.rst @@ -94,7 +94,7 @@ instructions to execute at run time: } $ clang -O1 -g -fopenmp --offload-arch=native test.c -o test \ - -fprofile-generate + -fprofile-generate -Xarch_device -fprofile-update=atomic $ LLVM_PROFILE_FILE=test.profraw ./test 2 4 16.000000 @@ -107,3 +107,6 @@ instructions to execute at run time: test.c:13:0: in artificial function '__omp_offloading_34_1bc8484_main_l13', FloatingPointOpProfileCount = 0 test.c:7:9: in function 'test', double 'fmul' ('%9') executed 4 times test.c:4:0: in function 'test', FloatingPointOpProfileCount = 4 + +While ``-Xarch_device -fprofile-update=atomic`` is not required for the simple +example above, it can be critical while profiling parallel code. From 6e7208e8b8e715b01947e1313f30ed20aee0f3f3 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 May 2025 21:46:34 -0400 Subject: [PATCH 109/114] Also report floating point bytes moved from profile --- llvm/lib/Analysis/KernelInfo.cpp | 68 +++++-- llvm/test/Analysis/KernelInfo/flop-pgo.ll | 181 +++++++++++------- .../test/Analysis/KernelInfo/openmp/amdgpu.ll | 9 +- llvm/test/Analysis/KernelInfo/openmp/nvptx.ll | 9 +- 4 files changed, 173 insertions(+), 94 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 26f639c4fd67f..1775656c09740 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -78,18 +78,26 @@ class KernelInfo { /// Estimate of the number of floating point operations typically executed /// based on any available profile data. If no profile data is available, the /// count is zero. - uint64_t FloatingPointOpProfileCount = 0; + uint64_t ProfileFloatingPointOpCount = 0; + + /// Estimate of the number bytes of floating point memory typically moved + /// (e.g., load or store) based on any available profile data. If no profile + /// data is available, the count is zero. LLVM memory access operations + /// (e.g., llvm.memcpy.*, cmpxchg) that are always encoded as operating on + /// integer types and never on floating point types are not included. + uint64_t ProfileFloatingPointBytesMoved = 0; }; } // end anonymous namespace -// For the purposes of KernelInfo::FloatingPointOpProfileCount, should this be -// considered a floating point operation? If so, return the floating point -// type. Otherwise, return nullptr. +// For the purposes of KernelInfo::ProfileFloatingPointOpCount, should the +// specified Instruction be considered a floating point operation? If so, +// return the floating point type. Otherwise, return nullptr. // // TODO: Does this correctly identify floating point operations we care about? -// For example, we skip phi and load even when they return floating point -// values. +// For example, we skip phi even when it returns a floating point value, and +// load is covered by KernelInfo::ProfileFloatingPointBytesMoved instead. Is +// there anything missing that should be covered here? // // TODO: Should different operations have different weights? For example, // @llvm.fmuladd.* might be expensive for some targets. @@ -208,12 +216,16 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, }); } -static void remarkFloatingPointOp(OptimizationRemarkEmitter &ORE, - const Function &Caller, const Instruction &I, - Type *Ty, - std::optional BlockProfileCount) { +static void +remarkFloatingPointOp(OptimizationRemarkEmitter &ORE, const Function &Caller, + const Instruction &I, Type *Ty, + std::optional BlockProfileCount, + std::optional BytesMoved = std::nullopt) { ORE.emit([&] { - OptimizationRemark R(DEBUG_TYPE, "FloatingPointOp", &I); + OptimizationRemark R(DEBUG_TYPE, + BytesMoved ? "ProfileFloatingPointBytesMoved" + : "ProfileFloatingPointOpCount", + &I); R << "in "; identifyFunction(R, Caller); R << ", "; @@ -222,10 +234,15 @@ static void remarkFloatingPointOp(OptimizationRemarkEmitter &ORE, Ty->print(OS); R << TyName << " "; identifyInstruction(R, I); - if (BlockProfileCount) - R << " executed " << utostr(*BlockProfileCount) << " times"; - else + if (BlockProfileCount) { + if (BytesMoved) + R << " moved " << itostr(*BytesMoved * *BlockProfileCount) + << " fp bytes"; + else + R << " executed " << utostr(*BlockProfileCount) << " flops"; + } else { R << " has no profile data"; + } return R; }); } @@ -239,6 +256,14 @@ void KernelInfo::updateForBB(const BasicBlock &BB, BlockFrequencyInfo &BFI, std::optional BlockProfileCount = BFI.getBlockProfileCount(&BB, /*AllowSynthetic=*/true); for (const Instruction &I : BB.instructionsWithoutDebug()) { + auto HandleFloatingPointBytesMoved = [&]() { + Type *Ty = I.getAccessType(); + if (!Ty || !Ty->isFPOrFPVectorTy()) + return; + TypeSize::ScalarTy Size = DL.getTypeAllocSize(Ty).getFixedValue(); + ProfileFloatingPointBytesMoved += BlockProfileCount.value_or(0) * Size; + remarkFloatingPointOp(ORE, F, I, Ty, BlockProfileCount, Size); + }; if (const AllocaInst *Alloca = dyn_cast(&I)) { ++Allocas; TypeSize::ScalarTy StaticSize = 0; @@ -296,30 +321,40 @@ void KernelInfo::updateForBB(const BasicBlock &BB, BlockFrequencyInfo &BFI, remarkFlatAddrspaceAccess(ORE, F, I); } } + // llvm.memcpy.*, llvm.memset.*, etc. are encoded as operating on + // integer types not floating point types, so + // HandleFloatingPointBytesMoved is useless here. } } else if (const LoadInst *Load = dyn_cast(&I)) { if (Load->getPointerAddressSpace() == FlatAddrspace) { ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } + HandleFloatingPointBytesMoved(); } else if (const StoreInst *Store = dyn_cast(&I)) { if (Store->getPointerAddressSpace() == FlatAddrspace) { ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } + HandleFloatingPointBytesMoved(); } else if (const AtomicRMWInst *At = dyn_cast(&I)) { if (At->getPointerAddressSpace() == FlatAddrspace) { ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } + // TODO: Because there is a read and write, should we double the bytes + // moved count? + HandleFloatingPointBytesMoved(); } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { if (At->getPointerAddressSpace() == FlatAddrspace) { ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } + // cmpxchg is encoded as operating on integer types not floating point + // types, so HandleFloatingPointBytesMoved is useless here. } if (Type *Ty = getFloatingPointOpType(I)) { - FloatingPointOpProfileCount += BlockProfileCount.value_or(0); + ProfileFloatingPointOpCount += BlockProfileCount.value_or(0); remarkFloatingPointOp(ORE, F, I, Ty, BlockProfileCount); } } @@ -381,7 +416,8 @@ void KernelInfo::emitKernelInfo(Function &F, FunctionAnalysisManager &FAM, REMARK_PROPERTY(InlineAssemblyCalls); REMARK_PROPERTY(Invokes); REMARK_PROPERTY(FlatAddrspaceAccesses); - REMARK_PROPERTY(FloatingPointOpProfileCount); + REMARK_PROPERTY(ProfileFloatingPointOpCount); + REMARK_PROPERTY(ProfileFloatingPointBytesMoved); #undef REMARK_PROPERTY } diff --git a/llvm/test/Analysis/KernelInfo/flop-pgo.ll b/llvm/test/Analysis/KernelInfo/flop-pgo.ll index 8554d11828d3e..94e34374b776f 100644 --- a/llvm/test/Analysis/KernelInfo/flop-pgo.ll +++ b/llvm/test/Analysis/KernelInfo/flop-pgo.ll @@ -2,7 +2,9 @@ ; RUN: opt -pass-remarks=kernel-info -passes=kernel-info \ ; RUN: -disable-output %s 2>&1 | \ -; RUN: FileCheck -match-full-lines -implicit-check-not='floating point' %s +; RUN: FileCheck %s -match-full-lines \ +; RUN: -implicit-check-not='executed {{[0-9]+}} flops' \ +; RUN: -implicit-check-not='moved {{[0-9]+}} fp bytes' target datalayout = "e-i65:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" @@ -11,21 +13,27 @@ target triple = "nvptx64-nvidia-cuda" define double @noFlopOrProf() !dbg !100 { ret double 0.000000e+00, !dbg !105 } -; CHECK: remark: test.c:1:0: in function 'noFlopOrProf', FloatingPointOpProfileCount = 0 +; CHECK: remark: test.c:1:0: in function 'noFlopOrProf', ProfileFloatingPointOpCount = 0 +; CHECK: remark: test.c:1:0: in function 'noFlopOrProf', ProfileFloatingPointBytesMoved = 0 ; Check function with profile data but no floating point operations. define double @noFlop() !dbg !200 !prof !202 { ret double 0.000000e+00, !dbg !205 } -; CHECK: remark: test.c:2:0: in function 'noFlop', FloatingPointOpProfileCount = 0 +; CHECK: remark: test.c:2:0: in function 'noFlop', ProfileFloatingPointOpCount = 0 +; CHECK: remark: test.c:2:0: in function 'noFlop', ProfileFloatingPointBytesMoved = 0 -; Check function with floating point operations but no profile data. +; Check function with floating point operations (one that moves data and one +; that does not) but no profile data. define double @noProf() !dbg !300 { - ; CHECK: remark: test.c:3:9: in function 'noProf', double 'fadd' ('%fadd') has no profile data + ; CHECK: remark: test.c:4:9: in function 'noProf', double 'fadd' ('%fadd') has no profile data %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !304 - ret double 0.000000e+00, !dbg !305 + ; CHECK: remark: test.c:5:9: in function 'noProf', double 'load' ('%load') has no profile data + %load = load double, ptr null, align 4, !dbg !305 + ret double 0.000000e+00, !dbg !306 } -; CHECK: remark: test.c:3:0: in function 'noProf', FloatingPointOpProfileCount = 0 +; CHECK: remark: test.c:3:0: in function 'noProf', ProfileFloatingPointOpCount = 0 +; CHECK: remark: test.c:3:0: in function 'noProf', ProfileFloatingPointBytesMoved = 0 ; Check function with floating point operations and profile data. define double @f() !dbg !400 !prof !402 { @@ -33,82 +41,102 @@ define double @f() !dbg !400 !prof !402 { ; entry into the function. ; ; Also, check case of basic block with exactly 1 floating point operation. - %alloca = alloca double, align 8, addrspace(1), !dbg !498 - ; CHECK: remark: test.c:5:9: in function 'f', double 'fadd' ('%fadd') executed 2 times - %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !405 - br label %.none, !dbg !498 + %alloca = alloca double, align 8, addrspace(1), !dbg !405 + ; CHECK: remark: test.c:10:9: in function 'f', double 'fadd' ('%fadd') executed 2 flops + %fadd = fadd double 0.000000e+00, 0.000000e+00, !dbg !410 + br label %.none, !dbg !405 ; Check floating point operation in ret block. ; ; branch_weights gives this block a count of 1 per entry into the function. .ret: ; preds = %.many - ; CHECK: remark: test.c:6:9: in function 'f', double 'fsub' ('%fsub') executed 2 times - %fsub = fsub double 0.000000e+00, 0.000000e+00, !dbg !406 - ; CHECK: remark: test.c:7:9: in function 'f', double 'fmul' ('%fmul') executed 2 times - %fmul = fmul double 0.000000e+00, 0.000000e+00, !dbg !407 - ret double 0.000000e+00, !dbg !498 + ; CHECK: remark: test.c:20:9: in function 'f', double 'fsub' ('%fsub') executed 2 flops + %fsub = fsub double 0.000000e+00, 0.000000e+00, !dbg !420 + ; CHECK: remark: test.c:21:9: in function 'f', double 'fmul' ('%fmul') executed 2 flops + %fmul = fmul double 0.000000e+00, 0.000000e+00, !dbg !421 + ret double 0.000000e+00, !dbg !405 ; Check case of 0 floating point operations in a basic block. .none: ; preds = %0 - br label %.many, !dbg !498 + br label %.many, !dbg !405 ; Check case of many floating point operations in a basic block. ; ; branch_weights gives this block a count of 3 per entry into the function. .many: ; preds = %.none, %.many - ; These are not considered floating point ops even though they return floating - ; point values. - %phi = phi double [ %fadd, %.none ], [ %load, %.many ], !dbg !498 - %load = load double, ptr addrspace(1) %alloca, align 8, !dbg !498 + ; This is not counted as a floating point op even though it returns a floating + ; point value. + %phi = phi double [ %fadd, %.none ], [ %load, %.many ], !dbg !405 ; Check simple floating point ops not already checked above, and check an ; unnamed value. ; - ; CHECK: remark: test.c:8:9: in function 'f', double 'fdiv' ('%1') executed 6 times - %1 = fdiv double 0.000000e+00, 0.000000e+00, !dbg !408 - ; CHECK: remark: test.c:9:9: in function 'f', double 'fneg' ('%fneg') executed 6 times - %fneg = fneg double 0.000000e+00, !dbg !409 + ; CHECK: remark: test.c:30:9: in function 'f', double 'fdiv' ('%1') executed 6 flops + %1 = fdiv double 0.000000e+00, 0.000000e+00, !dbg !430 + ; CHECK: remark: test.c:31:9: in function 'f', double 'fneg' ('%fneg') executed 6 flops + %fneg = fneg double 0.000000e+00, !dbg !431 + ; CHECK: remark: test.c:32:9: in function 'f', double 'load' ('%load') moved 48 fp bytes + %load = load double, ptr addrspace(1) %alloca, align 8, !dbg !432 + ; CHECK: remark: test.c:33:9: in function 'f', double 'store' moved 48 fp bytes + store double 0.000000e+00, ptr addrspace(1) %alloca, align 8, !dbg !433 ; Check atomicrmw. ; - ; CHECK: remark: test.c:10:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times - atomicrmw fadd ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !410 - ; CHECK: remark: test.c:11:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times - atomicrmw fsub ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !411 - ; CHECK: remark: test.c:12:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times - atomicrmw fmax ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !412 - ; CHECK: remark: test.c:13:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 times - atomicrmw fmin ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !413 + ; CHECK: remark: test.c:40:9: in function 'f', double 'atomicrmw' ('%[[#]]') moved 48 fp bytes + atomicrmw xchg ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !440 + ; CHECK: remark: test.c:41:9: in function 'f', double 'atomicrmw' ('%[[#]]') moved 48 fp bytes + ; CHECK: remark: test.c:41:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 flops + atomicrmw fadd ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !441 + ; CHECK: remark: test.c:42:9: in function 'f', double 'atomicrmw' ('%[[#]]') moved 48 fp bytes + ; CHECK: remark: test.c:42:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 flops + atomicrmw fsub ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !442 + ; CHECK: remark: test.c:43:9: in function 'f', double 'atomicrmw' ('%[[#]]') moved 48 fp bytes + ; CHECK: remark: test.c:43:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 flops + atomicrmw fmax ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !443 + ; CHECK: remark: test.c:44:9: in function 'f', double 'atomicrmw' ('%[[#]]') moved 48 fp bytes + ; CHECK: remark: test.c:44:9: in function 'f', double 'atomicrmw' ('%[[#]]') executed 6 flops + atomicrmw fmin ptr addrspace(37) null, double 0.000000e+00 seq_cst, !dbg !444 ; atomicrmw that is not a floating point op. - atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !498 + atomicrmw add ptr addrspace(37) null, i32 10 seq_cst, !dbg !405 - ; Check some intrinsics. + ; Check some flop intrinsics. ; - ; CHECK: remark: test.c:14:9: in function 'f', double 'llvm.sqrt.f64' call ('%sqrt') executed 6 times - %sqrt = call double @llvm.sqrt.f64(double 0.000000e+00), !dbg !414 - ; CHECK: remark: test.c:15:9: in function 'f', double 'llvm.sin.f64' call ('%sin') executed 6 times - %sin = call double @llvm.sin.f64(double 0.000000e+00), !dbg !415 - ; CHECK: remark: test.c:16:9: in function 'f', double 'llvm.fmuladd.f64' call ('%fmuladd') executed 6 times - %fmuladd = call double @llvm.fmuladd.f64(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00), !dbg !416 + ; CHECK: remark: test.c:50:9: in function 'f', double 'llvm.sqrt.f64' call ('%sqrt') executed 6 flops + %sqrt = call double @llvm.sqrt.f64(double 0.000000e+00), !dbg !450 + ; CHECK: remark: test.c:51:9: in function 'f', double 'llvm.sin.f64' call ('%sin') executed 6 flops + %sin = call double @llvm.sin.f64(double 0.000000e+00), !dbg !451 + ; CHECK: remark: test.c:52:9: in function 'f', double 'llvm.fmuladd.f64' call ('%fmuladd') executed 6 flops + %fmuladd = call double @llvm.fmuladd.f64(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00), !dbg !452 ; Intrinsic that is not a floating point op. - %umax = call i32 @llvm.umax.i32(i32 0, i32 0), !dbg !498 + %umax = call i32 @llvm.umax.i32(i32 0, i32 0), !dbg !405 ; Check floating point types besides double scalar. ; - ; CHECK: remark: test.c:50:9: in function 'f', float 'fadd' ('%float') executed 6 times - %float = fadd float 0.000000e+00, 0.000000e+00, !dbg !450 - ; CHECK: remark: test.c:51:9: in function 'f', half 'fadd' ('%half') executed 6 times - %half = fadd half 0.000000e+00, 0.000000e+00, !dbg !451 - ; CHECK: remark: test.c:52:9: in function 'f', bfloat 'fadd' ('%bfloat') executed 6 times - %bfloat = fadd bfloat 0.000000e+00, 0.000000e+00, !dbg !452 - ; CHECK: remark: test.c:53:9: in function 'f', fp128 'fadd' ('%fp128') executed 6 times - %fp128 = fadd fp128 0xL0, 0xL0, !dbg !453 - ; CHECK: remark: test.c:54:9: in function 'f', <2 x double> 'fadd' ('%vector') executed 6 times - %vector = fadd <2 x double> , , !dbg !454 - - br i1 false, label %.ret, label %.many, !prof !499, !dbg !498 + ; CHECK: remark: test.c:60:9: in function 'f', float 'fadd' ('%float') executed 6 flops + %float = fadd float 0.000000e+00, 0.000000e+00, !dbg !460 + ; CHECK: remark: test.c:61:9: in function 'f', float 'store' moved 24 fp bytes + store float 0.000000e+00, ptr null, align 8, !dbg !461 + ; CHECK: remark: test.c:62:9: in function 'f', half 'fadd' ('%half') executed 6 flops + %half = fadd half 0.000000e+00, 0.000000e+00, !dbg !462 + ; CHECK: remark: test.c:63:9: in function 'f', half 'store' moved 12 fp bytes + store half 0.000000e+00, ptr null, align 8, !dbg !463 + ; CHECK: remark: test.c:64:9: in function 'f', bfloat 'fadd' ('%bfloat') executed 6 flops + %bfloat = fadd bfloat 0.000000e+00, 0.000000e+00, !dbg !464 + ; CHECK: remark: test.c:65:9: in function 'f', bfloat 'store' moved 12 fp bytes + store bfloat 0.000000e+00, ptr null, align 8, !dbg !465 + ; CHECK: remark: test.c:66:9: in function 'f', fp128 'fadd' ('%fp128') executed 6 flops + %fp128 = fadd fp128 0xL0, 0xL0, !dbg !466 + ; CHECK: remark: test.c:67:9: in function 'f', fp128 'store' moved 96 fp bytes + store fp128 0xL0, ptr null, align 8, !dbg !467 + ; CHECK: remark: test.c:68:9: in function 'f', <2 x double> 'fadd' ('%vector') executed 6 flops + %vector = fadd <2 x double> , , !dbg !468 + ; CHECK: remark: test.c:69:9: in function 'f', <2 x double> 'store' moved 96 fp bytes + store <2 x double> , ptr null, align 8, !dbg !469 + + br i1 false, label %.ret, label %.many, !prof !499, !dbg !405 } -; CHECK: remark: test.c:4:0: in function 'f', FloatingPointOpProfileCount = 90 +; CHECK: remark: test.c:4:0: in function 'f', ProfileFloatingPointOpCount = 90 +; CHECK: remark: test.c:4:0: in function 'f', ProfileFloatingPointBytesMoved = 576 !llvm.module.flags = !{!0} !llvm.dbg.cu = !{!1} @@ -135,30 +163,39 @@ define double @f() !dbg !400 !prof !402 { !301 = !DISubroutineType(types: !3) !302 = distinct !DILexicalBlock(scope: !303, file: !2, line: 3, column: 3) !303 = distinct !DILexicalBlock(scope: !300, file: !2, line: 3, column: 3) -!304 = !DILocation(line: 3, column: 9, scope: !302) -!305 = !DILocation(line: 4, column: 9, scope: !302) +!304 = !DILocation(line: 4, column: 9, scope: !302) +!305 = !DILocation(line: 5, column: 9, scope: !302) +!306 = !DILocation(line: 6, column: 9, scope: !302) !400 = distinct !DISubprogram(name: "f", scope: !2, file: !2, line: 4, type: !401, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !3) !401 = !DISubroutineType(types: !3) !402 = !{!"function_entry_count", i64 2} !403 = distinct !DILexicalBlock(scope: !404, file: !2, line: 6, column: 3) !404 = distinct !DILexicalBlock(scope: !400, file: !2, line: 6, column: 3) -!405 = !DILocation(line: 5, column: 9, scope: !403) -!406 = !DILocation(line: 6, column: 9, scope: !403) -!407 = !DILocation(line: 7, column: 9, scope: !403) -!408 = !DILocation(line: 8, column: 9, scope: !403) -!409 = !DILocation(line: 9, column: 9, scope: !403) +!405 = !DILocation(line: 999, column: 999, scope: !403) !410 = !DILocation(line: 10, column: 9, scope: !403) -!411 = !DILocation(line: 11, column: 9, scope: !403) -!412 = !DILocation(line: 12, column: 9, scope: !403) -!413 = !DILocation(line: 13, column: 9, scope: !403) -!414 = !DILocation(line: 14, column: 9, scope: !403) -!415 = !DILocation(line: 15, column: 9, scope: !403) -!416 = !DILocation(line: 16, column: 9, scope: !403) +!420 = !DILocation(line: 20, column: 9, scope: !403) +!421 = !DILocation(line: 21, column: 9, scope: !403) +!430 = !DILocation(line: 30, column: 9, scope: !403) +!431 = !DILocation(line: 31, column: 9, scope: !403) +!432 = !DILocation(line: 32, column: 9, scope: !403) +!433 = !DILocation(line: 33, column: 9, scope: !403) +!440 = !DILocation(line: 40, column: 9, scope: !403) +!441 = !DILocation(line: 41, column: 9, scope: !403) +!442 = !DILocation(line: 42, column: 9, scope: !403) +!443 = !DILocation(line: 43, column: 9, scope: !403) +!444 = !DILocation(line: 44, column: 9, scope: !403) !450 = !DILocation(line: 50, column: 9, scope: !403) !451 = !DILocation(line: 51, column: 9, scope: !403) !452 = !DILocation(line: 52, column: 9, scope: !403) -!453 = !DILocation(line: 53, column: 9, scope: !403) -!454 = !DILocation(line: 54, column: 9, scope: !403) -!498 = !DILocation(line: 999, column: 999, scope: !403) -!499 = !{!"branch_weights", i32 127, i32 257} +!460 = !DILocation(line: 60, column: 9, scope: !403) +!461 = !DILocation(line: 61, column: 9, scope: !403) +!462 = !DILocation(line: 62, column: 9, scope: !403) +!463 = !DILocation(line: 63, column: 9, scope: !403) +!464 = !DILocation(line: 64, column: 9, scope: !403) +!465 = !DILocation(line: 65, column: 9, scope: !403) +!466 = !DILocation(line: 66, column: 9, scope: !403) +!467 = !DILocation(line: 67, column: 9, scope: !403) +!468 = !DILocation(line: 68, column: 9, scope: !403) +!469 = !DILocation(line: 69, column: 9, scope: !403) +!499 = !{!"branch_weights", i32 1, i32 2} diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll index b08a8e593106e..e40fdf1525069 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll @@ -32,7 +32,8 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FloatingPointOpProfileCount = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ProfileFloatingPointOpCount = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ProfileFloatingPointBytesMoved = 0 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' accesses memory in flat address space @@ -56,7 +57,8 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FloatingPointOpProfileCount = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ProfileFloatingPointOpCount = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ProfileFloatingPointBytesMoved = 0 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes @@ -79,7 +81,8 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', FloatingPointOpProfileCount = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', ProfileFloatingPointOpCount = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', ProfileFloatingPointBytesMoved = 0 ; CHECK-NOT: {{.}} ; ModuleID = 'test-openmp-amdgcn-amd-amdhsa-gfx906.bc' diff --git a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll index ef5605fba958f..f6b3b117ab12f 100644 --- a/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll +++ b/llvm/test/Analysis/KernelInfo/openmp/nvptx.ll @@ -25,7 +25,8 @@ ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', Invokes = 0 ; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FlatAddrspaceAccesses = 1 -; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', FloatingPointOpProfileCount = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ProfileFloatingPointOpCount = 0 +; CHECK-NEXT: remark: test.c:13:0: in artificial function '[[OFF_FUNC]]_debug__', ProfileFloatingPointBytesMoved = 0 ; CHECK-NEXT: remark: test.c:0:0: in artificial function '[[OFF_FUNC]]', artificial alloca ('%[[#]]') for 'dyn_ptr' with static size of 8 bytes ; CHECK-NEXT: remark: :0:0: in artificial function '[[OFF_FUNC]]', 'store' accesses memory in flat address space @@ -43,7 +44,8 @@ ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', Invokes = 0 ; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FlatAddrspaceAccesses = 2 -; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', FloatingPointOpProfileCount = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ProfileFloatingPointOpCount = 0 +; CHECK-NEXT: remark: test.c:12:0: in artificial function '[[OFF_FUNC]]', ProfileFloatingPointBytesMoved = 0 ; CHECK-NEXT: remark: test.c:4:7: in function 'g', alloca ('%[[#]]') for 'i' with static size of 4 bytes ; CHECK-NEXT: remark: test.c:5:7: in function 'g', alloca ('%[[#]]') for 'a' with static size of 8 bytes @@ -59,7 +61,8 @@ ; CHECK-NEXT: remark: test.c:3:0: in function 'g', InlineAssemblyCalls = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', Invokes = 0 ; CHECK-NEXT: remark: test.c:3:0: in function 'g', FlatAddrspaceAccesses = 0 -; CHECK-NEXT: remark: test.c:3:0: in function 'g', FloatingPointOpProfileCount = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', ProfileFloatingPointOpCount = 0 +; CHECK-NEXT: remark: test.c:3:0: in function 'g', ProfileFloatingPointBytesMoved = 0 ; CHECK-NOT: remark: {{.*: in function 'g',.*}} ; A lot of internal functions (e.g., __kmpc_target_init) come next, but we don't From 47d6b9a2d3acb29d7a5533450c7b90393fdfc570 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Mon, 12 May 2025 21:58:02 -0400 Subject: [PATCH 110/114] Update KernelInfo.rst example --- llvm/docs/KernelInfo.rst | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst index f2f05170dcb8d..e0e3e694ab16a 100644 --- a/llvm/docs/KernelInfo.rst +++ b/llvm/docs/KernelInfo.rst @@ -67,10 +67,10 @@ PGO Using LLVM's PGO implementation for GPUs, profile data can augment the info reported by kernel-info. In particular, kernel-info can estimate the number of -floating point operations executed. +floating point operations executed or bytes moved. For example, the following computes 2\ :sup:`4`\ , so we expect 4 fmul -instructions to execute at run time: +instructions to execute at run time, and we expect a load and store for ``x``: .. code-block:: shell @@ -104,9 +104,13 @@ instructions to execute at run time: $ clang -O1 -g -fopenmp --offload-arch=native test.c -foffload-lto \ -Rpass=kernel-info -fprofile-use=test.profdata | \ grep "test.c:.*Floating\|double" - test.c:13:0: in artificial function '__omp_offloading_34_1bc8484_main_l13', FloatingPointOpProfileCount = 0 - test.c:7:9: in function 'test', double 'fmul' ('%9') executed 4 times - test.c:4:0: in function 'test', FloatingPointOpProfileCount = 4 + test.c:14:14: in artificial function '__omp_offloading_34_1c64d55_main_l13', double 'load' ('%11') moved 8 fp bytes + test.c:14:7: in artificial function '__omp_offloading_34_1c64d55_main_l13', double 'store' moved 8 fp bytes + test.c:13:0: in artificial function '__omp_offloading_34_1c64d55_main_l13', ProfileFloatingPointOpCount = 0 + test.c:13:0: in artificial function '__omp_offloading_34_1c64d55_main_l13', ProfileFloatingPointBytesMoved = 16 + test.c:7:11: in function 'test', double 'fmul' ('%9') executed 4 flops + test.c:4:0: in function 'test', ProfileFloatingPointOpCount = 4 + test.c:4:0: in function 'test', ProfileFloatingPointBytesMoved = 0 While ``-Xarch_device -fprofile-update=atomic`` is not required for the simple example above, it can be critical while profiling parallel code. From 8daf9846c6190f3a1a924c25a75df16076c1c265 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Tue, 13 May 2025 14:42:46 -0400 Subject: [PATCH 111/114] Use getTypeStoreSize not getTypeAllocSize for bytes moved --- llvm/lib/Analysis/KernelInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 1775656c09740..75ae5d01d6067 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -260,7 +260,7 @@ void KernelInfo::updateForBB(const BasicBlock &BB, BlockFrequencyInfo &BFI, Type *Ty = I.getAccessType(); if (!Ty || !Ty->isFPOrFPVectorTy()) return; - TypeSize::ScalarTy Size = DL.getTypeAllocSize(Ty).getFixedValue(); + TypeSize::ScalarTy Size = DL.getTypeStoreSize(Ty).getFixedValue(); ProfileFloatingPointBytesMoved += BlockProfileCount.value_or(0) * Size; remarkFloatingPointOp(ORE, F, I, Ty, BlockProfileCount, Size); }; From e1c50c2a755f437548972f6a9cfbb0d4a2941d4a Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Tue, 13 May 2025 14:44:23 -0400 Subject: [PATCH 112/114] Remove an answered todo --- llvm/lib/Analysis/KernelInfo.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index 75ae5d01d6067..e7b08d2d712dc 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -342,8 +342,6 @@ void KernelInfo::updateForBB(const BasicBlock &BB, BlockFrequencyInfo &BFI, ++FlatAddrspaceAccesses; remarkFlatAddrspaceAccess(ORE, F, I); } - // TODO: Because there is a read and write, should we double the bytes - // moved count? HandleFloatingPointBytesMoved(); } else if (const AtomicCmpXchgInst *At = dyn_cast(&I)) { if (At->getPointerAddressSpace() == FlatAddrspace) { From 57a9848a060c1b680be7c212306ba9714041f6a2 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Wed, 28 May 2025 19:04:52 -0400 Subject: [PATCH 113/114] Adjust flops for some instructions based on amdgpu hw counters --- llvm/lib/Analysis/KernelInfo.cpp | 115 ++++++++++++++++++---- llvm/test/Analysis/KernelInfo/flop-pgo.ll | 80 +++++++++++---- 2 files changed, 160 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Analysis/KernelInfo.cpp b/llvm/lib/Analysis/KernelInfo.cpp index e7b08d2d712dc..abe9c2ce25c17 100644 --- a/llvm/lib/Analysis/KernelInfo.cpp +++ b/llvm/lib/Analysis/KernelInfo.cpp @@ -92,27 +92,100 @@ class KernelInfo { // For the purposes of KernelInfo::ProfileFloatingPointOpCount, should the // specified Instruction be considered a floating point operation? If so, -// return the floating point type. Otherwise, return nullptr. +// return the floating point type and a multiplier for its FLOP count. +// Otherwise, return std::nullopt. // // TODO: Does this correctly identify floating point operations we care about? // For example, we skip phi even when it returns a floating point value, and // load is covered by KernelInfo::ProfileFloatingPointBytesMoved instead. Is -// there anything missing that should be covered here? -// -// TODO: Should different operations have different weights? For example, -// @llvm.fmuladd.* might be expensive for some targets. -static Type *getFloatingPointOpType(const Instruction &I) { +// there anything missing that should be covered here? Is there anything else +// that we should exclude? For example, at least for AMD GPU, there are +// floating point instruction patterns (e.g., fmul with one operand in some +// category of immediate) that lower to instructions that do not trigger AMD's +// floating point hardware counters. Should we somehow query target-specific +// lowering to exclude such cases? +static std::optional> +getFloatingPointOp(const Instruction &I) { if (const AtomicRMWInst *At = dyn_cast(&I)) { if (At->isFloatingPointOperation()) - return At->getType(); - return nullptr; + return std::make_pair(At->getType(), 1); + return std::nullopt; + } + if (const CastInst *CI = dyn_cast(&I)) { + Type *SrcTy = CI->getSrcTy(); + Type *DestTy = CI->getDestTy(); + // For AMD GPU, conversions between fp and integer types where either is not + // 64-bit lower to instructions that do not trigger AMD's floating point + // hardware counters. TODO: Is that true for all archs, all non-64-bit + // floating point types, and all non-64-bit integer types? On AMD GPU, we + // have checked 64 vs. 32 and 32 vs. 32 so far. + if (SrcTy->getScalarSizeInBits() != 64 || + DestTy->getScalarSizeInBits() != 64) + return std::nullopt; + // For AMD GPU, uitofp and sitofp lower to FADD instructions. TODO: Is that + // true for all archs? + if (isa(I) || isa(I)) + return std::make_pair(DestTy, 1); + // For AMD GPU, fptoui and fptosi lower to FMA instructions. Thus, as for + // FMA instructions below, we mutliply by 2. TODO: Is that true for all + // archs? + if (isa(I) || isa(I)) + return std::make_pair(SrcTy, 2); + return std::nullopt; } - if (!I.isBinaryOp() && !I.isUnaryOp() && !isa(&I)) - return nullptr; Type *Ty = I.getType(); - if (Ty->isFPOrFPVectorTy()) - return Ty; - return nullptr; + if (!Ty->isFPOrFPVectorTy()) + return std::nullopt; + if (I.isBinaryOp() || I.isUnaryOp()) { + switch (I.getOpcode()) { + // For AMD GPU, fneg lowers to instructions that do not trigger AMD's + // floating point hardware counters. TODO: Is that true for all archs and + // all floating point types? On AMD GPU, we have check 64 bit. + case Instruction::FNeg: + return std::nullopt; + // This multiplier is based on AMD hardware fp counters for fdiv: + // - SQ_INSTS_VALU_FMA_F64 = 6*2 + // - SQ_INSTS_VALU_MUL_F64 = 1 + // - SQ_INSTS_VALU_TRANS_F64 = 1 + // TODO: Is that true for all archs and all floating point types? On AMD + // GPU, we have checked 64 bit. Moreover, this is surely brittle. What if + // the implementation changes? + case Instruction::FDiv: + return std::make_pair(Ty, 14); + } + return std::make_pair(Ty, 1); + } + if (const IntrinsicInst *II = dyn_cast(&I)) { + switch (II->getIntrinsicID()) { + // For AMD GPU, these lower to instructions that do not trigger AMD's + // floating point hardware counters. TODO: Is that true for all archs and + // all floating point types? On AMD GPU, we have checked 64 bit. + case Intrinsic::copysign: + case Intrinsic::fabs: + case Intrinsic::floor: + case Intrinsic::ldexp: + case Intrinsic::minnum: + case Intrinsic::rint: + return std::nullopt; + // For FMA instructions, we mimic AMD's rocprofiler-compute, which + // multiplies SQ_INSTS_VALU_FMA_* counts by 2. + case Intrinsic::fmuladd: + case Intrinsic::fma: + return std::make_pair(Ty, 2); + // This multiplier is based on AMD hardware fp counters for this intrinsic: + // - SQ_INSTS_VALU_FMA_F64 = 7*2 + // - SQ_INSTS_VALU_MUL_F64 = 2 + // - SQ_INSTS_VALU_TRANS_F64 = 1 + // TODO: Is that true for all archs and all floating point types? On AMD + // GPU, we have check 64 bit. Moreover, this is surely brittle. What if + // the implementation changes? + case Intrinsic::sqrt: + return std::make_pair(Ty, 17); + default: + return std::make_pair(Ty, 1); + } + } + return std::nullopt; } static void identifyCallee(OptimizationRemark &R, const Module *M, @@ -218,7 +291,7 @@ static void remarkFlatAddrspaceAccess(OptimizationRemarkEmitter &ORE, static void remarkFloatingPointOp(OptimizationRemarkEmitter &ORE, const Function &Caller, - const Instruction &I, Type *Ty, + const Instruction &I, Type *Ty, unsigned Multiplier, std::optional BlockProfileCount, std::optional BytesMoved = std::nullopt) { ORE.emit([&] { @@ -240,6 +313,8 @@ remarkFloatingPointOp(OptimizationRemarkEmitter &ORE, const Function &Caller, << " fp bytes"; else R << " executed " << utostr(*BlockProfileCount) << " flops"; + if (Multiplier != 1) + R << " x " << utostr(Multiplier); } else { R << " has no profile data"; } @@ -262,7 +337,8 @@ void KernelInfo::updateForBB(const BasicBlock &BB, BlockFrequencyInfo &BFI, return; TypeSize::ScalarTy Size = DL.getTypeStoreSize(Ty).getFixedValue(); ProfileFloatingPointBytesMoved += BlockProfileCount.value_or(0) * Size; - remarkFloatingPointOp(ORE, F, I, Ty, BlockProfileCount, Size); + remarkFloatingPointOp(ORE, F, I, Ty, /*Multiplier=*/1, BlockProfileCount, + Size); }; if (const AllocaInst *Alloca = dyn_cast(&I)) { ++Allocas; @@ -351,9 +427,12 @@ void KernelInfo::updateForBB(const BasicBlock &BB, BlockFrequencyInfo &BFI, // cmpxchg is encoded as operating on integer types not floating point // types, so HandleFloatingPointBytesMoved is useless here. } - if (Type *Ty = getFloatingPointOpType(I)) { - ProfileFloatingPointOpCount += BlockProfileCount.value_or(0); - remarkFloatingPointOp(ORE, F, I, Ty, BlockProfileCount); + if (auto Op = getFloatingPointOp(I)) { + Type *Ty; + unsigned Multiplier; + std::tie(Ty, Multiplier) = *Op; + ProfileFloatingPointOpCount += Multiplier * BlockProfileCount.value_or(0); + remarkFloatingPointOp(ORE, F, I, Ty, Multiplier, BlockProfileCount); } } } diff --git a/llvm/test/Analysis/KernelInfo/flop-pgo.ll b/llvm/test/Analysis/KernelInfo/flop-pgo.ll index 94e34374b776f..99c6920338a95 100644 --- a/llvm/test/Analysis/KernelInfo/flop-pgo.ll +++ b/llvm/test/Analysis/KernelInfo/flop-pgo.ll @@ -64,21 +64,28 @@ define double @f() !dbg !400 !prof !402 { ; ; branch_weights gives this block a count of 3 per entry into the function. .many: ; preds = %.none, %.many - ; This is not counted as a floating point op even though it returns a floating - ; point value. + ; These are not counted as floating point ops even though they return floating + ; point values. For AMD GPUs, we have seen no evidence that the hardware + ; instructions to which they lower ever trigger floating point hardware + ; counters. More appear with conversions below. %phi = phi double [ %fadd, %.none ], [ %load, %.many ], !dbg !405 + %fneg = fneg double 0.000000e+00, !dbg !405 + %copysign = call double @llvm.copysign.f64(double 0.000000e+00, double 0.000000e+00), !dbg !405 + %fabs = call double @llvm.fabs.f64(double 0.000000e+00), !dbg !405 + %floor = call double @llvm.floor.f64(double 0.000000e+00), !dbg !405 + %ldexp = call double @llvm.ldexp.f64.i32(double 0.000000e+00, i32 0), !dbg !405 + %minnum = call double @llvm.minnum.f64(double 0.000000e+00, double 0.000000e+00), !dbg !405 + %rint = call double @llvm.rint.f64(double 0.000000e+00), !dbg !405 ; Check simple floating point ops not already checked above, and check an ; unnamed value. ; - ; CHECK: remark: test.c:30:9: in function 'f', double 'fdiv' ('%1') executed 6 flops + ; CHECK: remark: test.c:30:9: in function 'f', double 'fdiv' ('%1') executed 6 flops x 14 %1 = fdiv double 0.000000e+00, 0.000000e+00, !dbg !430 - ; CHECK: remark: test.c:31:9: in function 'f', double 'fneg' ('%fneg') executed 6 flops - %fneg = fneg double 0.000000e+00, !dbg !431 - ; CHECK: remark: test.c:32:9: in function 'f', double 'load' ('%load') moved 48 fp bytes - %load = load double, ptr addrspace(1) %alloca, align 8, !dbg !432 - ; CHECK: remark: test.c:33:9: in function 'f', double 'store' moved 48 fp bytes - store double 0.000000e+00, ptr addrspace(1) %alloca, align 8, !dbg !433 + ; CHECK: remark: test.c:31:9: in function 'f', double 'load' ('%load') moved 48 fp bytes + %load = load double, ptr addrspace(1) %alloca, align 8, !dbg !431 + ; CHECK: remark: test.c:32:9: in function 'f', double 'store' moved 48 fp bytes + store double 0.000000e+00, ptr addrspace(1) %alloca, align 8, !dbg !432 ; Check atomicrmw. ; @@ -101,12 +108,18 @@ define double @f() !dbg !400 !prof !402 { ; Check some flop intrinsics. ; - ; CHECK: remark: test.c:50:9: in function 'f', double 'llvm.sqrt.f64' call ('%sqrt') executed 6 flops - %sqrt = call double @llvm.sqrt.f64(double 0.000000e+00), !dbg !450 - ; CHECK: remark: test.c:51:9: in function 'f', double 'llvm.sin.f64' call ('%sin') executed 6 flops - %sin = call double @llvm.sin.f64(double 0.000000e+00), !dbg !451 - ; CHECK: remark: test.c:52:9: in function 'f', double 'llvm.fmuladd.f64' call ('%fmuladd') executed 6 flops - %fmuladd = call double @llvm.fmuladd.f64(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00), !dbg !452 + ; CHECK: remark: test.c:50:9: in function 'f', double 'llvm.amdgcn.rcp.f64' call ('%rcp') executed 6 flops + %rcp = call double @llvm.amdgcn.rcp.f64(double 0.000000e+00), !dbg !450 + ; CHECK: remark: test.c:51:9: in function 'f', double 'llvm.amdgcn.trig.preop.f64' call ('%trig.preop') executed 6 flops + %trig.preop = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 0), !dbg !451 + ; CHECK: remark: test.c:52:9: in function 'f', double 'llvm.fma.f64' call ('%fma') executed 6 flops x 2 + %fma = call double @llvm.fma.f64(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00), !dbg !452 + ; CHECK: remark: test.c:53:9: in function 'f', double 'llvm.fmuladd.f64' call ('%fmuladd') executed 6 flops x 2 + %fmuladd = call double @llvm.fmuladd.f64(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00), !dbg !453 + ; CHECK: remark: test.c:54:9: in function 'f', double 'llvm.sin.f64' call ('%sin') executed 6 flops + %sin = call double @llvm.sin.f64(double 0.000000e+00), !dbg !454 + ; CHECK: remark: test.c:55:9: in function 'f', double 'llvm.sqrt.f64' call ('%sqrt') executed 6 flops x 17 + %sqrt = call double @llvm.sqrt.f64(double 0.000000e+00), !dbg !455 ; Intrinsic that is not a floating point op. %umax = call i32 @llvm.umax.i32(i32 0, i32 0), !dbg !405 @@ -133,9 +146,36 @@ define double @f() !dbg !400 !prof !402 { ; CHECK: remark: test.c:69:9: in function 'f', <2 x double> 'store' moved 96 fp bytes store <2 x double> , ptr null, align 8, !dbg !469 + ; Check conversions. + ; + ; CHECK: remark: test.c:70:9: in function 'f', double 'uitofp' ('%uitofp.64.64') executed 6 flops + %uitofp.64.64 = uitofp i64 0 to double, !dbg !470 + ; CHECK: remark: test.c:71:9: in function 'f', double 'sitofp' ('%sitofp.64.64') executed 6 flops + %sitofp.64.64 = sitofp i64 0 to double, !dbg !471 + ; CHECK: remark: test.c:72:9: in function 'f', double 'fptoui' ('%fptoui.64.64') executed 6 flops x 2 + %fptoui.64.64 = fptoui double 0.000000e+00 to i64, !dbg !472 + ; CHECK: remark: test.c:73:9: in function 'f', double 'fptosi' ('%fptosi.64.64') executed 6 flops x 2 + %fptosi.64.64 = fptosi double 0.000000e+00 to i64, !dbg !473 + %uitofp.32.64 = uitofp i32 0 to double, !dbg !405 + %sitofp.32.64 = sitofp i32 0 to double, !dbg !405 + %fptoui.64.32 = fptoui double 0.000000e+00 to i32, !dbg !405 + %fptosi.64.32 = fptosi double 0.000000e+00 to i32, !dbg !405 + %uitofp.64.32 = uitofp i64 0 to float, !dbg !405 + %sitofp.64.32 = sitofp i64 0 to float, !dbg !405 + %fptoui.32.64 = fptoui float 0.000000e+00 to i64, !dbg !405 + %fptosi.32.64 = fptosi float 0.000000e+00 to i64, !dbg !405 + %uitofp.32.32 = uitofp i32 0 to float, !dbg !405 + %sitofp.32.32 = sitofp i32 0 to float, !dbg !405 + %fptoui.32.32 = fptoui float 0.000000e+00 to i32, !dbg !405 + %fptosi.32.32 = fptosi float 0.000000e+00 to i32, !dbg !405 + %fptrunc.64.32 = fptrunc double 0.000000e+00 to float, !dbg !405 + %fpext.32.64 = fpext float 0.000000e+00 to double, !dbg !405 + %bitcast.double.i64 = bitcast double 0.000000e+00 to i64, !dbg !405 + %bitcast.i64.double = bitcast i64 0 to double, !dbg !405 + br i1 false, label %.ret, label %.many, !prof !499, !dbg !405 } -; CHECK: remark: test.c:4:0: in function 'f', ProfileFloatingPointOpCount = 90 +; CHECK: remark: test.c:4:0: in function 'f', ProfileFloatingPointOpCount = 324 ; CHECK: remark: test.c:4:0: in function 'f', ProfileFloatingPointBytesMoved = 576 !llvm.module.flags = !{!0} @@ -179,7 +219,6 @@ define double @f() !dbg !400 !prof !402 { !430 = !DILocation(line: 30, column: 9, scope: !403) !431 = !DILocation(line: 31, column: 9, scope: !403) !432 = !DILocation(line: 32, column: 9, scope: !403) -!433 = !DILocation(line: 33, column: 9, scope: !403) !440 = !DILocation(line: 40, column: 9, scope: !403) !441 = !DILocation(line: 41, column: 9, scope: !403) !442 = !DILocation(line: 42, column: 9, scope: !403) @@ -188,6 +227,9 @@ define double @f() !dbg !400 !prof !402 { !450 = !DILocation(line: 50, column: 9, scope: !403) !451 = !DILocation(line: 51, column: 9, scope: !403) !452 = !DILocation(line: 52, column: 9, scope: !403) +!453 = !DILocation(line: 53, column: 9, scope: !403) +!454 = !DILocation(line: 54, column: 9, scope: !403) +!455 = !DILocation(line: 55, column: 9, scope: !403) !460 = !DILocation(line: 60, column: 9, scope: !403) !461 = !DILocation(line: 61, column: 9, scope: !403) !462 = !DILocation(line: 62, column: 9, scope: !403) @@ -198,4 +240,8 @@ define double @f() !dbg !400 !prof !402 { !467 = !DILocation(line: 67, column: 9, scope: !403) !468 = !DILocation(line: 68, column: 9, scope: !403) !469 = !DILocation(line: 69, column: 9, scope: !403) +!470 = !DILocation(line: 70, column: 9, scope: !403) +!471 = !DILocation(line: 71, column: 9, scope: !403) +!472 = !DILocation(line: 72, column: 9, scope: !403) +!473 = !DILocation(line: 73, column: 9, scope: !403) !499 = !{!"branch_weights", i32 1, i32 2} From 30e7101ad7a1302a1fe45cabdccee1193b669f94 Mon Sep 17 00:00:00 2001 From: "Joel E. Denny" Date: Thu, 29 May 2025 10:31:34 -0400 Subject: [PATCH 114/114] KernelInfo.rst: Drop unnecessary -Xarch_device --- llvm/docs/KernelInfo.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/docs/KernelInfo.rst b/llvm/docs/KernelInfo.rst index e0e3e694ab16a..071f28898c73d 100644 --- a/llvm/docs/KernelInfo.rst +++ b/llvm/docs/KernelInfo.rst @@ -94,7 +94,7 @@ instructions to execute at run time, and we expect a load and store for ``x``: } $ clang -O1 -g -fopenmp --offload-arch=native test.c -o test \ - -fprofile-generate -Xarch_device -fprofile-update=atomic + -fprofile-generate -fprofile-update=atomic $ LLVM_PROFILE_FILE=test.profraw ./test 2 4 16.000000 @@ -112,5 +112,5 @@ instructions to execute at run time, and we expect a load and store for ``x``: test.c:4:0: in function 'test', ProfileFloatingPointOpCount = 4 test.c:4:0: in function 'test', ProfileFloatingPointBytesMoved = 0 -While ``-Xarch_device -fprofile-update=atomic`` is not required for the simple -example above, it can be critical while profiling parallel code. +While ``-fprofile-update=atomic`` is not required for the simple example above, +it can be critical while profiling parallel code.