diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index a0fb32f67e385..41909a8fc1d59 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -307,12 +307,12 @@ enum GlobalValueSummarySymtabCodes { // [valueid, n x stackidindex] FS_PERMODULE_CALLSITE_INFO = 26, // Summary of per-module allocation memprof metadata. - // [nummib, nummib x (alloc type, numstackids, numstackids x stackidindex), + // [nummib, nummib x (alloc type, context radix tree index), // [nummib x (numcontext x total size)]?] FS_PERMODULE_ALLOC_INFO = 27, // Summary of combined index memprof callsite metadata. - // [valueid, numstackindices, numver, - // numstackindices x stackidindex, numver x version] + // [valueid, context radix tree index, numver, + // numver x version] FS_COMBINED_CALLSITE_INFO = 28, // Summary of combined index allocation memprof metadata. // [nummib, numver, @@ -331,6 +331,10 @@ enum GlobalValueSummarySymtabCodes { // the entries must be in the exact same order as the corresponding sizes. // [nummib x (numcontext x full stack id)] FS_ALLOC_CONTEXT_IDS = 31, + // Linearized radix tree of allocation contexts. See the description above the + // CallStackRadixTreeBuilder class in ProfileData/MemProf.h for format. + // [n x entry] + FS_CONTEXT_RADIX_TREE_ARRAY = 32, }; enum MetadataCodes { diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp index 8f79ccdb9ff75..032c0de3c7a00 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp @@ -329,6 +329,7 @@ GetCodeName(unsigned CodeID, unsigned BlockID, STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO) STRINGIFY_CODE(FS, STACK_IDS) STRINGIFY_CODE(FS, ALLOC_CONTEXT_IDS) + STRINGIFY_CODE(FS, CONTEXT_RADIX_TREE_ARRAY) } case bitc::METADATA_ATTACHMENT_ID: switch (CodeID) { diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 3e6abacac2726..11fbe6e6158ee 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -987,6 +987,10 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase { /// ids from the lists in the callsite and alloc entries to the index. std::vector StackIds; + /// Linearized radix tree of allocation contexts. See the description above + /// the CallStackRadixTreeBuilder class in ProfileData/MemProf.h for format. + std::vector RadixArray; + public: ModuleSummaryIndexBitcodeReader( BitstreamCursor Stream, StringRef Strtab, ModuleSummaryIndex &TheIndex, @@ -1013,6 +1017,8 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase { TypeIdCompatibleVtableInfo &TypeId); std::vector parseParamAccesses(ArrayRef Record); + SmallVector parseAllocInfoContext(ArrayRef Record, + unsigned &I); template std::pair @@ -7544,6 +7550,48 @@ void ModuleSummaryIndexBitcodeReader::parseTypeIdCompatibleVtableSummaryRecord( parseTypeIdCompatibleVtableInfo(Record, Slot, TypeId); } +SmallVector ModuleSummaryIndexBitcodeReader::parseAllocInfoContext( + ArrayRef Record, unsigned &I) { + SmallVector StackIdList; + // For backwards compatibility with old format before radix tree was + // used, simply see if we found a radix tree array record (and thus if + // the RadixArray is non-empty). + if (RadixArray.empty()) { + unsigned NumStackEntries = Record[I++]; + assert(Record.size() - I >= NumStackEntries); + StackIdList.reserve(NumStackEntries); + for (unsigned J = 0; J < NumStackEntries; J++) { + assert(Record[I] < StackIds.size()); + StackIdList.push_back( + TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]])); + } + } else { + unsigned RadixIndex = Record[I++]; + // See the comments above CallStackRadixTreeBuilder in ProfileData/MemProf.h + // for a detailed description of the radix tree array format. Briefly, the + // first entry will be the number of frames, any negative values are the + // negative of the offset of the next frame, and otherwise the frames are in + // increasing linear order. + assert(RadixIndex < RadixArray.size()); + unsigned NumStackIds = RadixArray[RadixIndex++]; + StackIdList.reserve(NumStackIds); + while (NumStackIds--) { + assert(RadixIndex < RadixArray.size()); + unsigned Elem = RadixArray[RadixIndex]; + if (static_cast>(Elem) < 0) { + RadixIndex = RadixIndex - Elem; + assert(RadixIndex < RadixArray.size()); + Elem = RadixArray[RadixIndex]; + // We shouldn't encounter a second offset in a row. + assert(static_cast>(Elem) >= 0); + } + RadixIndex++; + StackIdList.push_back(TheIndex.addOrGetStackIdIndex(StackIds[Elem])); + } + } + return StackIdList; +} + static void setSpecialRefs(SmallVectorImpl &Refs, unsigned ROCnt, unsigned WOCnt) { // Readonly and writeonly refs are in the end of the refs list. @@ -8010,6 +8058,11 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { break; } + case bitc::FS_CONTEXT_RADIX_TREE_ARRAY: { // [n x entry] + RadixArray = ArrayRef(Record); + break; + } + case bitc::FS_PERMODULE_CALLSITE_INFO: { unsigned ValueID = Record[0]; SmallVector StackIdList; @@ -8065,14 +8118,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { (Version < 10 && I < Record.size())) { assert(Record.size() - I >= 2); AllocationType AllocType = (AllocationType)Record[I++]; - unsigned NumStackEntries = Record[I++]; - assert(Record.size() - I >= NumStackEntries); - SmallVector StackIdList; - for (unsigned J = 0; J < NumStackEntries; J++) { - assert(Record[I] < StackIds.size()); - StackIdList.push_back( - TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]])); - } + auto StackIdList = parseAllocInfoContext(Record, I); MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList))); } // We either have nothing left or at least NumMIBs context size info @@ -8123,14 +8169,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { while (MIBsRead++ < NumMIBs) { assert(Record.size() - I >= 2); AllocationType AllocType = (AllocationType)Record[I++]; - unsigned NumStackEntries = Record[I++]; - assert(Record.size() - I >= NumStackEntries); - SmallVector StackIdList; - for (unsigned J = 0; J < NumStackEntries; J++) { - assert(Record[I] < StackIds.size()); - StackIdList.push_back( - TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]])); - } + auto StackIdList = parseAllocInfoContext(Record, I); MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList))); } assert(Record.size() - I >= NumVersions); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 59e070a511062..8f22a50a5e024 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -60,6 +60,7 @@ #include "llvm/MC/StringTableBuilder.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Object/IRSymtab.h" +#include "llvm/ProfileData/MemProf.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -83,6 +84,7 @@ #include using namespace llvm; +using namespace llvm::memprof; static cl::opt IndexThreshold("bitcode-mdindex-threshold", cl::Hidden, cl::init(25), @@ -231,7 +233,8 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase { SmallVector &NameVals, GlobalValueSummary *Summary, unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev, unsigned CallsiteAbbrev, unsigned AllocAbbrev, unsigned ContextIdAbbvId, - const Function &F); + const Function &F, DenseMap &CallStackPos, + CallStackId &CallStackCount); void writeModuleLevelReferences(const GlobalVariable &V, SmallVector &NameVals, unsigned FSModRefsAbbrev, @@ -4195,12 +4198,58 @@ static void writeTypeIdCompatibleVtableSummaryRecord( } } +// Adds the allocation contexts to the CallStacks map. We simply use the +// size at the time the context was added as the CallStackId. This works because +// when we look up the call stacks later on we process the function summaries +// and their allocation records in the same exact order. +static void collectMemProfCallStacks( + FunctionSummary *FS, std::function GetStackIndex, + MapVector> &CallStacks) { + // The interfaces in ProfileData/MemProf.h use a type alias for a stack frame + // id offset into the index of the full stack frames. The ModuleSummaryIndex + // currently uses unsigned. Make sure these stay in sync. + static_assert(std::is_same_v); + for (auto &AI : FS->allocs()) { + for (auto &MIB : AI.MIBs) { + SmallVector StackIdIndices; + StackIdIndices.reserve(MIB.StackIdIndices.size()); + for (auto Id : MIB.StackIdIndices) + StackIdIndices.push_back(GetStackIndex(Id)); + // The CallStackId is the size at the time this context was inserted. + CallStacks.insert({CallStacks.size(), StackIdIndices}); + } + } +} + +// Build the radix tree from the accumulated CallStacks, write out the resulting +// linearized radix tree array, and return the map of call stack positions into +// this array for use when writing the allocation records. The returned map is +// indexed by a CallStackId which in this case is implicitly determined by the +// order of function summaries and their allocation infos being written. +static DenseMap writeMemoryProfileRadixTree( + MapVector> &&CallStacks, + BitstreamWriter &Stream, unsigned RadixAbbrev) { + assert(!CallStacks.empty()); + DenseMap FrameHistogram = + computeFrameHistogram(CallStacks); + CallStackRadixTreeBuilder Builder; + // We don't need a MemProfFrameIndexes map as we have already converted the + // full stack id hash to a linear offset into the StackIds array. + Builder.build(std::move(CallStacks), /*MemProfFrameIndexes=*/std::nullopt, + FrameHistogram); + Stream.EmitRecord(bitc::FS_CONTEXT_RADIX_TREE_ARRAY, Builder.getRadixArray(), + RadixAbbrev); + return Builder.takeCallStackPos(); +} + static void writeFunctionHeapProfileRecords( BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev, unsigned AllocAbbrev, unsigned ContextIdAbbvId, bool PerModule, std::function GetValueID, std::function GetStackIndex, - bool WriteContextSizeInfoIndex) { + bool WriteContextSizeInfoIndex, + DenseMap &CallStackPos, + CallStackId &CallStackCount) { SmallVector Record; for (auto &CI : FS->callsites()) { @@ -4234,9 +4283,9 @@ static void writeFunctionHeapProfileRecords( Record.push_back(AI.Versions.size()); for (auto &MIB : AI.MIBs) { Record.push_back((uint8_t)MIB.AllocType); - Record.push_back(MIB.StackIdIndices.size()); - for (auto Id : MIB.StackIdIndices) - Record.push_back(GetStackIndex(Id)); + // Record the index into the radix tree array for this context. + assert(CallStackCount <= CallStackPos.size()); + Record.push_back(CallStackPos[CallStackCount++]); } if (!PerModule) { for (auto V : AI.Versions) @@ -4282,7 +4331,9 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord( SmallVector &NameVals, GlobalValueSummary *Summary, unsigned ValueID, unsigned FSCallsRelBFAbbrev, unsigned FSCallsProfileAbbrev, unsigned CallsiteAbbrev, - unsigned AllocAbbrev, unsigned ContextIdAbbvId, const Function &F) { + unsigned AllocAbbrev, unsigned ContextIdAbbvId, const Function &F, + DenseMap &CallStackPos, + CallStackId &CallStackCount) { NameVals.push_back(ValueID); FunctionSummary *FS = cast(Summary); @@ -4297,7 +4348,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord( /*PerModule*/ true, /*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); }, /*GetStackIndex*/ [&](unsigned I) { return I; }, - /*WriteContextSizeInfoIndex*/ true); + /*WriteContextSizeInfoIndex*/ true, CallStackPos, CallStackCount); auto SpecialRefCnts = FS->specialRefCounts(); NameVals.push_back(getEncodedGVSummaryFlags(FS->flags())); @@ -4530,12 +4581,54 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib - // n x (alloc type, numstackids, numstackids x stackidindex) + // n x (alloc type, context radix tree index) // optional: nummib x (numcontext x total size) Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + Abbv = std::make_shared(); + Abbv->Add(BitCodeAbbrevOp(bitc::FS_CONTEXT_RADIX_TREE_ARRAY)); + // n x entry + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + unsigned RadixAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + + // First walk through all the functions and collect the allocation contexts in + // their associated summaries, for use in constructing a radix tree of + // contexts. Note that we need to do this in the same order as the functions + // are processed further below since the call stack positions in the resulting + // radix tree array are identified based on this order. + MapVector> CallStacks; + for (const Function &F : M) { + // Summary emission does not support anonymous functions, they have to be + // renamed using the anonymous function renaming pass. + if (!F.hasName()) + report_fatal_error("Unexpected anonymous function when writing summary"); + + ValueInfo VI = Index->getValueInfo(F.getGUID()); + if (!VI || VI.getSummaryList().empty()) { + // Only declarations should not have a summary (a declaration might + // however have a summary if the def was in module level asm). + assert(F.isDeclaration()); + continue; + } + auto *Summary = VI.getSummaryList()[0].get(); + FunctionSummary *FS = cast(Summary); + collectMemProfCallStacks( + FS, /*GetStackIndex*/ [](unsigned I) { return I; }, CallStacks); + } + // Finalize the radix tree, write it out, and get the map of positions in the + // linearized tree array. + DenseMap CallStackPos; + if (!CallStacks.empty()) { + CallStackPos = + writeMemoryProfileRadixTree(std::move(CallStacks), Stream, RadixAbbrev); + } + + // Keep track of the current index into the CallStackPos map. + CallStackId CallStackCount = 0; + SmallVector NameVals; // Iterate over the list of functions instead of the Index to // ensure the ordering is stable. @@ -4555,7 +4648,8 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { auto *Summary = VI.getSummaryList()[0].get(); writePerModuleFunctionSummaryRecord( NameVals, Summary, VE.getValueID(&F), FSCallsRelBFAbbrev, - FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, ContextIdAbbvId, F); + FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, ContextIdAbbvId, F, + CallStackPos, CallStackCount); } // Capture references from GlobalVariable initializers, which are outside @@ -4692,13 +4786,20 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALLOC_INFO)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver - // nummib x (alloc type, numstackids, numstackids x stackidindex), + // nummib x (alloc type, context radix tree index), // numver x version // optional: nummib x total size Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + Abbv = std::make_shared(); + Abbv->Add(BitCodeAbbrevOp(bitc::FS_CONTEXT_RADIX_TREE_ARRAY)); + // n x entry + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + unsigned RadixAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + auto shouldImportValueAsDecl = [&](GlobalValueSummary *GVS) -> bool { if (DecSummaries == nullptr) return false; @@ -4735,6 +4836,41 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { NameVals.clear(); }; + // First walk through all the functions and collect the allocation contexts in + // their associated summaries, for use in constructing a radix tree of + // contexts. Note that we need to do this in the same order as the functions + // are processed further below since the call stack positions in the resulting + // radix tree array are identified based on this order. + MapVector> CallStacks; + forEachSummary([&](GVInfo I, bool IsAliasee) { + GlobalValueSummary *S = I.second; + assert(S); + auto *FS = dyn_cast(S); + if (!FS) + return; + collectMemProfCallStacks( + FS, + /*GetStackIndex*/ + [&](unsigned I) { + // Get the corresponding index into the list of StackIds actually + // being written for this combined index (which may be a subset in + // the case of distributed indexes). + assert(StackIdIndicesToIndex.contains(I)); + return StackIdIndicesToIndex[I]; + }, + CallStacks); + }); + // Finalize the radix tree, write it out, and get the map of positions in the + // linearized tree array. + DenseMap CallStackPos; + if (!CallStacks.empty()) { + CallStackPos = + writeMemoryProfileRadixTree(std::move(CallStacks), Stream, RadixAbbrev); + } + + // Keep track of the current index into the CallStackPos map. + CallStackId CallStackCount = 0; + DenseSet DefOrUseGUIDs; forEachSummary([&](GVInfo I, bool IsAliasee) { GlobalValueSummary *S = I.second; @@ -4813,7 +4949,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { assert(StackIdIndicesToIndex.contains(I)); return StackIdIndicesToIndex[I]; }, - /*WriteContextSizeInfoIndex*/ false); + /*WriteContextSizeInfoIndex*/ false, CallStackPos, CallStackCount); NameVals.push_back(*ValueId); assert(ModuleIdMap.count(FS->modulePath())); diff --git a/llvm/lib/Bitcode/Writer/CMakeLists.txt b/llvm/lib/Bitcode/Writer/CMakeLists.txt index 1cc1802bc9aaf..2c508ca9fae95 100644 --- a/llvm/lib/Bitcode/Writer/CMakeLists.txt +++ b/llvm/lib/Bitcode/Writer/CMakeLists.txt @@ -12,6 +12,7 @@ add_llvm_component_library(LLVMBitWriter Core MC Object + ProfileData Support TargetParser ) diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp index 9615fdf77eb27..70741ee4850bd 100644 --- a/llvm/lib/ProfileData/MemProf.cpp +++ b/llvm/lib/ProfileData/MemProf.cpp @@ -510,6 +510,7 @@ void CallStackRadixTreeBuilder::build( // Explicitly instantiate class with the utilized FrameIdTy. template class CallStackRadixTreeBuilder; +template class CallStackRadixTreeBuilder; template llvm::DenseMap @@ -532,6 +533,10 @@ computeFrameHistogram(llvm::MapVector> template llvm::DenseMap computeFrameHistogram( llvm::MapVector> &MemProfCallStackData); +template llvm::DenseMap +computeFrameHistogram( + llvm::MapVector> + &MemProfCallStackData); void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) { for (const auto &AS : Record.AllocSites) { diff --git a/llvm/test/ThinLTO/X86/Inputs/memprof-old-alloc-context-summary.bc b/llvm/test/ThinLTO/X86/Inputs/memprof-old-alloc-context-summary.bc new file mode 100644 index 0000000000000..c98308f4637f8 Binary files /dev/null and b/llvm/test/ThinLTO/X86/Inputs/memprof-old-alloc-context-summary.bc differ diff --git a/llvm/test/ThinLTO/X86/memprof-old-alloc-context-summary.ll b/llvm/test/ThinLTO/X86/memprof-old-alloc-context-summary.ll new file mode 100644 index 0000000000000..20f95617915cc --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-old-alloc-context-summary.ll @@ -0,0 +1,28 @@ +;; Check that we can read the old *_ALLOC_INFO summary format that placed the +;; stack id indexes directly in the alloc info summary, rather than encoding as +;; a separate radix tree. +;; +;; The old bitcode was generated by the older compiler from `opt -thinlto-bc` +;; on the following LLVM assembly: +;; +;; target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +;; target triple = "x86_64-unknown-linux-gnu" +;; +;; define internal ptr @_Z3barv() #0 { +;; entry: +;; %call = call ptr @_Znam(i64 0), !memprof !1, !callsite !6 +;; ret ptr null +;; } +;; +;; declare ptr @_Znam(i64) +;; +;; !1 = !{!2, !4} +;; !2 = !{!3, !"notcold"} +;; !3 = !{i64 9086428284934609951, i64 8632435727821051414} +;; !4 = !{!5, !"cold"} +;; !5 = !{i64 9086428284934609951, i64 2732490490862098848} +;; !6 = !{i64 9086428284934609951} + +; RUN: llvm-dis %S/Inputs/memprof-old-alloc-context-summary.bc -o - | FileCheck %s +; CHECK: stackIds: (8632435727821051414) +; CHECK-SAME: stackIds: (2732490490862098848)