123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303 |
- //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- #include "CSPreInliner.h"
- #include "ProfiledBinary.h"
- #include "llvm/ADT/SCCIterator.h"
- #include "llvm/ADT/Statistic.h"
- #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
- #include <cstdint>
- #include <queue>
- #define DEBUG_TYPE "cs-preinliner"
- using namespace llvm;
- using namespace sampleprof;
- STATISTIC(PreInlNumCSInlined,
- "Number of functions inlined with context sensitive profile");
- STATISTIC(PreInlNumCSNotInlined,
- "Number of functions not inlined with context sensitive profile");
- STATISTIC(PreInlNumCSInlinedHitMinLimit,
- "Number of functions with FDO inline stopped due to min size limit");
- STATISTIC(PreInlNumCSInlinedHitMaxLimit,
- "Number of functions with FDO inline stopped due to max size limit");
- STATISTIC(
- PreInlNumCSInlinedHitGrowthLimit,
- "Number of functions with FDO inline stopped due to growth size limit");
- // The switches specify inline thresholds used in SampleProfileLoader inlining.
- // TODO: the actual threshold to be tuned here because the size here is based
- // on machine code not LLVM IR.
- extern cl::opt<int> SampleHotCallSiteThreshold;
- extern cl::opt<int> SampleColdCallSiteThreshold;
- extern cl::opt<int> ProfileInlineGrowthLimit;
- extern cl::opt<int> ProfileInlineLimitMin;
- extern cl::opt<int> ProfileInlineLimitMax;
- extern cl::opt<bool> SortProfiledSCC;
- cl::opt<bool> EnableCSPreInliner(
- "csspgo-preinliner", cl::Hidden, cl::init(true),
- cl::desc("Run a global pre-inliner to merge context profile based on "
- "estimated global top-down inline decisions"));
- cl::opt<bool> UseContextCostForPreInliner(
- "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
- cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
- static cl::opt<bool> SamplePreInlineReplay(
- "csspgo-replay-preinline", cl::Hidden, cl::init(false),
- cl::desc(
- "Replay previous inlining and adjust context profile accordingly"));
- CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
- ProfiledBinary &Binary, ProfileSummary *Summary)
- : UseContextCost(UseContextCostForPreInliner),
- // TODO: Pass in a guid-to-name map in order for
- // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
- // as their profile context.
- ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
- // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
- // for good performance with reasonable profile size.
- if (!SampleHotCallSiteThreshold.getNumOccurrences())
- SampleHotCallSiteThreshold = 1500;
- if (!SampleColdCallSiteThreshold.getNumOccurrences())
- SampleColdCallSiteThreshold = 0;
- if (!ProfileInlineLimitMax.getNumOccurrences())
- ProfileInlineLimitMax = 3000;
- }
- std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
- std::vector<StringRef> Order;
- ProfiledCallGraph ProfiledCG(ContextTracker);
- // Now that we have a profiled call graph, construct top-down order
- // by building up SCC and reversing SCC order.
- scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
- while (!I.isAtEnd()) {
- auto Range = *I;
- if (SortProfiledSCC) {
- // Sort nodes in one SCC based on callsite hotness.
- scc_member_iterator<ProfiledCallGraph *> SI(*I);
- Range = *SI;
- }
- for (auto *Node : Range) {
- if (Node != ProfiledCG.getEntryNode())
- Order.push_back(Node->Name);
- }
- ++I;
- }
- std::reverse(Order.begin(), Order.end());
- return Order;
- }
- bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
- const FunctionSamples *CallerSamples) {
- assert(CallerSamples && "Expect non-null caller samples");
- // Ideally we want to consider everything a function calls, but as far as
- // context profile is concerned, only those frames that are children of
- // current one in the trie is relavent. So we walk the trie instead of call
- // targets from function profile.
- ContextTrieNode *CallerNode =
- ContextTracker.getContextNodeForProfile(CallerSamples);
- bool HasNewCandidate = false;
- for (auto &Child : CallerNode->getAllChildContext()) {
- ContextTrieNode *CalleeNode = &Child.second;
- FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
- if (!CalleeSamples)
- continue;
- // Call site count is more reliable, so we look up the corresponding call
- // target profile in caller's context profile to retrieve call site count.
- uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
- uint64_t CallsiteCount = 0;
- LineLocation Callsite = CalleeNode->getCallSiteLoc();
- if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
- SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
- auto It = TargetCounts.find(CalleeSamples->getName());
- if (It != TargetCounts.end())
- CallsiteCount = It->second;
- }
- // TODO: call site and callee entry count should be mostly consistent, add
- // check for that.
- HasNewCandidate = true;
- uint32_t CalleeSize = getFuncSize(CalleeNode);
- CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
- CalleeSize);
- }
- return HasNewCandidate;
- }
- uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
- if (UseContextCost)
- return Binary.getFuncSizeForContext(ContextNode);
- return ContextNode->getFunctionSamples()->getBodySamples().size();
- }
- bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
- // If replay inline is requested, simply follow the inline decision of the
- // profiled binary.
- if (SamplePreInlineReplay)
- return Candidate.CalleeSamples->getContext().hasAttribute(
- ContextWasInlined);
- unsigned int SampleThreshold = SampleColdCallSiteThreshold;
- uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
- (Summary->getDetailedSummary()));
- if (Candidate.CallsiteCount <= ColdCountThreshold)
- SampleThreshold = SampleColdCallSiteThreshold;
- else {
- // Linearly adjust threshold based on normalized hotness, i.e, a value in
- // [0,1]. Use 10% cutoff instead of the max count as the normalization
- // upperbound for stability.
- double NormalizationUpperBound =
- ProfileSummaryBuilder::getEntryForPercentile(
- Summary->getDetailedSummary(), 100000 /* 10% */)
- .MinCount;
- double NormalizationLowerBound = ColdCountThreshold;
- double NormalizedHotness =
- (Candidate.CallsiteCount - NormalizationLowerBound) /
- (NormalizationUpperBound - NormalizationLowerBound);
- if (NormalizedHotness > 1.0)
- NormalizedHotness = 1.0;
- // Add 1 to to ensure hot callsites get a non-zero threshold, which could
- // happen when SampleColdCallSiteThreshold is 0. This is when we do not
- // want any inlining for cold callsites.
- SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
- SampleColdCallSiteThreshold + 1;
- }
- return (Candidate.SizeCost < SampleThreshold);
- }
- void CSPreInliner::processFunction(const StringRef Name) {
- FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
- if (!FSamples)
- return;
- unsigned FuncSize =
- getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
- unsigned FuncFinalSize = FuncSize;
- unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
- SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
- SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
- LLVM_DEBUG(dbgs() << "Process " << Name
- << " for context-sensitive pre-inlining (pre-inline size: "
- << FuncSize << ", size limit: " << SizeLimit << ")\n");
- ProfiledCandidateQueue CQueue;
- getInlineCandidates(CQueue, FSamples);
- while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
- ProfiledInlineCandidate Candidate = CQueue.top();
- CQueue.pop();
- bool ShouldInline = false;
- if ((ShouldInline = shouldInline(Candidate))) {
- // We mark context as inlined as the corresponding context profile
- // won't be merged into that function's base profile.
- ++PreInlNumCSInlined;
- ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
- Candidate.CalleeSamples->getContext().setAttribute(
- ContextShouldBeInlined);
- FuncFinalSize += Candidate.SizeCost;
- getInlineCandidates(CQueue, Candidate.CalleeSamples);
- } else {
- ++PreInlNumCSNotInlined;
- }
- LLVM_DEBUG(
- dbgs() << (ShouldInline ? " Inlined" : " Outlined")
- << " context profile for: "
- << ContextTracker.getContextString(*Candidate.CalleeSamples)
- << " (callee size: " << Candidate.SizeCost
- << ", call count:" << Candidate.CallsiteCount << ")\n");
- }
- if (!CQueue.empty()) {
- if (SizeLimit == (unsigned)ProfileInlineLimitMax)
- ++PreInlNumCSInlinedHitMaxLimit;
- else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
- ++PreInlNumCSInlinedHitMinLimit;
- else
- ++PreInlNumCSInlinedHitGrowthLimit;
- }
- LLVM_DEBUG({
- if (!CQueue.empty())
- dbgs() << " Inline candidates ignored due to size limit (inliner "
- "original size: "
- << FuncSize << ", inliner final size: " << FuncFinalSize
- << ", size limit: " << SizeLimit << ")\n";
- while (!CQueue.empty()) {
- ProfiledInlineCandidate Candidate = CQueue.top();
- CQueue.pop();
- bool WasInlined =
- Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
- dbgs() << " "
- << ContextTracker.getContextString(*Candidate.CalleeSamples)
- << " (candidate size:" << Candidate.SizeCost
- << ", call count: " << Candidate.CallsiteCount << ", previously "
- << (WasInlined ? "inlined)\n" : "not inlined)\n");
- }
- });
- }
- void CSPreInliner::run() {
- #ifndef NDEBUG
- auto printProfileNames = [](SampleContextTracker &ContextTracker,
- bool IsInput) {
- uint32_t Size = 0;
- for (auto *Node : ContextTracker) {
- FunctionSamples *FSamples = Node->getFunctionSamples();
- if (FSamples) {
- Size++;
- dbgs() << " [" << ContextTracker.getContextString(Node) << "] "
- << FSamples->getTotalSamples() << ":"
- << FSamples->getHeadSamples() << "\n";
- }
- }
- dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
- << Size << " total):\n";
- };
- #endif
- LLVM_DEBUG(printProfileNames(ContextTracker, true));
- // Execute global pre-inliner to estimate a global top-down inline
- // decision and merge profiles accordingly. This helps with profile
- // merge for ThinLTO otherwise we won't be able to merge profiles back
- // to base profile across module/thin-backend boundaries.
- // It also helps better compress context profile to control profile
- // size, as we now only need context profile for functions going to
- // be inlined.
- for (StringRef FuncName : buildTopDownOrder()) {
- processFunction(FuncName);
- }
- // Not inlined context profiles are merged into its base, so we can
- // trim out such profiles from the output.
- for (auto *Node : ContextTracker) {
- FunctionSamples *FProfile = Node->getFunctionSamples();
- if (FProfile &&
- (Node->getParentContext() != &ContextTracker.getRootContext() &&
- !FProfile->getContext().hasState(InlinedContext))) {
- Node->setFunctionSamples(nullptr);
- }
- }
- FunctionSamples::ProfileIsPreInlined = true;
- LLVM_DEBUG(printProfileNames(ContextTracker, false));
- }
|