//===- PartialInlining.cpp - Inline parts of functions --------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass performs partial inlining, typically by inlining an if statement // that surrounds the body of the function. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/PartialInlining.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/User.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/CodeExtractor.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include #include #include #include #include #include #include #include using namespace llvm; #define DEBUG_TYPE "partial-inlining" STATISTIC(NumPartialInlined, "Number of callsites functions partially inlined into."); STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " "cold outlined regions were partially " "inlined into its caller(s)."); STATISTIC(NumColdRegionsFound, "Number of cold single entry/exit regions found."); STATISTIC(NumColdRegionsOutlined, "Number of cold single entry/exit regions outlined."); // Command line option to disable partial-inlining. The default is false: static cl::opt DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial inlining")); // Command line option to disable multi-region partial-inlining. The default is // false: static cl::opt DisableMultiRegionPartialInline( "disable-mr-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable multi-region partial inlining")); // Command line option to force outlining in regions with live exit variables. // The default is false: static cl::opt ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, cl::desc("Force outline regions with live exits")); // Command line option to enable marking outline functions with Cold Calling // Convention. The default is false: static cl::opt MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, cl::desc("Mark outline function calls with ColdCC")); // This is an option used by testing: static cl::opt SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::init(false), cl::ZeroOrMore, cl::ReallyHidden, cl::desc("Skip Cost Analysis")); // Used to determine if a cold region is worth outlining based on // its inlining cost compared to the original function. Default is set at 10%. // ie. if the cold region reduces the inlining cost of the original function by // at least 10%. static cl::opt MinRegionSizeRatio( "min-region-size-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum ratio comparing relative sizes of each " "outline candidate and original function")); // Used to tune the minimum number of execution counts needed in the predecessor // block to the cold edge. ie. confidence interval. static cl::opt MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, cl::desc("Minimum block executions to consider " "its BranchProbabilityInfo valid")); // Used to determine when an edge is considered cold. Default is set to 10%. ie. // if the branch probability is 10% or less, then it is deemed as 'cold'. static cl::opt ColdBranchRatio( "cold-branch-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum BranchProbability to consider a region cold.")); static cl::opt MaxNumInlineBlocks( "max-num-inline-blocks", cl::init(5), cl::Hidden, cl::desc("Max number of blocks to be partially inlined")); // Command line option to set the maximum number of partial inlining allowed // for the module. The default value of -1 means no limit. static cl::opt MaxNumPartialInlining( "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, cl::desc("Max number of partial inlining. The default is unlimited")); // Used only when PGO or user annotated branch data is absent. It is // the least value that is used to weigh the outline region. If BFI // produces larger value, the BFI value will be used. static cl::opt OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), cl::Hidden, cl::ZeroOrMore, cl::desc("Relative frequency of outline region to " "the entry block")); static cl::opt ExtraOutliningPenalty( "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, cl::desc("A debug option to add additional penalty to the computed one.")); namespace { struct FunctionOutliningInfo { FunctionOutliningInfo() = default; // Returns the number of blocks to be inlined including all blocks // in Entries and one return block. unsigned getNumInlinedBlocks() const { return Entries.size() + 1; } // A set of blocks including the function entry that guard // the region to be outlined. SmallVector Entries; // The return block that is not included in the outlined region. BasicBlock *ReturnBlock = nullptr; // The dominating block of the region to be outlined. BasicBlock *NonReturnBlock = nullptr; // The set of blocks in Entries that that are predecessors to ReturnBlock SmallVector ReturnBlockPreds; }; struct FunctionOutliningMultiRegionInfo { FunctionOutliningMultiRegionInfo() {} // Container for outline regions struct OutlineRegionInfo { OutlineRegionInfo(ArrayRef Region, BasicBlock *EntryBlock, BasicBlock *ExitBlock, BasicBlock *ReturnBlock) : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock), ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {} SmallVector Region; BasicBlock *EntryBlock; BasicBlock *ExitBlock; BasicBlock *ReturnBlock; }; SmallVector ORI; }; struct PartialInlinerImpl { PartialInlinerImpl( function_ref GetAC, function_ref LookupAC, function_ref GTTI, function_ref GTLI, ProfileSummaryInfo &ProfSI, function_ref GBFI = nullptr) : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {} bool run(Module &M); // Main part of the transformation that calls helper functions to find // outlining candidates, clone & outline the function, and attempt to // partially inline the resulting function. Returns true if // inlining was successful, false otherwise. Also returns the outline // function (only if we partially inlined early returns) as there is a // possibility to further "peel" early return statements that were left in the // outline function due to code size. std::pair unswitchFunction(Function &F); // This class speculatively clones the function to be partial inlined. // At the end of partial inlining, the remaining callsites to the cloned // function that are not partially inlined will be fixed up to reference // the original function, and the cloned function will be erased. struct FunctionCloner { // Two constructors, one for single region outlining, the other for // multi-region outlining. FunctionCloner(Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, function_ref LookupAC, function_ref GetTTI); FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, OptimizationRemarkEmitter &ORE, function_ref LookupAC, function_ref GetTTI); ~FunctionCloner(); // Prepare for function outlining: making sure there is only // one incoming edge from the extracted/outlined region to // the return block. void normalizeReturnBlock() const; // Do function outlining for cold regions. bool doMultiRegionFunctionOutlining(); // Do function outlining for region after early return block(s). // NOTE: For vararg functions that do the vararg handling in the outlined // function, we temporarily generate IR that does not properly // forward varargs to the outlined function. Calling InlineFunction // will update calls to the outlined functions to properly forward // the varargs. Function *doSingleRegionFunctionOutlining(); Function *OrigFunc = nullptr; Function *ClonedFunc = nullptr; typedef std::pair FuncBodyCallerPair; // Keep track of Outlined Functions and the basic block they're called from. SmallVector OutlinedFunctions; // ClonedFunc is inlined in one of its callers after function // outlining. bool IsFunctionInlined = false; // The cost of the region to be outlined. InstructionCost OutlinedRegionCost = 0; // ClonedOI is specific to outlining non-early return blocks. std::unique_ptr ClonedOI = nullptr; // ClonedOMRI is specific to outlining cold regions. std::unique_ptr ClonedOMRI = nullptr; std::unique_ptr ClonedFuncBFI = nullptr; OptimizationRemarkEmitter &ORE; function_ref LookupAC; function_ref GetTTI; }; private: int NumPartialInlining = 0; function_ref GetAssumptionCache; function_ref LookupAssumptionCache; function_ref GetTTI; function_ref GetBFI; function_ref GetTLI; ProfileSummaryInfo &PSI; // Return the frequency of the OutlininingBB relative to F's entry point. // The result is no larger than 1 and is represented using BP. // (Note that the outlined region's 'head' block can only have incoming // edges from the guarding entry blocks). BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const; // Return true if the callee of CB should be partially inlined with // profit. bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, OptimizationRemarkEmitter &ORE) const; // Try to inline DuplicateFunction (cloned from F with call to // the OutlinedFunction into its callers. Return true // if there is any successful inlining. bool tryPartialInline(FunctionCloner &Cloner); // Compute the mapping from use site of DuplicationFunction to the enclosing // BB's profile count. void computeCallsiteToProfCountMap(Function *DuplicateFunction, DenseMap &SiteCountMap) const; bool isLimitReached() const { return (MaxNumPartialInlining != -1 && NumPartialInlining >= MaxNumPartialInlining); } static CallBase *getSupportedCallBase(User *U) { if (isa(U) || isa(U)) return cast(U); llvm_unreachable("All uses must be calls"); return nullptr; } static CallBase *getOneCallSiteTo(Function &F) { User *User = *F.user_begin(); return getSupportedCallBase(User); } std::tuple getOneDebugLoc(Function &F) const { CallBase *CB = getOneCallSiteTo(F); DebugLoc DLoc = CB->getDebugLoc(); BasicBlock *Block = CB->getParent(); return std::make_tuple(DLoc, Block); } // Returns the costs associated with function outlining: // - The first value is the non-weighted runtime cost for making the call // to the outlined function, including the addtional setup cost in the // outlined function itself; // - The second value is the estimated size of the new call sequence in // basic block Cloner.OutliningCallBB; std::tuple computeOutliningCosts(FunctionCloner &Cloner) const; // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to // approximate both the size and runtime cost (Note that in the current // inline cost analysis, there is no clear distinction there either). static InstructionCost computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI); std::unique_ptr computeOutliningInfo(Function &F) const; std::unique_ptr computeOutliningColdRegionsInfo(Function &F, OptimizationRemarkEmitter &ORE) const; }; struct PartialInlinerLegacyPass : public ModulePass { static char ID; // Pass identification, replacement for typeid PartialInlinerLegacyPass() : ModulePass(ID) { initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); } bool runOnModule(Module &M) override { if (skipModule(M)) return false; AssumptionCacheTracker *ACT = &getAnalysis(); TargetTransformInfoWrapperPass *TTIWP = &getAnalysis(); ProfileSummaryInfo &PSI = getAnalysis().getPSI(); auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * { return ACT->lookupAssumptionCache(F); }; auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & { return TTIWP->getTTI(F); }; auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { return this->getAnalysis().getTLI(F); }; return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, GetTLI, PSI) .run(M); } }; } // end anonymous namespace std::unique_ptr PartialInlinerImpl::computeOutliningColdRegionsInfo( Function &F, OptimizationRemarkEmitter &ORE) const { BasicBlock *EntryBlock = &F.front(); DominatorTree DT(F); LoopInfo LI(DT); BranchProbabilityInfo BPI(F, LI); std::unique_ptr ScopedBFI; BlockFrequencyInfo *BFI; if (!GetBFI) { ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI)); BFI = ScopedBFI.get(); } else BFI = &(GetBFI(F)); // Return if we don't have profiling information. if (!PSI.hasInstrumentationProfile()) return std::unique_ptr(); std::unique_ptr OutliningInfo = std::make_unique(); auto IsSingleExit = [&ORE](SmallVectorImpl &BlockList) -> BasicBlock * { BasicBlock *ExitBlock = nullptr; for (auto *Block : BlockList) { for (BasicBlock *Succ : successors(Block)) { if (!is_contained(BlockList, Succ)) { if (ExitBlock) { ORE.emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", &Succ->front()) << "Region dominated by " << ore::NV("Block", BlockList.front()->getName()) << " has more than one region exit edge."; }); return nullptr; } ExitBlock = Block; } } } return ExitBlock; }; auto BBProfileCount = [BFI](BasicBlock *BB) { return BFI->getBlockProfileCount(BB).getValueOr(0); }; // Use the same computeBBInlineCost function to compute the cost savings of // the outlining the candidate region. TargetTransformInfo *FTTI = &GetTTI(F); InstructionCost OverallFunctionCost = 0; for (auto &BB : F) OverallFunctionCost += computeBBInlineCost(&BB, FTTI); LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";); InstructionCost MinOutlineRegionCost = OverallFunctionCost.map( [&](auto Cost) { return Cost * MinRegionSizeRatio; }); BranchProbability MinBranchProbability( static_cast(ColdBranchRatio * MinBlockCounterExecution), MinBlockCounterExecution); bool ColdCandidateFound = false; BasicBlock *CurrEntry = EntryBlock; std::vector DFS; DenseMap VisitedMap; DFS.push_back(CurrEntry); VisitedMap[CurrEntry] = true; // Use Depth First Search on the basic blocks to find CFG edges that are // considered cold. // Cold regions considered must also have its inline cost compared to the // overall inline cost of the original function. The region is outlined only // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or // more. while (!DFS.empty()) { auto *ThisBB = DFS.back(); DFS.pop_back(); // Only consider regions with predecessor blocks that are considered // not-cold (default: part of the top 99.99% of all block counters) // AND greater than our minimum block execution count (default: 100). if (PSI.isColdBlock(ThisBB, BFI) || BBProfileCount(ThisBB) < MinBlockCounterExecution) continue; for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) { if (VisitedMap[*SI]) continue; VisitedMap[*SI] = true; DFS.push_back(*SI); // If branch isn't cold, we skip to the next one. BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI); if (SuccProb > MinBranchProbability) continue; LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->" << SI->getName() << "\nBranch Probability = " << SuccProb << "\n";); SmallVector DominateVector; DT.getDescendants(*SI, DominateVector); assert(!DominateVector.empty() && "SI should be reachable and have at least itself as descendant"); // We can only outline single entry regions (for now). if (!DominateVector.front()->hasNPredecessors(1)) { LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() << " doesn't have a single predecessor in the " "dominator tree\n";); continue; } BasicBlock *ExitBlock = nullptr; // We can only outline single exit regions (for now). if (!(ExitBlock = IsSingleExit(DominateVector))) { LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() << " doesn't have a unique successor\n";); continue; } InstructionCost OutlineRegionCost = 0; for (auto *BB : DominateVector) OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";); if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) { ORE.emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &SI->front()) << ore::NV("Callee", &F) << " inline cost-savings smaller than " << ore::NV("Cost", MinOutlineRegionCost); }); LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than " << MinOutlineRegionCost << "\n";); continue; } // For now, ignore blocks that belong to a SISE region that is a // candidate for outlining. In the future, we may want to look // at inner regions because the outer region may have live-exit // variables. for (auto *BB : DominateVector) VisitedMap[BB] = true; // ReturnBlock here means the block after the outline call BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); OutliningInfo->ORI.push_back(RegInfo); LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: " << DominateVector.front()->getName() << "\n";); ColdCandidateFound = true; NumColdRegionsFound++; } } if (ColdCandidateFound) return OutliningInfo; return std::unique_ptr(); } std::unique_ptr PartialInlinerImpl::computeOutliningInfo(Function &F) const { BasicBlock *EntryBlock = &F.front(); BranchInst *BR = dyn_cast(EntryBlock->getTerminator()); if (!BR || BR->isUnconditional()) return std::unique_ptr(); // Returns true if Succ is BB's successor auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { return is_contained(successors(BB), Succ); }; auto IsReturnBlock = [](BasicBlock *BB) { Instruction *TI = BB->getTerminator(); return isa(TI); }; auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { if (IsReturnBlock(Succ1)) return std::make_tuple(Succ1, Succ2); if (IsReturnBlock(Succ2)) return std::make_tuple(Succ2, Succ1); return std::make_tuple(nullptr, nullptr); }; // Detect a triangular shape: auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { if (IsSuccessor(Succ1, Succ2)) return std::make_tuple(Succ1, Succ2); if (IsSuccessor(Succ2, Succ1)) return std::make_tuple(Succ2, Succ1); return std::make_tuple(nullptr, nullptr); }; std::unique_ptr OutliningInfo = std::make_unique(); BasicBlock *CurrEntry = EntryBlock; bool CandidateFound = false; do { // The number of blocks to be inlined has already reached // the limit. When MaxNumInlineBlocks is set to 0 or 1, this // disables partial inlining for the function. if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks) break; if (succ_size(CurrEntry) != 2) break; BasicBlock *Succ1 = *succ_begin(CurrEntry); BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); BasicBlock *ReturnBlock, *NonReturnBlock; std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); if (ReturnBlock) { OutliningInfo->Entries.push_back(CurrEntry); OutliningInfo->ReturnBlock = ReturnBlock; OutliningInfo->NonReturnBlock = NonReturnBlock; CandidateFound = true; break; } BasicBlock *CommSucc, *OtherSucc; std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); if (!CommSucc) break; OutliningInfo->Entries.push_back(CurrEntry); CurrEntry = OtherSucc; } while (true); if (!CandidateFound) return std::unique_ptr(); // There should not be any successors (not in the entry set) other than // {ReturnBlock, NonReturnBlock} assert(OutliningInfo->Entries[0] == &F.front() && "Function Entry must be the first in Entries vector"); DenseSet Entries; for (BasicBlock *E : OutliningInfo->Entries) Entries.insert(E); // Returns true of BB has Predecessor which is not // in Entries set. auto HasNonEntryPred = [Entries](BasicBlock *BB) { for (auto *Pred : predecessors(BB)) { if (!Entries.count(Pred)) return true; } return false; }; auto CheckAndNormalizeCandidate = [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { for (BasicBlock *E : OutliningInfo->Entries) { for (auto *Succ : successors(E)) { if (Entries.count(Succ)) continue; if (Succ == OutliningInfo->ReturnBlock) OutliningInfo->ReturnBlockPreds.push_back(E); else if (Succ != OutliningInfo->NonReturnBlock) return false; } // There should not be any outside incoming edges either: if (HasNonEntryPred(E)) return false; } return true; }; if (!CheckAndNormalizeCandidate(OutliningInfo.get())) return std::unique_ptr(); // Now further growing the candidate's inlining region by // peeling off dominating blocks from the outlining region: while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) { BasicBlock *Cand = OutliningInfo->NonReturnBlock; if (succ_size(Cand) != 2) break; if (HasNonEntryPred(Cand)) break; BasicBlock *Succ1 = *succ_begin(Cand); BasicBlock *Succ2 = *(succ_begin(Cand) + 1); BasicBlock *ReturnBlock, *NonReturnBlock; std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) break; if (NonReturnBlock->getSinglePredecessor() != Cand) break; // Now grow and update OutlininigInfo: OutliningInfo->Entries.push_back(Cand); OutliningInfo->NonReturnBlock = NonReturnBlock; OutliningInfo->ReturnBlockPreds.push_back(Cand); Entries.insert(Cand); } return OutliningInfo; } // Check if there is PGO data or user annotated branch data: static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) { if (F.hasProfileData()) return true; // Now check if any of the entry block has MD_prof data: for (auto *E : OI.Entries) { BranchInst *BR = dyn_cast(E->getTerminator()); if (!BR || BR->isUnconditional()) continue; uint64_t T, F; if (BR->extractProfMetadata(T, F)) return true; } return false; } BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( FunctionCloner &Cloner) const { BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; auto EntryFreq = Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); auto OutliningCallFreq = Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE // we outlined any regions, so we may encounter situations where the // OutliningCallFreq is *slightly* bigger than the EntryFreq. if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) OutliningCallFreq = EntryFreq; auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get())) return OutlineRegionRelFreq; // When profile data is not available, we need to be conservative in // estimating the overall savings. Static branch prediction can usually // guess the branch direction right (taken/non-taken), but the guessed // branch probability is usually not biased enough. In case when the // outlined region is predicted to be likely, its probability needs // to be made higher (more biased) to not under-estimate the cost of // function outlining. On the other hand, if the outlined region // is predicted to be less likely, the predicted probablity is usually // higher than the actual. For instance, the actual probability of the // less likely target is only 5%, but the guessed probablity can be // 40%. In the latter case, there is no need for further adjustement. // FIXME: add an option for this. if (OutlineRegionRelFreq < BranchProbability(45, 100)) return OutlineRegionRelFreq; OutlineRegionRelFreq = std::max( OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); return OutlineRegionRelFreq; } bool PartialInlinerImpl::shouldPartialInline( CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, OptimizationRemarkEmitter &ORE) const { using namespace ore; Function *Callee = CB.getCalledFunction(); assert(Callee == Cloner.ClonedFunc); if (SkipCostAnalysis) return isInlineViable(*Callee).isSuccess(); Function *Caller = CB.getCaller(); auto &CalleeTTI = GetTTI(*Callee); bool RemarksEnabled = Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( DEBUG_TYPE); InlineCost IC = getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache, GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr); if (IC.isAlways()) { ORE.emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB) << NV("Callee", Cloner.OrigFunc) << " should always be fully inlined, not partially"; }); return false; } if (IC.isNever()) { ORE.emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB) << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " << NV("Caller", Caller) << " because it should never be inlined (cost=never)"; }); return false; } if (!IC) { ORE.emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB) << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " << NV("Caller", Caller) << " because too costly to inline (cost=" << NV("Cost", IC.getCost()) << ", threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; }); return false; } const DataLayout &DL = Caller->getParent()->getDataLayout(); // The savings of eliminating the call: int NonWeightedSavings = getCallsiteCost(CB, DL); BlockFrequency NormWeightedSavings(NonWeightedSavings); // Weighted saving is smaller than weighted cost, return false if (NormWeightedSavings < WeightedOutliningRcost) { ORE.emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", &CB) << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " << NV("Caller", Caller) << " runtime overhead (overhead=" << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) << ", savings=" << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" << " of making the outlined call is too high"; }); return false; } ORE.emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB) << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) << " (threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; }); return true; } // TODO: Ideally we should share Inliner's InlineCost Analysis code. // For now use a simplified version. The returned 'InlineCost' will be used // to esimate the size cost as well as runtime cost of the BB. InstructionCost PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI) { InstructionCost InlineCost = 0; const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip free instructions. switch (I.getOpcode()) { case Instruction::BitCast: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::Alloca: case Instruction::PHI: continue; case Instruction::GetElementPtr: if (cast(&I)->hasAllZeroIndices()) continue; break; default: break; } if (I.isLifetimeStartOrEnd()) continue; if (auto *II = dyn_cast(&I)) { Intrinsic::ID IID = II->getIntrinsicID(); SmallVector Tys; FastMathFlags FMF; for (Value *Val : II->args()) Tys.push_back(Val->getType()); if (auto *FPMO = dyn_cast(II)) FMF = FPMO->getFastMathFlags(); IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF); InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency); continue; } if (CallInst *CI = dyn_cast(&I)) { InlineCost += getCallsiteCost(*CI, DL); continue; } if (InvokeInst *II = dyn_cast(&I)) { InlineCost += getCallsiteCost(*II, DL); continue; } if (SwitchInst *SI = dyn_cast(&I)) { InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; continue; } InlineCost += InlineConstants::InstrCost; } return InlineCost; } std::tuple PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const { InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; for (auto FuncBBPair : Cloner.OutlinedFunctions) { Function *OutlinedFunc = FuncBBPair.first; BasicBlock* OutliningCallBB = FuncBBPair.second; // Now compute the cost of the call sequence to the outlined function // 'OutlinedFunction' in BB 'OutliningCallBB': auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc); OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI); // Now compute the cost of the extracted/outlined function itself: for (BasicBlock &BB : *OutlinedFunc) OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI); } assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && "Outlined function cost should be no less than the outlined region"); // The code extractor introduces a new root and exit stub blocks with // additional unconditional branches. Those branches will be eliminated // later with bb layout. The cost should be adjusted accordingly: OutlinedFunctionCost -= 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); InstructionCost OutliningRuntimeOverhead = OutliningFuncCallCost + (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + ExtraOutliningPenalty.getValue(); return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); } // Create the callsite to profile count map which is // used to update the original function's entry count, // after the function is partially inlined into the callsite. void PartialInlinerImpl::computeCallsiteToProfCountMap( Function *DuplicateFunction, DenseMap &CallSiteToProfCountMap) const { std::vector Users(DuplicateFunction->user_begin(), DuplicateFunction->user_end()); Function *CurrentCaller = nullptr; std::unique_ptr TempBFI; BlockFrequencyInfo *CurrentCallerBFI = nullptr; auto ComputeCurrBFI = [&,this](Function *Caller) { // For the old pass manager: if (!GetBFI) { DominatorTree DT(*Caller); LoopInfo LI(DT); BranchProbabilityInfo BPI(*Caller, LI); TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); CurrentCallerBFI = TempBFI.get(); } else { // New pass manager: CurrentCallerBFI = &(GetBFI(*Caller)); } }; for (User *User : Users) { // Don't bother with BlockAddress used by CallBr for asm goto. if (isa(User)) continue; CallBase *CB = getSupportedCallBase(User); Function *Caller = CB->getCaller(); if (CurrentCaller != Caller) { CurrentCaller = Caller; ComputeCurrBFI(Caller); } else { assert(CurrentCallerBFI && "CallerBFI is not set"); } BasicBlock *CallBB = CB->getParent(); auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); if (Count) CallSiteToProfCountMap[User] = *Count; else CallSiteToProfCountMap[User] = 0; } } PartialInlinerImpl::FunctionCloner::FunctionCloner( Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, function_ref LookupAC, function_ref GetTTI) : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { ClonedOI = std::make_unique(); // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; ClonedFunc = CloneFunction(F, VMap); ClonedOI->ReturnBlock = cast(VMap[OI->ReturnBlock]); ClonedOI->NonReturnBlock = cast(VMap[OI->NonReturnBlock]); for (BasicBlock *BB : OI->Entries) ClonedOI->Entries.push_back(cast(VMap[BB])); for (BasicBlock *E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast(VMap[E]); ClonedOI->ReturnBlockPreds.push_back(NewE); } // Go ahead and update all uses to the duplicate, so that we can just // use the inliner functionality when we're done hacking. F->replaceAllUsesWith(ClonedFunc); } PartialInlinerImpl::FunctionCloner::FunctionCloner( Function *F, FunctionOutliningMultiRegionInfo *OI, OptimizationRemarkEmitter &ORE, function_ref LookupAC, function_ref GetTTI) : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { ClonedOMRI = std::make_unique(); // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; ClonedFunc = CloneFunction(F, VMap); // Go through all Outline Candidate Regions and update all BasicBlock // information. for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : OI->ORI) { SmallVector Region; for (BasicBlock *BB : RegionInfo.Region) Region.push_back(cast(VMap[BB])); BasicBlock *NewEntryBlock = cast(VMap[RegionInfo.EntryBlock]); BasicBlock *NewExitBlock = cast(VMap[RegionInfo.ExitBlock]); BasicBlock *NewReturnBlock = nullptr; if (RegionInfo.ReturnBlock) NewReturnBlock = cast(VMap[RegionInfo.ReturnBlock]); FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( Region, NewEntryBlock, NewExitBlock, NewReturnBlock); ClonedOMRI->ORI.push_back(MappedRegionInfo); } // Go ahead and update all uses to the duplicate, so that we can just // use the inliner functionality when we're done hacking. F->replaceAllUsesWith(ClonedFunc); } void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const { auto GetFirstPHI = [](BasicBlock *BB) { BasicBlock::iterator I = BB->begin(); PHINode *FirstPhi = nullptr; while (I != BB->end()) { PHINode *Phi = dyn_cast(I); if (!Phi) break; if (!FirstPhi) { FirstPhi = Phi; break; } } return FirstPhi; }; // Shouldn't need to normalize PHIs if we're not outlining non-early return // blocks. if (!ClonedOI) return; // Special hackery is needed with PHI nodes that have inputs from more than // one extracted block. For simplicity, just split the PHIs into a two-level // sequence of PHIs, some of which will go in the extracted region, and some // of which will go outside. BasicBlock *PreReturn = ClonedOI->ReturnBlock; // only split block when necessary: PHINode *FirstPhi = GetFirstPHI(PreReturn); unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) return; auto IsTrivialPhi = [](PHINode *PN) -> Value * { Value *CommonValue = PN->getIncomingValue(0); if (all_of(PN->incoming_values(), [&](Value *V) { return V == CommonValue; })) return CommonValue; return nullptr; }; ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); BasicBlock::iterator I = PreReturn->begin(); Instruction *Ins = &ClonedOI->ReturnBlock->front(); SmallVector DeadPhis; while (I != PreReturn->end()) { PHINode *OldPhi = dyn_cast(I); if (!OldPhi) break; PHINode *RetPhi = PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); OldPhi->replaceAllUsesWith(RetPhi); Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); RetPhi->addIncoming(&*I, PreReturn); for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); OldPhi->removeIncomingValue(E); } // After incoming values splitting, the old phi may become trivial. // Keeping the trivial phi can introduce definition inside the outline // region which is live-out, causing necessary overhead (load, store // arg passing etc). if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { OldPhi->replaceAllUsesWith(OldPhiVal); DeadPhis.push_back(OldPhi); } ++I; } for (auto *DP : DeadPhis) DP->eraseFromParent(); for (auto *E : ClonedOI->ReturnBlockPreds) E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); } bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { auto ComputeRegionCost = [&](SmallVectorImpl &Region) -> InstructionCost { InstructionCost Cost = 0; for (BasicBlock* BB : Region) Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); return Cost; }; assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); if (ClonedOMRI->ORI.empty()) return false; // The CodeExtractor needs a dominator tree. DominatorTree DT; DT.recalculate(*ClonedFunc); // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. LoopInfo LI(DT); BranchProbabilityInfo BPI(*ClonedFunc, LI); ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. CodeExtractorAnalysisCache CEAC(*ClonedFunc); SetVector Inputs, Outputs, Sinks; for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : ClonedOMRI->ORI) { InstructionCost CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, ClonedFuncBFI.get(), &BPI, LookupAC(*RegionInfo.EntryBlock->getParent()), /* AllowVarargs */ false); CE.findInputsOutputs(Inputs, Outputs, Sinks); LLVM_DEBUG({ dbgs() << "inputs: " << Inputs.size() << "\n"; dbgs() << "outputs: " << Outputs.size() << "\n"; for (Value *value : Inputs) dbgs() << "value used in func: " << *value << "\n"; for (Value *output : Outputs) dbgs() << "instr used in func: " << *output << "\n"; }); // Do not extract regions that have live exit variables. if (Outputs.size() > 0 && !ForceLiveExit) continue; if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) { CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc); BasicBlock *OutliningCallBB = OCS->getParent(); assert(OutliningCallBB->getParent() == ClonedFunc); OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); NumColdRegionsOutlined++; OutlinedRegionCost += CurrentOutlinedRegionCost; if (MarkOutlinedColdCC) { OutlinedFunc->setCallingConv(CallingConv::Cold); OCS->setCallingConv(CallingConv::Cold); } } else ORE.emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", &RegionInfo.Region.front()->front()) << "Failed to extract region at block " << ore::NV("Block", RegionInfo.Region.front()); }); } return !OutlinedFunctions.empty(); } Function * PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { // Returns true if the block is to be partial inlined into the caller // (i.e. not to be extracted to the out of line function) auto ToBeInlined = [&, this](BasicBlock *BB) { return BB == ClonedOI->ReturnBlock || llvm::is_contained(ClonedOI->Entries, BB); }; assert(ClonedOI && "Expecting OutlineInfo for single region outline"); // The CodeExtractor needs a dominator tree. DominatorTree DT; DT.recalculate(*ClonedFunc); // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. LoopInfo LI(DT); BranchProbabilityInfo BPI(*ClonedFunc, LI); ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); // Gather up the blocks that we're going to extract. std::vector ToExtract; auto *ClonedFuncTTI = &GetTTI(*ClonedFunc); ToExtract.push_back(ClonedOI->NonReturnBlock); OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost( ClonedOI->NonReturnBlock, ClonedFuncTTI); for (BasicBlock &BB : *ClonedFunc) if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { ToExtract.push_back(&BB); // FIXME: the code extractor may hoist/sink more code // into the outlined function which may make the outlining // overhead (the difference of the outlined function cost // and OutliningRegionCost) look larger. OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI); } // Extract the body of the if. CodeExtractorAnalysisCache CEAC(*ClonedFunc); Function *OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), /* AllowVarargs */ true) .extractCodeRegion(CEAC); if (OutlinedFunc) { BasicBlock *OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent(); assert(OutliningCallBB->getParent() == ClonedFunc); OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); } else ORE.emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", &ToExtract.front()->front()) << "Failed to extract region at block " << ore::NV("Block", ToExtract.front()); }); return OutlinedFunc; } PartialInlinerImpl::FunctionCloner::~FunctionCloner() { // Ditch the duplicate, since we're done with it, and rewrite all remaining // users (function pointers, etc.) back to the original function. ClonedFunc->replaceAllUsesWith(OrigFunc); ClonedFunc->eraseFromParent(); if (!IsFunctionInlined) { // Remove each function that was speculatively created if there is no // reference. for (auto FuncBBPair : OutlinedFunctions) { Function *Func = FuncBBPair.first; Func->eraseFromParent(); } } } std::pair PartialInlinerImpl::unswitchFunction(Function &F) { if (F.hasAddressTaken()) return {false, nullptr}; // Let inliner handle it if (F.hasFnAttribute(Attribute::AlwaysInline)) return {false, nullptr}; if (F.hasFnAttribute(Attribute::NoInline)) return {false, nullptr}; if (PSI.isFunctionEntryCold(&F)) return {false, nullptr}; if (F.users().empty()) return {false, nullptr}; OptimizationRemarkEmitter ORE(&F); // Only try to outline cold regions if we have a profile summary, which // implies we have profiling information. if (PSI.hasProfileSummary() && F.hasProfileData() && !DisableMultiRegionPartialInline) { std::unique_ptr OMRI = computeOutliningColdRegionsInfo(F, ORE); if (OMRI) { FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI); LLVM_DEBUG({ dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n"; dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold() << "\n"; }); bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); if (DidOutline) { LLVM_DEBUG({ dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; Cloner.ClonedFunc->print(dbgs()); dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; }); if (tryPartialInline(Cloner)) return {true, nullptr}; } } } // Fall-thru to regular partial inlining if we: // i) can't find any cold regions to outline, or // ii) can't inline the outlined function anywhere. std::unique_ptr OI = computeOutliningInfo(F); if (!OI) return {false, nullptr}; FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI); Cloner.normalizeReturnBlock(); Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); if (!OutlinedFunction) return {false, nullptr}; if (tryPartialInline(Cloner)) return {true, OutlinedFunction}; return {false, nullptr}; } bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { if (Cloner.OutlinedFunctions.empty()) return false; int SizeCost = 0; BlockFrequency WeightedRcost; int NonWeightedRcost; auto OutliningCosts = computeOutliningCosts(Cloner); assert(std::get<0>(OutliningCosts).isValid() && std::get<1>(OutliningCosts).isValid() && "Expected valid costs"); SizeCost = *std::get<0>(OutliningCosts).getValue(); NonWeightedRcost = *std::get<1>(OutliningCosts).getValue(); // Only calculate RelativeToEntryFreq when we are doing single region // outlining. BranchProbability RelativeToEntryFreq; if (Cloner.ClonedOI) RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); else // RelativeToEntryFreq doesn't make sense when we have more than one // outlined call because each call will have a different relative frequency // to the entry block. We can consider using the average, but the // usefulness of that information is questionable. For now, assume we never // execute the calls to outlined functions. RelativeToEntryFreq = BranchProbability(0, 1); WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; // The call sequence(s) to the outlined function(s) are larger than the sum of // the original outlined region size(s), it does not increase the chances of // inlining the function with outlining (The inliner uses the size increase to // model the cost of inlining a callee). if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); DebugLoc DLoc; BasicBlock *Block; std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc); OrigFuncORE.emit([&]() { return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", DLoc, Block) << ore::NV("Function", Cloner.OrigFunc) << " not partially inlined into callers (Original Size = " << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) << ", Size of call sequence to outlined function = " << ore::NV("NewSize", SizeCost) << ")"; }); return false; } assert(Cloner.OrigFunc->users().empty() && "F's users should all be replaced!"); std::vector Users(Cloner.ClonedFunc->user_begin(), Cloner.ClonedFunc->user_end()); DenseMap CallSiteToProfCountMap; auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); if (CalleeEntryCount) computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); uint64_t CalleeEntryCountV = (CalleeEntryCount ? CalleeEntryCount->getCount() : 0); bool AnyInline = false; for (User *User : Users) { // Don't bother with BlockAddress used by CallBr for asm goto. if (isa(User)) continue; CallBase *CB = getSupportedCallBase(User); if (isLimitReached()) continue; OptimizationRemarkEmitter CallerORE(CB->getCaller()); if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE)) continue; // Construct remark before doing the inlining, as after successful inlining // the callsite is removed. OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB); OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " << ore::NV("Caller", CB->getCaller()); InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI); // We can only forward varargs when we outlined a single region, else we // bail on vararg functions. if (!InlineFunction(*CB, IFI, nullptr, true, (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first : nullptr)) .isSuccess()) continue; CallerORE.emit(OR); // Now update the entry count: if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { uint64_t CallSiteCount = CallSiteToProfCountMap[User]; CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); } AnyInline = true; NumPartialInlining++; // Update the stats if (Cloner.ClonedOI) NumPartialInlined++; else NumColdOutlinePartialInlined++; } if (AnyInline) { Cloner.IsFunctionInlined = true; if (CalleeEntryCount) Cloner.OrigFunc->setEntryCount(Function::ProfileCount( CalleeEntryCountV, CalleeEntryCount->getType())); OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); OrigFuncORE.emit([&]() { return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) << "Partially inlined into at least one caller"; }); } return AnyInline; } bool PartialInlinerImpl::run(Module &M) { if (DisablePartialInlining) return false; std::vector Worklist; Worklist.reserve(M.size()); for (Function &F : M) if (!F.use_empty() && !F.isDeclaration()) Worklist.push_back(&F); bool Changed = false; while (!Worklist.empty()) { Function *CurrFunc = Worklist.back(); Worklist.pop_back(); if (CurrFunc->use_empty()) continue; bool Recursive = false; for (User *U : CurrFunc->users()) if (Instruction *I = dyn_cast(U)) if (I->getParent()->getParent() == CurrFunc) { Recursive = true; break; } if (Recursive) continue; std::pair Result = unswitchFunction(*CurrFunc); if (Result.second) Worklist.push_back(Result.second); Changed |= Result.first; } return Changed; } char PartialInlinerLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner", false, false) ModulePass *llvm::createPartialInliningPass() { return new PartialInlinerLegacyPass(); } PreservedAnalyses PartialInlinerPass::run(Module &M, ModuleAnalysisManager &AM) { auto &FAM = AM.getResult(M).getManager(); auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & { return FAM.getResult(F); }; auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { return FAM.getCachedResult(F); }; auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { return FAM.getResult(F); }; auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { return FAM.getResult(F); }; auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { return FAM.getResult(F); }; ProfileSummaryInfo &PSI = AM.getResult(M); if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, GetTLI, PSI, GetBFI) .run(M)) return PreservedAnalyses::none(); return PreservedAnalyses::all(); }