12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562 |
- //===- PartialInlining.cpp - Inline parts of functions --------------------===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- //
- // This pass performs partial inlining, typically by inlining an if statement
- // that surrounds the body of the function.
- //
- //===----------------------------------------------------------------------===//
- #include "llvm/Transforms/IPO/PartialInlining.h"
- #include "llvm/ADT/DenseMap.h"
- #include "llvm/ADT/DenseSet.h"
- #include "llvm/ADT/None.h"
- #include "llvm/ADT/Optional.h"
- #include "llvm/ADT/STLExtras.h"
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/ADT/Statistic.h"
- #include "llvm/Analysis/BlockFrequencyInfo.h"
- #include "llvm/Analysis/BranchProbabilityInfo.h"
- #include "llvm/Analysis/InlineCost.h"
- #include "llvm/Analysis/LoopInfo.h"
- #include "llvm/Analysis/OptimizationRemarkEmitter.h"
- #include "llvm/Analysis/ProfileSummaryInfo.h"
- #include "llvm/Analysis/TargetLibraryInfo.h"
- #include "llvm/Analysis/TargetTransformInfo.h"
- #include "llvm/IR/Attributes.h"
- #include "llvm/IR/BasicBlock.h"
- #include "llvm/IR/CFG.h"
- #include "llvm/IR/DebugLoc.h"
- #include "llvm/IR/DiagnosticInfo.h"
- #include "llvm/IR/Dominators.h"
- #include "llvm/IR/Function.h"
- #include "llvm/IR/InstrTypes.h"
- #include "llvm/IR/Instruction.h"
- #include "llvm/IR/Instructions.h"
- #include "llvm/IR/IntrinsicInst.h"
- #include "llvm/IR/Intrinsics.h"
- #include "llvm/IR/Module.h"
- #include "llvm/IR/User.h"
- #include "llvm/InitializePasses.h"
- #include "llvm/Pass.h"
- #include "llvm/Support/BlockFrequency.h"
- #include "llvm/Support/BranchProbability.h"
- #include "llvm/Support/Casting.h"
- #include "llvm/Support/CommandLine.h"
- #include "llvm/Support/ErrorHandling.h"
- #include "llvm/Transforms/IPO.h"
- #include "llvm/Transforms/Utils/Cloning.h"
- #include "llvm/Transforms/Utils/CodeExtractor.h"
- #include "llvm/Transforms/Utils/ValueMapper.h"
- #include <algorithm>
- #include <cassert>
- #include <cstdint>
- #include <functional>
- #include <iterator>
- #include <memory>
- #include <tuple>
- #include <vector>
- using namespace llvm;
- #define DEBUG_TYPE "partial-inlining"
- STATISTIC(NumPartialInlined,
- "Number of callsites functions partially inlined into.");
- STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
- "cold outlined regions were partially "
- "inlined into its caller(s).");
- STATISTIC(NumColdRegionsFound,
- "Number of cold single entry/exit regions found.");
- STATISTIC(NumColdRegionsOutlined,
- "Number of cold single entry/exit regions outlined.");
- // Command line option to disable partial-inlining. The default is false:
- static cl::opt<bool>
- DisablePartialInlining("disable-partial-inlining", cl::init(false),
- cl::Hidden, cl::desc("Disable partial inlining"));
- // Command line option to disable multi-region partial-inlining. The default is
- // false:
- static cl::opt<bool> DisableMultiRegionPartialInline(
- "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
- cl::desc("Disable multi-region partial inlining"));
- // Command line option to force outlining in regions with live exit variables.
- // The default is false:
- static cl::opt<bool>
- ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
- cl::desc("Force outline regions with live exits"));
- // Command line option to enable marking outline functions with Cold Calling
- // Convention. The default is false:
- static cl::opt<bool>
- MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
- cl::desc("Mark outline function calls with ColdCC"));
- // This is an option used by testing:
- static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
- cl::init(false), cl::ZeroOrMore,
- cl::ReallyHidden,
- cl::desc("Skip Cost Analysis"));
- // Used to determine if a cold region is worth outlining based on
- // its inlining cost compared to the original function. Default is set at 10%.
- // ie. if the cold region reduces the inlining cost of the original function by
- // at least 10%.
- static cl::opt<float> MinRegionSizeRatio(
- "min-region-size-ratio", cl::init(0.1), cl::Hidden,
- cl::desc("Minimum ratio comparing relative sizes of each "
- "outline candidate and original function"));
- // Used to tune the minimum number of execution counts needed in the predecessor
- // block to the cold edge. ie. confidence interval.
- static cl::opt<unsigned>
- MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
- cl::desc("Minimum block executions to consider "
- "its BranchProbabilityInfo valid"));
- // Used to determine when an edge is considered cold. Default is set to 10%. ie.
- // if the branch probability is 10% or less, then it is deemed as 'cold'.
- static cl::opt<float> ColdBranchRatio(
- "cold-branch-ratio", cl::init(0.1), cl::Hidden,
- cl::desc("Minimum BranchProbability to consider a region cold."));
- static cl::opt<unsigned> MaxNumInlineBlocks(
- "max-num-inline-blocks", cl::init(5), cl::Hidden,
- cl::desc("Max number of blocks to be partially inlined"));
- // Command line option to set the maximum number of partial inlining allowed
- // for the module. The default value of -1 means no limit.
- static cl::opt<int> MaxNumPartialInlining(
- "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
- cl::desc("Max number of partial inlining. The default is unlimited"));
- // Used only when PGO or user annotated branch data is absent. It is
- // the least value that is used to weigh the outline region. If BFI
- // produces larger value, the BFI value will be used.
- static cl::opt<int>
- OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
- cl::Hidden, cl::ZeroOrMore,
- cl::desc("Relative frequency of outline region to "
- "the entry block"));
- static cl::opt<unsigned> ExtraOutliningPenalty(
- "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
- cl::desc("A debug option to add additional penalty to the computed one."));
- namespace {
- struct FunctionOutliningInfo {
- FunctionOutliningInfo() = default;
- // Returns the number of blocks to be inlined including all blocks
- // in Entries and one return block.
- unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
- // A set of blocks including the function entry that guard
- // the region to be outlined.
- SmallVector<BasicBlock *, 4> Entries;
- // The return block that is not included in the outlined region.
- BasicBlock *ReturnBlock = nullptr;
- // The dominating block of the region to be outlined.
- BasicBlock *NonReturnBlock = nullptr;
- // The set of blocks in Entries that that are predecessors to ReturnBlock
- SmallVector<BasicBlock *, 4> ReturnBlockPreds;
- };
- struct FunctionOutliningMultiRegionInfo {
- FunctionOutliningMultiRegionInfo() {}
- // Container for outline regions
- struct OutlineRegionInfo {
- OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
- BasicBlock *EntryBlock, BasicBlock *ExitBlock,
- BasicBlock *ReturnBlock)
- : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
- ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
- SmallVector<BasicBlock *, 8> Region;
- BasicBlock *EntryBlock;
- BasicBlock *ExitBlock;
- BasicBlock *ReturnBlock;
- };
- SmallVector<OutlineRegionInfo, 4> ORI;
- };
- struct PartialInlinerImpl {
- PartialInlinerImpl(
- function_ref<AssumptionCache &(Function &)> GetAC,
- function_ref<AssumptionCache *(Function &)> LookupAC,
- function_ref<TargetTransformInfo &(Function &)> GTTI,
- function_ref<const TargetLibraryInfo &(Function &)> GTLI,
- ProfileSummaryInfo &ProfSI,
- function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
- : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
- GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
- bool run(Module &M);
- // Main part of the transformation that calls helper functions to find
- // outlining candidates, clone & outline the function, and attempt to
- // partially inline the resulting function. Returns true if
- // inlining was successful, false otherwise. Also returns the outline
- // function (only if we partially inlined early returns) as there is a
- // possibility to further "peel" early return statements that were left in the
- // outline function due to code size.
- std::pair<bool, Function *> unswitchFunction(Function &F);
- // This class speculatively clones the function to be partial inlined.
- // At the end of partial inlining, the remaining callsites to the cloned
- // function that are not partially inlined will be fixed up to reference
- // the original function, and the cloned function will be erased.
- struct FunctionCloner {
- // Two constructors, one for single region outlining, the other for
- // multi-region outlining.
- FunctionCloner(Function *F, FunctionOutliningInfo *OI,
- OptimizationRemarkEmitter &ORE,
- function_ref<AssumptionCache *(Function &)> LookupAC,
- function_ref<TargetTransformInfo &(Function &)> GetTTI);
- FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
- OptimizationRemarkEmitter &ORE,
- function_ref<AssumptionCache *(Function &)> LookupAC,
- function_ref<TargetTransformInfo &(Function &)> GetTTI);
- ~FunctionCloner();
- // Prepare for function outlining: making sure there is only
- // one incoming edge from the extracted/outlined region to
- // the return block.
- void normalizeReturnBlock() const;
- // Do function outlining for cold regions.
- bool doMultiRegionFunctionOutlining();
- // Do function outlining for region after early return block(s).
- // NOTE: For vararg functions that do the vararg handling in the outlined
- // function, we temporarily generate IR that does not properly
- // forward varargs to the outlined function. Calling InlineFunction
- // will update calls to the outlined functions to properly forward
- // the varargs.
- Function *doSingleRegionFunctionOutlining();
- Function *OrigFunc = nullptr;
- Function *ClonedFunc = nullptr;
- typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
- // Keep track of Outlined Functions and the basic block they're called from.
- SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
- // ClonedFunc is inlined in one of its callers after function
- // outlining.
- bool IsFunctionInlined = false;
- // The cost of the region to be outlined.
- InstructionCost OutlinedRegionCost = 0;
- // ClonedOI is specific to outlining non-early return blocks.
- std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
- // ClonedOMRI is specific to outlining cold regions.
- std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
- std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
- OptimizationRemarkEmitter &ORE;
- function_ref<AssumptionCache *(Function &)> LookupAC;
- function_ref<TargetTransformInfo &(Function &)> GetTTI;
- };
- private:
- int NumPartialInlining = 0;
- function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
- function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
- function_ref<TargetTransformInfo &(Function &)> GetTTI;
- function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
- function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
- ProfileSummaryInfo &PSI;
- // Return the frequency of the OutlininingBB relative to F's entry point.
- // The result is no larger than 1 and is represented using BP.
- // (Note that the outlined region's 'head' block can only have incoming
- // edges from the guarding entry blocks).
- BranchProbability
- getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
- // Return true if the callee of CB should be partially inlined with
- // profit.
- bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
- BlockFrequency WeightedOutliningRcost,
- OptimizationRemarkEmitter &ORE) const;
- // Try to inline DuplicateFunction (cloned from F with call to
- // the OutlinedFunction into its callers. Return true
- // if there is any successful inlining.
- bool tryPartialInline(FunctionCloner &Cloner);
- // Compute the mapping from use site of DuplicationFunction to the enclosing
- // BB's profile count.
- void
- computeCallsiteToProfCountMap(Function *DuplicateFunction,
- DenseMap<User *, uint64_t> &SiteCountMap) const;
- bool isLimitReached() const {
- return (MaxNumPartialInlining != -1 &&
- NumPartialInlining >= MaxNumPartialInlining);
- }
- static CallBase *getSupportedCallBase(User *U) {
- if (isa<CallInst>(U) || isa<InvokeInst>(U))
- return cast<CallBase>(U);
- llvm_unreachable("All uses must be calls");
- return nullptr;
- }
- static CallBase *getOneCallSiteTo(Function &F) {
- User *User = *F.user_begin();
- return getSupportedCallBase(User);
- }
- std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
- CallBase *CB = getOneCallSiteTo(F);
- DebugLoc DLoc = CB->getDebugLoc();
- BasicBlock *Block = CB->getParent();
- return std::make_tuple(DLoc, Block);
- }
- // Returns the costs associated with function outlining:
- // - The first value is the non-weighted runtime cost for making the call
- // to the outlined function, including the addtional setup cost in the
- // outlined function itself;
- // - The second value is the estimated size of the new call sequence in
- // basic block Cloner.OutliningCallBB;
- std::tuple<InstructionCost, InstructionCost>
- computeOutliningCosts(FunctionCloner &Cloner) const;
- // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
- // approximate both the size and runtime cost (Note that in the current
- // inline cost analysis, there is no clear distinction there either).
- static InstructionCost computeBBInlineCost(BasicBlock *BB,
- TargetTransformInfo *TTI);
- std::unique_ptr<FunctionOutliningInfo>
- computeOutliningInfo(Function &F) const;
- std::unique_ptr<FunctionOutliningMultiRegionInfo>
- computeOutliningColdRegionsInfo(Function &F,
- OptimizationRemarkEmitter &ORE) const;
- };
- struct PartialInlinerLegacyPass : public ModulePass {
- static char ID; // Pass identification, replacement for typeid
- PartialInlinerLegacyPass() : ModulePass(ID) {
- initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
- }
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
- AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
- TargetTransformInfoWrapperPass *TTIWP =
- &getAnalysis<TargetTransformInfoWrapperPass>();
- ProfileSummaryInfo &PSI =
- getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
- return ACT->getAssumptionCache(F);
- };
- auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
- return ACT->lookupAssumptionCache(F);
- };
- auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
- return TTIWP->getTTI(F);
- };
- auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
- return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- };
- return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
- GetTLI, PSI)
- .run(M);
- }
- };
- } // end anonymous namespace
- std::unique_ptr<FunctionOutliningMultiRegionInfo>
- PartialInlinerImpl::computeOutliningColdRegionsInfo(
- Function &F, OptimizationRemarkEmitter &ORE) const {
- BasicBlock *EntryBlock = &F.front();
- DominatorTree DT(F);
- LoopInfo LI(DT);
- BranchProbabilityInfo BPI(F, LI);
- std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
- BlockFrequencyInfo *BFI;
- if (!GetBFI) {
- ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
- BFI = ScopedBFI.get();
- } else
- BFI = &(GetBFI(F));
- // Return if we don't have profiling information.
- if (!PSI.hasInstrumentationProfile())
- return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
- std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
- std::make_unique<FunctionOutliningMultiRegionInfo>();
- auto IsSingleExit =
- [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
- BasicBlock *ExitBlock = nullptr;
- for (auto *Block : BlockList) {
- for (BasicBlock *Succ : successors(Block)) {
- if (!is_contained(BlockList, Succ)) {
- if (ExitBlock) {
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
- &Succ->front())
- << "Region dominated by "
- << ore::NV("Block", BlockList.front()->getName())
- << " has more than one region exit edge.";
- });
- return nullptr;
- }
- ExitBlock = Block;
- }
- }
- }
- return ExitBlock;
- };
- auto BBProfileCount = [BFI](BasicBlock *BB) {
- return BFI->getBlockProfileCount(BB).getValueOr(0);
- };
- // Use the same computeBBInlineCost function to compute the cost savings of
- // the outlining the candidate region.
- TargetTransformInfo *FTTI = &GetTTI(F);
- InstructionCost OverallFunctionCost = 0;
- for (auto &BB : F)
- OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
- LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
- << "\n";);
- InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
- [&](auto Cost) { return Cost * MinRegionSizeRatio; });
- BranchProbability MinBranchProbability(
- static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
- MinBlockCounterExecution);
- bool ColdCandidateFound = false;
- BasicBlock *CurrEntry = EntryBlock;
- std::vector<BasicBlock *> DFS;
- DenseMap<BasicBlock *, bool> VisitedMap;
- DFS.push_back(CurrEntry);
- VisitedMap[CurrEntry] = true;
- // Use Depth First Search on the basic blocks to find CFG edges that are
- // considered cold.
- // Cold regions considered must also have its inline cost compared to the
- // overall inline cost of the original function. The region is outlined only
- // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
- // more.
- while (!DFS.empty()) {
- auto *ThisBB = DFS.back();
- DFS.pop_back();
- // Only consider regions with predecessor blocks that are considered
- // not-cold (default: part of the top 99.99% of all block counters)
- // AND greater than our minimum block execution count (default: 100).
- if (PSI.isColdBlock(ThisBB, BFI) ||
- BBProfileCount(ThisBB) < MinBlockCounterExecution)
- continue;
- for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
- if (VisitedMap[*SI])
- continue;
- VisitedMap[*SI] = true;
- DFS.push_back(*SI);
- // If branch isn't cold, we skip to the next one.
- BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
- if (SuccProb > MinBranchProbability)
- continue;
- LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
- << SI->getName()
- << "\nBranch Probability = " << SuccProb << "\n";);
- SmallVector<BasicBlock *, 8> DominateVector;
- DT.getDescendants(*SI, DominateVector);
- assert(!DominateVector.empty() &&
- "SI should be reachable and have at least itself as descendant");
- // We can only outline single entry regions (for now).
- if (!DominateVector.front()->hasNPredecessors(1)) {
- LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
- << " doesn't have a single predecessor in the "
- "dominator tree\n";);
- continue;
- }
- BasicBlock *ExitBlock = nullptr;
- // We can only outline single exit regions (for now).
- if (!(ExitBlock = IsSingleExit(DominateVector))) {
- LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
- << " doesn't have a unique successor\n";);
- continue;
- }
- InstructionCost OutlineRegionCost = 0;
- for (auto *BB : DominateVector)
- OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
- LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
- << "\n";);
- if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
- &SI->front())
- << ore::NV("Callee", &F)
- << " inline cost-savings smaller than "
- << ore::NV("Cost", MinOutlineRegionCost);
- });
- LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
- << MinOutlineRegionCost << "\n";);
- continue;
- }
- // For now, ignore blocks that belong to a SISE region that is a
- // candidate for outlining. In the future, we may want to look
- // at inner regions because the outer region may have live-exit
- // variables.
- for (auto *BB : DominateVector)
- VisitedMap[BB] = true;
- // ReturnBlock here means the block after the outline call
- BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
- FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
- DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
- OutliningInfo->ORI.push_back(RegInfo);
- LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
- << DominateVector.front()->getName() << "\n";);
- ColdCandidateFound = true;
- NumColdRegionsFound++;
- }
- }
- if (ColdCandidateFound)
- return OutliningInfo;
- return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
- }
- std::unique_ptr<FunctionOutliningInfo>
- PartialInlinerImpl::computeOutliningInfo(Function &F) const {
- BasicBlock *EntryBlock = &F.front();
- BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
- if (!BR || BR->isUnconditional())
- return std::unique_ptr<FunctionOutliningInfo>();
- // Returns true if Succ is BB's successor
- auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
- return is_contained(successors(BB), Succ);
- };
- auto IsReturnBlock = [](BasicBlock *BB) {
- Instruction *TI = BB->getTerminator();
- return isa<ReturnInst>(TI);
- };
- auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
- if (IsReturnBlock(Succ1))
- return std::make_tuple(Succ1, Succ2);
- if (IsReturnBlock(Succ2))
- return std::make_tuple(Succ2, Succ1);
- return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
- };
- // Detect a triangular shape:
- auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
- if (IsSuccessor(Succ1, Succ2))
- return std::make_tuple(Succ1, Succ2);
- if (IsSuccessor(Succ2, Succ1))
- return std::make_tuple(Succ2, Succ1);
- return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
- };
- std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
- std::make_unique<FunctionOutliningInfo>();
- BasicBlock *CurrEntry = EntryBlock;
- bool CandidateFound = false;
- do {
- // The number of blocks to be inlined has already reached
- // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
- // disables partial inlining for the function.
- if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
- break;
- if (succ_size(CurrEntry) != 2)
- break;
- BasicBlock *Succ1 = *succ_begin(CurrEntry);
- BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
- BasicBlock *ReturnBlock, *NonReturnBlock;
- std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
- if (ReturnBlock) {
- OutliningInfo->Entries.push_back(CurrEntry);
- OutliningInfo->ReturnBlock = ReturnBlock;
- OutliningInfo->NonReturnBlock = NonReturnBlock;
- CandidateFound = true;
- break;
- }
- BasicBlock *CommSucc, *OtherSucc;
- std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
- if (!CommSucc)
- break;
- OutliningInfo->Entries.push_back(CurrEntry);
- CurrEntry = OtherSucc;
- } while (true);
- if (!CandidateFound)
- return std::unique_ptr<FunctionOutliningInfo>();
- // There should not be any successors (not in the entry set) other than
- // {ReturnBlock, NonReturnBlock}
- assert(OutliningInfo->Entries[0] == &F.front() &&
- "Function Entry must be the first in Entries vector");
- DenseSet<BasicBlock *> Entries;
- for (BasicBlock *E : OutliningInfo->Entries)
- Entries.insert(E);
- // Returns true of BB has Predecessor which is not
- // in Entries set.
- auto HasNonEntryPred = [Entries](BasicBlock *BB) {
- for (auto *Pred : predecessors(BB)) {
- if (!Entries.count(Pred))
- return true;
- }
- return false;
- };
- auto CheckAndNormalizeCandidate =
- [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
- for (BasicBlock *E : OutliningInfo->Entries) {
- for (auto *Succ : successors(E)) {
- if (Entries.count(Succ))
- continue;
- if (Succ == OutliningInfo->ReturnBlock)
- OutliningInfo->ReturnBlockPreds.push_back(E);
- else if (Succ != OutliningInfo->NonReturnBlock)
- return false;
- }
- // There should not be any outside incoming edges either:
- if (HasNonEntryPred(E))
- return false;
- }
- return true;
- };
- if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
- return std::unique_ptr<FunctionOutliningInfo>();
- // Now further growing the candidate's inlining region by
- // peeling off dominating blocks from the outlining region:
- while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
- BasicBlock *Cand = OutliningInfo->NonReturnBlock;
- if (succ_size(Cand) != 2)
- break;
- if (HasNonEntryPred(Cand))
- break;
- BasicBlock *Succ1 = *succ_begin(Cand);
- BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
- BasicBlock *ReturnBlock, *NonReturnBlock;
- std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
- if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
- break;
- if (NonReturnBlock->getSinglePredecessor() != Cand)
- break;
- // Now grow and update OutlininigInfo:
- OutliningInfo->Entries.push_back(Cand);
- OutliningInfo->NonReturnBlock = NonReturnBlock;
- OutliningInfo->ReturnBlockPreds.push_back(Cand);
- Entries.insert(Cand);
- }
- return OutliningInfo;
- }
- // Check if there is PGO data or user annotated branch data:
- static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
- if (F.hasProfileData())
- return true;
- // Now check if any of the entry block has MD_prof data:
- for (auto *E : OI.Entries) {
- BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
- if (!BR || BR->isUnconditional())
- continue;
- uint64_t T, F;
- if (BR->extractProfMetadata(T, F))
- return true;
- }
- return false;
- }
- BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
- FunctionCloner &Cloner) const {
- BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
- auto EntryFreq =
- Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
- auto OutliningCallFreq =
- Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
- // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
- // we outlined any regions, so we may encounter situations where the
- // OutliningCallFreq is *slightly* bigger than the EntryFreq.
- if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
- OutliningCallFreq = EntryFreq;
- auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
- OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
- if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get()))
- return OutlineRegionRelFreq;
- // When profile data is not available, we need to be conservative in
- // estimating the overall savings. Static branch prediction can usually
- // guess the branch direction right (taken/non-taken), but the guessed
- // branch probability is usually not biased enough. In case when the
- // outlined region is predicted to be likely, its probability needs
- // to be made higher (more biased) to not under-estimate the cost of
- // function outlining. On the other hand, if the outlined region
- // is predicted to be less likely, the predicted probablity is usually
- // higher than the actual. For instance, the actual probability of the
- // less likely target is only 5%, but the guessed probablity can be
- // 40%. In the latter case, there is no need for further adjustement.
- // FIXME: add an option for this.
- if (OutlineRegionRelFreq < BranchProbability(45, 100))
- return OutlineRegionRelFreq;
- OutlineRegionRelFreq = std::max(
- OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
- return OutlineRegionRelFreq;
- }
- bool PartialInlinerImpl::shouldPartialInline(
- CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
- OptimizationRemarkEmitter &ORE) const {
- using namespace ore;
- Function *Callee = CB.getCalledFunction();
- assert(Callee == Cloner.ClonedFunc);
- if (SkipCostAnalysis)
- return isInlineViable(*Callee).isSuccess();
- Function *Caller = CB.getCaller();
- auto &CalleeTTI = GetTTI(*Callee);
- bool RemarksEnabled =
- Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
- DEBUG_TYPE);
- InlineCost IC =
- getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
- GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
- if (IC.isAlways()) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
- << NV("Callee", Cloner.OrigFunc)
- << " should always be fully inlined, not partially";
- });
- return false;
- }
- if (IC.isNever()) {
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
- << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
- << NV("Caller", Caller)
- << " because it should never be inlined (cost=never)";
- });
- return false;
- }
- if (!IC) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
- << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
- << NV("Caller", Caller) << " because too costly to inline (cost="
- << NV("Cost", IC.getCost()) << ", threshold="
- << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
- });
- return false;
- }
- const DataLayout &DL = Caller->getParent()->getDataLayout();
- // The savings of eliminating the call:
- int NonWeightedSavings = getCallsiteCost(CB, DL);
- BlockFrequency NormWeightedSavings(NonWeightedSavings);
- // Weighted saving is smaller than weighted cost, return false
- if (NormWeightedSavings < WeightedOutliningRcost) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
- &CB)
- << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
- << NV("Caller", Caller) << " runtime overhead (overhead="
- << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
- << ", savings="
- << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
- << ")"
- << " of making the outlined call is too high";
- });
- return false;
- }
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
- << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
- << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
- << " (threshold="
- << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
- });
- return true;
- }
- // TODO: Ideally we should share Inliner's InlineCost Analysis code.
- // For now use a simplified version. The returned 'InlineCost' will be used
- // to esimate the size cost as well as runtime cost of the BB.
- InstructionCost
- PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
- TargetTransformInfo *TTI) {
- InstructionCost InlineCost = 0;
- const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
- for (Instruction &I : BB->instructionsWithoutDebug()) {
- // Skip free instructions.
- switch (I.getOpcode()) {
- case Instruction::BitCast:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::Alloca:
- case Instruction::PHI:
- continue;
- case Instruction::GetElementPtr:
- if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
- continue;
- break;
- default:
- break;
- }
- if (I.isLifetimeStartOrEnd())
- continue;
- if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
- Intrinsic::ID IID = II->getIntrinsicID();
- SmallVector<Type *, 4> Tys;
- FastMathFlags FMF;
- for (Value *Val : II->args())
- Tys.push_back(Val->getType());
- if (auto *FPMO = dyn_cast<FPMathOperator>(II))
- FMF = FPMO->getFastMathFlags();
- IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
- InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
- continue;
- }
- if (CallInst *CI = dyn_cast<CallInst>(&I)) {
- InlineCost += getCallsiteCost(*CI, DL);
- continue;
- }
- if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
- InlineCost += getCallsiteCost(*II, DL);
- continue;
- }
- if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
- InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
- continue;
- }
- InlineCost += InlineConstants::InstrCost;
- }
- return InlineCost;
- }
- std::tuple<InstructionCost, InstructionCost>
- PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
- InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
- for (auto FuncBBPair : Cloner.OutlinedFunctions) {
- Function *OutlinedFunc = FuncBBPair.first;
- BasicBlock* OutliningCallBB = FuncBBPair.second;
- // Now compute the cost of the call sequence to the outlined function
- // 'OutlinedFunction' in BB 'OutliningCallBB':
- auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
- OutliningFuncCallCost +=
- computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
- // Now compute the cost of the extracted/outlined function itself:
- for (BasicBlock &BB : *OutlinedFunc)
- OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
- }
- assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
- "Outlined function cost should be no less than the outlined region");
- // The code extractor introduces a new root and exit stub blocks with
- // additional unconditional branches. Those branches will be eliminated
- // later with bb layout. The cost should be adjusted accordingly:
- OutlinedFunctionCost -=
- 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
- InstructionCost OutliningRuntimeOverhead =
- OutliningFuncCallCost +
- (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
- ExtraOutliningPenalty.getValue();
- return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
- }
- // Create the callsite to profile count map which is
- // used to update the original function's entry count,
- // after the function is partially inlined into the callsite.
- void PartialInlinerImpl::computeCallsiteToProfCountMap(
- Function *DuplicateFunction,
- DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
- std::vector<User *> Users(DuplicateFunction->user_begin(),
- DuplicateFunction->user_end());
- Function *CurrentCaller = nullptr;
- std::unique_ptr<BlockFrequencyInfo> TempBFI;
- BlockFrequencyInfo *CurrentCallerBFI = nullptr;
- auto ComputeCurrBFI = [&,this](Function *Caller) {
- // For the old pass manager:
- if (!GetBFI) {
- DominatorTree DT(*Caller);
- LoopInfo LI(DT);
- BranchProbabilityInfo BPI(*Caller, LI);
- TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
- CurrentCallerBFI = TempBFI.get();
- } else {
- // New pass manager:
- CurrentCallerBFI = &(GetBFI(*Caller));
- }
- };
- for (User *User : Users) {
- // Don't bother with BlockAddress used by CallBr for asm goto.
- if (isa<BlockAddress>(User))
- continue;
- CallBase *CB = getSupportedCallBase(User);
- Function *Caller = CB->getCaller();
- if (CurrentCaller != Caller) {
- CurrentCaller = Caller;
- ComputeCurrBFI(Caller);
- } else {
- assert(CurrentCallerBFI && "CallerBFI is not set");
- }
- BasicBlock *CallBB = CB->getParent();
- auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
- if (Count)
- CallSiteToProfCountMap[User] = *Count;
- else
- CallSiteToProfCountMap[User] = 0;
- }
- }
- PartialInlinerImpl::FunctionCloner::FunctionCloner(
- Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
- function_ref<AssumptionCache *(Function &)> LookupAC,
- function_ref<TargetTransformInfo &(Function &)> GetTTI)
- : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
- ClonedOI = std::make_unique<FunctionOutliningInfo>();
- // Clone the function, so that we can hack away on it.
- ValueToValueMapTy VMap;
- ClonedFunc = CloneFunction(F, VMap);
- ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
- ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
- for (BasicBlock *BB : OI->Entries)
- ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
- for (BasicBlock *E : OI->ReturnBlockPreds) {
- BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
- ClonedOI->ReturnBlockPreds.push_back(NewE);
- }
- // Go ahead and update all uses to the duplicate, so that we can just
- // use the inliner functionality when we're done hacking.
- F->replaceAllUsesWith(ClonedFunc);
- }
- PartialInlinerImpl::FunctionCloner::FunctionCloner(
- Function *F, FunctionOutliningMultiRegionInfo *OI,
- OptimizationRemarkEmitter &ORE,
- function_ref<AssumptionCache *(Function &)> LookupAC,
- function_ref<TargetTransformInfo &(Function &)> GetTTI)
- : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
- ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
- // Clone the function, so that we can hack away on it.
- ValueToValueMapTy VMap;
- ClonedFunc = CloneFunction(F, VMap);
- // Go through all Outline Candidate Regions and update all BasicBlock
- // information.
- for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
- OI->ORI) {
- SmallVector<BasicBlock *, 8> Region;
- for (BasicBlock *BB : RegionInfo.Region)
- Region.push_back(cast<BasicBlock>(VMap[BB]));
- BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
- BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
- BasicBlock *NewReturnBlock = nullptr;
- if (RegionInfo.ReturnBlock)
- NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
- FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
- Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
- ClonedOMRI->ORI.push_back(MappedRegionInfo);
- }
- // Go ahead and update all uses to the duplicate, so that we can just
- // use the inliner functionality when we're done hacking.
- F->replaceAllUsesWith(ClonedFunc);
- }
- void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
- auto GetFirstPHI = [](BasicBlock *BB) {
- BasicBlock::iterator I = BB->begin();
- PHINode *FirstPhi = nullptr;
- while (I != BB->end()) {
- PHINode *Phi = dyn_cast<PHINode>(I);
- if (!Phi)
- break;
- if (!FirstPhi) {
- FirstPhi = Phi;
- break;
- }
- }
- return FirstPhi;
- };
- // Shouldn't need to normalize PHIs if we're not outlining non-early return
- // blocks.
- if (!ClonedOI)
- return;
- // Special hackery is needed with PHI nodes that have inputs from more than
- // one extracted block. For simplicity, just split the PHIs into a two-level
- // sequence of PHIs, some of which will go in the extracted region, and some
- // of which will go outside.
- BasicBlock *PreReturn = ClonedOI->ReturnBlock;
- // only split block when necessary:
- PHINode *FirstPhi = GetFirstPHI(PreReturn);
- unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
- if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
- return;
- auto IsTrivialPhi = [](PHINode *PN) -> Value * {
- Value *CommonValue = PN->getIncomingValue(0);
- if (all_of(PN->incoming_values(),
- [&](Value *V) { return V == CommonValue; }))
- return CommonValue;
- return nullptr;
- };
- ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
- ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
- BasicBlock::iterator I = PreReturn->begin();
- Instruction *Ins = &ClonedOI->ReturnBlock->front();
- SmallVector<Instruction *, 4> DeadPhis;
- while (I != PreReturn->end()) {
- PHINode *OldPhi = dyn_cast<PHINode>(I);
- if (!OldPhi)
- break;
- PHINode *RetPhi =
- PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
- OldPhi->replaceAllUsesWith(RetPhi);
- Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
- RetPhi->addIncoming(&*I, PreReturn);
- for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
- RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
- OldPhi->removeIncomingValue(E);
- }
- // After incoming values splitting, the old phi may become trivial.
- // Keeping the trivial phi can introduce definition inside the outline
- // region which is live-out, causing necessary overhead (load, store
- // arg passing etc).
- if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
- OldPhi->replaceAllUsesWith(OldPhiVal);
- DeadPhis.push_back(OldPhi);
- }
- ++I;
- }
- for (auto *DP : DeadPhis)
- DP->eraseFromParent();
- for (auto *E : ClonedOI->ReturnBlockPreds)
- E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
- }
- bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
- auto ComputeRegionCost =
- [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
- InstructionCost Cost = 0;
- for (BasicBlock* BB : Region)
- Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
- return Cost;
- };
- assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
- if (ClonedOMRI->ORI.empty())
- return false;
- // The CodeExtractor needs a dominator tree.
- DominatorTree DT;
- DT.recalculate(*ClonedFunc);
- // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
- LoopInfo LI(DT);
- BranchProbabilityInfo BPI(*ClonedFunc, LI);
- ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
- // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
- CodeExtractorAnalysisCache CEAC(*ClonedFunc);
- SetVector<Value *> Inputs, Outputs, Sinks;
- for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
- ClonedOMRI->ORI) {
- InstructionCost CurrentOutlinedRegionCost =
- ComputeRegionCost(RegionInfo.Region);
- CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
- ClonedFuncBFI.get(), &BPI,
- LookupAC(*RegionInfo.EntryBlock->getParent()),
- /* AllowVarargs */ false);
- CE.findInputsOutputs(Inputs, Outputs, Sinks);
- LLVM_DEBUG({
- dbgs() << "inputs: " << Inputs.size() << "\n";
- dbgs() << "outputs: " << Outputs.size() << "\n";
- for (Value *value : Inputs)
- dbgs() << "value used in func: " << *value << "\n";
- for (Value *output : Outputs)
- dbgs() << "instr used in func: " << *output << "\n";
- });
- // Do not extract regions that have live exit variables.
- if (Outputs.size() > 0 && !ForceLiveExit)
- continue;
- if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
- CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
- BasicBlock *OutliningCallBB = OCS->getParent();
- assert(OutliningCallBB->getParent() == ClonedFunc);
- OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
- NumColdRegionsOutlined++;
- OutlinedRegionCost += CurrentOutlinedRegionCost;
- if (MarkOutlinedColdCC) {
- OutlinedFunc->setCallingConv(CallingConv::Cold);
- OCS->setCallingConv(CallingConv::Cold);
- }
- } else
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
- &RegionInfo.Region.front()->front())
- << "Failed to extract region at block "
- << ore::NV("Block", RegionInfo.Region.front());
- });
- }
- return !OutlinedFunctions.empty();
- }
- Function *
- PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
- // Returns true if the block is to be partial inlined into the caller
- // (i.e. not to be extracted to the out of line function)
- auto ToBeInlined = [&, this](BasicBlock *BB) {
- return BB == ClonedOI->ReturnBlock ||
- llvm::is_contained(ClonedOI->Entries, BB);
- };
- assert(ClonedOI && "Expecting OutlineInfo for single region outline");
- // The CodeExtractor needs a dominator tree.
- DominatorTree DT;
- DT.recalculate(*ClonedFunc);
- // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
- LoopInfo LI(DT);
- BranchProbabilityInfo BPI(*ClonedFunc, LI);
- ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
- // Gather up the blocks that we're going to extract.
- std::vector<BasicBlock *> ToExtract;
- auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
- ToExtract.push_back(ClonedOI->NonReturnBlock);
- OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
- ClonedOI->NonReturnBlock, ClonedFuncTTI);
- for (BasicBlock &BB : *ClonedFunc)
- if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
- ToExtract.push_back(&BB);
- // FIXME: the code extractor may hoist/sink more code
- // into the outlined function which may make the outlining
- // overhead (the difference of the outlined function cost
- // and OutliningRegionCost) look larger.
- OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
- }
- // Extract the body of the if.
- CodeExtractorAnalysisCache CEAC(*ClonedFunc);
- Function *OutlinedFunc =
- CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
- ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
- /* AllowVarargs */ true)
- .extractCodeRegion(CEAC);
- if (OutlinedFunc) {
- BasicBlock *OutliningCallBB =
- PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
- assert(OutliningCallBB->getParent() == ClonedFunc);
- OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
- } else
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
- &ToExtract.front()->front())
- << "Failed to extract region at block "
- << ore::NV("Block", ToExtract.front());
- });
- return OutlinedFunc;
- }
- PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
- // Ditch the duplicate, since we're done with it, and rewrite all remaining
- // users (function pointers, etc.) back to the original function.
- ClonedFunc->replaceAllUsesWith(OrigFunc);
- ClonedFunc->eraseFromParent();
- if (!IsFunctionInlined) {
- // Remove each function that was speculatively created if there is no
- // reference.
- for (auto FuncBBPair : OutlinedFunctions) {
- Function *Func = FuncBBPair.first;
- Func->eraseFromParent();
- }
- }
- }
- std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
- if (F.hasAddressTaken())
- return {false, nullptr};
- // Let inliner handle it
- if (F.hasFnAttribute(Attribute::AlwaysInline))
- return {false, nullptr};
- if (F.hasFnAttribute(Attribute::NoInline))
- return {false, nullptr};
- if (PSI.isFunctionEntryCold(&F))
- return {false, nullptr};
- if (F.users().empty())
- return {false, nullptr};
- OptimizationRemarkEmitter ORE(&F);
- // Only try to outline cold regions if we have a profile summary, which
- // implies we have profiling information.
- if (PSI.hasProfileSummary() && F.hasProfileData() &&
- !DisableMultiRegionPartialInline) {
- std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
- computeOutliningColdRegionsInfo(F, ORE);
- if (OMRI) {
- FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
- LLVM_DEBUG({
- dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
- dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
- << "\n";
- });
- bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
- if (DidOutline) {
- LLVM_DEBUG({
- dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
- Cloner.ClonedFunc->print(dbgs());
- dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
- });
- if (tryPartialInline(Cloner))
- return {true, nullptr};
- }
- }
- }
- // Fall-thru to regular partial inlining if we:
- // i) can't find any cold regions to outline, or
- // ii) can't inline the outlined function anywhere.
- std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
- if (!OI)
- return {false, nullptr};
- FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
- Cloner.normalizeReturnBlock();
- Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
- if (!OutlinedFunction)
- return {false, nullptr};
- if (tryPartialInline(Cloner))
- return {true, OutlinedFunction};
- return {false, nullptr};
- }
- bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
- if (Cloner.OutlinedFunctions.empty())
- return false;
- int SizeCost = 0;
- BlockFrequency WeightedRcost;
- int NonWeightedRcost;
- auto OutliningCosts = computeOutliningCosts(Cloner);
- assert(std::get<0>(OutliningCosts).isValid() &&
- std::get<1>(OutliningCosts).isValid() && "Expected valid costs");
- SizeCost = *std::get<0>(OutliningCosts).getValue();
- NonWeightedRcost = *std::get<1>(OutliningCosts).getValue();
- // Only calculate RelativeToEntryFreq when we are doing single region
- // outlining.
- BranchProbability RelativeToEntryFreq;
- if (Cloner.ClonedOI)
- RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
- else
- // RelativeToEntryFreq doesn't make sense when we have more than one
- // outlined call because each call will have a different relative frequency
- // to the entry block. We can consider using the average, but the
- // usefulness of that information is questionable. For now, assume we never
- // execute the calls to outlined functions.
- RelativeToEntryFreq = BranchProbability(0, 1);
- WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
- // The call sequence(s) to the outlined function(s) are larger than the sum of
- // the original outlined region size(s), it does not increase the chances of
- // inlining the function with outlining (The inliner uses the size increase to
- // model the cost of inlining a callee).
- if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
- OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
- DebugLoc DLoc;
- BasicBlock *Block;
- std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
- OrigFuncORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
- DLoc, Block)
- << ore::NV("Function", Cloner.OrigFunc)
- << " not partially inlined into callers (Original Size = "
- << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
- << ", Size of call sequence to outlined function = "
- << ore::NV("NewSize", SizeCost) << ")";
- });
- return false;
- }
- assert(Cloner.OrigFunc->users().empty() &&
- "F's users should all be replaced!");
- std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
- Cloner.ClonedFunc->user_end());
- DenseMap<User *, uint64_t> CallSiteToProfCountMap;
- auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
- if (CalleeEntryCount)
- computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
- uint64_t CalleeEntryCountV =
- (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
- bool AnyInline = false;
- for (User *User : Users) {
- // Don't bother with BlockAddress used by CallBr for asm goto.
- if (isa<BlockAddress>(User))
- continue;
- CallBase *CB = getSupportedCallBase(User);
- if (isLimitReached())
- continue;
- OptimizationRemarkEmitter CallerORE(CB->getCaller());
- if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
- continue;
- // Construct remark before doing the inlining, as after successful inlining
- // the callsite is removed.
- OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
- OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
- << ore::NV("Caller", CB->getCaller());
- InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
- // We can only forward varargs when we outlined a single region, else we
- // bail on vararg functions.
- if (!InlineFunction(*CB, IFI, nullptr, true,
- (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
- : nullptr))
- .isSuccess())
- continue;
- CallerORE.emit(OR);
- // Now update the entry count:
- if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
- uint64_t CallSiteCount = CallSiteToProfCountMap[User];
- CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
- }
- AnyInline = true;
- NumPartialInlining++;
- // Update the stats
- if (Cloner.ClonedOI)
- NumPartialInlined++;
- else
- NumColdOutlinePartialInlined++;
- }
- if (AnyInline) {
- Cloner.IsFunctionInlined = true;
- if (CalleeEntryCount)
- Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
- CalleeEntryCountV, CalleeEntryCount->getType()));
- OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
- OrigFuncORE.emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
- << "Partially inlined into at least one caller";
- });
- }
- return AnyInline;
- }
- bool PartialInlinerImpl::run(Module &M) {
- if (DisablePartialInlining)
- return false;
- std::vector<Function *> Worklist;
- Worklist.reserve(M.size());
- for (Function &F : M)
- if (!F.use_empty() && !F.isDeclaration())
- Worklist.push_back(&F);
- bool Changed = false;
- while (!Worklist.empty()) {
- Function *CurrFunc = Worklist.back();
- Worklist.pop_back();
- if (CurrFunc->use_empty())
- continue;
- bool Recursive = false;
- for (User *U : CurrFunc->users())
- if (Instruction *I = dyn_cast<Instruction>(U))
- if (I->getParent()->getParent() == CurrFunc) {
- Recursive = true;
- break;
- }
- if (Recursive)
- continue;
- std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
- if (Result.second)
- Worklist.push_back(Result.second);
- Changed |= Result.first;
- }
- return Changed;
- }
- char PartialInlinerLegacyPass::ID = 0;
- INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
- "Partial Inliner", false, false)
- INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
- INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
- INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
- "Partial Inliner", false, false)
- ModulePass *llvm::createPartialInliningPass() {
- return new PartialInlinerLegacyPass();
- }
- PreservedAnalyses PartialInlinerPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
- return FAM.getResult<AssumptionAnalysis>(F);
- };
- auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
- return FAM.getCachedResult<AssumptionAnalysis>(F);
- };
- auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
- return FAM.getResult<BlockFrequencyAnalysis>(F);
- };
- auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
- return FAM.getResult<TargetIRAnalysis>(F);
- };
- auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
- return FAM.getResult<TargetLibraryAnalysis>(F);
- };
- ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
- if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
- GetTLI, PSI, GetBFI)
- .run(M))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
- }
|