InlineCost.cpp 119 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135
  1. //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements inline cost analysis.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "llvm/Analysis/InlineCost.h"
  13. #include "llvm/ADT/STLExtras.h"
  14. #include "llvm/ADT/SetVector.h"
  15. #include "llvm/ADT/SmallPtrSet.h"
  16. #include "llvm/ADT/SmallVector.h"
  17. #include "llvm/ADT/Statistic.h"
  18. #include "llvm/Analysis/AssumptionCache.h"
  19. #include "llvm/Analysis/BlockFrequencyInfo.h"
  20. #include "llvm/Analysis/CFG.h"
  21. #include "llvm/Analysis/CodeMetrics.h"
  22. #include "llvm/Analysis/ConstantFolding.h"
  23. #include "llvm/Analysis/InstructionSimplify.h"
  24. #include "llvm/Analysis/LoopInfo.h"
  25. #include "llvm/Analysis/ProfileSummaryInfo.h"
  26. #include "llvm/Analysis/TargetLibraryInfo.h"
  27. #include "llvm/Analysis/TargetTransformInfo.h"
  28. #include "llvm/Analysis/ValueTracking.h"
  29. #include "llvm/Config/llvm-config.h"
  30. #include "llvm/IR/AssemblyAnnotationWriter.h"
  31. #include "llvm/IR/CallingConv.h"
  32. #include "llvm/IR/DataLayout.h"
  33. #include "llvm/IR/Dominators.h"
  34. #include "llvm/IR/GetElementPtrTypeIterator.h"
  35. #include "llvm/IR/GlobalAlias.h"
  36. #include "llvm/IR/InstVisitor.h"
  37. #include "llvm/IR/IntrinsicInst.h"
  38. #include "llvm/IR/Operator.h"
  39. #include "llvm/IR/PatternMatch.h"
  40. #include "llvm/Support/CommandLine.h"
  41. #include "llvm/Support/Debug.h"
  42. #include "llvm/Support/FormattedStream.h"
  43. #include "llvm/Support/raw_ostream.h"
  44. using namespace llvm;
  45. #define DEBUG_TYPE "inline-cost"
  46. STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
  47. static cl::opt<int>
  48. DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
  49. cl::ZeroOrMore,
  50. cl::desc("Default amount of inlining to perform"));
  51. static cl::opt<bool> PrintInstructionComments(
  52. "print-instruction-comments", cl::Hidden, cl::init(false),
  53. cl::desc("Prints comments for instruction based on inline cost analysis"));
  54. static cl::opt<int> InlineThreshold(
  55. "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
  56. cl::desc("Control the amount of inlining to perform (default = 225)"));
  57. static cl::opt<int> HintThreshold(
  58. "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore,
  59. cl::desc("Threshold for inlining functions with inline hint"));
  60. static cl::opt<int>
  61. ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
  62. cl::init(45), cl::ZeroOrMore,
  63. cl::desc("Threshold for inlining cold callsites"));
  64. static cl::opt<bool> InlineEnableCostBenefitAnalysis(
  65. "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
  66. cl::desc("Enable the cost-benefit analysis for the inliner"));
  67. static cl::opt<int> InlineSavingsMultiplier(
  68. "inline-savings-multiplier", cl::Hidden, cl::init(8), cl::ZeroOrMore,
  69. cl::desc("Multiplier to multiply cycle savings by during inlining"));
  70. static cl::opt<int>
  71. InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
  72. cl::ZeroOrMore,
  73. cl::desc("The maximum size of a callee that get's "
  74. "inlined without sufficient cycle savings"));
  75. // We introduce this threshold to help performance of instrumentation based
  76. // PGO before we actually hook up inliner with analysis passes such as BPI and
  77. // BFI.
  78. static cl::opt<int> ColdThreshold(
  79. "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore,
  80. cl::desc("Threshold for inlining functions with cold attribute"));
  81. static cl::opt<int>
  82. HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
  83. cl::ZeroOrMore,
  84. cl::desc("Threshold for hot callsites "));
  85. static cl::opt<int> LocallyHotCallSiteThreshold(
  86. "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
  87. cl::desc("Threshold for locally hot callsites "));
  88. static cl::opt<int> ColdCallSiteRelFreq(
  89. "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
  90. cl::desc("Maximum block frequency, expressed as a percentage of caller's "
  91. "entry frequency, for a callsite to be cold in the absence of "
  92. "profile information."));
  93. static cl::opt<int> HotCallSiteRelFreq(
  94. "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
  95. cl::desc("Minimum block frequency, expressed as a multiple of caller's "
  96. "entry frequency, for a callsite to be hot in the absence of "
  97. "profile information."));
  98. static cl::opt<int> CallPenalty(
  99. "inline-call-penalty", cl::Hidden, cl::init(25),
  100. cl::desc("Call penalty that is applied per callsite when inlining"));
  101. static cl::opt<bool> OptComputeFullInlineCost(
  102. "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore,
  103. cl::desc("Compute the full inline cost of a call site even when the cost "
  104. "exceeds the threshold."));
  105. static cl::opt<bool> InlineCallerSupersetNoBuiltin(
  106. "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
  107. cl::ZeroOrMore,
  108. cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
  109. "attributes."));
  110. static cl::opt<bool> DisableGEPConstOperand(
  111. "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
  112. cl::desc("Disables evaluation of GetElementPtr with constant operands"));
  113. namespace {
  114. /// This function behaves more like CallBase::hasFnAttr: when it looks for the
  115. /// requested attribute, it check both the call instruction and the called
  116. /// function (if it's available and operand bundles don't prohibit that).
  117. Attribute getFnAttr(CallBase &CB, StringRef AttrKind) {
  118. Attribute CallAttr = CB.getFnAttr(AttrKind);
  119. if (CallAttr.isValid())
  120. return CallAttr;
  121. // Operand bundles override attributes on the called function, but don't
  122. // override attributes directly present on the call instruction.
  123. if (!CB.isFnAttrDisallowedByOpBundle(AttrKind))
  124. if (const Function *F = CB.getCalledFunction())
  125. return F->getFnAttribute(AttrKind);
  126. return {};
  127. }
  128. } // namespace
  129. namespace llvm {
  130. Optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
  131. Attribute Attr = getFnAttr(CB, AttrKind);
  132. int AttrValue;
  133. if (Attr.getValueAsString().getAsInteger(10, AttrValue))
  134. return None;
  135. return AttrValue;
  136. }
  137. } // namespace llvm
  138. namespace {
  139. class InlineCostCallAnalyzer;
  140. // This struct is used to store information about inline cost of a
  141. // particular instruction
  142. struct InstructionCostDetail {
  143. int CostBefore = 0;
  144. int CostAfter = 0;
  145. int ThresholdBefore = 0;
  146. int ThresholdAfter = 0;
  147. int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
  148. int getCostDelta() const { return CostAfter - CostBefore; }
  149. bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
  150. };
  151. class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
  152. private:
  153. InlineCostCallAnalyzer *const ICCA;
  154. public:
  155. InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
  156. virtual void emitInstructionAnnot(const Instruction *I,
  157. formatted_raw_ostream &OS) override;
  158. };
  159. /// Carry out call site analysis, in order to evaluate inlinability.
  160. /// NOTE: the type is currently used as implementation detail of functions such
  161. /// as llvm::getInlineCost. Note the function_ref constructor parameters - the
  162. /// expectation is that they come from the outer scope, from the wrapper
  163. /// functions. If we want to support constructing CallAnalyzer objects where
  164. /// lambdas are provided inline at construction, or where the object needs to
  165. /// otherwise survive past the scope of the provided functions, we need to
  166. /// revisit the argument types.
  167. class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
  168. typedef InstVisitor<CallAnalyzer, bool> Base;
  169. friend class InstVisitor<CallAnalyzer, bool>;
  170. protected:
  171. virtual ~CallAnalyzer() {}
  172. /// The TargetTransformInfo available for this compilation.
  173. const TargetTransformInfo &TTI;
  174. /// Getter for the cache of @llvm.assume intrinsics.
  175. function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
  176. /// Getter for BlockFrequencyInfo
  177. function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
  178. /// Profile summary information.
  179. ProfileSummaryInfo *PSI;
  180. /// The called function.
  181. Function &F;
  182. // Cache the DataLayout since we use it a lot.
  183. const DataLayout &DL;
  184. /// The OptimizationRemarkEmitter available for this compilation.
  185. OptimizationRemarkEmitter *ORE;
  186. /// The candidate callsite being analyzed. Please do not use this to do
  187. /// analysis in the caller function; we want the inline cost query to be
  188. /// easily cacheable. Instead, use the cover function paramHasAttr.
  189. CallBase &CandidateCall;
  190. /// Extension points for handling callsite features.
  191. // Called before a basic block was analyzed.
  192. virtual void onBlockStart(const BasicBlock *BB) {}
  193. /// Called after a basic block was analyzed.
  194. virtual void onBlockAnalyzed(const BasicBlock *BB) {}
  195. /// Called before an instruction was analyzed
  196. virtual void onInstructionAnalysisStart(const Instruction *I) {}
  197. /// Called after an instruction was analyzed
  198. virtual void onInstructionAnalysisFinish(const Instruction *I) {}
  199. /// Called at the end of the analysis of the callsite. Return the outcome of
  200. /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
  201. /// the reason it can't.
  202. virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
  203. /// Called when we're about to start processing a basic block, and every time
  204. /// we are done processing an instruction. Return true if there is no point in
  205. /// continuing the analysis (e.g. we've determined already the call site is
  206. /// too expensive to inline)
  207. virtual bool shouldStop() { return false; }
  208. /// Called before the analysis of the callee body starts (with callsite
  209. /// contexts propagated). It checks callsite-specific information. Return a
  210. /// reason analysis can't continue if that's the case, or 'true' if it may
  211. /// continue.
  212. virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
  213. /// Called if the analysis engine decides SROA cannot be done for the given
  214. /// alloca.
  215. virtual void onDisableSROA(AllocaInst *Arg) {}
  216. /// Called the analysis engine determines load elimination won't happen.
  217. virtual void onDisableLoadElimination() {}
  218. /// Called when we visit a CallBase, before the analysis starts. Return false
  219. /// to stop further processing of the instruction.
  220. virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
  221. /// Called to account for a call.
  222. virtual void onCallPenalty() {}
  223. /// Called to account for the expectation the inlining would result in a load
  224. /// elimination.
  225. virtual void onLoadEliminationOpportunity() {}
  226. /// Called to account for the cost of argument setup for the Call in the
  227. /// callee's body (not the callsite currently under analysis).
  228. virtual void onCallArgumentSetup(const CallBase &Call) {}
  229. /// Called to account for a load relative intrinsic.
  230. virtual void onLoadRelativeIntrinsic() {}
  231. /// Called to account for a lowered call.
  232. virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
  233. }
  234. /// Account for a jump table of given size. Return false to stop further
  235. /// processing the switch instruction
  236. virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
  237. /// Account for a case cluster of given size. Return false to stop further
  238. /// processing of the instruction.
  239. virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
  240. /// Called at the end of processing a switch instruction, with the given
  241. /// number of case clusters.
  242. virtual void onFinalizeSwitch(unsigned JumpTableSize,
  243. unsigned NumCaseCluster) {}
  244. /// Called to account for any other instruction not specifically accounted
  245. /// for.
  246. virtual void onMissedSimplification() {}
  247. /// Start accounting potential benefits due to SROA for the given alloca.
  248. virtual void onInitializeSROAArg(AllocaInst *Arg) {}
  249. /// Account SROA savings for the AllocaInst value.
  250. virtual void onAggregateSROAUse(AllocaInst *V) {}
  251. bool handleSROA(Value *V, bool DoNotDisable) {
  252. // Check for SROA candidates in comparisons.
  253. if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
  254. if (DoNotDisable) {
  255. onAggregateSROAUse(SROAArg);
  256. return true;
  257. }
  258. disableSROAForArg(SROAArg);
  259. }
  260. return false;
  261. }
  262. bool IsCallerRecursive = false;
  263. bool IsRecursiveCall = false;
  264. bool ExposesReturnsTwice = false;
  265. bool HasDynamicAlloca = false;
  266. bool ContainsNoDuplicateCall = false;
  267. bool HasReturn = false;
  268. bool HasIndirectBr = false;
  269. bool HasUninlineableIntrinsic = false;
  270. bool InitsVargArgs = false;
  271. /// Number of bytes allocated statically by the callee.
  272. uint64_t AllocatedSize = 0;
  273. unsigned NumInstructions = 0;
  274. unsigned NumVectorInstructions = 0;
  275. /// While we walk the potentially-inlined instructions, we build up and
  276. /// maintain a mapping of simplified values specific to this callsite. The
  277. /// idea is to propagate any special information we have about arguments to
  278. /// this call through the inlinable section of the function, and account for
  279. /// likely simplifications post-inlining. The most important aspect we track
  280. /// is CFG altering simplifications -- when we prove a basic block dead, that
  281. /// can cause dramatic shifts in the cost of inlining a function.
  282. DenseMap<Value *, Constant *> SimplifiedValues;
  283. /// Keep track of the values which map back (through function arguments) to
  284. /// allocas on the caller stack which could be simplified through SROA.
  285. DenseMap<Value *, AllocaInst *> SROAArgValues;
  286. /// Keep track of Allocas for which we believe we may get SROA optimization.
  287. DenseSet<AllocaInst *> EnabledSROAAllocas;
  288. /// Keep track of values which map to a pointer base and constant offset.
  289. DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
  290. /// Keep track of dead blocks due to the constant arguments.
  291. SetVector<BasicBlock *> DeadBlocks;
  292. /// The mapping of the blocks to their known unique successors due to the
  293. /// constant arguments.
  294. DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
  295. /// Model the elimination of repeated loads that is expected to happen
  296. /// whenever we simplify away the stores that would otherwise cause them to be
  297. /// loads.
  298. bool EnableLoadElimination = true;
  299. /// Whether we allow inlining for recursive call.
  300. bool AllowRecursiveCall = false;
  301. SmallPtrSet<Value *, 16> LoadAddrSet;
  302. AllocaInst *getSROAArgForValueOrNull(Value *V) const {
  303. auto It = SROAArgValues.find(V);
  304. if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
  305. return nullptr;
  306. return It->second;
  307. }
  308. // Custom simplification helper routines.
  309. bool isAllocaDerivedArg(Value *V);
  310. void disableSROAForArg(AllocaInst *SROAArg);
  311. void disableSROA(Value *V);
  312. void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
  313. void disableLoadElimination();
  314. bool isGEPFree(GetElementPtrInst &GEP);
  315. bool canFoldInboundsGEP(GetElementPtrInst &I);
  316. bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
  317. bool simplifyCallSite(Function *F, CallBase &Call);
  318. template <typename Callable>
  319. bool simplifyInstruction(Instruction &I, Callable Evaluate);
  320. bool simplifyIntrinsicCallIsConstant(CallBase &CB);
  321. ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
  322. /// Return true if the given argument to the function being considered for
  323. /// inlining has the given attribute set either at the call site or the
  324. /// function declaration. Primarily used to inspect call site specific
  325. /// attributes since these can be more precise than the ones on the callee
  326. /// itself.
  327. bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
  328. /// Return true if the given value is known non null within the callee if
  329. /// inlined through this particular callsite.
  330. bool isKnownNonNullInCallee(Value *V);
  331. /// Return true if size growth is allowed when inlining the callee at \p Call.
  332. bool allowSizeGrowth(CallBase &Call);
  333. // Custom analysis routines.
  334. InlineResult analyzeBlock(BasicBlock *BB,
  335. SmallPtrSetImpl<const Value *> &EphValues);
  336. // Disable several entry points to the visitor so we don't accidentally use
  337. // them by declaring but not defining them here.
  338. void visit(Module *);
  339. void visit(Module &);
  340. void visit(Function *);
  341. void visit(Function &);
  342. void visit(BasicBlock *);
  343. void visit(BasicBlock &);
  344. // Provide base case for our instruction visit.
  345. bool visitInstruction(Instruction &I);
  346. // Our visit overrides.
  347. bool visitAlloca(AllocaInst &I);
  348. bool visitPHI(PHINode &I);
  349. bool visitGetElementPtr(GetElementPtrInst &I);
  350. bool visitBitCast(BitCastInst &I);
  351. bool visitPtrToInt(PtrToIntInst &I);
  352. bool visitIntToPtr(IntToPtrInst &I);
  353. bool visitCastInst(CastInst &I);
  354. bool visitCmpInst(CmpInst &I);
  355. bool visitSub(BinaryOperator &I);
  356. bool visitBinaryOperator(BinaryOperator &I);
  357. bool visitFNeg(UnaryOperator &I);
  358. bool visitLoad(LoadInst &I);
  359. bool visitStore(StoreInst &I);
  360. bool visitExtractValue(ExtractValueInst &I);
  361. bool visitInsertValue(InsertValueInst &I);
  362. bool visitCallBase(CallBase &Call);
  363. bool visitReturnInst(ReturnInst &RI);
  364. bool visitBranchInst(BranchInst &BI);
  365. bool visitSelectInst(SelectInst &SI);
  366. bool visitSwitchInst(SwitchInst &SI);
  367. bool visitIndirectBrInst(IndirectBrInst &IBI);
  368. bool visitResumeInst(ResumeInst &RI);
  369. bool visitCleanupReturnInst(CleanupReturnInst &RI);
  370. bool visitCatchReturnInst(CatchReturnInst &RI);
  371. bool visitUnreachableInst(UnreachableInst &I);
  372. public:
  373. CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
  374. function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
  375. function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
  376. ProfileSummaryInfo *PSI = nullptr,
  377. OptimizationRemarkEmitter *ORE = nullptr)
  378. : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
  379. PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
  380. CandidateCall(Call) {}
  381. InlineResult analyze();
  382. Optional<Constant *> getSimplifiedValue(Instruction *I) {
  383. if (SimplifiedValues.find(I) != SimplifiedValues.end())
  384. return SimplifiedValues[I];
  385. return None;
  386. }
  387. // Keep a bunch of stats about the cost savings found so we can print them
  388. // out when debugging.
  389. unsigned NumConstantArgs = 0;
  390. unsigned NumConstantOffsetPtrArgs = 0;
  391. unsigned NumAllocaArgs = 0;
  392. unsigned NumConstantPtrCmps = 0;
  393. unsigned NumConstantPtrDiffs = 0;
  394. unsigned NumInstructionsSimplified = 0;
  395. void dump();
  396. };
  397. // Considering forming a binary search, we should find the number of nodes
  398. // which is same as the number of comparisons when lowered. For a given
  399. // number of clusters, n, we can define a recursive function, f(n), to find
  400. // the number of nodes in the tree. The recursion is :
  401. // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
  402. // and f(n) = n, when n <= 3.
  403. // This will lead a binary tree where the leaf should be either f(2) or f(3)
  404. // when n > 3. So, the number of comparisons from leaves should be n, while
  405. // the number of non-leaf should be :
  406. // 2^(log2(n) - 1) - 1
  407. // = 2^log2(n) * 2^-1 - 1
  408. // = n / 2 - 1.
  409. // Considering comparisons from leaf and non-leaf nodes, we can estimate the
  410. // number of comparisons in a simple closed form :
  411. // n + n / 2 - 1 = n * 3 / 2 - 1
  412. int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
  413. return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
  414. }
  415. /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
  416. /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
  417. class InlineCostCallAnalyzer final : public CallAnalyzer {
  418. const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
  419. const bool ComputeFullInlineCost;
  420. int LoadEliminationCost = 0;
  421. /// Bonus to be applied when percentage of vector instructions in callee is
  422. /// high (see more details in updateThreshold).
  423. int VectorBonus = 0;
  424. /// Bonus to be applied when the callee has only one reachable basic block.
  425. int SingleBBBonus = 0;
  426. /// Tunable parameters that control the analysis.
  427. const InlineParams &Params;
  428. // This DenseMap stores the delta change in cost and threshold after
  429. // accounting for the given instruction. The map is filled only with the
  430. // flag PrintInstructionComments on.
  431. DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
  432. /// Upper bound for the inlining cost. Bonuses are being applied to account
  433. /// for speculative "expected profit" of the inlining decision.
  434. int Threshold = 0;
  435. /// Attempt to evaluate indirect calls to boost its inline cost.
  436. const bool BoostIndirectCalls;
  437. /// Ignore the threshold when finalizing analysis.
  438. const bool IgnoreThreshold;
  439. // True if the cost-benefit-analysis-based inliner is enabled.
  440. const bool CostBenefitAnalysisEnabled;
  441. /// Inlining cost measured in abstract units, accounts for all the
  442. /// instructions expected to be executed for a given function invocation.
  443. /// Instructions that are statically proven to be dead based on call-site
  444. /// arguments are not counted here.
  445. int Cost = 0;
  446. // The cumulative cost at the beginning of the basic block being analyzed. At
  447. // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
  448. // the size of that basic block.
  449. int CostAtBBStart = 0;
  450. // The static size of live but cold basic blocks. This is "static" in the
  451. // sense that it's not weighted by profile counts at all.
  452. int ColdSize = 0;
  453. // Whether inlining is decided by cost-threshold analysis.
  454. bool DecidedByCostThreshold = false;
  455. // Whether inlining is decided by cost-benefit analysis.
  456. bool DecidedByCostBenefit = false;
  457. // The cost-benefit pair computed by cost-benefit analysis.
  458. Optional<CostBenefitPair> CostBenefit = None;
  459. bool SingleBB = true;
  460. unsigned SROACostSavings = 0;
  461. unsigned SROACostSavingsLost = 0;
  462. /// The mapping of caller Alloca values to their accumulated cost savings. If
  463. /// we have to disable SROA for one of the allocas, this tells us how much
  464. /// cost must be added.
  465. DenseMap<AllocaInst *, int> SROAArgCosts;
  466. /// Return true if \p Call is a cold callsite.
  467. bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
  468. /// Update Threshold based on callsite properties such as callee
  469. /// attributes and callee hotness for PGO builds. The Callee is explicitly
  470. /// passed to support analyzing indirect calls whose target is inferred by
  471. /// analysis.
  472. void updateThreshold(CallBase &Call, Function &Callee);
  473. /// Return a higher threshold if \p Call is a hot callsite.
  474. Optional<int> getHotCallSiteThreshold(CallBase &Call,
  475. BlockFrequencyInfo *CallerBFI);
  476. /// Handle a capped 'int' increment for Cost.
  477. void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
  478. assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
  479. Cost = std::min<int>(UpperBound, Cost + Inc);
  480. }
  481. void onDisableSROA(AllocaInst *Arg) override {
  482. auto CostIt = SROAArgCosts.find(Arg);
  483. if (CostIt == SROAArgCosts.end())
  484. return;
  485. addCost(CostIt->second);
  486. SROACostSavings -= CostIt->second;
  487. SROACostSavingsLost += CostIt->second;
  488. SROAArgCosts.erase(CostIt);
  489. }
  490. void onDisableLoadElimination() override {
  491. addCost(LoadEliminationCost);
  492. LoadEliminationCost = 0;
  493. }
  494. bool onCallBaseVisitStart(CallBase &Call) override {
  495. if (Optional<int> AttrCallThresholdBonus =
  496. getStringFnAttrAsInt(Call, "call-threshold-bonus"))
  497. Threshold += *AttrCallThresholdBonus;
  498. if (Optional<int> AttrCallCost =
  499. getStringFnAttrAsInt(Call, "call-inline-cost")) {
  500. addCost(*AttrCallCost);
  501. // Prevent further processing of the call since we want to override its
  502. // inline cost, not just add to it.
  503. return false;
  504. }
  505. return true;
  506. }
  507. void onCallPenalty() override { addCost(CallPenalty); }
  508. void onCallArgumentSetup(const CallBase &Call) override {
  509. // Pay the price of the argument setup. We account for the average 1
  510. // instruction per call argument setup here.
  511. addCost(Call.arg_size() * InlineConstants::InstrCost);
  512. }
  513. void onLoadRelativeIntrinsic() override {
  514. // This is normally lowered to 4 LLVM instructions.
  515. addCost(3 * InlineConstants::InstrCost);
  516. }
  517. void onLoweredCall(Function *F, CallBase &Call,
  518. bool IsIndirectCall) override {
  519. // We account for the average 1 instruction per call argument setup here.
  520. addCost(Call.arg_size() * InlineConstants::InstrCost);
  521. // If we have a constant that we are calling as a function, we can peer
  522. // through it and see the function target. This happens not infrequently
  523. // during devirtualization and so we want to give it a hefty bonus for
  524. // inlining, but cap that bonus in the event that inlining wouldn't pan out.
  525. // Pretend to inline the function, with a custom threshold.
  526. if (IsIndirectCall && BoostIndirectCalls) {
  527. auto IndirectCallParams = Params;
  528. IndirectCallParams.DefaultThreshold =
  529. InlineConstants::IndirectCallThreshold;
  530. /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
  531. /// to instantiate the derived class.
  532. InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
  533. GetAssumptionCache, GetBFI, PSI, ORE, false);
  534. if (CA.analyze().isSuccess()) {
  535. // We were able to inline the indirect call! Subtract the cost from the
  536. // threshold to get the bonus we want to apply, but don't go below zero.
  537. Cost -= std::max(0, CA.getThreshold() - CA.getCost());
  538. }
  539. } else
  540. // Otherwise simply add the cost for merely making the call.
  541. addCost(CallPenalty);
  542. }
  543. void onFinalizeSwitch(unsigned JumpTableSize,
  544. unsigned NumCaseCluster) override {
  545. // If suitable for a jump table, consider the cost for the table size and
  546. // branch to destination.
  547. // Maximum valid cost increased in this function.
  548. if (JumpTableSize) {
  549. int64_t JTCost =
  550. static_cast<int64_t>(JumpTableSize) * InlineConstants::InstrCost +
  551. 4 * InlineConstants::InstrCost;
  552. addCost(JTCost, static_cast<int64_t>(CostUpperBound));
  553. return;
  554. }
  555. if (NumCaseCluster <= 3) {
  556. // Suppose a comparison includes one compare and one conditional branch.
  557. addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
  558. return;
  559. }
  560. int64_t ExpectedNumberOfCompare =
  561. getExpectedNumberOfCompare(NumCaseCluster);
  562. int64_t SwitchCost =
  563. ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
  564. addCost(SwitchCost, static_cast<int64_t>(CostUpperBound));
  565. }
  566. void onMissedSimplification() override {
  567. addCost(InlineConstants::InstrCost);
  568. }
  569. void onInitializeSROAArg(AllocaInst *Arg) override {
  570. assert(Arg != nullptr &&
  571. "Should not initialize SROA costs for null value.");
  572. SROAArgCosts[Arg] = 0;
  573. }
  574. void onAggregateSROAUse(AllocaInst *SROAArg) override {
  575. auto CostIt = SROAArgCosts.find(SROAArg);
  576. assert(CostIt != SROAArgCosts.end() &&
  577. "expected this argument to have a cost");
  578. CostIt->second += InlineConstants::InstrCost;
  579. SROACostSavings += InlineConstants::InstrCost;
  580. }
  581. void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
  582. void onBlockAnalyzed(const BasicBlock *BB) override {
  583. if (CostBenefitAnalysisEnabled) {
  584. // Keep track of the static size of live but cold basic blocks. For now,
  585. // we define a cold basic block to be one that's never executed.
  586. assert(GetBFI && "GetBFI must be available");
  587. BlockFrequencyInfo *BFI = &(GetBFI(F));
  588. assert(BFI && "BFI must be available");
  589. auto ProfileCount = BFI->getBlockProfileCount(BB);
  590. assert(ProfileCount.hasValue());
  591. if (ProfileCount.getValue() == 0)
  592. ColdSize += Cost - CostAtBBStart;
  593. }
  594. auto *TI = BB->getTerminator();
  595. // If we had any successors at this point, than post-inlining is likely to
  596. // have them as well. Note that we assume any basic blocks which existed
  597. // due to branches or switches which folded above will also fold after
  598. // inlining.
  599. if (SingleBB && TI->getNumSuccessors() > 1) {
  600. // Take off the bonus we applied to the threshold.
  601. Threshold -= SingleBBBonus;
  602. SingleBB = false;
  603. }
  604. }
  605. void onInstructionAnalysisStart(const Instruction *I) override {
  606. // This function is called to store the initial cost of inlining before
  607. // the given instruction was assessed.
  608. if (!PrintInstructionComments)
  609. return;
  610. InstructionCostDetailMap[I].CostBefore = Cost;
  611. InstructionCostDetailMap[I].ThresholdBefore = Threshold;
  612. }
  613. void onInstructionAnalysisFinish(const Instruction *I) override {
  614. // This function is called to find new values of cost and threshold after
  615. // the instruction has been assessed.
  616. if (!PrintInstructionComments)
  617. return;
  618. InstructionCostDetailMap[I].CostAfter = Cost;
  619. InstructionCostDetailMap[I].ThresholdAfter = Threshold;
  620. }
  621. bool isCostBenefitAnalysisEnabled() {
  622. if (!PSI || !PSI->hasProfileSummary())
  623. return false;
  624. if (!GetBFI)
  625. return false;
  626. if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
  627. // Honor the explicit request from the user.
  628. if (!InlineEnableCostBenefitAnalysis)
  629. return false;
  630. } else {
  631. // Otherwise, require instrumentation profile.
  632. if (!PSI->hasInstrumentationProfile())
  633. return false;
  634. }
  635. auto *Caller = CandidateCall.getParent()->getParent();
  636. if (!Caller->getEntryCount())
  637. return false;
  638. BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
  639. if (!CallerBFI)
  640. return false;
  641. // For now, limit to hot call site.
  642. if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
  643. return false;
  644. // Make sure we have a nonzero entry count.
  645. auto EntryCount = F.getEntryCount();
  646. if (!EntryCount || !EntryCount->getCount())
  647. return false;
  648. BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
  649. if (!CalleeBFI)
  650. return false;
  651. return true;
  652. }
  653. // Determine whether we should inline the given call site, taking into account
  654. // both the size cost and the cycle savings. Return None if we don't have
  655. // suficient profiling information to determine.
  656. Optional<bool> costBenefitAnalysis() {
  657. if (!CostBenefitAnalysisEnabled)
  658. return None;
  659. // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
  660. // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
  661. // falling back to the cost-based metric.
  662. // TODO: Improve this hacky condition.
  663. if (Threshold == 0)
  664. return None;
  665. assert(GetBFI);
  666. BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
  667. assert(CalleeBFI);
  668. // The cycle savings expressed as the sum of InlineConstants::InstrCost
  669. // multiplied by the estimated dynamic count of each instruction we can
  670. // avoid. Savings come from the call site cost, such as argument setup and
  671. // the call instruction, as well as the instructions that are folded.
  672. //
  673. // We use 128-bit APInt here to avoid potential overflow. This variable
  674. // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
  675. // assumes that we can avoid or fold a billion instructions, each with a
  676. // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
  677. // period on a 4GHz machine.
  678. APInt CycleSavings(128, 0);
  679. for (auto &BB : F) {
  680. APInt CurrentSavings(128, 0);
  681. for (auto &I : BB) {
  682. if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
  683. // Count a conditional branch as savings if it becomes unconditional.
  684. if (BI->isConditional() &&
  685. isa_and_nonnull<ConstantInt>(
  686. SimplifiedValues.lookup(BI->getCondition()))) {
  687. CurrentSavings += InlineConstants::InstrCost;
  688. }
  689. } else if (Value *V = dyn_cast<Value>(&I)) {
  690. // Count an instruction as savings if we can fold it.
  691. if (SimplifiedValues.count(V)) {
  692. CurrentSavings += InlineConstants::InstrCost;
  693. }
  694. }
  695. }
  696. auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
  697. assert(ProfileCount.hasValue());
  698. CurrentSavings *= ProfileCount.getValue();
  699. CycleSavings += CurrentSavings;
  700. }
  701. // Compute the cycle savings per call.
  702. auto EntryProfileCount = F.getEntryCount();
  703. assert(EntryProfileCount.hasValue() && EntryProfileCount->getCount());
  704. auto EntryCount = EntryProfileCount->getCount();
  705. CycleSavings += EntryCount / 2;
  706. CycleSavings = CycleSavings.udiv(EntryCount);
  707. // Compute the total savings for the call site.
  708. auto *CallerBB = CandidateCall.getParent();
  709. BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
  710. CycleSavings += getCallsiteCost(this->CandidateCall, DL);
  711. CycleSavings *= CallerBFI->getBlockProfileCount(CallerBB).getValue();
  712. // Remove the cost of the cold basic blocks.
  713. int Size = Cost - ColdSize;
  714. // Allow tiny callees to be inlined regardless of whether they meet the
  715. // savings threshold.
  716. Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
  717. CostBenefit.emplace(APInt(128, Size), CycleSavings);
  718. // Return true if the savings justify the cost of inlining. Specifically,
  719. // we evaluate the following inequality:
  720. //
  721. // CycleSavings PSI->getOrCompHotCountThreshold()
  722. // -------------- >= -----------------------------------
  723. // Size InlineSavingsMultiplier
  724. //
  725. // Note that the left hand side is specific to a call site. The right hand
  726. // side is a constant for the entire executable.
  727. APInt LHS = CycleSavings;
  728. LHS *= InlineSavingsMultiplier;
  729. APInt RHS(128, PSI->getOrCompHotCountThreshold());
  730. RHS *= Size;
  731. return LHS.uge(RHS);
  732. }
  733. InlineResult finalizeAnalysis() override {
  734. // Loops generally act a lot like calls in that they act like barriers to
  735. // movement, require a certain amount of setup, etc. So when optimising for
  736. // size, we penalise any call sites that perform loops. We do this after all
  737. // other costs here, so will likely only be dealing with relatively small
  738. // functions (and hence DT and LI will hopefully be cheap).
  739. auto *Caller = CandidateCall.getFunction();
  740. if (Caller->hasMinSize()) {
  741. DominatorTree DT(F);
  742. LoopInfo LI(DT);
  743. int NumLoops = 0;
  744. for (Loop *L : LI) {
  745. // Ignore loops that will not be executed
  746. if (DeadBlocks.count(L->getHeader()))
  747. continue;
  748. NumLoops++;
  749. }
  750. addCost(NumLoops * InlineConstants::LoopPenalty);
  751. }
  752. // We applied the maximum possible vector bonus at the beginning. Now,
  753. // subtract the excess bonus, if any, from the Threshold before
  754. // comparing against Cost.
  755. if (NumVectorInstructions <= NumInstructions / 10)
  756. Threshold -= VectorBonus;
  757. else if (NumVectorInstructions <= NumInstructions / 2)
  758. Threshold -= VectorBonus / 2;
  759. if (Optional<int> AttrCost =
  760. getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
  761. Cost = *AttrCost;
  762. if (Optional<int> AttrCostMult = getStringFnAttrAsInt(
  763. CandidateCall,
  764. InlineConstants::FunctionInlineCostMultiplierAttributeName))
  765. Cost *= *AttrCostMult;
  766. if (Optional<int> AttrThreshold =
  767. getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
  768. Threshold = *AttrThreshold;
  769. if (auto Result = costBenefitAnalysis()) {
  770. DecidedByCostBenefit = true;
  771. if (Result.getValue())
  772. return InlineResult::success();
  773. else
  774. return InlineResult::failure("Cost over threshold.");
  775. }
  776. if (IgnoreThreshold)
  777. return InlineResult::success();
  778. DecidedByCostThreshold = true;
  779. return Cost < std::max(1, Threshold)
  780. ? InlineResult::success()
  781. : InlineResult::failure("Cost over threshold.");
  782. }
  783. bool shouldStop() override {
  784. if (IgnoreThreshold || ComputeFullInlineCost)
  785. return false;
  786. // Bail out the moment we cross the threshold. This means we'll under-count
  787. // the cost, but only when undercounting doesn't matter.
  788. if (Cost < Threshold)
  789. return false;
  790. DecidedByCostThreshold = true;
  791. return true;
  792. }
  793. void onLoadEliminationOpportunity() override {
  794. LoadEliminationCost += InlineConstants::InstrCost;
  795. }
  796. InlineResult onAnalysisStart() override {
  797. // Perform some tweaks to the cost and threshold based on the direct
  798. // callsite information.
  799. // We want to more aggressively inline vector-dense kernels, so up the
  800. // threshold, and we'll lower it if the % of vector instructions gets too
  801. // low. Note that these bonuses are some what arbitrary and evolved over
  802. // time by accident as much as because they are principled bonuses.
  803. //
  804. // FIXME: It would be nice to remove all such bonuses. At least it would be
  805. // nice to base the bonus values on something more scientific.
  806. assert(NumInstructions == 0);
  807. assert(NumVectorInstructions == 0);
  808. // Update the threshold based on callsite properties
  809. updateThreshold(CandidateCall, F);
  810. // While Threshold depends on commandline options that can take negative
  811. // values, we want to enforce the invariant that the computed threshold and
  812. // bonuses are non-negative.
  813. assert(Threshold >= 0);
  814. assert(SingleBBBonus >= 0);
  815. assert(VectorBonus >= 0);
  816. // Speculatively apply all possible bonuses to Threshold. If cost exceeds
  817. // this Threshold any time, and cost cannot decrease, we can stop processing
  818. // the rest of the function body.
  819. Threshold += (SingleBBBonus + VectorBonus);
  820. // Give out bonuses for the callsite, as the instructions setting them up
  821. // will be gone after inlining.
  822. addCost(-getCallsiteCost(this->CandidateCall, DL));
  823. // If this function uses the coldcc calling convention, prefer not to inline
  824. // it.
  825. if (F.getCallingConv() == CallingConv::Cold)
  826. Cost += InlineConstants::ColdccPenalty;
  827. // Check if we're done. This can happen due to bonuses and penalties.
  828. if (Cost >= Threshold && !ComputeFullInlineCost)
  829. return InlineResult::failure("high cost");
  830. return InlineResult::success();
  831. }
  832. public:
  833. InlineCostCallAnalyzer(
  834. Function &Callee, CallBase &Call, const InlineParams &Params,
  835. const TargetTransformInfo &TTI,
  836. function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
  837. function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
  838. ProfileSummaryInfo *PSI = nullptr,
  839. OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
  840. bool IgnoreThreshold = false)
  841. : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
  842. ComputeFullInlineCost(OptComputeFullInlineCost ||
  843. Params.ComputeFullInlineCost || ORE ||
  844. isCostBenefitAnalysisEnabled()),
  845. Params(Params), Threshold(Params.DefaultThreshold),
  846. BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
  847. CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
  848. Writer(this) {
  849. AllowRecursiveCall = Params.AllowRecursiveCall.getValue();
  850. }
  851. /// Annotation Writer for instruction details
  852. InlineCostAnnotationWriter Writer;
  853. void dump();
  854. // Prints the same analysis as dump(), but its definition is not dependent
  855. // on the build.
  856. void print(raw_ostream &OS);
  857. Optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
  858. if (InstructionCostDetailMap.find(I) != InstructionCostDetailMap.end())
  859. return InstructionCostDetailMap[I];
  860. return None;
  861. }
  862. virtual ~InlineCostCallAnalyzer() {}
  863. int getThreshold() const { return Threshold; }
  864. int getCost() const { return Cost; }
  865. Optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
  866. bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
  867. bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
  868. };
  869. class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
  870. private:
  871. InlineCostFeatures Cost = {};
  872. // FIXME: These constants are taken from the heuristic-based cost visitor.
  873. // These should be removed entirely in a later revision to avoid reliance on
  874. // heuristics in the ML inliner.
  875. static constexpr int JTCostMultiplier = 4;
  876. static constexpr int CaseClusterCostMultiplier = 2;
  877. static constexpr int SwitchCostMultiplier = 2;
  878. // FIXME: These are taken from the heuristic-based cost visitor: we should
  879. // eventually abstract these to the CallAnalyzer to avoid duplication.
  880. unsigned SROACostSavingOpportunities = 0;
  881. int VectorBonus = 0;
  882. int SingleBBBonus = 0;
  883. int Threshold = 5;
  884. DenseMap<AllocaInst *, unsigned> SROACosts;
  885. void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
  886. Cost[static_cast<size_t>(Feature)] += Delta;
  887. }
  888. void set(InlineCostFeatureIndex Feature, int64_t Value) {
  889. Cost[static_cast<size_t>(Feature)] = Value;
  890. }
  891. void onDisableSROA(AllocaInst *Arg) override {
  892. auto CostIt = SROACosts.find(Arg);
  893. if (CostIt == SROACosts.end())
  894. return;
  895. increment(InlineCostFeatureIndex::SROALosses, CostIt->second);
  896. SROACostSavingOpportunities -= CostIt->second;
  897. SROACosts.erase(CostIt);
  898. }
  899. void onDisableLoadElimination() override {
  900. set(InlineCostFeatureIndex::LoadElimination, 1);
  901. }
  902. void onCallPenalty() override {
  903. increment(InlineCostFeatureIndex::CallPenalty, CallPenalty);
  904. }
  905. void onCallArgumentSetup(const CallBase &Call) override {
  906. increment(InlineCostFeatureIndex::CallArgumentSetup,
  907. Call.arg_size() * InlineConstants::InstrCost);
  908. }
  909. void onLoadRelativeIntrinsic() override {
  910. increment(InlineCostFeatureIndex::LoadRelativeIntrinsic,
  911. 3 * InlineConstants::InstrCost);
  912. }
  913. void onLoweredCall(Function *F, CallBase &Call,
  914. bool IsIndirectCall) override {
  915. increment(InlineCostFeatureIndex::LoweredCallArgSetup,
  916. Call.arg_size() * InlineConstants::InstrCost);
  917. if (IsIndirectCall) {
  918. InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
  919. /*HintThreshold*/ {},
  920. /*ColdThreshold*/ {},
  921. /*OptSizeThreshold*/ {},
  922. /*OptMinSizeThreshold*/ {},
  923. /*HotCallSiteThreshold*/ {},
  924. /*LocallyHotCallSiteThreshold*/ {},
  925. /*ColdCallSiteThreshold*/ {},
  926. /*ComputeFullInlineCost*/ true,
  927. /*EnableDeferral*/ true};
  928. IndirectCallParams.DefaultThreshold =
  929. InlineConstants::IndirectCallThreshold;
  930. InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
  931. GetAssumptionCache, GetBFI, PSI, ORE, false,
  932. true);
  933. if (CA.analyze().isSuccess()) {
  934. increment(InlineCostFeatureIndex::NestedInlineCostEstimate,
  935. CA.getCost());
  936. increment(InlineCostFeatureIndex::NestedInlines, 1);
  937. }
  938. } else {
  939. onCallPenalty();
  940. }
  941. }
  942. void onFinalizeSwitch(unsigned JumpTableSize,
  943. unsigned NumCaseCluster) override {
  944. if (JumpTableSize) {
  945. int64_t JTCost =
  946. static_cast<int64_t>(JumpTableSize) * InlineConstants::InstrCost +
  947. JTCostMultiplier * InlineConstants::InstrCost;
  948. increment(InlineCostFeatureIndex::JumpTablePenalty, JTCost);
  949. return;
  950. }
  951. if (NumCaseCluster <= 3) {
  952. increment(InlineCostFeatureIndex::CaseClusterPenalty,
  953. NumCaseCluster * CaseClusterCostMultiplier *
  954. InlineConstants::InstrCost);
  955. return;
  956. }
  957. int64_t ExpectedNumberOfCompare =
  958. getExpectedNumberOfCompare(NumCaseCluster);
  959. int64_t SwitchCost = ExpectedNumberOfCompare * SwitchCostMultiplier *
  960. InlineConstants::InstrCost;
  961. increment(InlineCostFeatureIndex::SwitchPenalty, SwitchCost);
  962. }
  963. void onMissedSimplification() override {
  964. increment(InlineCostFeatureIndex::UnsimplifiedCommonInstructions,
  965. InlineConstants::InstrCost);
  966. }
  967. void onInitializeSROAArg(AllocaInst *Arg) override { SROACosts[Arg] = 0; }
  968. void onAggregateSROAUse(AllocaInst *Arg) override {
  969. SROACosts.find(Arg)->second += InlineConstants::InstrCost;
  970. SROACostSavingOpportunities += InlineConstants::InstrCost;
  971. }
  972. void onBlockAnalyzed(const BasicBlock *BB) override {
  973. if (BB->getTerminator()->getNumSuccessors() > 1)
  974. set(InlineCostFeatureIndex::IsMultipleBlocks, 1);
  975. Threshold -= SingleBBBonus;
  976. }
  977. InlineResult finalizeAnalysis() override {
  978. auto *Caller = CandidateCall.getFunction();
  979. if (Caller->hasMinSize()) {
  980. DominatorTree DT(F);
  981. LoopInfo LI(DT);
  982. for (Loop *L : LI) {
  983. // Ignore loops that will not be executed
  984. if (DeadBlocks.count(L->getHeader()))
  985. continue;
  986. increment(InlineCostFeatureIndex::NumLoops,
  987. InlineConstants::LoopPenalty);
  988. }
  989. }
  990. set(InlineCostFeatureIndex::DeadBlocks, DeadBlocks.size());
  991. set(InlineCostFeatureIndex::SimplifiedInstructions,
  992. NumInstructionsSimplified);
  993. set(InlineCostFeatureIndex::ConstantArgs, NumConstantArgs);
  994. set(InlineCostFeatureIndex::ConstantOffsetPtrArgs,
  995. NumConstantOffsetPtrArgs);
  996. set(InlineCostFeatureIndex::SROASavings, SROACostSavingOpportunities);
  997. if (NumVectorInstructions <= NumInstructions / 10)
  998. Threshold -= VectorBonus;
  999. else if (NumVectorInstructions <= NumInstructions / 2)
  1000. Threshold -= VectorBonus / 2;
  1001. set(InlineCostFeatureIndex::Threshold, Threshold);
  1002. return InlineResult::success();
  1003. }
  1004. bool shouldStop() override { return false; }
  1005. void onLoadEliminationOpportunity() override {
  1006. increment(InlineCostFeatureIndex::LoadElimination, 1);
  1007. }
  1008. InlineResult onAnalysisStart() override {
  1009. increment(InlineCostFeatureIndex::CallSiteCost,
  1010. -1 * getCallsiteCost(this->CandidateCall, DL));
  1011. set(InlineCostFeatureIndex::ColdCcPenalty,
  1012. (F.getCallingConv() == CallingConv::Cold));
  1013. // FIXME: we shouldn't repeat this logic in both the Features and Cost
  1014. // analyzer - instead, we should abstract it to a common method in the
  1015. // CallAnalyzer
  1016. int SingleBBBonusPercent = 50;
  1017. int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
  1018. Threshold += TTI.adjustInliningThreshold(&CandidateCall);
  1019. Threshold *= TTI.getInliningThresholdMultiplier();
  1020. SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
  1021. VectorBonus = Threshold * VectorBonusPercent / 100;
  1022. Threshold += (SingleBBBonus + VectorBonus);
  1023. return InlineResult::success();
  1024. }
  1025. public:
  1026. InlineCostFeaturesAnalyzer(
  1027. const TargetTransformInfo &TTI,
  1028. function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
  1029. function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
  1030. ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
  1031. CallBase &Call)
  1032. : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
  1033. const InlineCostFeatures &features() const { return Cost; }
  1034. };
  1035. } // namespace
  1036. /// Test whether the given value is an Alloca-derived function argument.
  1037. bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
  1038. return SROAArgValues.count(V);
  1039. }
  1040. void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
  1041. onDisableSROA(SROAArg);
  1042. EnabledSROAAllocas.erase(SROAArg);
  1043. disableLoadElimination();
  1044. }
  1045. void InlineCostAnnotationWriter::emitInstructionAnnot(
  1046. const Instruction *I, formatted_raw_ostream &OS) {
  1047. // The cost of inlining of the given instruction is printed always.
  1048. // The threshold delta is printed only when it is non-zero. It happens
  1049. // when we decided to give a bonus at a particular instruction.
  1050. Optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
  1051. if (!Record)
  1052. OS << "; No analysis for the instruction";
  1053. else {
  1054. OS << "; cost before = " << Record->CostBefore
  1055. << ", cost after = " << Record->CostAfter
  1056. << ", threshold before = " << Record->ThresholdBefore
  1057. << ", threshold after = " << Record->ThresholdAfter << ", ";
  1058. OS << "cost delta = " << Record->getCostDelta();
  1059. if (Record->hasThresholdChanged())
  1060. OS << ", threshold delta = " << Record->getThresholdDelta();
  1061. }
  1062. auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
  1063. if (C) {
  1064. OS << ", simplified to ";
  1065. C.getValue()->print(OS, true);
  1066. }
  1067. OS << "\n";
  1068. }
  1069. /// If 'V' maps to a SROA candidate, disable SROA for it.
  1070. void CallAnalyzer::disableSROA(Value *V) {
  1071. if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
  1072. disableSROAForArg(SROAArg);
  1073. }
  1074. }
  1075. void CallAnalyzer::disableLoadElimination() {
  1076. if (EnableLoadElimination) {
  1077. onDisableLoadElimination();
  1078. EnableLoadElimination = false;
  1079. }
  1080. }
  1081. /// Accumulate a constant GEP offset into an APInt if possible.
  1082. ///
  1083. /// Returns false if unable to compute the offset for any reason. Respects any
  1084. /// simplified values known during the analysis of this callsite.
  1085. bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
  1086. unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
  1087. assert(IntPtrWidth == Offset.getBitWidth());
  1088. for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
  1089. GTI != GTE; ++GTI) {
  1090. ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
  1091. if (!OpC)
  1092. if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
  1093. OpC = dyn_cast<ConstantInt>(SimpleOp);
  1094. if (!OpC)
  1095. return false;
  1096. if (OpC->isZero())
  1097. continue;
  1098. // Handle a struct index, which adds its field offset to the pointer.
  1099. if (StructType *STy = GTI.getStructTypeOrNull()) {
  1100. unsigned ElementIdx = OpC->getZExtValue();
  1101. const StructLayout *SL = DL.getStructLayout(STy);
  1102. Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
  1103. continue;
  1104. }
  1105. APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
  1106. Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
  1107. }
  1108. return true;
  1109. }
  1110. /// Use TTI to check whether a GEP is free.
  1111. ///
  1112. /// Respects any simplified values known during the analysis of this callsite.
  1113. bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
  1114. SmallVector<Value *, 4> Operands;
  1115. Operands.push_back(GEP.getOperand(0));
  1116. for (const Use &Op : GEP.indices())
  1117. if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
  1118. Operands.push_back(SimpleOp);
  1119. else
  1120. Operands.push_back(Op);
  1121. return TTI.getUserCost(&GEP, Operands,
  1122. TargetTransformInfo::TCK_SizeAndLatency) ==
  1123. TargetTransformInfo::TCC_Free;
  1124. }
  1125. bool CallAnalyzer::visitAlloca(AllocaInst &I) {
  1126. disableSROA(I.getOperand(0));
  1127. // Check whether inlining will turn a dynamic alloca into a static
  1128. // alloca and handle that case.
  1129. if (I.isArrayAllocation()) {
  1130. Constant *Size = SimplifiedValues.lookup(I.getArraySize());
  1131. if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
  1132. // Sometimes a dynamic alloca could be converted into a static alloca
  1133. // after this constant prop, and become a huge static alloca on an
  1134. // unconditional CFG path. Avoid inlining if this is going to happen above
  1135. // a threshold.
  1136. // FIXME: If the threshold is removed or lowered too much, we could end up
  1137. // being too pessimistic and prevent inlining non-problematic code. This
  1138. // could result in unintended perf regressions. A better overall strategy
  1139. // is needed to track stack usage during inlining.
  1140. Type *Ty = I.getAllocatedType();
  1141. AllocatedSize = SaturatingMultiplyAdd(
  1142. AllocSize->getLimitedValue(),
  1143. DL.getTypeAllocSize(Ty).getKnownMinSize(), AllocatedSize);
  1144. if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
  1145. HasDynamicAlloca = true;
  1146. return false;
  1147. }
  1148. }
  1149. // Accumulate the allocated size.
  1150. if (I.isStaticAlloca()) {
  1151. Type *Ty = I.getAllocatedType();
  1152. AllocatedSize =
  1153. SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinSize(), AllocatedSize);
  1154. }
  1155. // FIXME: This is overly conservative. Dynamic allocas are inefficient for
  1156. // a variety of reasons, and so we would like to not inline them into
  1157. // functions which don't currently have a dynamic alloca. This simply
  1158. // disables inlining altogether in the presence of a dynamic alloca.
  1159. if (!I.isStaticAlloca())
  1160. HasDynamicAlloca = true;
  1161. return false;
  1162. }
  1163. bool CallAnalyzer::visitPHI(PHINode &I) {
  1164. // FIXME: We need to propagate SROA *disabling* through phi nodes, even
  1165. // though we don't want to propagate it's bonuses. The idea is to disable
  1166. // SROA if it *might* be used in an inappropriate manner.
  1167. // Phi nodes are always zero-cost.
  1168. // FIXME: Pointer sizes may differ between different address spaces, so do we
  1169. // need to use correct address space in the call to getPointerSizeInBits here?
  1170. // Or could we skip the getPointerSizeInBits call completely? As far as I can
  1171. // see the ZeroOffset is used as a dummy value, so we can probably use any
  1172. // bit width for the ZeroOffset?
  1173. APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
  1174. bool CheckSROA = I.getType()->isPointerTy();
  1175. // Track the constant or pointer with constant offset we've seen so far.
  1176. Constant *FirstC = nullptr;
  1177. std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
  1178. Value *FirstV = nullptr;
  1179. for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
  1180. BasicBlock *Pred = I.getIncomingBlock(i);
  1181. // If the incoming block is dead, skip the incoming block.
  1182. if (DeadBlocks.count(Pred))
  1183. continue;
  1184. // If the parent block of phi is not the known successor of the incoming
  1185. // block, skip the incoming block.
  1186. BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
  1187. if (KnownSuccessor && KnownSuccessor != I.getParent())
  1188. continue;
  1189. Value *V = I.getIncomingValue(i);
  1190. // If the incoming value is this phi itself, skip the incoming value.
  1191. if (&I == V)
  1192. continue;
  1193. Constant *C = dyn_cast<Constant>(V);
  1194. if (!C)
  1195. C = SimplifiedValues.lookup(V);
  1196. std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
  1197. if (!C && CheckSROA)
  1198. BaseAndOffset = ConstantOffsetPtrs.lookup(V);
  1199. if (!C && !BaseAndOffset.first)
  1200. // The incoming value is neither a constant nor a pointer with constant
  1201. // offset, exit early.
  1202. return true;
  1203. if (FirstC) {
  1204. if (FirstC == C)
  1205. // If we've seen a constant incoming value before and it is the same
  1206. // constant we see this time, continue checking the next incoming value.
  1207. continue;
  1208. // Otherwise early exit because we either see a different constant or saw
  1209. // a constant before but we have a pointer with constant offset this time.
  1210. return true;
  1211. }
  1212. if (FirstV) {
  1213. // The same logic as above, but check pointer with constant offset here.
  1214. if (FirstBaseAndOffset == BaseAndOffset)
  1215. continue;
  1216. return true;
  1217. }
  1218. if (C) {
  1219. // This is the 1st time we've seen a constant, record it.
  1220. FirstC = C;
  1221. continue;
  1222. }
  1223. // The remaining case is that this is the 1st time we've seen a pointer with
  1224. // constant offset, record it.
  1225. FirstV = V;
  1226. FirstBaseAndOffset = BaseAndOffset;
  1227. }
  1228. // Check if we can map phi to a constant.
  1229. if (FirstC) {
  1230. SimplifiedValues[&I] = FirstC;
  1231. return true;
  1232. }
  1233. // Check if we can map phi to a pointer with constant offset.
  1234. if (FirstBaseAndOffset.first) {
  1235. ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
  1236. if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
  1237. SROAArgValues[&I] = SROAArg;
  1238. }
  1239. return true;
  1240. }
  1241. /// Check we can fold GEPs of constant-offset call site argument pointers.
  1242. /// This requires target data and inbounds GEPs.
  1243. ///
  1244. /// \return true if the specified GEP can be folded.
  1245. bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
  1246. // Check if we have a base + offset for the pointer.
  1247. std::pair<Value *, APInt> BaseAndOffset =
  1248. ConstantOffsetPtrs.lookup(I.getPointerOperand());
  1249. if (!BaseAndOffset.first)
  1250. return false;
  1251. // Check if the offset of this GEP is constant, and if so accumulate it
  1252. // into Offset.
  1253. if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
  1254. return false;
  1255. // Add the result as a new mapping to Base + Offset.
  1256. ConstantOffsetPtrs[&I] = BaseAndOffset;
  1257. return true;
  1258. }
  1259. bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
  1260. auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
  1261. // Lambda to check whether a GEP's indices are all constant.
  1262. auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
  1263. for (const Use &Op : GEP.indices())
  1264. if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
  1265. return false;
  1266. return true;
  1267. };
  1268. if (!DisableGEPConstOperand)
  1269. if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
  1270. SmallVector<Constant *, 2> Indices;
  1271. for (unsigned int Index = 1; Index < COps.size(); ++Index)
  1272. Indices.push_back(COps[Index]);
  1273. return ConstantExpr::getGetElementPtr(
  1274. I.getSourceElementType(), COps[0], Indices, I.isInBounds());
  1275. }))
  1276. return true;
  1277. if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
  1278. if (SROAArg)
  1279. SROAArgValues[&I] = SROAArg;
  1280. // Constant GEPs are modeled as free.
  1281. return true;
  1282. }
  1283. // Variable GEPs will require math and will disable SROA.
  1284. if (SROAArg)
  1285. disableSROAForArg(SROAArg);
  1286. return isGEPFree(I);
  1287. }
  1288. /// Simplify \p I if its operands are constants and update SimplifiedValues.
  1289. /// \p Evaluate is a callable specific to instruction type that evaluates the
  1290. /// instruction when all the operands are constants.
  1291. template <typename Callable>
  1292. bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
  1293. SmallVector<Constant *, 2> COps;
  1294. for (Value *Op : I.operands()) {
  1295. Constant *COp = dyn_cast<Constant>(Op);
  1296. if (!COp)
  1297. COp = SimplifiedValues.lookup(Op);
  1298. if (!COp)
  1299. return false;
  1300. COps.push_back(COp);
  1301. }
  1302. auto *C = Evaluate(COps);
  1303. if (!C)
  1304. return false;
  1305. SimplifiedValues[&I] = C;
  1306. return true;
  1307. }
  1308. /// Try to simplify a call to llvm.is.constant.
  1309. ///
  1310. /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
  1311. /// we expect calls of this specific intrinsic to be infrequent.
  1312. ///
  1313. /// FIXME: Given that we know CB's parent (F) caller
  1314. /// (CandidateCall->getParent()->getParent()), we might be able to determine
  1315. /// whether inlining F into F's caller would change how the call to
  1316. /// llvm.is.constant would evaluate.
  1317. bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
  1318. Value *Arg = CB.getArgOperand(0);
  1319. auto *C = dyn_cast<Constant>(Arg);
  1320. if (!C)
  1321. C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
  1322. Type *RT = CB.getFunctionType()->getReturnType();
  1323. SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
  1324. return true;
  1325. }
  1326. bool CallAnalyzer::visitBitCast(BitCastInst &I) {
  1327. // Propagate constants through bitcasts.
  1328. if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
  1329. return ConstantExpr::getBitCast(COps[0], I.getType());
  1330. }))
  1331. return true;
  1332. // Track base/offsets through casts
  1333. std::pair<Value *, APInt> BaseAndOffset =
  1334. ConstantOffsetPtrs.lookup(I.getOperand(0));
  1335. // Casts don't change the offset, just wrap it up.
  1336. if (BaseAndOffset.first)
  1337. ConstantOffsetPtrs[&I] = BaseAndOffset;
  1338. // Also look for SROA candidates here.
  1339. if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
  1340. SROAArgValues[&I] = SROAArg;
  1341. // Bitcasts are always zero cost.
  1342. return true;
  1343. }
  1344. bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
  1345. // Propagate constants through ptrtoint.
  1346. if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
  1347. return ConstantExpr::getPtrToInt(COps[0], I.getType());
  1348. }))
  1349. return true;
  1350. // Track base/offset pairs when converted to a plain integer provided the
  1351. // integer is large enough to represent the pointer.
  1352. unsigned IntegerSize = I.getType()->getScalarSizeInBits();
  1353. unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
  1354. if (IntegerSize == DL.getPointerSizeInBits(AS)) {
  1355. std::pair<Value *, APInt> BaseAndOffset =
  1356. ConstantOffsetPtrs.lookup(I.getOperand(0));
  1357. if (BaseAndOffset.first)
  1358. ConstantOffsetPtrs[&I] = BaseAndOffset;
  1359. }
  1360. // This is really weird. Technically, ptrtoint will disable SROA. However,
  1361. // unless that ptrtoint is *used* somewhere in the live basic blocks after
  1362. // inlining, it will be nuked, and SROA should proceed. All of the uses which
  1363. // would block SROA would also block SROA if applied directly to a pointer,
  1364. // and so we can just add the integer in here. The only places where SROA is
  1365. // preserved either cannot fire on an integer, or won't in-and-of themselves
  1366. // disable SROA (ext) w/o some later use that we would see and disable.
  1367. if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
  1368. SROAArgValues[&I] = SROAArg;
  1369. return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
  1370. TargetTransformInfo::TCC_Free;
  1371. }
  1372. bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
  1373. // Propagate constants through ptrtoint.
  1374. if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
  1375. return ConstantExpr::getIntToPtr(COps[0], I.getType());
  1376. }))
  1377. return true;
  1378. // Track base/offset pairs when round-tripped through a pointer without
  1379. // modifications provided the integer is not too large.
  1380. Value *Op = I.getOperand(0);
  1381. unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
  1382. if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
  1383. std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
  1384. if (BaseAndOffset.first)
  1385. ConstantOffsetPtrs[&I] = BaseAndOffset;
  1386. }
  1387. // "Propagate" SROA here in the same manner as we do for ptrtoint above.
  1388. if (auto *SROAArg = getSROAArgForValueOrNull(Op))
  1389. SROAArgValues[&I] = SROAArg;
  1390. return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
  1391. TargetTransformInfo::TCC_Free;
  1392. }
  1393. bool CallAnalyzer::visitCastInst(CastInst &I) {
  1394. // Propagate constants through casts.
  1395. if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
  1396. return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
  1397. }))
  1398. return true;
  1399. // Disable SROA in the face of arbitrary casts we don't explicitly list
  1400. // elsewhere.
  1401. disableSROA(I.getOperand(0));
  1402. // If this is a floating-point cast, and the target says this operation
  1403. // is expensive, this may eventually become a library call. Treat the cost
  1404. // as such.
  1405. switch (I.getOpcode()) {
  1406. case Instruction::FPTrunc:
  1407. case Instruction::FPExt:
  1408. case Instruction::UIToFP:
  1409. case Instruction::SIToFP:
  1410. case Instruction::FPToUI:
  1411. case Instruction::FPToSI:
  1412. if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
  1413. onCallPenalty();
  1414. break;
  1415. default:
  1416. break;
  1417. }
  1418. return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
  1419. TargetTransformInfo::TCC_Free;
  1420. }
  1421. bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
  1422. return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
  1423. }
  1424. bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
  1425. // Does the *call site* have the NonNull attribute set on an argument? We
  1426. // use the attribute on the call site to memoize any analysis done in the
  1427. // caller. This will also trip if the callee function has a non-null
  1428. // parameter attribute, but that's a less interesting case because hopefully
  1429. // the callee would already have been simplified based on that.
  1430. if (Argument *A = dyn_cast<Argument>(V))
  1431. if (paramHasAttr(A, Attribute::NonNull))
  1432. return true;
  1433. // Is this an alloca in the caller? This is distinct from the attribute case
  1434. // above because attributes aren't updated within the inliner itself and we
  1435. // always want to catch the alloca derived case.
  1436. if (isAllocaDerivedArg(V))
  1437. // We can actually predict the result of comparisons between an
  1438. // alloca-derived value and null. Note that this fires regardless of
  1439. // SROA firing.
  1440. return true;
  1441. return false;
  1442. }
  1443. bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
  1444. // If the normal destination of the invoke or the parent block of the call
  1445. // site is unreachable-terminated, there is little point in inlining this
  1446. // unless there is literally zero cost.
  1447. // FIXME: Note that it is possible that an unreachable-terminated block has a
  1448. // hot entry. For example, in below scenario inlining hot_call_X() may be
  1449. // beneficial :
  1450. // main() {
  1451. // hot_call_1();
  1452. // ...
  1453. // hot_call_N()
  1454. // exit(0);
  1455. // }
  1456. // For now, we are not handling this corner case here as it is rare in real
  1457. // code. In future, we should elaborate this based on BPI and BFI in more
  1458. // general threshold adjusting heuristics in updateThreshold().
  1459. if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
  1460. if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
  1461. return false;
  1462. } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
  1463. return false;
  1464. return true;
  1465. }
  1466. bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
  1467. BlockFrequencyInfo *CallerBFI) {
  1468. // If global profile summary is available, then callsite's coldness is
  1469. // determined based on that.
  1470. if (PSI && PSI->hasProfileSummary())
  1471. return PSI->isColdCallSite(Call, CallerBFI);
  1472. // Otherwise we need BFI to be available.
  1473. if (!CallerBFI)
  1474. return false;
  1475. // Determine if the callsite is cold relative to caller's entry. We could
  1476. // potentially cache the computation of scaled entry frequency, but the added
  1477. // complexity is not worth it unless this scaling shows up high in the
  1478. // profiles.
  1479. const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
  1480. auto CallSiteBB = Call.getParent();
  1481. auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
  1482. auto CallerEntryFreq =
  1483. CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
  1484. return CallSiteFreq < CallerEntryFreq * ColdProb;
  1485. }
  1486. Optional<int>
  1487. InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
  1488. BlockFrequencyInfo *CallerBFI) {
  1489. // If global profile summary is available, then callsite's hotness is
  1490. // determined based on that.
  1491. if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
  1492. return Params.HotCallSiteThreshold;
  1493. // Otherwise we need BFI to be available and to have a locally hot callsite
  1494. // threshold.
  1495. if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
  1496. return None;
  1497. // Determine if the callsite is hot relative to caller's entry. We could
  1498. // potentially cache the computation of scaled entry frequency, but the added
  1499. // complexity is not worth it unless this scaling shows up high in the
  1500. // profiles.
  1501. auto CallSiteBB = Call.getParent();
  1502. auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
  1503. auto CallerEntryFreq = CallerBFI->getEntryFreq();
  1504. if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
  1505. return Params.LocallyHotCallSiteThreshold;
  1506. // Otherwise treat it normally.
  1507. return None;
  1508. }
  1509. void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
  1510. // If no size growth is allowed for this inlining, set Threshold to 0.
  1511. if (!allowSizeGrowth(Call)) {
  1512. Threshold = 0;
  1513. return;
  1514. }
  1515. Function *Caller = Call.getCaller();
  1516. // return min(A, B) if B is valid.
  1517. auto MinIfValid = [](int A, Optional<int> B) {
  1518. return B ? std::min(A, B.getValue()) : A;
  1519. };
  1520. // return max(A, B) if B is valid.
  1521. auto MaxIfValid = [](int A, Optional<int> B) {
  1522. return B ? std::max(A, B.getValue()) : A;
  1523. };
  1524. // Various bonus percentages. These are multiplied by Threshold to get the
  1525. // bonus values.
  1526. // SingleBBBonus: This bonus is applied if the callee has a single reachable
  1527. // basic block at the given callsite context. This is speculatively applied
  1528. // and withdrawn if more than one basic block is seen.
  1529. //
  1530. // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
  1531. // of the last call to a static function as inlining such functions is
  1532. // guaranteed to reduce code size.
  1533. //
  1534. // These bonus percentages may be set to 0 based on properties of the caller
  1535. // and the callsite.
  1536. int SingleBBBonusPercent = 50;
  1537. int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
  1538. int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
  1539. // Lambda to set all the above bonus and bonus percentages to 0.
  1540. auto DisallowAllBonuses = [&]() {
  1541. SingleBBBonusPercent = 0;
  1542. VectorBonusPercent = 0;
  1543. LastCallToStaticBonus = 0;
  1544. };
  1545. // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
  1546. // and reduce the threshold if the caller has the necessary attribute.
  1547. if (Caller->hasMinSize()) {
  1548. Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
  1549. // For minsize, we want to disable the single BB bonus and the vector
  1550. // bonuses, but not the last-call-to-static bonus. Inlining the last call to
  1551. // a static function will, at the minimum, eliminate the parameter setup and
  1552. // call/return instructions.
  1553. SingleBBBonusPercent = 0;
  1554. VectorBonusPercent = 0;
  1555. } else if (Caller->hasOptSize())
  1556. Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
  1557. // Adjust the threshold based on inlinehint attribute and profile based
  1558. // hotness information if the caller does not have MinSize attribute.
  1559. if (!Caller->hasMinSize()) {
  1560. if (Callee.hasFnAttribute(Attribute::InlineHint))
  1561. Threshold = MaxIfValid(Threshold, Params.HintThreshold);
  1562. // FIXME: After switching to the new passmanager, simplify the logic below
  1563. // by checking only the callsite hotness/coldness as we will reliably
  1564. // have local profile information.
  1565. //
  1566. // Callsite hotness and coldness can be determined if sample profile is
  1567. // used (which adds hotness metadata to calls) or if caller's
  1568. // BlockFrequencyInfo is available.
  1569. BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
  1570. auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
  1571. if (!Caller->hasOptSize() && HotCallSiteThreshold) {
  1572. LLVM_DEBUG(dbgs() << "Hot callsite.\n");
  1573. // FIXME: This should update the threshold only if it exceeds the
  1574. // current threshold, but AutoFDO + ThinLTO currently relies on this
  1575. // behavior to prevent inlining of hot callsites during ThinLTO
  1576. // compile phase.
  1577. Threshold = HotCallSiteThreshold.getValue();
  1578. } else if (isColdCallSite(Call, CallerBFI)) {
  1579. LLVM_DEBUG(dbgs() << "Cold callsite.\n");
  1580. // Do not apply bonuses for a cold callsite including the
  1581. // LastCallToStatic bonus. While this bonus might result in code size
  1582. // reduction, it can cause the size of a non-cold caller to increase
  1583. // preventing it from being inlined.
  1584. DisallowAllBonuses();
  1585. Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
  1586. } else if (PSI) {
  1587. // Use callee's global profile information only if we have no way of
  1588. // determining this via callsite information.
  1589. if (PSI->isFunctionEntryHot(&Callee)) {
  1590. LLVM_DEBUG(dbgs() << "Hot callee.\n");
  1591. // If callsite hotness can not be determined, we may still know
  1592. // that the callee is hot and treat it as a weaker hint for threshold
  1593. // increase.
  1594. Threshold = MaxIfValid(Threshold, Params.HintThreshold);
  1595. } else if (PSI->isFunctionEntryCold(&Callee)) {
  1596. LLVM_DEBUG(dbgs() << "Cold callee.\n");
  1597. // Do not apply bonuses for a cold callee including the
  1598. // LastCallToStatic bonus. While this bonus might result in code size
  1599. // reduction, it can cause the size of a non-cold caller to increase
  1600. // preventing it from being inlined.
  1601. DisallowAllBonuses();
  1602. Threshold = MinIfValid(Threshold, Params.ColdThreshold);
  1603. }
  1604. }
  1605. }
  1606. Threshold += TTI.adjustInliningThreshold(&Call);
  1607. // Finally, take the target-specific inlining threshold multiplier into
  1608. // account.
  1609. Threshold *= TTI.getInliningThresholdMultiplier();
  1610. SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
  1611. VectorBonus = Threshold * VectorBonusPercent / 100;
  1612. bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneLiveUse() &&
  1613. &F == Call.getCalledFunction();
  1614. // If there is only one call of the function, and it has internal linkage,
  1615. // the cost of inlining it drops dramatically. It may seem odd to update
  1616. // Cost in updateThreshold, but the bonus depends on the logic in this method.
  1617. if (OnlyOneCallAndLocalLinkage)
  1618. Cost -= LastCallToStaticBonus;
  1619. }
  1620. bool CallAnalyzer::visitCmpInst(CmpInst &I) {
  1621. Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
  1622. // First try to handle simplified comparisons.
  1623. if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
  1624. return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
  1625. }))
  1626. return true;
  1627. if (I.getOpcode() == Instruction::FCmp)
  1628. return false;
  1629. // Otherwise look for a comparison between constant offset pointers with
  1630. // a common base.
  1631. Value *LHSBase, *RHSBase;
  1632. APInt LHSOffset, RHSOffset;
  1633. std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
  1634. if (LHSBase) {
  1635. std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
  1636. if (RHSBase && LHSBase == RHSBase) {
  1637. // We have common bases, fold the icmp to a constant based on the
  1638. // offsets.
  1639. Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
  1640. Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
  1641. if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
  1642. SimplifiedValues[&I] = C;
  1643. ++NumConstantPtrCmps;
  1644. return true;
  1645. }
  1646. }
  1647. }
  1648. // If the comparison is an equality comparison with null, we can simplify it
  1649. // if we know the value (argument) can't be null
  1650. if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
  1651. isKnownNonNullInCallee(I.getOperand(0))) {
  1652. bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
  1653. SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
  1654. : ConstantInt::getFalse(I.getType());
  1655. return true;
  1656. }
  1657. return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
  1658. }
  1659. bool CallAnalyzer::visitSub(BinaryOperator &I) {
  1660. // Try to handle a special case: we can fold computing the difference of two
  1661. // constant-related pointers.
  1662. Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
  1663. Value *LHSBase, *RHSBase;
  1664. APInt LHSOffset, RHSOffset;
  1665. std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
  1666. if (LHSBase) {
  1667. std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
  1668. if (RHSBase && LHSBase == RHSBase) {
  1669. // We have common bases, fold the subtract to a constant based on the
  1670. // offsets.
  1671. Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
  1672. Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
  1673. if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
  1674. SimplifiedValues[&I] = C;
  1675. ++NumConstantPtrDiffs;
  1676. return true;
  1677. }
  1678. }
  1679. }
  1680. // Otherwise, fall back to the generic logic for simplifying and handling
  1681. // instructions.
  1682. return Base::visitSub(I);
  1683. }
  1684. bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
  1685. Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
  1686. Constant *CLHS = dyn_cast<Constant>(LHS);
  1687. if (!CLHS)
  1688. CLHS = SimplifiedValues.lookup(LHS);
  1689. Constant *CRHS = dyn_cast<Constant>(RHS);
  1690. if (!CRHS)
  1691. CRHS = SimplifiedValues.lookup(RHS);
  1692. Value *SimpleV = nullptr;
  1693. if (auto FI = dyn_cast<FPMathOperator>(&I))
  1694. SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
  1695. FI->getFastMathFlags(), DL);
  1696. else
  1697. SimpleV =
  1698. SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
  1699. if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
  1700. SimplifiedValues[&I] = C;
  1701. if (SimpleV)
  1702. return true;
  1703. // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
  1704. disableSROA(LHS);
  1705. disableSROA(RHS);
  1706. // If the instruction is floating point, and the target says this operation
  1707. // is expensive, this may eventually become a library call. Treat the cost
  1708. // as such. Unless it's fneg which can be implemented with an xor.
  1709. using namespace llvm::PatternMatch;
  1710. if (I.getType()->isFloatingPointTy() &&
  1711. TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
  1712. !match(&I, m_FNeg(m_Value())))
  1713. onCallPenalty();
  1714. return false;
  1715. }
  1716. bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
  1717. Value *Op = I.getOperand(0);
  1718. Constant *COp = dyn_cast<Constant>(Op);
  1719. if (!COp)
  1720. COp = SimplifiedValues.lookup(Op);
  1721. Value *SimpleV = SimplifyFNegInst(
  1722. COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
  1723. if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
  1724. SimplifiedValues[&I] = C;
  1725. if (SimpleV)
  1726. return true;
  1727. // Disable any SROA on arguments to arbitrary, unsimplified fneg.
  1728. disableSROA(Op);
  1729. return false;
  1730. }
  1731. bool CallAnalyzer::visitLoad(LoadInst &I) {
  1732. if (handleSROA(I.getPointerOperand(), I.isSimple()))
  1733. return true;
  1734. // If the data is already loaded from this address and hasn't been clobbered
  1735. // by any stores or calls, this load is likely to be redundant and can be
  1736. // eliminated.
  1737. if (EnableLoadElimination &&
  1738. !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
  1739. onLoadEliminationOpportunity();
  1740. return true;
  1741. }
  1742. return false;
  1743. }
  1744. bool CallAnalyzer::visitStore(StoreInst &I) {
  1745. if (handleSROA(I.getPointerOperand(), I.isSimple()))
  1746. return true;
  1747. // The store can potentially clobber loads and prevent repeated loads from
  1748. // being eliminated.
  1749. // FIXME:
  1750. // 1. We can probably keep an initial set of eliminatable loads substracted
  1751. // from the cost even when we finally see a store. We just need to disable
  1752. // *further* accumulation of elimination savings.
  1753. // 2. We should probably at some point thread MemorySSA for the callee into
  1754. // this and then use that to actually compute *really* precise savings.
  1755. disableLoadElimination();
  1756. return false;
  1757. }
  1758. bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
  1759. // Constant folding for extract value is trivial.
  1760. if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
  1761. return ConstantExpr::getExtractValue(COps[0], I.getIndices());
  1762. }))
  1763. return true;
  1764. // SROA can't look through these, but they may be free.
  1765. return Base::visitExtractValue(I);
  1766. }
  1767. bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
  1768. // Constant folding for insert value is trivial.
  1769. if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
  1770. return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
  1771. /*InsertedValueOperand*/ COps[1],
  1772. I.getIndices());
  1773. }))
  1774. return true;
  1775. // SROA can't look through these, but they may be free.
  1776. return Base::visitInsertValue(I);
  1777. }
  1778. /// Try to simplify a call site.
  1779. ///
  1780. /// Takes a concrete function and callsite and tries to actually simplify it by
  1781. /// analyzing the arguments and call itself with instsimplify. Returns true if
  1782. /// it has simplified the callsite to some other entity (a constant), making it
  1783. /// free.
  1784. bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
  1785. // FIXME: Using the instsimplify logic directly for this is inefficient
  1786. // because we have to continually rebuild the argument list even when no
  1787. // simplifications can be performed. Until that is fixed with remapping
  1788. // inside of instsimplify, directly constant fold calls here.
  1789. if (!canConstantFoldCallTo(&Call, F))
  1790. return false;
  1791. // Try to re-map the arguments to constants.
  1792. SmallVector<Constant *, 4> ConstantArgs;
  1793. ConstantArgs.reserve(Call.arg_size());
  1794. for (Value *I : Call.args()) {
  1795. Constant *C = dyn_cast<Constant>(I);
  1796. if (!C)
  1797. C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
  1798. if (!C)
  1799. return false; // This argument doesn't map to a constant.
  1800. ConstantArgs.push_back(C);
  1801. }
  1802. if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
  1803. SimplifiedValues[&Call] = C;
  1804. return true;
  1805. }
  1806. return false;
  1807. }
  1808. bool CallAnalyzer::visitCallBase(CallBase &Call) {
  1809. if (!onCallBaseVisitStart(Call))
  1810. return true;
  1811. if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
  1812. !F.hasFnAttribute(Attribute::ReturnsTwice)) {
  1813. // This aborts the entire analysis.
  1814. ExposesReturnsTwice = true;
  1815. return false;
  1816. }
  1817. if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
  1818. ContainsNoDuplicateCall = true;
  1819. Value *Callee = Call.getCalledOperand();
  1820. Function *F = dyn_cast_or_null<Function>(Callee);
  1821. bool IsIndirectCall = !F;
  1822. if (IsIndirectCall) {
  1823. // Check if this happens to be an indirect function call to a known function
  1824. // in this inline context. If not, we've done all we can.
  1825. F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
  1826. if (!F) {
  1827. onCallArgumentSetup(Call);
  1828. if (!Call.onlyReadsMemory())
  1829. disableLoadElimination();
  1830. return Base::visitCallBase(Call);
  1831. }
  1832. }
  1833. assert(F && "Expected a call to a known function");
  1834. // When we have a concrete function, first try to simplify it directly.
  1835. if (simplifyCallSite(F, Call))
  1836. return true;
  1837. // Next check if it is an intrinsic we know about.
  1838. // FIXME: Lift this into part of the InstVisitor.
  1839. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
  1840. switch (II->getIntrinsicID()) {
  1841. default:
  1842. if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
  1843. disableLoadElimination();
  1844. return Base::visitCallBase(Call);
  1845. case Intrinsic::load_relative:
  1846. onLoadRelativeIntrinsic();
  1847. return false;
  1848. case Intrinsic::memset:
  1849. case Intrinsic::memcpy:
  1850. case Intrinsic::memmove:
  1851. disableLoadElimination();
  1852. // SROA can usually chew through these intrinsics, but they aren't free.
  1853. return false;
  1854. case Intrinsic::icall_branch_funnel:
  1855. case Intrinsic::localescape:
  1856. HasUninlineableIntrinsic = true;
  1857. return false;
  1858. case Intrinsic::vastart:
  1859. InitsVargArgs = true;
  1860. return false;
  1861. case Intrinsic::launder_invariant_group:
  1862. case Intrinsic::strip_invariant_group:
  1863. if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
  1864. SROAArgValues[II] = SROAArg;
  1865. return true;
  1866. case Intrinsic::is_constant:
  1867. return simplifyIntrinsicCallIsConstant(Call);
  1868. }
  1869. }
  1870. if (F == Call.getFunction()) {
  1871. // This flag will fully abort the analysis, so don't bother with anything
  1872. // else.
  1873. IsRecursiveCall = true;
  1874. if (!AllowRecursiveCall)
  1875. return false;
  1876. }
  1877. if (TTI.isLoweredToCall(F)) {
  1878. onLoweredCall(F, Call, IsIndirectCall);
  1879. }
  1880. if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
  1881. disableLoadElimination();
  1882. return Base::visitCallBase(Call);
  1883. }
  1884. bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
  1885. // At least one return instruction will be free after inlining.
  1886. bool Free = !HasReturn;
  1887. HasReturn = true;
  1888. return Free;
  1889. }
  1890. bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
  1891. // We model unconditional branches as essentially free -- they really
  1892. // shouldn't exist at all, but handling them makes the behavior of the
  1893. // inliner more regular and predictable. Interestingly, conditional branches
  1894. // which will fold away are also free.
  1895. return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
  1896. isa_and_nonnull<ConstantInt>(
  1897. SimplifiedValues.lookup(BI.getCondition()));
  1898. }
  1899. bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
  1900. bool CheckSROA = SI.getType()->isPointerTy();
  1901. Value *TrueVal = SI.getTrueValue();
  1902. Value *FalseVal = SI.getFalseValue();
  1903. Constant *TrueC = dyn_cast<Constant>(TrueVal);
  1904. if (!TrueC)
  1905. TrueC = SimplifiedValues.lookup(TrueVal);
  1906. Constant *FalseC = dyn_cast<Constant>(FalseVal);
  1907. if (!FalseC)
  1908. FalseC = SimplifiedValues.lookup(FalseVal);
  1909. Constant *CondC =
  1910. dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
  1911. if (!CondC) {
  1912. // Select C, X, X => X
  1913. if (TrueC == FalseC && TrueC) {
  1914. SimplifiedValues[&SI] = TrueC;
  1915. return true;
  1916. }
  1917. if (!CheckSROA)
  1918. return Base::visitSelectInst(SI);
  1919. std::pair<Value *, APInt> TrueBaseAndOffset =
  1920. ConstantOffsetPtrs.lookup(TrueVal);
  1921. std::pair<Value *, APInt> FalseBaseAndOffset =
  1922. ConstantOffsetPtrs.lookup(FalseVal);
  1923. if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
  1924. ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
  1925. if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
  1926. SROAArgValues[&SI] = SROAArg;
  1927. return true;
  1928. }
  1929. return Base::visitSelectInst(SI);
  1930. }
  1931. // Select condition is a constant.
  1932. Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
  1933. : (CondC->isNullValue()) ? FalseVal
  1934. : nullptr;
  1935. if (!SelectedV) {
  1936. // Condition is a vector constant that is not all 1s or all 0s. If all
  1937. // operands are constants, ConstantExpr::getSelect() can handle the cases
  1938. // such as select vectors.
  1939. if (TrueC && FalseC) {
  1940. if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
  1941. SimplifiedValues[&SI] = C;
  1942. return true;
  1943. }
  1944. }
  1945. return Base::visitSelectInst(SI);
  1946. }
  1947. // Condition is either all 1s or all 0s. SI can be simplified.
  1948. if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
  1949. SimplifiedValues[&SI] = SelectedC;
  1950. return true;
  1951. }
  1952. if (!CheckSROA)
  1953. return true;
  1954. std::pair<Value *, APInt> BaseAndOffset =
  1955. ConstantOffsetPtrs.lookup(SelectedV);
  1956. if (BaseAndOffset.first) {
  1957. ConstantOffsetPtrs[&SI] = BaseAndOffset;
  1958. if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
  1959. SROAArgValues[&SI] = SROAArg;
  1960. }
  1961. return true;
  1962. }
  1963. bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
  1964. // We model unconditional switches as free, see the comments on handling
  1965. // branches.
  1966. if (isa<ConstantInt>(SI.getCondition()))
  1967. return true;
  1968. if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
  1969. if (isa<ConstantInt>(V))
  1970. return true;
  1971. // Assume the most general case where the switch is lowered into
  1972. // either a jump table, bit test, or a balanced binary tree consisting of
  1973. // case clusters without merging adjacent clusters with the same
  1974. // destination. We do not consider the switches that are lowered with a mix
  1975. // of jump table/bit test/binary search tree. The cost of the switch is
  1976. // proportional to the size of the tree or the size of jump table range.
  1977. //
  1978. // NB: We convert large switches which are just used to initialize large phi
  1979. // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
  1980. // inlining those. It will prevent inlining in cases where the optimization
  1981. // does not (yet) fire.
  1982. unsigned JumpTableSize = 0;
  1983. BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
  1984. unsigned NumCaseCluster =
  1985. TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
  1986. onFinalizeSwitch(JumpTableSize, NumCaseCluster);
  1987. return false;
  1988. }
  1989. bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
  1990. // We never want to inline functions that contain an indirectbr. This is
  1991. // incorrect because all the blockaddress's (in static global initializers
  1992. // for example) would be referring to the original function, and this
  1993. // indirect jump would jump from the inlined copy of the function into the
  1994. // original function which is extremely undefined behavior.
  1995. // FIXME: This logic isn't really right; we can safely inline functions with
  1996. // indirectbr's as long as no other function or global references the
  1997. // blockaddress of a block within the current function.
  1998. HasIndirectBr = true;
  1999. return false;
  2000. }
  2001. bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
  2002. // FIXME: It's not clear that a single instruction is an accurate model for
  2003. // the inline cost of a resume instruction.
  2004. return false;
  2005. }
  2006. bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
  2007. // FIXME: It's not clear that a single instruction is an accurate model for
  2008. // the inline cost of a cleanupret instruction.
  2009. return false;
  2010. }
  2011. bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
  2012. // FIXME: It's not clear that a single instruction is an accurate model for
  2013. // the inline cost of a catchret instruction.
  2014. return false;
  2015. }
  2016. bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
  2017. // FIXME: It might be reasonably to discount the cost of instructions leading
  2018. // to unreachable as they have the lowest possible impact on both runtime and
  2019. // code size.
  2020. return true; // No actual code is needed for unreachable.
  2021. }
  2022. bool CallAnalyzer::visitInstruction(Instruction &I) {
  2023. // Some instructions are free. All of the free intrinsics can also be
  2024. // handled by SROA, etc.
  2025. if (TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
  2026. TargetTransformInfo::TCC_Free)
  2027. return true;
  2028. // We found something we don't understand or can't handle. Mark any SROA-able
  2029. // values in the operand list as no longer viable.
  2030. for (const Use &Op : I.operands())
  2031. disableSROA(Op);
  2032. return false;
  2033. }
  2034. /// Analyze a basic block for its contribution to the inline cost.
  2035. ///
  2036. /// This method walks the analyzer over every instruction in the given basic
  2037. /// block and accounts for their cost during inlining at this callsite. It
  2038. /// aborts early if the threshold has been exceeded or an impossible to inline
  2039. /// construct has been detected. It returns false if inlining is no longer
  2040. /// viable, and true if inlining remains viable.
  2041. InlineResult
  2042. CallAnalyzer::analyzeBlock(BasicBlock *BB,
  2043. SmallPtrSetImpl<const Value *> &EphValues) {
  2044. for (Instruction &I : *BB) {
  2045. // FIXME: Currently, the number of instructions in a function regardless of
  2046. // our ability to simplify them during inline to constants or dead code,
  2047. // are actually used by the vector bonus heuristic. As long as that's true,
  2048. // we have to special case debug intrinsics here to prevent differences in
  2049. // inlining due to debug symbols. Eventually, the number of unsimplified
  2050. // instructions shouldn't factor into the cost computation, but until then,
  2051. // hack around it here.
  2052. // Similarly, skip pseudo-probes.
  2053. if (I.isDebugOrPseudoInst())
  2054. continue;
  2055. // Skip ephemeral values.
  2056. if (EphValues.count(&I))
  2057. continue;
  2058. ++NumInstructions;
  2059. if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
  2060. ++NumVectorInstructions;
  2061. // If the instruction simplified to a constant, there is no cost to this
  2062. // instruction. Visit the instructions using our InstVisitor to account for
  2063. // all of the per-instruction logic. The visit tree returns true if we
  2064. // consumed the instruction in any way, and false if the instruction's base
  2065. // cost should count against inlining.
  2066. onInstructionAnalysisStart(&I);
  2067. if (Base::visit(&I))
  2068. ++NumInstructionsSimplified;
  2069. else
  2070. onMissedSimplification();
  2071. onInstructionAnalysisFinish(&I);
  2072. using namespace ore;
  2073. // If the visit this instruction detected an uninlinable pattern, abort.
  2074. InlineResult IR = InlineResult::success();
  2075. if (IsRecursiveCall && !AllowRecursiveCall)
  2076. IR = InlineResult::failure("recursive");
  2077. else if (ExposesReturnsTwice)
  2078. IR = InlineResult::failure("exposes returns twice");
  2079. else if (HasDynamicAlloca)
  2080. IR = InlineResult::failure("dynamic alloca");
  2081. else if (HasIndirectBr)
  2082. IR = InlineResult::failure("indirect branch");
  2083. else if (HasUninlineableIntrinsic)
  2084. IR = InlineResult::failure("uninlinable intrinsic");
  2085. else if (InitsVargArgs)
  2086. IR = InlineResult::failure("varargs");
  2087. if (!IR.isSuccess()) {
  2088. if (ORE)
  2089. ORE->emit([&]() {
  2090. return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
  2091. &CandidateCall)
  2092. << NV("Callee", &F) << " has uninlinable pattern ("
  2093. << NV("InlineResult", IR.getFailureReason())
  2094. << ") and cost is not fully computed";
  2095. });
  2096. return IR;
  2097. }
  2098. // If the caller is a recursive function then we don't want to inline
  2099. // functions which allocate a lot of stack space because it would increase
  2100. // the caller stack usage dramatically.
  2101. if (IsCallerRecursive &&
  2102. AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
  2103. auto IR =
  2104. InlineResult::failure("recursive and allocates too much stack space");
  2105. if (ORE)
  2106. ORE->emit([&]() {
  2107. return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
  2108. &CandidateCall)
  2109. << NV("Callee", &F) << " is "
  2110. << NV("InlineResult", IR.getFailureReason())
  2111. << ". Cost is not fully computed";
  2112. });
  2113. return IR;
  2114. }
  2115. if (shouldStop())
  2116. return InlineResult::failure(
  2117. "Call site analysis is not favorable to inlining.");
  2118. }
  2119. return InlineResult::success();
  2120. }
  2121. /// Compute the base pointer and cumulative constant offsets for V.
  2122. ///
  2123. /// This strips all constant offsets off of V, leaving it the base pointer, and
  2124. /// accumulates the total constant offset applied in the returned constant. It
  2125. /// returns 0 if V is not a pointer, and returns the constant '0' if there are
  2126. /// no constant offsets applied.
  2127. ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
  2128. if (!V->getType()->isPointerTy())
  2129. return nullptr;
  2130. unsigned AS = V->getType()->getPointerAddressSpace();
  2131. unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
  2132. APInt Offset = APInt::getZero(IntPtrWidth);
  2133. // Even though we don't look through PHI nodes, we could be called on an
  2134. // instruction in an unreachable block, which may be on a cycle.
  2135. SmallPtrSet<Value *, 4> Visited;
  2136. Visited.insert(V);
  2137. do {
  2138. if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
  2139. if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
  2140. return nullptr;
  2141. V = GEP->getPointerOperand();
  2142. } else if (Operator::getOpcode(V) == Instruction::BitCast) {
  2143. V = cast<Operator>(V)->getOperand(0);
  2144. } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
  2145. if (GA->isInterposable())
  2146. break;
  2147. V = GA->getAliasee();
  2148. } else {
  2149. break;
  2150. }
  2151. assert(V->getType()->isPointerTy() && "Unexpected operand type!");
  2152. } while (Visited.insert(V).second);
  2153. Type *IdxPtrTy = DL.getIndexType(V->getType());
  2154. return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
  2155. }
  2156. /// Find dead blocks due to deleted CFG edges during inlining.
  2157. ///
  2158. /// If we know the successor of the current block, \p CurrBB, has to be \p
  2159. /// NextBB, the other successors of \p CurrBB are dead if these successors have
  2160. /// no live incoming CFG edges. If one block is found to be dead, we can
  2161. /// continue growing the dead block list by checking the successors of the dead
  2162. /// blocks to see if all their incoming edges are dead or not.
  2163. void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
  2164. auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
  2165. // A CFG edge is dead if the predecessor is dead or the predecessor has a
  2166. // known successor which is not the one under exam.
  2167. return (DeadBlocks.count(Pred) ||
  2168. (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
  2169. };
  2170. auto IsNewlyDead = [&](BasicBlock *BB) {
  2171. // If all the edges to a block are dead, the block is also dead.
  2172. return (!DeadBlocks.count(BB) &&
  2173. llvm::all_of(predecessors(BB),
  2174. [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
  2175. };
  2176. for (BasicBlock *Succ : successors(CurrBB)) {
  2177. if (Succ == NextBB || !IsNewlyDead(Succ))
  2178. continue;
  2179. SmallVector<BasicBlock *, 4> NewDead;
  2180. NewDead.push_back(Succ);
  2181. while (!NewDead.empty()) {
  2182. BasicBlock *Dead = NewDead.pop_back_val();
  2183. if (DeadBlocks.insert(Dead))
  2184. // Continue growing the dead block lists.
  2185. for (BasicBlock *S : successors(Dead))
  2186. if (IsNewlyDead(S))
  2187. NewDead.push_back(S);
  2188. }
  2189. }
  2190. }
  2191. /// Analyze a call site for potential inlining.
  2192. ///
  2193. /// Returns true if inlining this call is viable, and false if it is not
  2194. /// viable. It computes the cost and adjusts the threshold based on numerous
  2195. /// factors and heuristics. If this method returns false but the computed cost
  2196. /// is below the computed threshold, then inlining was forcibly disabled by
  2197. /// some artifact of the routine.
  2198. InlineResult CallAnalyzer::analyze() {
  2199. ++NumCallsAnalyzed;
  2200. auto Result = onAnalysisStart();
  2201. if (!Result.isSuccess())
  2202. return Result;
  2203. if (F.empty())
  2204. return InlineResult::success();
  2205. Function *Caller = CandidateCall.getFunction();
  2206. // Check if the caller function is recursive itself.
  2207. for (User *U : Caller->users()) {
  2208. CallBase *Call = dyn_cast<CallBase>(U);
  2209. if (Call && Call->getFunction() == Caller) {
  2210. IsCallerRecursive = true;
  2211. break;
  2212. }
  2213. }
  2214. // Populate our simplified values by mapping from function arguments to call
  2215. // arguments with known important simplifications.
  2216. auto CAI = CandidateCall.arg_begin();
  2217. for (Argument &FAI : F.args()) {
  2218. assert(CAI != CandidateCall.arg_end());
  2219. if (Constant *C = dyn_cast<Constant>(CAI))
  2220. SimplifiedValues[&FAI] = C;
  2221. Value *PtrArg = *CAI;
  2222. if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
  2223. ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
  2224. // We can SROA any pointer arguments derived from alloca instructions.
  2225. if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
  2226. SROAArgValues[&FAI] = SROAArg;
  2227. onInitializeSROAArg(SROAArg);
  2228. EnabledSROAAllocas.insert(SROAArg);
  2229. }
  2230. }
  2231. ++CAI;
  2232. }
  2233. NumConstantArgs = SimplifiedValues.size();
  2234. NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
  2235. NumAllocaArgs = SROAArgValues.size();
  2236. // FIXME: If a caller has multiple calls to a callee, we end up recomputing
  2237. // the ephemeral values multiple times (and they're completely determined by
  2238. // the callee, so this is purely duplicate work).
  2239. SmallPtrSet<const Value *, 32> EphValues;
  2240. CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
  2241. // The worklist of live basic blocks in the callee *after* inlining. We avoid
  2242. // adding basic blocks of the callee which can be proven to be dead for this
  2243. // particular call site in order to get more accurate cost estimates. This
  2244. // requires a somewhat heavyweight iteration pattern: we need to walk the
  2245. // basic blocks in a breadth-first order as we insert live successors. To
  2246. // accomplish this, prioritizing for small iterations because we exit after
  2247. // crossing our threshold, we use a small-size optimized SetVector.
  2248. typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
  2249. SmallPtrSet<BasicBlock *, 16>>
  2250. BBSetVector;
  2251. BBSetVector BBWorklist;
  2252. BBWorklist.insert(&F.getEntryBlock());
  2253. // Note that we *must not* cache the size, this loop grows the worklist.
  2254. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
  2255. if (shouldStop())
  2256. break;
  2257. BasicBlock *BB = BBWorklist[Idx];
  2258. if (BB->empty())
  2259. continue;
  2260. onBlockStart(BB);
  2261. // Disallow inlining a blockaddress with uses other than strictly callbr.
  2262. // A blockaddress only has defined behavior for an indirect branch in the
  2263. // same function, and we do not currently support inlining indirect
  2264. // branches. But, the inliner may not see an indirect branch that ends up
  2265. // being dead code at a particular call site. If the blockaddress escapes
  2266. // the function, e.g., via a global variable, inlining may lead to an
  2267. // invalid cross-function reference.
  2268. // FIXME: pr/39560: continue relaxing this overt restriction.
  2269. if (BB->hasAddressTaken())
  2270. for (User *U : BlockAddress::get(&*BB)->users())
  2271. if (!isa<CallBrInst>(*U))
  2272. return InlineResult::failure("blockaddress used outside of callbr");
  2273. // Analyze the cost of this block. If we blow through the threshold, this
  2274. // returns false, and we can bail on out.
  2275. InlineResult IR = analyzeBlock(BB, EphValues);
  2276. if (!IR.isSuccess())
  2277. return IR;
  2278. Instruction *TI = BB->getTerminator();
  2279. // Add in the live successors by first checking whether we have terminator
  2280. // that may be simplified based on the values simplified by this call.
  2281. if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
  2282. if (BI->isConditional()) {
  2283. Value *Cond = BI->getCondition();
  2284. if (ConstantInt *SimpleCond =
  2285. dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
  2286. BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
  2287. BBWorklist.insert(NextBB);
  2288. KnownSuccessors[BB] = NextBB;
  2289. findDeadBlocks(BB, NextBB);
  2290. continue;
  2291. }
  2292. }
  2293. } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
  2294. Value *Cond = SI->getCondition();
  2295. if (ConstantInt *SimpleCond =
  2296. dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
  2297. BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
  2298. BBWorklist.insert(NextBB);
  2299. KnownSuccessors[BB] = NextBB;
  2300. findDeadBlocks(BB, NextBB);
  2301. continue;
  2302. }
  2303. }
  2304. // If we're unable to select a particular successor, just count all of
  2305. // them.
  2306. for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
  2307. ++TIdx)
  2308. BBWorklist.insert(TI->getSuccessor(TIdx));
  2309. onBlockAnalyzed(BB);
  2310. }
  2311. bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneLiveUse() &&
  2312. &F == CandidateCall.getCalledFunction();
  2313. // If this is a noduplicate call, we can still inline as long as
  2314. // inlining this would cause the removal of the caller (so the instruction
  2315. // is not actually duplicated, just moved).
  2316. if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
  2317. return InlineResult::failure("noduplicate");
  2318. return finalizeAnalysis();
  2319. }
  2320. void InlineCostCallAnalyzer::print(raw_ostream &OS) {
  2321. #define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
  2322. if (PrintInstructionComments)
  2323. F.print(OS, &Writer);
  2324. DEBUG_PRINT_STAT(NumConstantArgs);
  2325. DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
  2326. DEBUG_PRINT_STAT(NumAllocaArgs);
  2327. DEBUG_PRINT_STAT(NumConstantPtrCmps);
  2328. DEBUG_PRINT_STAT(NumConstantPtrDiffs);
  2329. DEBUG_PRINT_STAT(NumInstructionsSimplified);
  2330. DEBUG_PRINT_STAT(NumInstructions);
  2331. DEBUG_PRINT_STAT(SROACostSavings);
  2332. DEBUG_PRINT_STAT(SROACostSavingsLost);
  2333. DEBUG_PRINT_STAT(LoadEliminationCost);
  2334. DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
  2335. DEBUG_PRINT_STAT(Cost);
  2336. DEBUG_PRINT_STAT(Threshold);
  2337. #undef DEBUG_PRINT_STAT
  2338. }
  2339. #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
  2340. /// Dump stats about this call's analysis.
  2341. LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
  2342. #endif
  2343. /// Test that there are no attribute conflicts between Caller and Callee
  2344. /// that prevent inlining.
  2345. static bool functionsHaveCompatibleAttributes(
  2346. Function *Caller, Function *Callee, TargetTransformInfo &TTI,
  2347. function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
  2348. // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
  2349. // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
  2350. // object, and always returns the same object (which is overwritten on each
  2351. // GetTLI call). Therefore we copy the first result.
  2352. auto CalleeTLI = GetTLI(*Callee);
  2353. return TTI.areInlineCompatible(Caller, Callee) &&
  2354. GetTLI(*Caller).areInlineCompatible(CalleeTLI,
  2355. InlineCallerSupersetNoBuiltin) &&
  2356. AttributeFuncs::areInlineCompatible(*Caller, *Callee);
  2357. }
  2358. int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) {
  2359. int Cost = 0;
  2360. for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
  2361. if (Call.isByValArgument(I)) {
  2362. // We approximate the number of loads and stores needed by dividing the
  2363. // size of the byval type by the target's pointer size.
  2364. PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
  2365. unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
  2366. unsigned AS = PTy->getAddressSpace();
  2367. unsigned PointerSize = DL.getPointerSizeInBits(AS);
  2368. // Ceiling division.
  2369. unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
  2370. // If it generates more than 8 stores it is likely to be expanded as an
  2371. // inline memcpy so we take that as an upper bound. Otherwise we assume
  2372. // one load and one store per word copied.
  2373. // FIXME: The maxStoresPerMemcpy setting from the target should be used
  2374. // here instead of a magic number of 8, but it's not available via
  2375. // DataLayout.
  2376. NumStores = std::min(NumStores, 8U);
  2377. Cost += 2 * NumStores * InlineConstants::InstrCost;
  2378. } else {
  2379. // For non-byval arguments subtract off one instruction per call
  2380. // argument.
  2381. Cost += InlineConstants::InstrCost;
  2382. }
  2383. }
  2384. // The call instruction also disappears after inlining.
  2385. Cost += InlineConstants::InstrCost + CallPenalty;
  2386. return Cost;
  2387. }
  2388. InlineCost llvm::getInlineCost(
  2389. CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
  2390. function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
  2391. function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
  2392. function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
  2393. ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
  2394. return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
  2395. GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
  2396. }
  2397. Optional<int> llvm::getInliningCostEstimate(
  2398. CallBase &Call, TargetTransformInfo &CalleeTTI,
  2399. function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
  2400. function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
  2401. ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
  2402. const InlineParams Params = {/* DefaultThreshold*/ 0,
  2403. /*HintThreshold*/ {},
  2404. /*ColdThreshold*/ {},
  2405. /*OptSizeThreshold*/ {},
  2406. /*OptMinSizeThreshold*/ {},
  2407. /*HotCallSiteThreshold*/ {},
  2408. /*LocallyHotCallSiteThreshold*/ {},
  2409. /*ColdCallSiteThreshold*/ {},
  2410. /*ComputeFullInlineCost*/ true,
  2411. /*EnableDeferral*/ true};
  2412. InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
  2413. GetAssumptionCache, GetBFI, PSI, ORE, true,
  2414. /*IgnoreThreshold*/ true);
  2415. auto R = CA.analyze();
  2416. if (!R.isSuccess())
  2417. return None;
  2418. return CA.getCost();
  2419. }
  2420. Optional<InlineCostFeatures> llvm::getInliningCostFeatures(
  2421. CallBase &Call, TargetTransformInfo &CalleeTTI,
  2422. function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
  2423. function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
  2424. ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
  2425. InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
  2426. ORE, *Call.getCalledFunction(), Call);
  2427. auto R = CFA.analyze();
  2428. if (!R.isSuccess())
  2429. return None;
  2430. return CFA.features();
  2431. }
  2432. Optional<InlineResult> llvm::getAttributeBasedInliningDecision(
  2433. CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
  2434. function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
  2435. // Cannot inline indirect calls.
  2436. if (!Callee)
  2437. return InlineResult::failure("indirect call");
  2438. // When callee coroutine function is inlined into caller coroutine function
  2439. // before coro-split pass,
  2440. // coro-early pass can not handle this quiet well.
  2441. // So we won't inline the coroutine function if it have not been unsplited
  2442. if (Callee->isPresplitCoroutine())
  2443. return InlineResult::failure("unsplited coroutine call");
  2444. // Never inline calls with byval arguments that does not have the alloca
  2445. // address space. Since byval arguments can be replaced with a copy to an
  2446. // alloca, the inlined code would need to be adjusted to handle that the
  2447. // argument is in the alloca address space (so it is a little bit complicated
  2448. // to solve).
  2449. unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
  2450. for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
  2451. if (Call.isByValArgument(I)) {
  2452. PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
  2453. if (PTy->getAddressSpace() != AllocaAS)
  2454. return InlineResult::failure("byval arguments without alloca"
  2455. " address space");
  2456. }
  2457. // Calls to functions with always-inline attributes should be inlined
  2458. // whenever possible.
  2459. if (Call.hasFnAttr(Attribute::AlwaysInline)) {
  2460. auto IsViable = isInlineViable(*Callee);
  2461. if (IsViable.isSuccess())
  2462. return InlineResult::success();
  2463. return InlineResult::failure(IsViable.getFailureReason());
  2464. }
  2465. // Never inline functions with conflicting attributes (unless callee has
  2466. // always-inline attribute).
  2467. Function *Caller = Call.getCaller();
  2468. if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
  2469. return InlineResult::failure("conflicting attributes");
  2470. // Don't inline this call if the caller has the optnone attribute.
  2471. if (Caller->hasOptNone())
  2472. return InlineResult::failure("optnone attribute");
  2473. // Don't inline a function that treats null pointer as valid into a caller
  2474. // that does not have this attribute.
  2475. if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
  2476. return InlineResult::failure("nullptr definitions incompatible");
  2477. // Don't inline functions which can be interposed at link-time.
  2478. if (Callee->isInterposable())
  2479. return InlineResult::failure("interposable");
  2480. // Don't inline functions marked noinline.
  2481. if (Callee->hasFnAttribute(Attribute::NoInline))
  2482. return InlineResult::failure("noinline function attribute");
  2483. // Don't inline call sites marked noinline.
  2484. if (Call.isNoInline())
  2485. return InlineResult::failure("noinline call site attribute");
  2486. return None;
  2487. }
  2488. InlineCost llvm::getInlineCost(
  2489. CallBase &Call, Function *Callee, const InlineParams &Params,
  2490. TargetTransformInfo &CalleeTTI,
  2491. function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
  2492. function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
  2493. function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
  2494. ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
  2495. auto UserDecision =
  2496. llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
  2497. if (UserDecision.hasValue()) {
  2498. if (UserDecision->isSuccess())
  2499. return llvm::InlineCost::getAlways("always inline attribute");
  2500. return llvm::InlineCost::getNever(UserDecision->getFailureReason());
  2501. }
  2502. LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
  2503. << "... (caller:" << Call.getCaller()->getName()
  2504. << ")\n");
  2505. InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
  2506. GetAssumptionCache, GetBFI, PSI, ORE);
  2507. InlineResult ShouldInline = CA.analyze();
  2508. LLVM_DEBUG(CA.dump());
  2509. // Always make cost benefit based decision explicit.
  2510. // We use always/never here since threshold is not meaningful,
  2511. // as it's not what drives cost-benefit analysis.
  2512. if (CA.wasDecidedByCostBenefit()) {
  2513. if (ShouldInline.isSuccess())
  2514. return InlineCost::getAlways("benefit over cost",
  2515. CA.getCostBenefitPair());
  2516. else
  2517. return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
  2518. }
  2519. if (CA.wasDecidedByCostThreshold())
  2520. return InlineCost::get(CA.getCost(), CA.getThreshold());
  2521. // No details on how the decision was made, simply return always or never.
  2522. return ShouldInline.isSuccess()
  2523. ? InlineCost::getAlways("empty function")
  2524. : InlineCost::getNever(ShouldInline.getFailureReason());
  2525. }
  2526. InlineResult llvm::isInlineViable(Function &F) {
  2527. bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
  2528. for (BasicBlock &BB : F) {
  2529. // Disallow inlining of functions which contain indirect branches.
  2530. if (isa<IndirectBrInst>(BB.getTerminator()))
  2531. return InlineResult::failure("contains indirect branches");
  2532. // Disallow inlining of blockaddresses which are used by non-callbr
  2533. // instructions.
  2534. if (BB.hasAddressTaken())
  2535. for (User *U : BlockAddress::get(&BB)->users())
  2536. if (!isa<CallBrInst>(*U))
  2537. return InlineResult::failure("blockaddress used outside of callbr");
  2538. for (auto &II : BB) {
  2539. CallBase *Call = dyn_cast<CallBase>(&II);
  2540. if (!Call)
  2541. continue;
  2542. // Disallow recursive calls.
  2543. Function *Callee = Call->getCalledFunction();
  2544. if (&F == Callee)
  2545. return InlineResult::failure("recursive call");
  2546. // Disallow calls which expose returns-twice to a function not previously
  2547. // attributed as such.
  2548. if (!ReturnsTwice && isa<CallInst>(Call) &&
  2549. cast<CallInst>(Call)->canReturnTwice())
  2550. return InlineResult::failure("exposes returns-twice attribute");
  2551. if (Callee)
  2552. switch (Callee->getIntrinsicID()) {
  2553. default:
  2554. break;
  2555. case llvm::Intrinsic::icall_branch_funnel:
  2556. // Disallow inlining of @llvm.icall.branch.funnel because current
  2557. // backend can't separate call targets from call arguments.
  2558. return InlineResult::failure(
  2559. "disallowed inlining of @llvm.icall.branch.funnel");
  2560. case llvm::Intrinsic::localescape:
  2561. // Disallow inlining functions that call @llvm.localescape. Doing this
  2562. // correctly would require major changes to the inliner.
  2563. return InlineResult::failure(
  2564. "disallowed inlining of @llvm.localescape");
  2565. case llvm::Intrinsic::vastart:
  2566. // Disallow inlining of functions that initialize VarArgs with
  2567. // va_start.
  2568. return InlineResult::failure(
  2569. "contains VarArgs initialized with va_start");
  2570. }
  2571. }
  2572. }
  2573. return InlineResult::success();
  2574. }
  2575. // APIs to create InlineParams based on command line flags and/or other
  2576. // parameters.
  2577. InlineParams llvm::getInlineParams(int Threshold) {
  2578. InlineParams Params;
  2579. // This field is the threshold to use for a callee by default. This is
  2580. // derived from one or more of:
  2581. // * optimization or size-optimization levels,
  2582. // * a value passed to createFunctionInliningPass function, or
  2583. // * the -inline-threshold flag.
  2584. // If the -inline-threshold flag is explicitly specified, that is used
  2585. // irrespective of anything else.
  2586. if (InlineThreshold.getNumOccurrences() > 0)
  2587. Params.DefaultThreshold = InlineThreshold;
  2588. else
  2589. Params.DefaultThreshold = Threshold;
  2590. // Set the HintThreshold knob from the -inlinehint-threshold.
  2591. Params.HintThreshold = HintThreshold;
  2592. // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
  2593. Params.HotCallSiteThreshold = HotCallSiteThreshold;
  2594. // If the -locally-hot-callsite-threshold is explicitly specified, use it to
  2595. // populate LocallyHotCallSiteThreshold. Later, we populate
  2596. // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
  2597. // we know that optimization level is O3 (in the getInlineParams variant that
  2598. // takes the opt and size levels).
  2599. // FIXME: Remove this check (and make the assignment unconditional) after
  2600. // addressing size regression issues at O2.
  2601. if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
  2602. Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
  2603. // Set the ColdCallSiteThreshold knob from the
  2604. // -inline-cold-callsite-threshold.
  2605. Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
  2606. // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
  2607. // -inlinehint-threshold commandline option is not explicitly given. If that
  2608. // option is present, then its value applies even for callees with size and
  2609. // minsize attributes.
  2610. // If the -inline-threshold is not specified, set the ColdThreshold from the
  2611. // -inlinecold-threshold even if it is not explicitly passed. If
  2612. // -inline-threshold is specified, then -inlinecold-threshold needs to be
  2613. // explicitly specified to set the ColdThreshold knob
  2614. if (InlineThreshold.getNumOccurrences() == 0) {
  2615. Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
  2616. Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
  2617. Params.ColdThreshold = ColdThreshold;
  2618. } else if (ColdThreshold.getNumOccurrences() > 0) {
  2619. Params.ColdThreshold = ColdThreshold;
  2620. }
  2621. return Params;
  2622. }
  2623. InlineParams llvm::getInlineParams() {
  2624. return getInlineParams(DefaultThreshold);
  2625. }
  2626. // Compute the default threshold for inlining based on the opt level and the
  2627. // size opt level.
  2628. static int computeThresholdFromOptLevels(unsigned OptLevel,
  2629. unsigned SizeOptLevel) {
  2630. if (OptLevel > 2)
  2631. return InlineConstants::OptAggressiveThreshold;
  2632. if (SizeOptLevel == 1) // -Os
  2633. return InlineConstants::OptSizeThreshold;
  2634. if (SizeOptLevel == 2) // -Oz
  2635. return InlineConstants::OptMinSizeThreshold;
  2636. return DefaultThreshold;
  2637. }
  2638. InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
  2639. auto Params =
  2640. getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
  2641. // At O3, use the value of -locally-hot-callsite-threshold option to populate
  2642. // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
  2643. // when it is specified explicitly.
  2644. if (OptLevel > 2)
  2645. Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
  2646. return Params;
  2647. }
  2648. PreservedAnalyses
  2649. InlineCostAnnotationPrinterPass::run(Function &F,
  2650. FunctionAnalysisManager &FAM) {
  2651. PrintInstructionComments = true;
  2652. std::function<AssumptionCache &(Function &)> GetAssumptionCache =
  2653. [&](Function &F) -> AssumptionCache & {
  2654. return FAM.getResult<AssumptionAnalysis>(F);
  2655. };
  2656. Module *M = F.getParent();
  2657. ProfileSummaryInfo PSI(*M);
  2658. DataLayout DL(M);
  2659. TargetTransformInfo TTI(DL);
  2660. // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
  2661. // In the current implementation, the type of InlineParams doesn't matter as
  2662. // the pass serves only for verification of inliner's decisions.
  2663. // We can add a flag which determines InlineParams for this run. Right now,
  2664. // the default InlineParams are used.
  2665. const InlineParams Params = llvm::getInlineParams();
  2666. for (BasicBlock &BB : F) {
  2667. for (Instruction &I : BB) {
  2668. if (CallInst *CI = dyn_cast<CallInst>(&I)) {
  2669. Function *CalledFunction = CI->getCalledFunction();
  2670. if (!CalledFunction || CalledFunction->isDeclaration())
  2671. continue;
  2672. OptimizationRemarkEmitter ORE(CalledFunction);
  2673. InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
  2674. GetAssumptionCache, nullptr, &PSI, &ORE);
  2675. ICCA.analyze();
  2676. OS << " Analyzing call of " << CalledFunction->getName()
  2677. << "... (caller:" << CI->getCaller()->getName() << ")\n";
  2678. ICCA.print(OS);
  2679. OS << "\n";
  2680. }
  2681. }
  2682. }
  2683. return PreservedAnalyses::all();
  2684. }