BlockFrequencyInfoImpl.h 69 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //==- BlockFrequencyInfoImpl.h - Block Frequency Implementation --*- C++ -*-==//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // Shared implementation of BlockFrequency for IR and Machine Instructions.
  15. // See the documentation below for BlockFrequencyInfoImpl for details.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  19. #define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  20. #include "llvm/ADT/BitVector.h"
  21. #include "llvm/ADT/DenseMap.h"
  22. #include "llvm/ADT/DenseSet.h"
  23. #include "llvm/ADT/GraphTraits.h"
  24. #include "llvm/ADT/Optional.h"
  25. #include "llvm/ADT/PostOrderIterator.h"
  26. #include "llvm/ADT/SmallVector.h"
  27. #include "llvm/ADT/SparseBitVector.h"
  28. #include "llvm/ADT/Twine.h"
  29. #include "llvm/ADT/iterator_range.h"
  30. #include "llvm/IR/BasicBlock.h"
  31. #include "llvm/IR/ValueHandle.h"
  32. #include "llvm/Support/BlockFrequency.h"
  33. #include "llvm/Support/BranchProbability.h"
  34. #include "llvm/Support/CommandLine.h"
  35. #include "llvm/Support/DOTGraphTraits.h"
  36. #include "llvm/Support/Debug.h"
  37. #include "llvm/Support/ErrorHandling.h"
  38. #include "llvm/Support/Format.h"
  39. #include "llvm/Support/ScaledNumber.h"
  40. #include "llvm/Support/raw_ostream.h"
  41. #include <algorithm>
  42. #include <cassert>
  43. #include <cstddef>
  44. #include <cstdint>
  45. #include <deque>
  46. #include <iterator>
  47. #include <limits>
  48. #include <list>
  49. #include <queue>
  50. #include <string>
  51. #include <unordered_set>
  52. #include <utility>
  53. #include <vector>
  54. #define DEBUG_TYPE "block-freq"
  55. namespace llvm {
  56. extern llvm::cl::opt<bool> CheckBFIUnknownBlockQueries;
  57. extern llvm::cl::opt<bool> UseIterativeBFIInference;
  58. extern llvm::cl::opt<unsigned> IterativeBFIMaxIterationsPerBlock;
  59. extern llvm::cl::opt<double> IterativeBFIPrecision;
  60. class BranchProbabilityInfo;
  61. class Function;
  62. class Loop;
  63. class LoopInfo;
  64. class MachineBasicBlock;
  65. class MachineBranchProbabilityInfo;
  66. class MachineFunction;
  67. class MachineLoop;
  68. class MachineLoopInfo;
  69. namespace bfi_detail {
  70. struct IrreducibleGraph;
  71. // This is part of a workaround for a GCC 4.7 crash on lambdas.
  72. template <class BT> struct BlockEdgesAdder;
  73. /// Mass of a block.
  74. ///
  75. /// This class implements a sort of fixed-point fraction always between 0.0 and
  76. /// 1.0. getMass() == std::numeric_limits<uint64_t>::max() indicates a value of
  77. /// 1.0.
  78. ///
  79. /// Masses can be added and subtracted. Simple saturation arithmetic is used,
  80. /// so arithmetic operations never overflow or underflow.
  81. ///
  82. /// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses
  83. /// an inexpensive floating-point algorithm that's off-by-one (almost, but not
  84. /// quite, maximum precision).
  85. ///
  86. /// Masses can be scaled by \a BranchProbability at maximum precision.
  87. class BlockMass {
  88. uint64_t Mass = 0;
  89. public:
  90. BlockMass() = default;
  91. explicit BlockMass(uint64_t Mass) : Mass(Mass) {}
  92. static BlockMass getEmpty() { return BlockMass(); }
  93. static BlockMass getFull() {
  94. return BlockMass(std::numeric_limits<uint64_t>::max());
  95. }
  96. uint64_t getMass() const { return Mass; }
  97. bool isFull() const { return Mass == std::numeric_limits<uint64_t>::max(); }
  98. bool isEmpty() const { return !Mass; }
  99. bool operator!() const { return isEmpty(); }
  100. /// Add another mass.
  101. ///
  102. /// Adds another mass, saturating at \a isFull() rather than overflowing.
  103. BlockMass &operator+=(BlockMass X) {
  104. uint64_t Sum = Mass + X.Mass;
  105. Mass = Sum < Mass ? std::numeric_limits<uint64_t>::max() : Sum;
  106. return *this;
  107. }
  108. /// Subtract another mass.
  109. ///
  110. /// Subtracts another mass, saturating at \a isEmpty() rather than
  111. /// undeflowing.
  112. BlockMass &operator-=(BlockMass X) {
  113. uint64_t Diff = Mass - X.Mass;
  114. Mass = Diff > Mass ? 0 : Diff;
  115. return *this;
  116. }
  117. BlockMass &operator*=(BranchProbability P) {
  118. Mass = P.scale(Mass);
  119. return *this;
  120. }
  121. bool operator==(BlockMass X) const { return Mass == X.Mass; }
  122. bool operator!=(BlockMass X) const { return Mass != X.Mass; }
  123. bool operator<=(BlockMass X) const { return Mass <= X.Mass; }
  124. bool operator>=(BlockMass X) const { return Mass >= X.Mass; }
  125. bool operator<(BlockMass X) const { return Mass < X.Mass; }
  126. bool operator>(BlockMass X) const { return Mass > X.Mass; }
  127. /// Convert to scaled number.
  128. ///
  129. /// Convert to \a ScaledNumber. \a isFull() gives 1.0, while \a isEmpty()
  130. /// gives slightly above 0.0.
  131. ScaledNumber<uint64_t> toScaled() const;
  132. void dump() const;
  133. raw_ostream &print(raw_ostream &OS) const;
  134. };
  135. inline BlockMass operator+(BlockMass L, BlockMass R) {
  136. return BlockMass(L) += R;
  137. }
  138. inline BlockMass operator-(BlockMass L, BlockMass R) {
  139. return BlockMass(L) -= R;
  140. }
  141. inline BlockMass operator*(BlockMass L, BranchProbability R) {
  142. return BlockMass(L) *= R;
  143. }
  144. inline BlockMass operator*(BranchProbability L, BlockMass R) {
  145. return BlockMass(R) *= L;
  146. }
  147. inline raw_ostream &operator<<(raw_ostream &OS, BlockMass X) {
  148. return X.print(OS);
  149. }
  150. } // end namespace bfi_detail
  151. /// Base class for BlockFrequencyInfoImpl
  152. ///
  153. /// BlockFrequencyInfoImplBase has supporting data structures and some
  154. /// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on
  155. /// the block type (or that call such algorithms) are skipped here.
  156. ///
  157. /// Nevertheless, the majority of the overall algorithm documentation lives with
  158. /// BlockFrequencyInfoImpl. See there for details.
  159. class BlockFrequencyInfoImplBase {
  160. public:
  161. using Scaled64 = ScaledNumber<uint64_t>;
  162. using BlockMass = bfi_detail::BlockMass;
  163. /// Representative of a block.
  164. ///
  165. /// This is a simple wrapper around an index into the reverse-post-order
  166. /// traversal of the blocks.
  167. ///
  168. /// Unlike a block pointer, its order has meaning (location in the
  169. /// topological sort) and it's class is the same regardless of block type.
  170. struct BlockNode {
  171. using IndexType = uint32_t;
  172. IndexType Index;
  173. BlockNode() : Index(std::numeric_limits<uint32_t>::max()) {}
  174. BlockNode(IndexType Index) : Index(Index) {}
  175. bool operator==(const BlockNode &X) const { return Index == X.Index; }
  176. bool operator!=(const BlockNode &X) const { return Index != X.Index; }
  177. bool operator<=(const BlockNode &X) const { return Index <= X.Index; }
  178. bool operator>=(const BlockNode &X) const { return Index >= X.Index; }
  179. bool operator<(const BlockNode &X) const { return Index < X.Index; }
  180. bool operator>(const BlockNode &X) const { return Index > X.Index; }
  181. bool isValid() const { return Index <= getMaxIndex(); }
  182. static size_t getMaxIndex() {
  183. return std::numeric_limits<uint32_t>::max() - 1;
  184. }
  185. };
  186. /// Stats about a block itself.
  187. struct FrequencyData {
  188. Scaled64 Scaled;
  189. uint64_t Integer;
  190. };
  191. /// Data about a loop.
  192. ///
  193. /// Contains the data necessary to represent a loop as a pseudo-node once it's
  194. /// packaged.
  195. struct LoopData {
  196. using ExitMap = SmallVector<std::pair<BlockNode, BlockMass>, 4>;
  197. using NodeList = SmallVector<BlockNode, 4>;
  198. using HeaderMassList = SmallVector<BlockMass, 1>;
  199. LoopData *Parent; ///< The parent loop.
  200. bool IsPackaged = false; ///< Whether this has been packaged.
  201. uint32_t NumHeaders = 1; ///< Number of headers.
  202. ExitMap Exits; ///< Successor edges (and weights).
  203. NodeList Nodes; ///< Header and the members of the loop.
  204. HeaderMassList BackedgeMass; ///< Mass returned to each loop header.
  205. BlockMass Mass;
  206. Scaled64 Scale;
  207. LoopData(LoopData *Parent, const BlockNode &Header)
  208. : Parent(Parent), Nodes(1, Header), BackedgeMass(1) {}
  209. template <class It1, class It2>
  210. LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
  211. It2 LastOther)
  212. : Parent(Parent), Nodes(FirstHeader, LastHeader) {
  213. NumHeaders = Nodes.size();
  214. Nodes.insert(Nodes.end(), FirstOther, LastOther);
  215. BackedgeMass.resize(NumHeaders);
  216. }
  217. bool isHeader(const BlockNode &Node) const {
  218. if (isIrreducible())
  219. return std::binary_search(Nodes.begin(), Nodes.begin() + NumHeaders,
  220. Node);
  221. return Node == Nodes[0];
  222. }
  223. BlockNode getHeader() const { return Nodes[0]; }
  224. bool isIrreducible() const { return NumHeaders > 1; }
  225. HeaderMassList::difference_type getHeaderIndex(const BlockNode &B) {
  226. assert(isHeader(B) && "this is only valid on loop header blocks");
  227. if (isIrreducible())
  228. return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) -
  229. Nodes.begin();
  230. return 0;
  231. }
  232. NodeList::const_iterator members_begin() const {
  233. return Nodes.begin() + NumHeaders;
  234. }
  235. NodeList::const_iterator members_end() const { return Nodes.end(); }
  236. iterator_range<NodeList::const_iterator> members() const {
  237. return make_range(members_begin(), members_end());
  238. }
  239. };
  240. /// Index of loop information.
  241. struct WorkingData {
  242. BlockNode Node; ///< This node.
  243. LoopData *Loop = nullptr; ///< The loop this block is inside.
  244. BlockMass Mass; ///< Mass distribution from the entry block.
  245. WorkingData(const BlockNode &Node) : Node(Node) {}
  246. bool isLoopHeader() const { return Loop && Loop->isHeader(Node); }
  247. bool isDoubleLoopHeader() const {
  248. return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() &&
  249. Loop->Parent->isHeader(Node);
  250. }
  251. LoopData *getContainingLoop() const {
  252. if (!isLoopHeader())
  253. return Loop;
  254. if (!isDoubleLoopHeader())
  255. return Loop->Parent;
  256. return Loop->Parent->Parent;
  257. }
  258. /// Resolve a node to its representative.
  259. ///
  260. /// Get the node currently representing Node, which could be a containing
  261. /// loop.
  262. ///
  263. /// This function should only be called when distributing mass. As long as
  264. /// there are no irreducible edges to Node, then it will have complexity
  265. /// O(1) in this context.
  266. ///
  267. /// In general, the complexity is O(L), where L is the number of loop
  268. /// headers Node has been packaged into. Since this method is called in
  269. /// the context of distributing mass, L will be the number of loop headers
  270. /// an early exit edge jumps out of.
  271. BlockNode getResolvedNode() const {
  272. auto L = getPackagedLoop();
  273. return L ? L->getHeader() : Node;
  274. }
  275. LoopData *getPackagedLoop() const {
  276. if (!Loop || !Loop->IsPackaged)
  277. return nullptr;
  278. auto L = Loop;
  279. while (L->Parent && L->Parent->IsPackaged)
  280. L = L->Parent;
  281. return L;
  282. }
  283. /// Get the appropriate mass for a node.
  284. ///
  285. /// Get appropriate mass for Node. If Node is a loop-header (whose loop
  286. /// has been packaged), returns the mass of its pseudo-node. If it's a
  287. /// node inside a packaged loop, it returns the loop's mass.
  288. BlockMass &getMass() {
  289. if (!isAPackage())
  290. return Mass;
  291. if (!isADoublePackage())
  292. return Loop->Mass;
  293. return Loop->Parent->Mass;
  294. }
  295. /// Has ContainingLoop been packaged up?
  296. bool isPackaged() const { return getResolvedNode() != Node; }
  297. /// Has Loop been packaged up?
  298. bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; }
  299. /// Has Loop been packaged up twice?
  300. bool isADoublePackage() const {
  301. return isDoubleLoopHeader() && Loop->Parent->IsPackaged;
  302. }
  303. };
  304. /// Unscaled probability weight.
  305. ///
  306. /// Probability weight for an edge in the graph (including the
  307. /// successor/target node).
  308. ///
  309. /// All edges in the original function are 32-bit. However, exit edges from
  310. /// loop packages are taken from 64-bit exit masses, so we need 64-bits of
  311. /// space in general.
  312. ///
  313. /// In addition to the raw weight amount, Weight stores the type of the edge
  314. /// in the current context (i.e., the context of the loop being processed).
  315. /// Is this a local edge within the loop, an exit from the loop, or a
  316. /// backedge to the loop header?
  317. struct Weight {
  318. enum DistType { Local, Exit, Backedge };
  319. DistType Type = Local;
  320. BlockNode TargetNode;
  321. uint64_t Amount = 0;
  322. Weight() = default;
  323. Weight(DistType Type, BlockNode TargetNode, uint64_t Amount)
  324. : Type(Type), TargetNode(TargetNode), Amount(Amount) {}
  325. };
  326. /// Distribution of unscaled probability weight.
  327. ///
  328. /// Distribution of unscaled probability weight to a set of successors.
  329. ///
  330. /// This class collates the successor edge weights for later processing.
  331. ///
  332. /// \a DidOverflow indicates whether \a Total did overflow while adding to
  333. /// the distribution. It should never overflow twice.
  334. struct Distribution {
  335. using WeightList = SmallVector<Weight, 4>;
  336. WeightList Weights; ///< Individual successor weights.
  337. uint64_t Total = 0; ///< Sum of all weights.
  338. bool DidOverflow = false; ///< Whether \a Total did overflow.
  339. Distribution() = default;
  340. void addLocal(const BlockNode &Node, uint64_t Amount) {
  341. add(Node, Amount, Weight::Local);
  342. }
  343. void addExit(const BlockNode &Node, uint64_t Amount) {
  344. add(Node, Amount, Weight::Exit);
  345. }
  346. void addBackedge(const BlockNode &Node, uint64_t Amount) {
  347. add(Node, Amount, Weight::Backedge);
  348. }
  349. /// Normalize the distribution.
  350. ///
  351. /// Combines multiple edges to the same \a Weight::TargetNode and scales
  352. /// down so that \a Total fits into 32-bits.
  353. ///
  354. /// This is linear in the size of \a Weights. For the vast majority of
  355. /// cases, adjacent edge weights are combined by sorting WeightList and
  356. /// combining adjacent weights. However, for very large edge lists an
  357. /// auxiliary hash table is used.
  358. void normalize();
  359. private:
  360. void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
  361. };
  362. /// Data about each block. This is used downstream.
  363. std::vector<FrequencyData> Freqs;
  364. /// Whether each block is an irreducible loop header.
  365. /// This is used downstream.
  366. SparseBitVector<> IsIrrLoopHeader;
  367. /// Loop data: see initializeLoops().
  368. std::vector<WorkingData> Working;
  369. /// Indexed information about loops.
  370. std::list<LoopData> Loops;
  371. /// Virtual destructor.
  372. ///
  373. /// Need a virtual destructor to mask the compiler warning about
  374. /// getBlockName().
  375. virtual ~BlockFrequencyInfoImplBase() = default;
  376. /// Add all edges out of a packaged loop to the distribution.
  377. ///
  378. /// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each
  379. /// successor edge.
  380. ///
  381. /// \return \c true unless there's an irreducible backedge.
  382. bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop,
  383. Distribution &Dist);
  384. /// Add an edge to the distribution.
  385. ///
  386. /// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the
  387. /// edge is local/exit/backedge is in the context of LoopHead. Otherwise,
  388. /// every edge should be a local edge (since all the loops are packaged up).
  389. ///
  390. /// \return \c true unless aborted due to an irreducible backedge.
  391. bool addToDist(Distribution &Dist, const LoopData *OuterLoop,
  392. const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
  393. /// Analyze irreducible SCCs.
  394. ///
  395. /// Separate irreducible SCCs from \c G, which is an explicit graph of \c
  396. /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr).
  397. /// Insert them into \a Loops before \c Insert.
  398. ///
  399. /// \return the \c LoopData nodes representing the irreducible SCCs.
  400. iterator_range<std::list<LoopData>::iterator>
  401. analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop,
  402. std::list<LoopData>::iterator Insert);
  403. /// Update a loop after packaging irreducible SCCs inside of it.
  404. ///
  405. /// Update \c OuterLoop. Before finding irreducible control flow, it was
  406. /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a
  407. /// LoopData::BackedgeMass need to be reset. Also, nodes that were packaged
  408. /// up need to be removed from \a OuterLoop::Nodes.
  409. void updateLoopWithIrreducible(LoopData &OuterLoop);
  410. /// Distribute mass according to a distribution.
  411. ///
  412. /// Distributes the mass in Source according to Dist. If LoopHead.isValid(),
  413. /// backedges and exits are stored in its entry in Loops.
  414. ///
  415. /// Mass is distributed in parallel from two copies of the source mass.
  416. void distributeMass(const BlockNode &Source, LoopData *OuterLoop,
  417. Distribution &Dist);
  418. /// Compute the loop scale for a loop.
  419. void computeLoopScale(LoopData &Loop);
  420. /// Adjust the mass of all headers in an irreducible loop.
  421. ///
  422. /// Initially, irreducible loops are assumed to distribute their mass
  423. /// equally among its headers. This can lead to wrong frequency estimates
  424. /// since some headers may be executed more frequently than others.
  425. ///
  426. /// This adjusts header mass distribution so it matches the weights of
  427. /// the backedges going into each of the loop headers.
  428. void adjustLoopHeaderMass(LoopData &Loop);
  429. void distributeIrrLoopHeaderMass(Distribution &Dist);
  430. /// Package up a loop.
  431. void packageLoop(LoopData &Loop);
  432. /// Unwrap loops.
  433. void unwrapLoops();
  434. /// Finalize frequency metrics.
  435. ///
  436. /// Calculates final frequencies and cleans up no-longer-needed data
  437. /// structures.
  438. void finalizeMetrics();
  439. /// Clear all memory.
  440. void clear();
  441. virtual std::string getBlockName(const BlockNode &Node) const;
  442. std::string getLoopName(const LoopData &Loop) const;
  443. virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
  444. void dump() const { print(dbgs()); }
  445. Scaled64 getFloatingBlockFreq(const BlockNode &Node) const;
  446. BlockFrequency getBlockFreq(const BlockNode &Node) const;
  447. Optional<uint64_t> getBlockProfileCount(const Function &F,
  448. const BlockNode &Node,
  449. bool AllowSynthetic = false) const;
  450. Optional<uint64_t> getProfileCountFromFreq(const Function &F,
  451. uint64_t Freq,
  452. bool AllowSynthetic = false) const;
  453. bool isIrrLoopHeader(const BlockNode &Node);
  454. void setBlockFreq(const BlockNode &Node, uint64_t Freq);
  455. raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
  456. raw_ostream &printBlockFreq(raw_ostream &OS,
  457. const BlockFrequency &Freq) const;
  458. uint64_t getEntryFreq() const {
  459. assert(!Freqs.empty());
  460. return Freqs[0].Integer;
  461. }
  462. };
  463. namespace bfi_detail {
  464. template <class BlockT> struct TypeMap {};
  465. template <> struct TypeMap<BasicBlock> {
  466. using BlockT = BasicBlock;
  467. using BlockKeyT = AssertingVH<const BasicBlock>;
  468. using FunctionT = Function;
  469. using BranchProbabilityInfoT = BranchProbabilityInfo;
  470. using LoopT = Loop;
  471. using LoopInfoT = LoopInfo;
  472. };
  473. template <> struct TypeMap<MachineBasicBlock> {
  474. using BlockT = MachineBasicBlock;
  475. using BlockKeyT = const MachineBasicBlock *;
  476. using FunctionT = MachineFunction;
  477. using BranchProbabilityInfoT = MachineBranchProbabilityInfo;
  478. using LoopT = MachineLoop;
  479. using LoopInfoT = MachineLoopInfo;
  480. };
  481. template <class BlockT, class BFIImplT>
  482. class BFICallbackVH;
  483. /// Get the name of a MachineBasicBlock.
  484. ///
  485. /// Get the name of a MachineBasicBlock. It's templated so that including from
  486. /// CodeGen is unnecessary (that would be a layering issue).
  487. ///
  488. /// This is used mainly for debug output. The name is similar to
  489. /// MachineBasicBlock::getFullName(), but skips the name of the function.
  490. template <class BlockT> std::string getBlockName(const BlockT *BB) {
  491. assert(BB && "Unexpected nullptr");
  492. auto MachineName = "BB" + Twine(BB->getNumber());
  493. if (BB->getBasicBlock())
  494. return (MachineName + "[" + BB->getName() + "]").str();
  495. return MachineName.str();
  496. }
  497. /// Get the name of a BasicBlock.
  498. template <> inline std::string getBlockName(const BasicBlock *BB) {
  499. assert(BB && "Unexpected nullptr");
  500. return BB->getName().str();
  501. }
  502. /// Graph of irreducible control flow.
  503. ///
  504. /// This graph is used for determining the SCCs in a loop (or top-level
  505. /// function) that has irreducible control flow.
  506. ///
  507. /// During the block frequency algorithm, the local graphs are defined in a
  508. /// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock
  509. /// graphs for most edges, but getting others from \a LoopData::ExitMap. The
  510. /// latter only has successor information.
  511. ///
  512. /// \a IrreducibleGraph makes this graph explicit. It's in a form that can use
  513. /// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator),
  514. /// and it explicitly lists predecessors and successors. The initialization
  515. /// that relies on \c MachineBasicBlock is defined in the header.
  516. struct IrreducibleGraph {
  517. using BFIBase = BlockFrequencyInfoImplBase;
  518. BFIBase &BFI;
  519. using BlockNode = BFIBase::BlockNode;
  520. struct IrrNode {
  521. BlockNode Node;
  522. unsigned NumIn = 0;
  523. std::deque<const IrrNode *> Edges;
  524. IrrNode(const BlockNode &Node) : Node(Node) {}
  525. using iterator = std::deque<const IrrNode *>::const_iterator;
  526. iterator pred_begin() const { return Edges.begin(); }
  527. iterator succ_begin() const { return Edges.begin() + NumIn; }
  528. iterator pred_end() const { return succ_begin(); }
  529. iterator succ_end() const { return Edges.end(); }
  530. };
  531. BlockNode Start;
  532. const IrrNode *StartIrr = nullptr;
  533. std::vector<IrrNode> Nodes;
  534. SmallDenseMap<uint32_t, IrrNode *, 4> Lookup;
  535. /// Construct an explicit graph containing irreducible control flow.
  536. ///
  537. /// Construct an explicit graph of the control flow in \c OuterLoop (or the
  538. /// top-level function, if \c OuterLoop is \c nullptr). Uses \c
  539. /// addBlockEdges to add block successors that have not been packaged into
  540. /// loops.
  541. ///
  542. /// \a BlockFrequencyInfoImpl::computeIrreducibleMass() is the only expected
  543. /// user of this.
  544. template <class BlockEdgesAdder>
  545. IrreducibleGraph(BFIBase &BFI, const BFIBase::LoopData *OuterLoop,
  546. BlockEdgesAdder addBlockEdges) : BFI(BFI) {
  547. initialize(OuterLoop, addBlockEdges);
  548. }
  549. template <class BlockEdgesAdder>
  550. void initialize(const BFIBase::LoopData *OuterLoop,
  551. BlockEdgesAdder addBlockEdges);
  552. void addNodesInLoop(const BFIBase::LoopData &OuterLoop);
  553. void addNodesInFunction();
  554. void addNode(const BlockNode &Node) {
  555. Nodes.emplace_back(Node);
  556. BFI.Working[Node.Index].getMass() = BlockMass::getEmpty();
  557. }
  558. void indexNodes();
  559. template <class BlockEdgesAdder>
  560. void addEdges(const BlockNode &Node, const BFIBase::LoopData *OuterLoop,
  561. BlockEdgesAdder addBlockEdges);
  562. void addEdge(IrrNode &Irr, const BlockNode &Succ,
  563. const BFIBase::LoopData *OuterLoop);
  564. };
  565. template <class BlockEdgesAdder>
  566. void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop,
  567. BlockEdgesAdder addBlockEdges) {
  568. if (OuterLoop) {
  569. addNodesInLoop(*OuterLoop);
  570. for (auto N : OuterLoop->Nodes)
  571. addEdges(N, OuterLoop, addBlockEdges);
  572. } else {
  573. addNodesInFunction();
  574. for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
  575. addEdges(Index, OuterLoop, addBlockEdges);
  576. }
  577. StartIrr = Lookup[Start.Index];
  578. }
  579. template <class BlockEdgesAdder>
  580. void IrreducibleGraph::addEdges(const BlockNode &Node,
  581. const BFIBase::LoopData *OuterLoop,
  582. BlockEdgesAdder addBlockEdges) {
  583. auto L = Lookup.find(Node.Index);
  584. if (L == Lookup.end())
  585. return;
  586. IrrNode &Irr = *L->second;
  587. const auto &Working = BFI.Working[Node.Index];
  588. if (Working.isAPackage())
  589. for (const auto &I : Working.Loop->Exits)
  590. addEdge(Irr, I.first, OuterLoop);
  591. else
  592. addBlockEdges(*this, Irr, OuterLoop);
  593. }
  594. } // end namespace bfi_detail
  595. /// Shared implementation for block frequency analysis.
  596. ///
  597. /// This is a shared implementation of BlockFrequencyInfo and
  598. /// MachineBlockFrequencyInfo, and calculates the relative frequencies of
  599. /// blocks.
  600. ///
  601. /// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block,
  602. /// which is called the header. A given loop, L, can have sub-loops, which are
  603. /// loops within the subgraph of L that exclude its header. (A "trivial" SCC
  604. /// consists of a single block that does not have a self-edge.)
  605. ///
  606. /// In addition to loops, this algorithm has limited support for irreducible
  607. /// SCCs, which are SCCs with multiple entry blocks. Irreducible SCCs are
  608. /// discovered on the fly, and modelled as loops with multiple headers.
  609. ///
  610. /// The headers of irreducible sub-SCCs consist of its entry blocks and all
  611. /// nodes that are targets of a backedge within it (excluding backedges within
  612. /// true sub-loops). Block frequency calculations act as if a block is
  613. /// inserted that intercepts all the edges to the headers. All backedges and
  614. /// entries point to this block. Its successors are the headers, which split
  615. /// the frequency evenly.
  616. ///
  617. /// This algorithm leverages BlockMass and ScaledNumber to maintain precision,
  618. /// separates mass distribution from loop scaling, and dithers to eliminate
  619. /// probability mass loss.
  620. ///
  621. /// The implementation is split between BlockFrequencyInfoImpl, which knows the
  622. /// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
  623. /// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a
  624. /// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in
  625. /// reverse-post order. This gives two advantages: it's easy to compare the
  626. /// relative ordering of two nodes, and maps keyed on BlockT can be represented
  627. /// by vectors.
  628. ///
  629. /// This algorithm is O(V+E), unless there is irreducible control flow, in
  630. /// which case it's O(V*E) in the worst case.
  631. ///
  632. /// These are the main stages:
  633. ///
  634. /// 0. Reverse post-order traversal (\a initializeRPOT()).
  635. ///
  636. /// Run a single post-order traversal and save it (in reverse) in RPOT.
  637. /// All other stages make use of this ordering. Save a lookup from BlockT
  638. /// to BlockNode (the index into RPOT) in Nodes.
  639. ///
  640. /// 1. Loop initialization (\a initializeLoops()).
  641. ///
  642. /// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
  643. /// the algorithm. In particular, store the immediate members of each loop
  644. /// in reverse post-order.
  645. ///
  646. /// 2. Calculate mass and scale in loops (\a computeMassInLoops()).
  647. ///
  648. /// For each loop (bottom-up), distribute mass through the DAG resulting
  649. /// from ignoring backedges and treating sub-loops as a single pseudo-node.
  650. /// Track the backedge mass distributed to the loop header, and use it to
  651. /// calculate the loop scale (number of loop iterations). Immediate
  652. /// members that represent sub-loops will already have been visited and
  653. /// packaged into a pseudo-node.
  654. ///
  655. /// Distributing mass in a loop is a reverse-post-order traversal through
  656. /// the loop. Start by assigning full mass to the Loop header. For each
  657. /// node in the loop:
  658. ///
  659. /// - Fetch and categorize the weight distribution for its successors.
  660. /// If this is a packaged-subloop, the weight distribution is stored
  661. /// in \a LoopData::Exits. Otherwise, fetch it from
  662. /// BranchProbabilityInfo.
  663. ///
  664. /// - Each successor is categorized as \a Weight::Local, a local edge
  665. /// within the current loop, \a Weight::Backedge, a backedge to the
  666. /// loop header, or \a Weight::Exit, any successor outside the loop.
  667. /// The weight, the successor, and its category are stored in \a
  668. /// Distribution. There can be multiple edges to each successor.
  669. ///
  670. /// - If there's a backedge to a non-header, there's an irreducible SCC.
  671. /// The usual flow is temporarily aborted. \a
  672. /// computeIrreducibleMass() finds the irreducible SCCs within the
  673. /// loop, packages them up, and restarts the flow.
  674. ///
  675. /// - Normalize the distribution: scale weights down so that their sum
  676. /// is 32-bits, and coalesce multiple edges to the same node.
  677. ///
  678. /// - Distribute the mass accordingly, dithering to minimize mass loss,
  679. /// as described in \a distributeMass().
  680. ///
  681. /// In the case of irreducible loops, instead of a single loop header,
  682. /// there will be several. The computation of backedge masses is similar
  683. /// but instead of having a single backedge mass, there will be one
  684. /// backedge per loop header. In these cases, each backedge will carry
  685. /// a mass proportional to the edge weights along the corresponding
  686. /// path.
  687. ///
  688. /// At the end of propagation, the full mass assigned to the loop will be
  689. /// distributed among the loop headers proportionally according to the
  690. /// mass flowing through their backedges.
  691. ///
  692. /// Finally, calculate the loop scale from the accumulated backedge mass.
  693. ///
  694. /// 3. Distribute mass in the function (\a computeMassInFunction()).
  695. ///
  696. /// Finally, distribute mass through the DAG resulting from packaging all
  697. /// loops in the function. This uses the same algorithm as distributing
  698. /// mass in a loop, except that there are no exit or backedge edges.
  699. ///
  700. /// 4. Unpackage loops (\a unwrapLoops()).
  701. ///
  702. /// Initialize each block's frequency to a floating point representation of
  703. /// its mass.
  704. ///
  705. /// Visit loops top-down, scaling the frequencies of its immediate members
  706. /// by the loop's pseudo-node's frequency.
  707. ///
  708. /// 5. Convert frequencies to a 64-bit range (\a finalizeMetrics()).
  709. ///
  710. /// Using the min and max frequencies as a guide, translate floating point
  711. /// frequencies to an appropriate range in uint64_t.
  712. ///
  713. /// It has some known flaws.
  714. ///
  715. /// - The model of irreducible control flow is a rough approximation.
  716. ///
  717. /// Modelling irreducible control flow exactly involves setting up and
  718. /// solving a group of infinite geometric series. Such precision is
  719. /// unlikely to be worthwhile, since most of our algorithms give up on
  720. /// irreducible control flow anyway.
  721. ///
  722. /// Nevertheless, we might find that we need to get closer. Here's a sort
  723. /// of TODO list for the model with diminishing returns, to be completed as
  724. /// necessary.
  725. ///
  726. /// - The headers for the \a LoopData representing an irreducible SCC
  727. /// include non-entry blocks. When these extra blocks exist, they
  728. /// indicate a self-contained irreducible sub-SCC. We could treat them
  729. /// as sub-loops, rather than arbitrarily shoving the problematic
  730. /// blocks into the headers of the main irreducible SCC.
  731. ///
  732. /// - Entry frequencies are assumed to be evenly split between the
  733. /// headers of a given irreducible SCC, which is the only option if we
  734. /// need to compute mass in the SCC before its parent loop. Instead,
  735. /// we could partially compute mass in the parent loop, and stop when
  736. /// we get to the SCC. Here, we have the correct ratio of entry
  737. /// masses, which we can use to adjust their relative frequencies.
  738. /// Compute mass in the SCC, and then continue propagation in the
  739. /// parent.
  740. ///
  741. /// - We can propagate mass iteratively through the SCC, for some fixed
  742. /// number of iterations. Each iteration starts by assigning the entry
  743. /// blocks their backedge mass from the prior iteration. The final
  744. /// mass for each block (and each exit, and the total backedge mass
  745. /// used for computing loop scale) is the sum of all iterations.
  746. /// (Running this until fixed point would "solve" the geometric
  747. /// series by simulation.)
  748. template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
  749. // This is part of a workaround for a GCC 4.7 crash on lambdas.
  750. friend struct bfi_detail::BlockEdgesAdder<BT>;
  751. using BlockT = typename bfi_detail::TypeMap<BT>::BlockT;
  752. using BlockKeyT = typename bfi_detail::TypeMap<BT>::BlockKeyT;
  753. using FunctionT = typename bfi_detail::TypeMap<BT>::FunctionT;
  754. using BranchProbabilityInfoT =
  755. typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT;
  756. using LoopT = typename bfi_detail::TypeMap<BT>::LoopT;
  757. using LoopInfoT = typename bfi_detail::TypeMap<BT>::LoopInfoT;
  758. using Successor = GraphTraits<const BlockT *>;
  759. using Predecessor = GraphTraits<Inverse<const BlockT *>>;
  760. using BFICallbackVH =
  761. bfi_detail::BFICallbackVH<BlockT, BlockFrequencyInfoImpl>;
  762. const BranchProbabilityInfoT *BPI = nullptr;
  763. const LoopInfoT *LI = nullptr;
  764. const FunctionT *F = nullptr;
  765. // All blocks in reverse postorder.
  766. std::vector<const BlockT *> RPOT;
  767. DenseMap<BlockKeyT, std::pair<BlockNode, BFICallbackVH>> Nodes;
  768. using rpot_iterator = typename std::vector<const BlockT *>::const_iterator;
  769. rpot_iterator rpot_begin() const { return RPOT.begin(); }
  770. rpot_iterator rpot_end() const { return RPOT.end(); }
  771. size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); }
  772. BlockNode getNode(const rpot_iterator &I) const {
  773. return BlockNode(getIndex(I));
  774. }
  775. BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB).first; }
  776. const BlockT *getBlock(const BlockNode &Node) const {
  777. assert(Node.Index < RPOT.size());
  778. return RPOT[Node.Index];
  779. }
  780. /// Run (and save) a post-order traversal.
  781. ///
  782. /// Saves a reverse post-order traversal of all the nodes in \a F.
  783. void initializeRPOT();
  784. /// Initialize loop data.
  785. ///
  786. /// Build up \a Loops using \a LoopInfo. \a LoopInfo gives us a mapping from
  787. /// each block to the deepest loop it's in, but we need the inverse. For each
  788. /// loop, we store in reverse post-order its "immediate" members, defined as
  789. /// the header, the headers of immediate sub-loops, and all other blocks in
  790. /// the loop that are not in sub-loops.
  791. void initializeLoops();
  792. /// Propagate to a block's successors.
  793. ///
  794. /// In the context of distributing mass through \c OuterLoop, divide the mass
  795. /// currently assigned to \c Node between its successors.
  796. ///
  797. /// \return \c true unless there's an irreducible backedge.
  798. bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node);
  799. /// Compute mass in a particular loop.
  800. ///
  801. /// Assign mass to \c Loop's header, and then for each block in \c Loop in
  802. /// reverse post-order, distribute mass to its successors. Only visits nodes
  803. /// that have not been packaged into sub-loops.
  804. ///
  805. /// \pre \a computeMassInLoop() has been called for each subloop of \c Loop.
  806. /// \return \c true unless there's an irreducible backedge.
  807. bool computeMassInLoop(LoopData &Loop);
  808. /// Try to compute mass in the top-level function.
  809. ///
  810. /// Assign mass to the entry block, and then for each block in reverse
  811. /// post-order, distribute mass to its successors. Skips nodes that have
  812. /// been packaged into loops.
  813. ///
  814. /// \pre \a computeMassInLoops() has been called.
  815. /// \return \c true unless there's an irreducible backedge.
  816. bool tryToComputeMassInFunction();
  817. /// Compute mass in (and package up) irreducible SCCs.
  818. ///
  819. /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front
  820. /// of \c Insert), and call \a computeMassInLoop() on each of them.
  821. ///
  822. /// If \c OuterLoop is \c nullptr, it refers to the top-level function.
  823. ///
  824. /// \pre \a computeMassInLoop() has been called for each subloop of \c
  825. /// OuterLoop.
  826. /// \pre \c Insert points at the last loop successfully processed by \a
  827. /// computeMassInLoop().
  828. /// \pre \c OuterLoop has irreducible SCCs.
  829. void computeIrreducibleMass(LoopData *OuterLoop,
  830. std::list<LoopData>::iterator Insert);
  831. /// Compute mass in all loops.
  832. ///
  833. /// For each loop bottom-up, call \a computeMassInLoop().
  834. ///
  835. /// \a computeMassInLoop() aborts (and returns \c false) on loops that
  836. /// contain a irreducible sub-SCCs. Use \a computeIrreducibleMass() and then
  837. /// re-enter \a computeMassInLoop().
  838. ///
  839. /// \post \a computeMassInLoop() has returned \c true for every loop.
  840. void computeMassInLoops();
  841. /// Compute mass in the top-level function.
  842. ///
  843. /// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to
  844. /// compute mass in the top-level function.
  845. ///
  846. /// \post \a tryToComputeMassInFunction() has returned \c true.
  847. void computeMassInFunction();
  848. std::string getBlockName(const BlockNode &Node) const override {
  849. return bfi_detail::getBlockName(getBlock(Node));
  850. }
  851. /// The current implementation for computing relative block frequencies does
  852. /// not handle correctly control-flow graphs containing irreducible loops. To
  853. /// resolve the problem, we apply a post-processing step, which iteratively
  854. /// updates block frequencies based on the frequencies of their predesessors.
  855. /// This corresponds to finding the stationary point of the Markov chain by
  856. /// an iterative method aka "PageRank computation".
  857. /// The algorithm takes at most O(|E| * IterativeBFIMaxIterations) steps but
  858. /// typically converges faster.
  859. ///
  860. /// Decide whether we want to apply iterative inference for a given function.
  861. bool needIterativeInference() const;
  862. /// Apply an iterative post-processing to infer correct counts for irr loops.
  863. void applyIterativeInference();
  864. using ProbMatrixType = std::vector<std::vector<std::pair<size_t, Scaled64>>>;
  865. /// Run iterative inference for a probability matrix and initial frequencies.
  866. void iterativeInference(const ProbMatrixType &ProbMatrix,
  867. std::vector<Scaled64> &Freq) const;
  868. /// Find all blocks to apply inference on, that is, reachable from the entry
  869. /// and backward reachable from exists along edges with positive probability.
  870. void findReachableBlocks(std::vector<const BlockT *> &Blocks) const;
  871. /// Build a matrix of probabilities with transitions (edges) between the
  872. /// blocks: ProbMatrix[I] holds pairs (J, P), where Pr[J -> I | J] = P
  873. void initTransitionProbabilities(
  874. const std::vector<const BlockT *> &Blocks,
  875. const DenseMap<const BlockT *, size_t> &BlockIndex,
  876. ProbMatrixType &ProbMatrix) const;
  877. #ifndef NDEBUG
  878. /// Compute the discrepancy between current block frequencies and the
  879. /// probability matrix.
  880. Scaled64 discrepancy(const ProbMatrixType &ProbMatrix,
  881. const std::vector<Scaled64> &Freq) const;
  882. #endif
  883. public:
  884. BlockFrequencyInfoImpl() = default;
  885. const FunctionT *getFunction() const { return F; }
  886. void calculate(const FunctionT &F, const BranchProbabilityInfoT &BPI,
  887. const LoopInfoT &LI);
  888. using BlockFrequencyInfoImplBase::getEntryFreq;
  889. BlockFrequency getBlockFreq(const BlockT *BB) const {
  890. return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
  891. }
  892. Optional<uint64_t> getBlockProfileCount(const Function &F,
  893. const BlockT *BB,
  894. bool AllowSynthetic = false) const {
  895. return BlockFrequencyInfoImplBase::getBlockProfileCount(F, getNode(BB),
  896. AllowSynthetic);
  897. }
  898. Optional<uint64_t> getProfileCountFromFreq(const Function &F,
  899. uint64_t Freq,
  900. bool AllowSynthetic = false) const {
  901. return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq,
  902. AllowSynthetic);
  903. }
  904. bool isIrrLoopHeader(const BlockT *BB) {
  905. return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB));
  906. }
  907. void setBlockFreq(const BlockT *BB, uint64_t Freq);
  908. void forgetBlock(const BlockT *BB) {
  909. // We don't erase corresponding items from `Freqs`, `RPOT` and other to
  910. // avoid invalidating indices. Doing so would have saved some memory, but
  911. // it's not worth it.
  912. Nodes.erase(BB);
  913. }
  914. Scaled64 getFloatingBlockFreq(const BlockT *BB) const {
  915. return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
  916. }
  917. const BranchProbabilityInfoT &getBPI() const { return *BPI; }
  918. /// Print the frequencies for the current function.
  919. ///
  920. /// Prints the frequencies for the blocks in the current function.
  921. ///
  922. /// Blocks are printed in the natural iteration order of the function, rather
  923. /// than reverse post-order. This provides two advantages: writing -analyze
  924. /// tests is easier (since blocks come out in source order), and even
  925. /// unreachable blocks are printed.
  926. ///
  927. /// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so
  928. /// we need to override it here.
  929. raw_ostream &print(raw_ostream &OS) const override;
  930. using BlockFrequencyInfoImplBase::dump;
  931. using BlockFrequencyInfoImplBase::printBlockFreq;
  932. raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
  933. return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
  934. }
  935. void verifyMatch(BlockFrequencyInfoImpl<BT> &Other) const;
  936. };
  937. namespace bfi_detail {
  938. template <class BFIImplT>
  939. class BFICallbackVH<BasicBlock, BFIImplT> : public CallbackVH {
  940. BFIImplT *BFIImpl;
  941. public:
  942. BFICallbackVH() = default;
  943. BFICallbackVH(const BasicBlock *BB, BFIImplT *BFIImpl)
  944. : CallbackVH(BB), BFIImpl(BFIImpl) {}
  945. virtual ~BFICallbackVH() = default;
  946. void deleted() override {
  947. BFIImpl->forgetBlock(cast<BasicBlock>(getValPtr()));
  948. }
  949. };
  950. /// Dummy implementation since MachineBasicBlocks aren't Values, so ValueHandles
  951. /// don't apply to them.
  952. template <class BFIImplT>
  953. class BFICallbackVH<MachineBasicBlock, BFIImplT> {
  954. public:
  955. BFICallbackVH() = default;
  956. BFICallbackVH(const MachineBasicBlock *, BFIImplT *) {}
  957. };
  958. } // end namespace bfi_detail
  959. template <class BT>
  960. void BlockFrequencyInfoImpl<BT>::calculate(const FunctionT &F,
  961. const BranchProbabilityInfoT &BPI,
  962. const LoopInfoT &LI) {
  963. // Save the parameters.
  964. this->BPI = &BPI;
  965. this->LI = &LI;
  966. this->F = &F;
  967. // Clean up left-over data structures.
  968. BlockFrequencyInfoImplBase::clear();
  969. RPOT.clear();
  970. Nodes.clear();
  971. // Initialize.
  972. LLVM_DEBUG(dbgs() << "\nblock-frequency: " << F.getName()
  973. << "\n================="
  974. << std::string(F.getName().size(), '=') << "\n");
  975. initializeRPOT();
  976. initializeLoops();
  977. // Visit loops in post-order to find the local mass distribution, and then do
  978. // the full function.
  979. computeMassInLoops();
  980. computeMassInFunction();
  981. unwrapLoops();
  982. // Apply a post-processing step improving computed frequencies for functions
  983. // with irreducible loops.
  984. if (needIterativeInference())
  985. applyIterativeInference();
  986. finalizeMetrics();
  987. if (CheckBFIUnknownBlockQueries) {
  988. // To detect BFI queries for unknown blocks, add entries for unreachable
  989. // blocks, if any. This is to distinguish between known/existing unreachable
  990. // blocks and unknown blocks.
  991. for (const BlockT &BB : F)
  992. if (!Nodes.count(&BB))
  993. setBlockFreq(&BB, 0);
  994. }
  995. }
  996. template <class BT>
  997. void BlockFrequencyInfoImpl<BT>::setBlockFreq(const BlockT *BB, uint64_t Freq) {
  998. if (Nodes.count(BB))
  999. BlockFrequencyInfoImplBase::setBlockFreq(getNode(BB), Freq);
  1000. else {
  1001. // If BB is a newly added block after BFI is done, we need to create a new
  1002. // BlockNode for it assigned with a new index. The index can be determined
  1003. // by the size of Freqs.
  1004. BlockNode NewNode(Freqs.size());
  1005. Nodes[BB] = {NewNode, BFICallbackVH(BB, this)};
  1006. Freqs.emplace_back();
  1007. BlockFrequencyInfoImplBase::setBlockFreq(NewNode, Freq);
  1008. }
  1009. }
  1010. template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() {
  1011. const BlockT *Entry = &F->front();
  1012. RPOT.reserve(F->size());
  1013. std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT));
  1014. std::reverse(RPOT.begin(), RPOT.end());
  1015. assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
  1016. "More nodes in function than Block Frequency Info supports");
  1017. LLVM_DEBUG(dbgs() << "reverse-post-order-traversal\n");
  1018. for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
  1019. BlockNode Node = getNode(I);
  1020. LLVM_DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node)
  1021. << "\n");
  1022. Nodes[*I] = {Node, BFICallbackVH(*I, this)};
  1023. }
  1024. Working.reserve(RPOT.size());
  1025. for (size_t Index = 0; Index < RPOT.size(); ++Index)
  1026. Working.emplace_back(Index);
  1027. Freqs.resize(RPOT.size());
  1028. }
  1029. template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() {
  1030. LLVM_DEBUG(dbgs() << "loop-detection\n");
  1031. if (LI->empty())
  1032. return;
  1033. // Visit loops top down and assign them an index.
  1034. std::deque<std::pair<const LoopT *, LoopData *>> Q;
  1035. for (const LoopT *L : *LI)
  1036. Q.emplace_back(L, nullptr);
  1037. while (!Q.empty()) {
  1038. const LoopT *Loop = Q.front().first;
  1039. LoopData *Parent = Q.front().second;
  1040. Q.pop_front();
  1041. BlockNode Header = getNode(Loop->getHeader());
  1042. assert(Header.isValid());
  1043. Loops.emplace_back(Parent, Header);
  1044. Working[Header.Index].Loop = &Loops.back();
  1045. LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
  1046. for (const LoopT *L : *Loop)
  1047. Q.emplace_back(L, &Loops.back());
  1048. }
  1049. // Visit nodes in reverse post-order and add them to their deepest containing
  1050. // loop.
  1051. for (size_t Index = 0; Index < RPOT.size(); ++Index) {
  1052. // Loop headers have already been mostly mapped.
  1053. if (Working[Index].isLoopHeader()) {
  1054. LoopData *ContainingLoop = Working[Index].getContainingLoop();
  1055. if (ContainingLoop)
  1056. ContainingLoop->Nodes.push_back(Index);
  1057. continue;
  1058. }
  1059. const LoopT *Loop = LI->getLoopFor(RPOT[Index]);
  1060. if (!Loop)
  1061. continue;
  1062. // Add this node to its containing loop's member list.
  1063. BlockNode Header = getNode(Loop->getHeader());
  1064. assert(Header.isValid());
  1065. const auto &HeaderData = Working[Header.Index];
  1066. assert(HeaderData.isLoopHeader());
  1067. Working[Index].Loop = HeaderData.Loop;
  1068. HeaderData.Loop->Nodes.push_back(Index);
  1069. LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header)
  1070. << ": member = " << getBlockName(Index) << "\n");
  1071. }
  1072. }
  1073. template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() {
  1074. // Visit loops with the deepest first, and the top-level loops last.
  1075. for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
  1076. if (computeMassInLoop(*L))
  1077. continue;
  1078. auto Next = std::next(L);
  1079. computeIrreducibleMass(&*L, L.base());
  1080. L = std::prev(Next);
  1081. if (computeMassInLoop(*L))
  1082. continue;
  1083. llvm_unreachable("unhandled irreducible control flow");
  1084. }
  1085. }
  1086. template <class BT>
  1087. bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
  1088. // Compute mass in loop.
  1089. LLVM_DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
  1090. if (Loop.isIrreducible()) {
  1091. LLVM_DEBUG(dbgs() << "isIrreducible = true\n");
  1092. Distribution Dist;
  1093. unsigned NumHeadersWithWeight = 0;
  1094. Optional<uint64_t> MinHeaderWeight;
  1095. DenseSet<uint32_t> HeadersWithoutWeight;
  1096. HeadersWithoutWeight.reserve(Loop.NumHeaders);
  1097. for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
  1098. auto &HeaderNode = Loop.Nodes[H];
  1099. const BlockT *Block = getBlock(HeaderNode);
  1100. IsIrrLoopHeader.set(Loop.Nodes[H].Index);
  1101. Optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight();
  1102. if (!HeaderWeight) {
  1103. LLVM_DEBUG(dbgs() << "Missing irr loop header metadata on "
  1104. << getBlockName(HeaderNode) << "\n");
  1105. HeadersWithoutWeight.insert(H);
  1106. continue;
  1107. }
  1108. LLVM_DEBUG(dbgs() << getBlockName(HeaderNode)
  1109. << " has irr loop header weight "
  1110. << HeaderWeight.getValue() << "\n");
  1111. NumHeadersWithWeight++;
  1112. uint64_t HeaderWeightValue = HeaderWeight.getValue();
  1113. if (!MinHeaderWeight || HeaderWeightValue < MinHeaderWeight)
  1114. MinHeaderWeight = HeaderWeightValue;
  1115. if (HeaderWeightValue) {
  1116. Dist.addLocal(HeaderNode, HeaderWeightValue);
  1117. }
  1118. }
  1119. // As a heuristic, if some headers don't have a weight, give them the
  1120. // minimum weight seen (not to disrupt the existing trends too much by
  1121. // using a weight that's in the general range of the other headers' weights,
  1122. // and the minimum seems to perform better than the average.)
  1123. // FIXME: better update in the passes that drop the header weight.
  1124. // If no headers have a weight, give them even weight (use weight 1).
  1125. if (!MinHeaderWeight)
  1126. MinHeaderWeight = 1;
  1127. for (uint32_t H : HeadersWithoutWeight) {
  1128. auto &HeaderNode = Loop.Nodes[H];
  1129. assert(!getBlock(HeaderNode)->getIrrLoopHeaderWeight() &&
  1130. "Shouldn't have a weight metadata");
  1131. uint64_t MinWeight = MinHeaderWeight.getValue();
  1132. LLVM_DEBUG(dbgs() << "Giving weight " << MinWeight << " to "
  1133. << getBlockName(HeaderNode) << "\n");
  1134. if (MinWeight)
  1135. Dist.addLocal(HeaderNode, MinWeight);
  1136. }
  1137. distributeIrrLoopHeaderMass(Dist);
  1138. for (const BlockNode &M : Loop.Nodes)
  1139. if (!propagateMassToSuccessors(&Loop, M))
  1140. llvm_unreachable("unhandled irreducible control flow");
  1141. if (NumHeadersWithWeight == 0)
  1142. // No headers have a metadata. Adjust header mass.
  1143. adjustLoopHeaderMass(Loop);
  1144. } else {
  1145. Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
  1146. if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
  1147. llvm_unreachable("irreducible control flow to loop header!?");
  1148. for (const BlockNode &M : Loop.members())
  1149. if (!propagateMassToSuccessors(&Loop, M))
  1150. // Irreducible backedge.
  1151. return false;
  1152. }
  1153. computeLoopScale(Loop);
  1154. packageLoop(Loop);
  1155. return true;
  1156. }
  1157. template <class BT>
  1158. bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() {
  1159. // Compute mass in function.
  1160. LLVM_DEBUG(dbgs() << "compute-mass-in-function\n");
  1161. assert(!Working.empty() && "no blocks in function");
  1162. assert(!Working[0].isLoopHeader() && "entry block is a loop header");
  1163. Working[0].getMass() = BlockMass::getFull();
  1164. for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) {
  1165. // Check for nodes that have been packaged.
  1166. BlockNode Node = getNode(I);
  1167. if (Working[Node.Index].isPackaged())
  1168. continue;
  1169. if (!propagateMassToSuccessors(nullptr, Node))
  1170. return false;
  1171. }
  1172. return true;
  1173. }
  1174. template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() {
  1175. if (tryToComputeMassInFunction())
  1176. return;
  1177. computeIrreducibleMass(nullptr, Loops.begin());
  1178. if (tryToComputeMassInFunction())
  1179. return;
  1180. llvm_unreachable("unhandled irreducible control flow");
  1181. }
  1182. template <class BT>
  1183. bool BlockFrequencyInfoImpl<BT>::needIterativeInference() const {
  1184. if (!UseIterativeBFIInference)
  1185. return false;
  1186. if (!F->getFunction().hasProfileData())
  1187. return false;
  1188. // Apply iterative inference only if the function contains irreducible loops;
  1189. // otherwise, computed block frequencies are reasonably correct.
  1190. for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
  1191. if (L->isIrreducible())
  1192. return true;
  1193. }
  1194. return false;
  1195. }
  1196. template <class BT> void BlockFrequencyInfoImpl<BT>::applyIterativeInference() {
  1197. // Extract blocks for processing: a block is considered for inference iff it
  1198. // can be reached from the entry by edges with a positive probability.
  1199. // Non-processed blocks are assigned with the zero frequency and are ignored
  1200. // in the computation
  1201. std::vector<const BlockT *> ReachableBlocks;
  1202. findReachableBlocks(ReachableBlocks);
  1203. if (ReachableBlocks.empty())
  1204. return;
  1205. // The map is used to to index successors/predecessors of reachable blocks in
  1206. // the ReachableBlocks vector
  1207. DenseMap<const BlockT *, size_t> BlockIndex;
  1208. // Extract initial frequencies for the reachable blocks
  1209. auto Freq = std::vector<Scaled64>(ReachableBlocks.size());
  1210. Scaled64 SumFreq;
  1211. for (size_t I = 0; I < ReachableBlocks.size(); I++) {
  1212. const BlockT *BB = ReachableBlocks[I];
  1213. BlockIndex[BB] = I;
  1214. Freq[I] = getFloatingBlockFreq(BB);
  1215. SumFreq += Freq[I];
  1216. }
  1217. assert(!SumFreq.isZero() && "empty initial block frequencies");
  1218. LLVM_DEBUG(dbgs() << "Applying iterative inference for " << F->getName()
  1219. << " with " << ReachableBlocks.size() << " blocks\n");
  1220. // Normalizing frequencies so they sum up to 1.0
  1221. for (auto &Value : Freq) {
  1222. Value /= SumFreq;
  1223. }
  1224. // Setting up edge probabilities using sparse matrix representation:
  1225. // ProbMatrix[I] holds a vector of pairs (J, P) where Pr[J -> I | J] = P
  1226. ProbMatrixType ProbMatrix;
  1227. initTransitionProbabilities(ReachableBlocks, BlockIndex, ProbMatrix);
  1228. // Run the propagation
  1229. iterativeInference(ProbMatrix, Freq);
  1230. // Assign computed frequency values
  1231. for (const BlockT &BB : *F) {
  1232. auto Node = getNode(&BB);
  1233. if (!Node.isValid())
  1234. continue;
  1235. if (BlockIndex.count(&BB)) {
  1236. Freqs[Node.Index].Scaled = Freq[BlockIndex[&BB]];
  1237. } else {
  1238. Freqs[Node.Index].Scaled = Scaled64::getZero();
  1239. }
  1240. }
  1241. }
  1242. template <class BT>
  1243. void BlockFrequencyInfoImpl<BT>::iterativeInference(
  1244. const ProbMatrixType &ProbMatrix, std::vector<Scaled64> &Freq) const {
  1245. assert(0.0 < IterativeBFIPrecision && IterativeBFIPrecision < 1.0 &&
  1246. "incorrectly specified precision");
  1247. // Convert double precision to Scaled64
  1248. const auto Precision =
  1249. Scaled64::getInverse(static_cast<uint64_t>(1.0 / IterativeBFIPrecision));
  1250. const size_t MaxIterations = IterativeBFIMaxIterationsPerBlock * Freq.size();
  1251. #ifndef NDEBUG
  1252. LLVM_DEBUG(dbgs() << " Initial discrepancy = "
  1253. << discrepancy(ProbMatrix, Freq).toString() << "\n");
  1254. #endif
  1255. // Successors[I] holds unique sucessors of the I-th block
  1256. auto Successors = std::vector<std::vector<size_t>>(Freq.size());
  1257. for (size_t I = 0; I < Freq.size(); I++) {
  1258. for (auto &Jump : ProbMatrix[I]) {
  1259. Successors[Jump.first].push_back(I);
  1260. }
  1261. }
  1262. // To speedup computation, we maintain a set of "active" blocks whose
  1263. // frequencies need to be updated based on the incoming edges.
  1264. // The set is dynamic and changes after every update. Initially all blocks
  1265. // with a positive frequency are active
  1266. auto IsActive = BitVector(Freq.size(), false);
  1267. std::queue<size_t> ActiveSet;
  1268. for (size_t I = 0; I < Freq.size(); I++) {
  1269. if (Freq[I] > 0) {
  1270. ActiveSet.push(I);
  1271. IsActive[I] = true;
  1272. }
  1273. }
  1274. // Iterate over the blocks propagating frequencies
  1275. size_t It = 0;
  1276. while (It++ < MaxIterations && !ActiveSet.empty()) {
  1277. size_t I = ActiveSet.front();
  1278. ActiveSet.pop();
  1279. IsActive[I] = false;
  1280. // Compute a new frequency for the block: NewFreq := Freq \times ProbMatrix.
  1281. // A special care is taken for self-edges that needs to be scaled by
  1282. // (1.0 - SelfProb), where SelfProb is the sum of probabilities on the edges
  1283. Scaled64 NewFreq;
  1284. Scaled64 OneMinusSelfProb = Scaled64::getOne();
  1285. for (auto &Jump : ProbMatrix[I]) {
  1286. if (Jump.first == I) {
  1287. OneMinusSelfProb -= Jump.second;
  1288. } else {
  1289. NewFreq += Freq[Jump.first] * Jump.second;
  1290. }
  1291. }
  1292. if (OneMinusSelfProb != Scaled64::getOne())
  1293. NewFreq /= OneMinusSelfProb;
  1294. // If the block's frequency has changed enough, then
  1295. // make sure the block and its successors are in the active set
  1296. auto Change = Freq[I] >= NewFreq ? Freq[I] - NewFreq : NewFreq - Freq[I];
  1297. if (Change > Precision) {
  1298. ActiveSet.push(I);
  1299. IsActive[I] = true;
  1300. for (size_t Succ : Successors[I]) {
  1301. if (!IsActive[Succ]) {
  1302. ActiveSet.push(Succ);
  1303. IsActive[Succ] = true;
  1304. }
  1305. }
  1306. }
  1307. // Update the frequency for the block
  1308. Freq[I] = NewFreq;
  1309. }
  1310. LLVM_DEBUG(dbgs() << " Completed " << It << " inference iterations"
  1311. << format(" (%0.0f per block)", double(It) / Freq.size())
  1312. << "\n");
  1313. #ifndef NDEBUG
  1314. LLVM_DEBUG(dbgs() << " Final discrepancy = "
  1315. << discrepancy(ProbMatrix, Freq).toString() << "\n");
  1316. #endif
  1317. }
  1318. template <class BT>
  1319. void BlockFrequencyInfoImpl<BT>::findReachableBlocks(
  1320. std::vector<const BlockT *> &Blocks) const {
  1321. // Find all blocks to apply inference on, that is, reachable from the entry
  1322. // along edges with non-zero probablities
  1323. std::queue<const BlockT *> Queue;
  1324. std::unordered_set<const BlockT *> Reachable;
  1325. const BlockT *Entry = &F->front();
  1326. Queue.push(Entry);
  1327. Reachable.insert(Entry);
  1328. while (!Queue.empty()) {
  1329. const BlockT *SrcBB = Queue.front();
  1330. Queue.pop();
  1331. for (const BlockT *DstBB : children<const BlockT *>(SrcBB)) {
  1332. auto EP = BPI->getEdgeProbability(SrcBB, DstBB);
  1333. if (EP.isZero())
  1334. continue;
  1335. if (Reachable.find(DstBB) == Reachable.end()) {
  1336. Queue.push(DstBB);
  1337. Reachable.insert(DstBB);
  1338. }
  1339. }
  1340. }
  1341. // Find all blocks to apply inference on, that is, backward reachable from
  1342. // the entry along (backward) edges with non-zero probablities
  1343. std::unordered_set<const BlockT *> InverseReachable;
  1344. for (const BlockT &BB : *F) {
  1345. // An exit block is a block without any successors
  1346. bool HasSucc = GraphTraits<const BlockT *>::child_begin(&BB) !=
  1347. GraphTraits<const BlockT *>::child_end(&BB);
  1348. if (!HasSucc && Reachable.count(&BB)) {
  1349. Queue.push(&BB);
  1350. InverseReachable.insert(&BB);
  1351. }
  1352. }
  1353. while (!Queue.empty()) {
  1354. const BlockT *SrcBB = Queue.front();
  1355. Queue.pop();
  1356. for (const BlockT *DstBB : children<Inverse<const BlockT *>>(SrcBB)) {
  1357. auto EP = BPI->getEdgeProbability(DstBB, SrcBB);
  1358. if (EP.isZero())
  1359. continue;
  1360. if (InverseReachable.find(DstBB) == InverseReachable.end()) {
  1361. Queue.push(DstBB);
  1362. InverseReachable.insert(DstBB);
  1363. }
  1364. }
  1365. }
  1366. // Collect the result
  1367. Blocks.reserve(F->size());
  1368. for (const BlockT &BB : *F) {
  1369. if (Reachable.count(&BB) && InverseReachable.count(&BB)) {
  1370. Blocks.push_back(&BB);
  1371. }
  1372. }
  1373. }
  1374. template <class BT>
  1375. void BlockFrequencyInfoImpl<BT>::initTransitionProbabilities(
  1376. const std::vector<const BlockT *> &Blocks,
  1377. const DenseMap<const BlockT *, size_t> &BlockIndex,
  1378. ProbMatrixType &ProbMatrix) const {
  1379. const size_t NumBlocks = Blocks.size();
  1380. auto Succs = std::vector<std::vector<std::pair<size_t, Scaled64>>>(NumBlocks);
  1381. auto SumProb = std::vector<Scaled64>(NumBlocks);
  1382. // Find unique successors and corresponding probabilities for every block
  1383. for (size_t Src = 0; Src < NumBlocks; Src++) {
  1384. const BlockT *BB = Blocks[Src];
  1385. std::unordered_set<const BlockT *> UniqueSuccs;
  1386. for (const auto SI : children<const BlockT *>(BB)) {
  1387. // Ignore cold blocks
  1388. if (BlockIndex.find(SI) == BlockIndex.end())
  1389. continue;
  1390. // Ignore parallel edges between BB and SI blocks
  1391. if (UniqueSuccs.find(SI) != UniqueSuccs.end())
  1392. continue;
  1393. UniqueSuccs.insert(SI);
  1394. // Ignore jumps with zero probability
  1395. auto EP = BPI->getEdgeProbability(BB, SI);
  1396. if (EP.isZero())
  1397. continue;
  1398. auto EdgeProb =
  1399. Scaled64::getFraction(EP.getNumerator(), EP.getDenominator());
  1400. size_t Dst = BlockIndex.find(SI)->second;
  1401. Succs[Src].push_back(std::make_pair(Dst, EdgeProb));
  1402. SumProb[Src] += EdgeProb;
  1403. }
  1404. }
  1405. // Add transitions for every jump with positive branch probability
  1406. ProbMatrix = ProbMatrixType(NumBlocks);
  1407. for (size_t Src = 0; Src < NumBlocks; Src++) {
  1408. // Ignore blocks w/o successors
  1409. if (Succs[Src].empty())
  1410. continue;
  1411. assert(!SumProb[Src].isZero() && "Zero sum probability of non-exit block");
  1412. for (auto &Jump : Succs[Src]) {
  1413. size_t Dst = Jump.first;
  1414. Scaled64 Prob = Jump.second;
  1415. ProbMatrix[Dst].push_back(std::make_pair(Src, Prob / SumProb[Src]));
  1416. }
  1417. }
  1418. // Add transitions from sinks to the source
  1419. size_t EntryIdx = BlockIndex.find(&F->front())->second;
  1420. for (size_t Src = 0; Src < NumBlocks; Src++) {
  1421. if (Succs[Src].empty()) {
  1422. ProbMatrix[EntryIdx].push_back(std::make_pair(Src, Scaled64::getOne()));
  1423. }
  1424. }
  1425. }
  1426. #ifndef NDEBUG
  1427. template <class BT>
  1428. BlockFrequencyInfoImplBase::Scaled64 BlockFrequencyInfoImpl<BT>::discrepancy(
  1429. const ProbMatrixType &ProbMatrix, const std::vector<Scaled64> &Freq) const {
  1430. assert(Freq[0] > 0 && "Incorrectly computed frequency of the entry block");
  1431. Scaled64 Discrepancy;
  1432. for (size_t I = 0; I < ProbMatrix.size(); I++) {
  1433. Scaled64 Sum;
  1434. for (const auto &Jump : ProbMatrix[I]) {
  1435. Sum += Freq[Jump.first] * Jump.second;
  1436. }
  1437. Discrepancy += Freq[I] >= Sum ? Freq[I] - Sum : Sum - Freq[I];
  1438. }
  1439. // Normalizing by the frequency of the entry block
  1440. return Discrepancy / Freq[0];
  1441. }
  1442. #endif
  1443. /// \note This should be a lambda, but that crashes GCC 4.7.
  1444. namespace bfi_detail {
  1445. template <class BT> struct BlockEdgesAdder {
  1446. using BlockT = BT;
  1447. using LoopData = BlockFrequencyInfoImplBase::LoopData;
  1448. using Successor = GraphTraits<const BlockT *>;
  1449. const BlockFrequencyInfoImpl<BT> &BFI;
  1450. explicit BlockEdgesAdder(const BlockFrequencyInfoImpl<BT> &BFI)
  1451. : BFI(BFI) {}
  1452. void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr,
  1453. const LoopData *OuterLoop) {
  1454. const BlockT *BB = BFI.RPOT[Irr.Node.Index];
  1455. for (const auto Succ : children<const BlockT *>(BB))
  1456. G.addEdge(Irr, BFI.getNode(Succ), OuterLoop);
  1457. }
  1458. };
  1459. } // end namespace bfi_detail
  1460. template <class BT>
  1461. void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass(
  1462. LoopData *OuterLoop, std::list<LoopData>::iterator Insert) {
  1463. LLVM_DEBUG(dbgs() << "analyze-irreducible-in-";
  1464. if (OuterLoop) dbgs()
  1465. << "loop: " << getLoopName(*OuterLoop) << "\n";
  1466. else dbgs() << "function\n");
  1467. using namespace bfi_detail;
  1468. // Ideally, addBlockEdges() would be declared here as a lambda, but that
  1469. // crashes GCC 4.7.
  1470. BlockEdgesAdder<BT> addBlockEdges(*this);
  1471. IrreducibleGraph G(*this, OuterLoop, addBlockEdges);
  1472. for (auto &L : analyzeIrreducible(G, OuterLoop, Insert))
  1473. computeMassInLoop(L);
  1474. if (!OuterLoop)
  1475. return;
  1476. updateLoopWithIrreducible(*OuterLoop);
  1477. }
  1478. // A helper function that converts a branch probability into weight.
  1479. inline uint32_t getWeightFromBranchProb(const BranchProbability Prob) {
  1480. return Prob.getNumerator();
  1481. }
  1482. template <class BT>
  1483. bool
  1484. BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop,
  1485. const BlockNode &Node) {
  1486. LLVM_DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
  1487. // Calculate probability for successors.
  1488. Distribution Dist;
  1489. if (auto *Loop = Working[Node.Index].getPackagedLoop()) {
  1490. assert(Loop != OuterLoop && "Cannot propagate mass in a packaged loop");
  1491. if (!addLoopSuccessorsToDist(OuterLoop, *Loop, Dist))
  1492. // Irreducible backedge.
  1493. return false;
  1494. } else {
  1495. const BlockT *BB = getBlock(Node);
  1496. for (auto SI = GraphTraits<const BlockT *>::child_begin(BB),
  1497. SE = GraphTraits<const BlockT *>::child_end(BB);
  1498. SI != SE; ++SI)
  1499. if (!addToDist(
  1500. Dist, OuterLoop, Node, getNode(*SI),
  1501. getWeightFromBranchProb(BPI->getEdgeProbability(BB, SI))))
  1502. // Irreducible backedge.
  1503. return false;
  1504. }
  1505. // Distribute mass to successors, saving exit and backedge data in the
  1506. // loop header.
  1507. distributeMass(Node, OuterLoop, Dist);
  1508. return true;
  1509. }
  1510. template <class BT>
  1511. raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const {
  1512. if (!F)
  1513. return OS;
  1514. OS << "block-frequency-info: " << F->getName() << "\n";
  1515. for (const BlockT &BB : *F) {
  1516. OS << " - " << bfi_detail::getBlockName(&BB) << ": float = ";
  1517. getFloatingBlockFreq(&BB).print(OS, 5)
  1518. << ", int = " << getBlockFreq(&BB).getFrequency();
  1519. if (Optional<uint64_t> ProfileCount =
  1520. BlockFrequencyInfoImplBase::getBlockProfileCount(
  1521. F->getFunction(), getNode(&BB)))
  1522. OS << ", count = " << ProfileCount.getValue();
  1523. if (Optional<uint64_t> IrrLoopHeaderWeight =
  1524. BB.getIrrLoopHeaderWeight())
  1525. OS << ", irr_loop_header_weight = " << IrrLoopHeaderWeight.getValue();
  1526. OS << "\n";
  1527. }
  1528. // Add an extra newline for readability.
  1529. OS << "\n";
  1530. return OS;
  1531. }
  1532. template <class BT>
  1533. void BlockFrequencyInfoImpl<BT>::verifyMatch(
  1534. BlockFrequencyInfoImpl<BT> &Other) const {
  1535. bool Match = true;
  1536. DenseMap<const BlockT *, BlockNode> ValidNodes;
  1537. DenseMap<const BlockT *, BlockNode> OtherValidNodes;
  1538. for (auto &Entry : Nodes) {
  1539. const BlockT *BB = Entry.first;
  1540. if (BB) {
  1541. ValidNodes[BB] = Entry.second.first;
  1542. }
  1543. }
  1544. for (auto &Entry : Other.Nodes) {
  1545. const BlockT *BB = Entry.first;
  1546. if (BB) {
  1547. OtherValidNodes[BB] = Entry.second.first;
  1548. }
  1549. }
  1550. unsigned NumValidNodes = ValidNodes.size();
  1551. unsigned NumOtherValidNodes = OtherValidNodes.size();
  1552. if (NumValidNodes != NumOtherValidNodes) {
  1553. Match = false;
  1554. dbgs() << "Number of blocks mismatch: " << NumValidNodes << " vs "
  1555. << NumOtherValidNodes << "\n";
  1556. } else {
  1557. for (auto &Entry : ValidNodes) {
  1558. const BlockT *BB = Entry.first;
  1559. BlockNode Node = Entry.second;
  1560. if (OtherValidNodes.count(BB)) {
  1561. BlockNode OtherNode = OtherValidNodes[BB];
  1562. const auto &Freq = Freqs[Node.Index];
  1563. const auto &OtherFreq = Other.Freqs[OtherNode.Index];
  1564. if (Freq.Integer != OtherFreq.Integer) {
  1565. Match = false;
  1566. dbgs() << "Freq mismatch: " << bfi_detail::getBlockName(BB) << " "
  1567. << Freq.Integer << " vs " << OtherFreq.Integer << "\n";
  1568. }
  1569. } else {
  1570. Match = false;
  1571. dbgs() << "Block " << bfi_detail::getBlockName(BB) << " index "
  1572. << Node.Index << " does not exist in Other.\n";
  1573. }
  1574. }
  1575. // If there's a valid node in OtherValidNodes that's not in ValidNodes,
  1576. // either the above num check or the check on OtherValidNodes will fail.
  1577. }
  1578. if (!Match) {
  1579. dbgs() << "This\n";
  1580. print(dbgs());
  1581. dbgs() << "Other\n";
  1582. Other.print(dbgs());
  1583. }
  1584. assert(Match && "BFI mismatch");
  1585. }
  1586. // Graph trait base class for block frequency information graph
  1587. // viewer.
  1588. enum GVDAGType { GVDT_None, GVDT_Fraction, GVDT_Integer, GVDT_Count };
  1589. template <class BlockFrequencyInfoT, class BranchProbabilityInfoT>
  1590. struct BFIDOTGraphTraitsBase : public DefaultDOTGraphTraits {
  1591. using GTraits = GraphTraits<BlockFrequencyInfoT *>;
  1592. using NodeRef = typename GTraits::NodeRef;
  1593. using EdgeIter = typename GTraits::ChildIteratorType;
  1594. using NodeIter = typename GTraits::nodes_iterator;
  1595. uint64_t MaxFrequency = 0;
  1596. explicit BFIDOTGraphTraitsBase(bool isSimple = false)
  1597. : DefaultDOTGraphTraits(isSimple) {}
  1598. static StringRef getGraphName(const BlockFrequencyInfoT *G) {
  1599. return G->getFunction()->getName();
  1600. }
  1601. std::string getNodeAttributes(NodeRef Node, const BlockFrequencyInfoT *Graph,
  1602. unsigned HotPercentThreshold = 0) {
  1603. std::string Result;
  1604. if (!HotPercentThreshold)
  1605. return Result;
  1606. // Compute MaxFrequency on the fly:
  1607. if (!MaxFrequency) {
  1608. for (NodeIter I = GTraits::nodes_begin(Graph),
  1609. E = GTraits::nodes_end(Graph);
  1610. I != E; ++I) {
  1611. NodeRef N = *I;
  1612. MaxFrequency =
  1613. std::max(MaxFrequency, Graph->getBlockFreq(N).getFrequency());
  1614. }
  1615. }
  1616. BlockFrequency Freq = Graph->getBlockFreq(Node);
  1617. BlockFrequency HotFreq =
  1618. (BlockFrequency(MaxFrequency) *
  1619. BranchProbability::getBranchProbability(HotPercentThreshold, 100));
  1620. if (Freq < HotFreq)
  1621. return Result;
  1622. raw_string_ostream OS(Result);
  1623. OS << "color=\"red\"";
  1624. OS.flush();
  1625. return Result;
  1626. }
  1627. std::string getNodeLabel(NodeRef Node, const BlockFrequencyInfoT *Graph,
  1628. GVDAGType GType, int layout_order = -1) {
  1629. std::string Result;
  1630. raw_string_ostream OS(Result);
  1631. if (layout_order != -1)
  1632. OS << Node->getName() << "[" << layout_order << "] : ";
  1633. else
  1634. OS << Node->getName() << " : ";
  1635. switch (GType) {
  1636. case GVDT_Fraction:
  1637. Graph->printBlockFreq(OS, Node);
  1638. break;
  1639. case GVDT_Integer:
  1640. OS << Graph->getBlockFreq(Node).getFrequency();
  1641. break;
  1642. case GVDT_Count: {
  1643. auto Count = Graph->getBlockProfileCount(Node);
  1644. if (Count)
  1645. OS << Count.getValue();
  1646. else
  1647. OS << "Unknown";
  1648. break;
  1649. }
  1650. case GVDT_None:
  1651. llvm_unreachable("If we are not supposed to render a graph we should "
  1652. "never reach this point.");
  1653. }
  1654. return Result;
  1655. }
  1656. std::string getEdgeAttributes(NodeRef Node, EdgeIter EI,
  1657. const BlockFrequencyInfoT *BFI,
  1658. const BranchProbabilityInfoT *BPI,
  1659. unsigned HotPercentThreshold = 0) {
  1660. std::string Str;
  1661. if (!BPI)
  1662. return Str;
  1663. BranchProbability BP = BPI->getEdgeProbability(Node, EI);
  1664. uint32_t N = BP.getNumerator();
  1665. uint32_t D = BP.getDenominator();
  1666. double Percent = 100.0 * N / D;
  1667. raw_string_ostream OS(Str);
  1668. OS << format("label=\"%.1f%%\"", Percent);
  1669. if (HotPercentThreshold) {
  1670. BlockFrequency EFreq = BFI->getBlockFreq(Node) * BP;
  1671. BlockFrequency HotFreq = BlockFrequency(MaxFrequency) *
  1672. BranchProbability(HotPercentThreshold, 100);
  1673. if (EFreq >= HotFreq) {
  1674. OS << ",color=\"red\"";
  1675. }
  1676. }
  1677. OS.flush();
  1678. return Str;
  1679. }
  1680. };
  1681. } // end namespace llvm
  1682. #undef DEBUG_TYPE
  1683. #endif // LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  1684. #ifdef __GNUC__
  1685. #pragma GCC diagnostic pop
  1686. #endif