BlockFrequencyInfoImpl.h 57 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //==- BlockFrequencyInfoImpl.h - Block Frequency Implementation --*- C++ -*-==//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // Shared implementation of BlockFrequency for IR and Machine Instructions.
  15. // See the documentation below for BlockFrequencyInfoImpl for details.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  19. #define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  20. #include "llvm/ADT/DenseMap.h"
  21. #include "llvm/ADT/DenseSet.h"
  22. #include "llvm/ADT/GraphTraits.h"
  23. #include "llvm/ADT/Optional.h"
  24. #include "llvm/ADT/PostOrderIterator.h"
  25. #include "llvm/ADT/SmallVector.h"
  26. #include "llvm/ADT/SparseBitVector.h"
  27. #include "llvm/ADT/Twine.h"
  28. #include "llvm/ADT/iterator_range.h"
  29. #include "llvm/IR/BasicBlock.h"
  30. #include "llvm/IR/ValueHandle.h"
  31. #include "llvm/Support/BlockFrequency.h"
  32. #include "llvm/Support/BranchProbability.h"
  33. #include "llvm/Support/CommandLine.h"
  34. #include "llvm/Support/DOTGraphTraits.h"
  35. #include "llvm/Support/Debug.h"
  36. #include "llvm/Support/ErrorHandling.h"
  37. #include "llvm/Support/Format.h"
  38. #include "llvm/Support/ScaledNumber.h"
  39. #include "llvm/Support/raw_ostream.h"
  40. #include <algorithm>
  41. #include <cassert>
  42. #include <cstddef>
  43. #include <cstdint>
  44. #include <deque>
  45. #include <iterator>
  46. #include <limits>
  47. #include <list>
  48. #include <string>
  49. #include <utility>
  50. #include <vector>
  51. #define DEBUG_TYPE "block-freq"
  52. extern llvm::cl::opt<bool> CheckBFIUnknownBlockQueries;
  53. namespace llvm {
  54. class BranchProbabilityInfo;
  55. class Function;
  56. class Loop;
  57. class LoopInfo;
  58. class MachineBasicBlock;
  59. class MachineBranchProbabilityInfo;
  60. class MachineFunction;
  61. class MachineLoop;
  62. class MachineLoopInfo;
  63. namespace bfi_detail {
  64. struct IrreducibleGraph;
  65. // This is part of a workaround for a GCC 4.7 crash on lambdas.
  66. template <class BT> struct BlockEdgesAdder;
  67. /// Mass of a block.
  68. ///
  69. /// This class implements a sort of fixed-point fraction always between 0.0 and
  70. /// 1.0. getMass() == std::numeric_limits<uint64_t>::max() indicates a value of
  71. /// 1.0.
  72. ///
  73. /// Masses can be added and subtracted. Simple saturation arithmetic is used,
  74. /// so arithmetic operations never overflow or underflow.
  75. ///
  76. /// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses
  77. /// an inexpensive floating-point algorithm that's off-by-one (almost, but not
  78. /// quite, maximum precision).
  79. ///
  80. /// Masses can be scaled by \a BranchProbability at maximum precision.
  81. class BlockMass {
  82. uint64_t Mass = 0;
  83. public:
  84. BlockMass() = default;
  85. explicit BlockMass(uint64_t Mass) : Mass(Mass) {}
  86. static BlockMass getEmpty() { return BlockMass(); }
  87. static BlockMass getFull() {
  88. return BlockMass(std::numeric_limits<uint64_t>::max());
  89. }
  90. uint64_t getMass() const { return Mass; }
  91. bool isFull() const { return Mass == std::numeric_limits<uint64_t>::max(); }
  92. bool isEmpty() const { return !Mass; }
  93. bool operator!() const { return isEmpty(); }
  94. /// Add another mass.
  95. ///
  96. /// Adds another mass, saturating at \a isFull() rather than overflowing.
  97. BlockMass &operator+=(BlockMass X) {
  98. uint64_t Sum = Mass + X.Mass;
  99. Mass = Sum < Mass ? std::numeric_limits<uint64_t>::max() : Sum;
  100. return *this;
  101. }
  102. /// Subtract another mass.
  103. ///
  104. /// Subtracts another mass, saturating at \a isEmpty() rather than
  105. /// undeflowing.
  106. BlockMass &operator-=(BlockMass X) {
  107. uint64_t Diff = Mass - X.Mass;
  108. Mass = Diff > Mass ? 0 : Diff;
  109. return *this;
  110. }
  111. BlockMass &operator*=(BranchProbability P) {
  112. Mass = P.scale(Mass);
  113. return *this;
  114. }
  115. bool operator==(BlockMass X) const { return Mass == X.Mass; }
  116. bool operator!=(BlockMass X) const { return Mass != X.Mass; }
  117. bool operator<=(BlockMass X) const { return Mass <= X.Mass; }
  118. bool operator>=(BlockMass X) const { return Mass >= X.Mass; }
  119. bool operator<(BlockMass X) const { return Mass < X.Mass; }
  120. bool operator>(BlockMass X) const { return Mass > X.Mass; }
  121. /// Convert to scaled number.
  122. ///
  123. /// Convert to \a ScaledNumber. \a isFull() gives 1.0, while \a isEmpty()
  124. /// gives slightly above 0.0.
  125. ScaledNumber<uint64_t> toScaled() const;
  126. void dump() const;
  127. raw_ostream &print(raw_ostream &OS) const;
  128. };
  129. inline BlockMass operator+(BlockMass L, BlockMass R) {
  130. return BlockMass(L) += R;
  131. }
  132. inline BlockMass operator-(BlockMass L, BlockMass R) {
  133. return BlockMass(L) -= R;
  134. }
  135. inline BlockMass operator*(BlockMass L, BranchProbability R) {
  136. return BlockMass(L) *= R;
  137. }
  138. inline BlockMass operator*(BranchProbability L, BlockMass R) {
  139. return BlockMass(R) *= L;
  140. }
  141. inline raw_ostream &operator<<(raw_ostream &OS, BlockMass X) {
  142. return X.print(OS);
  143. }
  144. } // end namespace bfi_detail
  145. /// Base class for BlockFrequencyInfoImpl
  146. ///
  147. /// BlockFrequencyInfoImplBase has supporting data structures and some
  148. /// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on
  149. /// the block type (or that call such algorithms) are skipped here.
  150. ///
  151. /// Nevertheless, the majority of the overall algorithm documentation lives with
  152. /// BlockFrequencyInfoImpl. See there for details.
  153. class BlockFrequencyInfoImplBase {
  154. public:
  155. using Scaled64 = ScaledNumber<uint64_t>;
  156. using BlockMass = bfi_detail::BlockMass;
  157. /// Representative of a block.
  158. ///
  159. /// This is a simple wrapper around an index into the reverse-post-order
  160. /// traversal of the blocks.
  161. ///
  162. /// Unlike a block pointer, its order has meaning (location in the
  163. /// topological sort) and it's class is the same regardless of block type.
  164. struct BlockNode {
  165. using IndexType = uint32_t;
  166. IndexType Index;
  167. BlockNode() : Index(std::numeric_limits<uint32_t>::max()) {}
  168. BlockNode(IndexType Index) : Index(Index) {}
  169. bool operator==(const BlockNode &X) const { return Index == X.Index; }
  170. bool operator!=(const BlockNode &X) const { return Index != X.Index; }
  171. bool operator<=(const BlockNode &X) const { return Index <= X.Index; }
  172. bool operator>=(const BlockNode &X) const { return Index >= X.Index; }
  173. bool operator<(const BlockNode &X) const { return Index < X.Index; }
  174. bool operator>(const BlockNode &X) const { return Index > X.Index; }
  175. bool isValid() const { return Index <= getMaxIndex(); }
  176. static size_t getMaxIndex() {
  177. return std::numeric_limits<uint32_t>::max() - 1;
  178. }
  179. };
  180. /// Stats about a block itself.
  181. struct FrequencyData {
  182. Scaled64 Scaled;
  183. uint64_t Integer;
  184. };
  185. /// Data about a loop.
  186. ///
  187. /// Contains the data necessary to represent a loop as a pseudo-node once it's
  188. /// packaged.
  189. struct LoopData {
  190. using ExitMap = SmallVector<std::pair<BlockNode, BlockMass>, 4>;
  191. using NodeList = SmallVector<BlockNode, 4>;
  192. using HeaderMassList = SmallVector<BlockMass, 1>;
  193. LoopData *Parent; ///< The parent loop.
  194. bool IsPackaged = false; ///< Whether this has been packaged.
  195. uint32_t NumHeaders = 1; ///< Number of headers.
  196. ExitMap Exits; ///< Successor edges (and weights).
  197. NodeList Nodes; ///< Header and the members of the loop.
  198. HeaderMassList BackedgeMass; ///< Mass returned to each loop header.
  199. BlockMass Mass;
  200. Scaled64 Scale;
  201. LoopData(LoopData *Parent, const BlockNode &Header)
  202. : Parent(Parent), Nodes(1, Header), BackedgeMass(1) {}
  203. template <class It1, class It2>
  204. LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
  205. It2 LastOther)
  206. : Parent(Parent), Nodes(FirstHeader, LastHeader) {
  207. NumHeaders = Nodes.size();
  208. Nodes.insert(Nodes.end(), FirstOther, LastOther);
  209. BackedgeMass.resize(NumHeaders);
  210. }
  211. bool isHeader(const BlockNode &Node) const {
  212. if (isIrreducible())
  213. return std::binary_search(Nodes.begin(), Nodes.begin() + NumHeaders,
  214. Node);
  215. return Node == Nodes[0];
  216. }
  217. BlockNode getHeader() const { return Nodes[0]; }
  218. bool isIrreducible() const { return NumHeaders > 1; }
  219. HeaderMassList::difference_type getHeaderIndex(const BlockNode &B) {
  220. assert(isHeader(B) && "this is only valid on loop header blocks");
  221. if (isIrreducible())
  222. return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) -
  223. Nodes.begin();
  224. return 0;
  225. }
  226. NodeList::const_iterator members_begin() const {
  227. return Nodes.begin() + NumHeaders;
  228. }
  229. NodeList::const_iterator members_end() const { return Nodes.end(); }
  230. iterator_range<NodeList::const_iterator> members() const {
  231. return make_range(members_begin(), members_end());
  232. }
  233. };
  234. /// Index of loop information.
  235. struct WorkingData {
  236. BlockNode Node; ///< This node.
  237. LoopData *Loop = nullptr; ///< The loop this block is inside.
  238. BlockMass Mass; ///< Mass distribution from the entry block.
  239. WorkingData(const BlockNode &Node) : Node(Node) {}
  240. bool isLoopHeader() const { return Loop && Loop->isHeader(Node); }
  241. bool isDoubleLoopHeader() const {
  242. return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() &&
  243. Loop->Parent->isHeader(Node);
  244. }
  245. LoopData *getContainingLoop() const {
  246. if (!isLoopHeader())
  247. return Loop;
  248. if (!isDoubleLoopHeader())
  249. return Loop->Parent;
  250. return Loop->Parent->Parent;
  251. }
  252. /// Resolve a node to its representative.
  253. ///
  254. /// Get the node currently representing Node, which could be a containing
  255. /// loop.
  256. ///
  257. /// This function should only be called when distributing mass. As long as
  258. /// there are no irreducible edges to Node, then it will have complexity
  259. /// O(1) in this context.
  260. ///
  261. /// In general, the complexity is O(L), where L is the number of loop
  262. /// headers Node has been packaged into. Since this method is called in
  263. /// the context of distributing mass, L will be the number of loop headers
  264. /// an early exit edge jumps out of.
  265. BlockNode getResolvedNode() const {
  266. auto L = getPackagedLoop();
  267. return L ? L->getHeader() : Node;
  268. }
  269. LoopData *getPackagedLoop() const {
  270. if (!Loop || !Loop->IsPackaged)
  271. return nullptr;
  272. auto L = Loop;
  273. while (L->Parent && L->Parent->IsPackaged)
  274. L = L->Parent;
  275. return L;
  276. }
  277. /// Get the appropriate mass for a node.
  278. ///
  279. /// Get appropriate mass for Node. If Node is a loop-header (whose loop
  280. /// has been packaged), returns the mass of its pseudo-node. If it's a
  281. /// node inside a packaged loop, it returns the loop's mass.
  282. BlockMass &getMass() {
  283. if (!isAPackage())
  284. return Mass;
  285. if (!isADoublePackage())
  286. return Loop->Mass;
  287. return Loop->Parent->Mass;
  288. }
  289. /// Has ContainingLoop been packaged up?
  290. bool isPackaged() const { return getResolvedNode() != Node; }
  291. /// Has Loop been packaged up?
  292. bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; }
  293. /// Has Loop been packaged up twice?
  294. bool isADoublePackage() const {
  295. return isDoubleLoopHeader() && Loop->Parent->IsPackaged;
  296. }
  297. };
  298. /// Unscaled probability weight.
  299. ///
  300. /// Probability weight for an edge in the graph (including the
  301. /// successor/target node).
  302. ///
  303. /// All edges in the original function are 32-bit. However, exit edges from
  304. /// loop packages are taken from 64-bit exit masses, so we need 64-bits of
  305. /// space in general.
  306. ///
  307. /// In addition to the raw weight amount, Weight stores the type of the edge
  308. /// in the current context (i.e., the context of the loop being processed).
  309. /// Is this a local edge within the loop, an exit from the loop, or a
  310. /// backedge to the loop header?
  311. struct Weight {
  312. enum DistType { Local, Exit, Backedge };
  313. DistType Type = Local;
  314. BlockNode TargetNode;
  315. uint64_t Amount = 0;
  316. Weight() = default;
  317. Weight(DistType Type, BlockNode TargetNode, uint64_t Amount)
  318. : Type(Type), TargetNode(TargetNode), Amount(Amount) {}
  319. };
  320. /// Distribution of unscaled probability weight.
  321. ///
  322. /// Distribution of unscaled probability weight to a set of successors.
  323. ///
  324. /// This class collates the successor edge weights for later processing.
  325. ///
  326. /// \a DidOverflow indicates whether \a Total did overflow while adding to
  327. /// the distribution. It should never overflow twice.
  328. struct Distribution {
  329. using WeightList = SmallVector<Weight, 4>;
  330. WeightList Weights; ///< Individual successor weights.
  331. uint64_t Total = 0; ///< Sum of all weights.
  332. bool DidOverflow = false; ///< Whether \a Total did overflow.
  333. Distribution() = default;
  334. void addLocal(const BlockNode &Node, uint64_t Amount) {
  335. add(Node, Amount, Weight::Local);
  336. }
  337. void addExit(const BlockNode &Node, uint64_t Amount) {
  338. add(Node, Amount, Weight::Exit);
  339. }
  340. void addBackedge(const BlockNode &Node, uint64_t Amount) {
  341. add(Node, Amount, Weight::Backedge);
  342. }
  343. /// Normalize the distribution.
  344. ///
  345. /// Combines multiple edges to the same \a Weight::TargetNode and scales
  346. /// down so that \a Total fits into 32-bits.
  347. ///
  348. /// This is linear in the size of \a Weights. For the vast majority of
  349. /// cases, adjacent edge weights are combined by sorting WeightList and
  350. /// combining adjacent weights. However, for very large edge lists an
  351. /// auxiliary hash table is used.
  352. void normalize();
  353. private:
  354. void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
  355. };
  356. /// Data about each block. This is used downstream.
  357. std::vector<FrequencyData> Freqs;
  358. /// Whether each block is an irreducible loop header.
  359. /// This is used downstream.
  360. SparseBitVector<> IsIrrLoopHeader;
  361. /// Loop data: see initializeLoops().
  362. std::vector<WorkingData> Working;
  363. /// Indexed information about loops.
  364. std::list<LoopData> Loops;
  365. /// Virtual destructor.
  366. ///
  367. /// Need a virtual destructor to mask the compiler warning about
  368. /// getBlockName().
  369. virtual ~BlockFrequencyInfoImplBase() = default;
  370. /// Add all edges out of a packaged loop to the distribution.
  371. ///
  372. /// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each
  373. /// successor edge.
  374. ///
  375. /// \return \c true unless there's an irreducible backedge.
  376. bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop,
  377. Distribution &Dist);
  378. /// Add an edge to the distribution.
  379. ///
  380. /// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the
  381. /// edge is local/exit/backedge is in the context of LoopHead. Otherwise,
  382. /// every edge should be a local edge (since all the loops are packaged up).
  383. ///
  384. /// \return \c true unless aborted due to an irreducible backedge.
  385. bool addToDist(Distribution &Dist, const LoopData *OuterLoop,
  386. const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
  387. LoopData &getLoopPackage(const BlockNode &Head) {
  388. assert(Head.Index < Working.size());
  389. assert(Working[Head.Index].isLoopHeader());
  390. return *Working[Head.Index].Loop;
  391. }
  392. /// Analyze irreducible SCCs.
  393. ///
  394. /// Separate irreducible SCCs from \c G, which is an explicit graph of \c
  395. /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr).
  396. /// Insert them into \a Loops before \c Insert.
  397. ///
  398. /// \return the \c LoopData nodes representing the irreducible SCCs.
  399. iterator_range<std::list<LoopData>::iterator>
  400. analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop,
  401. std::list<LoopData>::iterator Insert);
  402. /// Update a loop after packaging irreducible SCCs inside of it.
  403. ///
  404. /// Update \c OuterLoop. Before finding irreducible control flow, it was
  405. /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a
  406. /// LoopData::BackedgeMass need to be reset. Also, nodes that were packaged
  407. /// up need to be removed from \a OuterLoop::Nodes.
  408. void updateLoopWithIrreducible(LoopData &OuterLoop);
  409. /// Distribute mass according to a distribution.
  410. ///
  411. /// Distributes the mass in Source according to Dist. If LoopHead.isValid(),
  412. /// backedges and exits are stored in its entry in Loops.
  413. ///
  414. /// Mass is distributed in parallel from two copies of the source mass.
  415. void distributeMass(const BlockNode &Source, LoopData *OuterLoop,
  416. Distribution &Dist);
  417. /// Compute the loop scale for a loop.
  418. void computeLoopScale(LoopData &Loop);
  419. /// Adjust the mass of all headers in an irreducible loop.
  420. ///
  421. /// Initially, irreducible loops are assumed to distribute their mass
  422. /// equally among its headers. This can lead to wrong frequency estimates
  423. /// since some headers may be executed more frequently than others.
  424. ///
  425. /// This adjusts header mass distribution so it matches the weights of
  426. /// the backedges going into each of the loop headers.
  427. void adjustLoopHeaderMass(LoopData &Loop);
  428. void distributeIrrLoopHeaderMass(Distribution &Dist);
  429. /// Package up a loop.
  430. void packageLoop(LoopData &Loop);
  431. /// Unwrap loops.
  432. void unwrapLoops();
  433. /// Finalize frequency metrics.
  434. ///
  435. /// Calculates final frequencies and cleans up no-longer-needed data
  436. /// structures.
  437. void finalizeMetrics();
  438. /// Clear all memory.
  439. void clear();
  440. virtual std::string getBlockName(const BlockNode &Node) const;
  441. std::string getLoopName(const LoopData &Loop) const;
  442. virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
  443. void dump() const { print(dbgs()); }
  444. Scaled64 getFloatingBlockFreq(const BlockNode &Node) const;
  445. BlockFrequency getBlockFreq(const BlockNode &Node) const;
  446. Optional<uint64_t> getBlockProfileCount(const Function &F,
  447. const BlockNode &Node,
  448. bool AllowSynthetic = false) const;
  449. Optional<uint64_t> getProfileCountFromFreq(const Function &F,
  450. uint64_t Freq,
  451. bool AllowSynthetic = false) const;
  452. bool isIrrLoopHeader(const BlockNode &Node);
  453. void setBlockFreq(const BlockNode &Node, uint64_t Freq);
  454. raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
  455. raw_ostream &printBlockFreq(raw_ostream &OS,
  456. const BlockFrequency &Freq) const;
  457. uint64_t getEntryFreq() const {
  458. assert(!Freqs.empty());
  459. return Freqs[0].Integer;
  460. }
  461. };
  462. namespace bfi_detail {
  463. template <class BlockT> struct TypeMap {};
  464. template <> struct TypeMap<BasicBlock> {
  465. using BlockT = BasicBlock;
  466. using BlockKeyT = AssertingVH<const BasicBlock>;
  467. using FunctionT = Function;
  468. using BranchProbabilityInfoT = BranchProbabilityInfo;
  469. using LoopT = Loop;
  470. using LoopInfoT = LoopInfo;
  471. };
  472. template <> struct TypeMap<MachineBasicBlock> {
  473. using BlockT = MachineBasicBlock;
  474. using BlockKeyT = const MachineBasicBlock *;
  475. using FunctionT = MachineFunction;
  476. using BranchProbabilityInfoT = MachineBranchProbabilityInfo;
  477. using LoopT = MachineLoop;
  478. using LoopInfoT = MachineLoopInfo;
  479. };
  480. template <class BlockT, class BFIImplT>
  481. class BFICallbackVH;
  482. /// Get the name of a MachineBasicBlock.
  483. ///
  484. /// Get the name of a MachineBasicBlock. It's templated so that including from
  485. /// CodeGen is unnecessary (that would be a layering issue).
  486. ///
  487. /// This is used mainly for debug output. The name is similar to
  488. /// MachineBasicBlock::getFullName(), but skips the name of the function.
  489. template <class BlockT> std::string getBlockName(const BlockT *BB) {
  490. assert(BB && "Unexpected nullptr");
  491. auto MachineName = "BB" + Twine(BB->getNumber());
  492. if (BB->getBasicBlock())
  493. return (MachineName + "[" + BB->getName() + "]").str();
  494. return MachineName.str();
  495. }
  496. /// Get the name of a BasicBlock.
  497. template <> inline std::string getBlockName(const BasicBlock *BB) {
  498. assert(BB && "Unexpected nullptr");
  499. return BB->getName().str();
  500. }
  501. /// Graph of irreducible control flow.
  502. ///
  503. /// This graph is used for determining the SCCs in a loop (or top-level
  504. /// function) that has irreducible control flow.
  505. ///
  506. /// During the block frequency algorithm, the local graphs are defined in a
  507. /// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock
  508. /// graphs for most edges, but getting others from \a LoopData::ExitMap. The
  509. /// latter only has successor information.
  510. ///
  511. /// \a IrreducibleGraph makes this graph explicit. It's in a form that can use
  512. /// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator),
  513. /// and it explicitly lists predecessors and successors. The initialization
  514. /// that relies on \c MachineBasicBlock is defined in the header.
  515. struct IrreducibleGraph {
  516. using BFIBase = BlockFrequencyInfoImplBase;
  517. BFIBase &BFI;
  518. using BlockNode = BFIBase::BlockNode;
  519. struct IrrNode {
  520. BlockNode Node;
  521. unsigned NumIn = 0;
  522. std::deque<const IrrNode *> Edges;
  523. IrrNode(const BlockNode &Node) : Node(Node) {}
  524. using iterator = std::deque<const IrrNode *>::const_iterator;
  525. iterator pred_begin() const { return Edges.begin(); }
  526. iterator succ_begin() const { return Edges.begin() + NumIn; }
  527. iterator pred_end() const { return succ_begin(); }
  528. iterator succ_end() const { return Edges.end(); }
  529. };
  530. BlockNode Start;
  531. const IrrNode *StartIrr = nullptr;
  532. std::vector<IrrNode> Nodes;
  533. SmallDenseMap<uint32_t, IrrNode *, 4> Lookup;
  534. /// Construct an explicit graph containing irreducible control flow.
  535. ///
  536. /// Construct an explicit graph of the control flow in \c OuterLoop (or the
  537. /// top-level function, if \c OuterLoop is \c nullptr). Uses \c
  538. /// addBlockEdges to add block successors that have not been packaged into
  539. /// loops.
  540. ///
  541. /// \a BlockFrequencyInfoImpl::computeIrreducibleMass() is the only expected
  542. /// user of this.
  543. template <class BlockEdgesAdder>
  544. IrreducibleGraph(BFIBase &BFI, const BFIBase::LoopData *OuterLoop,
  545. BlockEdgesAdder addBlockEdges) : BFI(BFI) {
  546. initialize(OuterLoop, addBlockEdges);
  547. }
  548. template <class BlockEdgesAdder>
  549. void initialize(const BFIBase::LoopData *OuterLoop,
  550. BlockEdgesAdder addBlockEdges);
  551. void addNodesInLoop(const BFIBase::LoopData &OuterLoop);
  552. void addNodesInFunction();
  553. void addNode(const BlockNode &Node) {
  554. Nodes.emplace_back(Node);
  555. BFI.Working[Node.Index].getMass() = BlockMass::getEmpty();
  556. }
  557. void indexNodes();
  558. template <class BlockEdgesAdder>
  559. void addEdges(const BlockNode &Node, const BFIBase::LoopData *OuterLoop,
  560. BlockEdgesAdder addBlockEdges);
  561. void addEdge(IrrNode &Irr, const BlockNode &Succ,
  562. const BFIBase::LoopData *OuterLoop);
  563. };
  564. template <class BlockEdgesAdder>
  565. void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop,
  566. BlockEdgesAdder addBlockEdges) {
  567. if (OuterLoop) {
  568. addNodesInLoop(*OuterLoop);
  569. for (auto N : OuterLoop->Nodes)
  570. addEdges(N, OuterLoop, addBlockEdges);
  571. } else {
  572. addNodesInFunction();
  573. for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
  574. addEdges(Index, OuterLoop, addBlockEdges);
  575. }
  576. StartIrr = Lookup[Start.Index];
  577. }
  578. template <class BlockEdgesAdder>
  579. void IrreducibleGraph::addEdges(const BlockNode &Node,
  580. const BFIBase::LoopData *OuterLoop,
  581. BlockEdgesAdder addBlockEdges) {
  582. auto L = Lookup.find(Node.Index);
  583. if (L == Lookup.end())
  584. return;
  585. IrrNode &Irr = *L->second;
  586. const auto &Working = BFI.Working[Node.Index];
  587. if (Working.isAPackage())
  588. for (const auto &I : Working.Loop->Exits)
  589. addEdge(Irr, I.first, OuterLoop);
  590. else
  591. addBlockEdges(*this, Irr, OuterLoop);
  592. }
  593. } // end namespace bfi_detail
  594. /// Shared implementation for block frequency analysis.
  595. ///
  596. /// This is a shared implementation of BlockFrequencyInfo and
  597. /// MachineBlockFrequencyInfo, and calculates the relative frequencies of
  598. /// blocks.
  599. ///
  600. /// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block,
  601. /// which is called the header. A given loop, L, can have sub-loops, which are
  602. /// loops within the subgraph of L that exclude its header. (A "trivial" SCC
  603. /// consists of a single block that does not have a self-edge.)
  604. ///
  605. /// In addition to loops, this algorithm has limited support for irreducible
  606. /// SCCs, which are SCCs with multiple entry blocks. Irreducible SCCs are
  607. /// discovered on the fly, and modelled as loops with multiple headers.
  608. ///
  609. /// The headers of irreducible sub-SCCs consist of its entry blocks and all
  610. /// nodes that are targets of a backedge within it (excluding backedges within
  611. /// true sub-loops). Block frequency calculations act as if a block is
  612. /// inserted that intercepts all the edges to the headers. All backedges and
  613. /// entries point to this block. Its successors are the headers, which split
  614. /// the frequency evenly.
  615. ///
  616. /// This algorithm leverages BlockMass and ScaledNumber to maintain precision,
  617. /// separates mass distribution from loop scaling, and dithers to eliminate
  618. /// probability mass loss.
  619. ///
  620. /// The implementation is split between BlockFrequencyInfoImpl, which knows the
  621. /// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
  622. /// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a
  623. /// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in
  624. /// reverse-post order. This gives two advantages: it's easy to compare the
  625. /// relative ordering of two nodes, and maps keyed on BlockT can be represented
  626. /// by vectors.
  627. ///
  628. /// This algorithm is O(V+E), unless there is irreducible control flow, in
  629. /// which case it's O(V*E) in the worst case.
  630. ///
  631. /// These are the main stages:
  632. ///
  633. /// 0. Reverse post-order traversal (\a initializeRPOT()).
  634. ///
  635. /// Run a single post-order traversal and save it (in reverse) in RPOT.
  636. /// All other stages make use of this ordering. Save a lookup from BlockT
  637. /// to BlockNode (the index into RPOT) in Nodes.
  638. ///
  639. /// 1. Loop initialization (\a initializeLoops()).
  640. ///
  641. /// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
  642. /// the algorithm. In particular, store the immediate members of each loop
  643. /// in reverse post-order.
  644. ///
  645. /// 2. Calculate mass and scale in loops (\a computeMassInLoops()).
  646. ///
  647. /// For each loop (bottom-up), distribute mass through the DAG resulting
  648. /// from ignoring backedges and treating sub-loops as a single pseudo-node.
  649. /// Track the backedge mass distributed to the loop header, and use it to
  650. /// calculate the loop scale (number of loop iterations). Immediate
  651. /// members that represent sub-loops will already have been visited and
  652. /// packaged into a pseudo-node.
  653. ///
  654. /// Distributing mass in a loop is a reverse-post-order traversal through
  655. /// the loop. Start by assigning full mass to the Loop header. For each
  656. /// node in the loop:
  657. ///
  658. /// - Fetch and categorize the weight distribution for its successors.
  659. /// If this is a packaged-subloop, the weight distribution is stored
  660. /// in \a LoopData::Exits. Otherwise, fetch it from
  661. /// BranchProbabilityInfo.
  662. ///
  663. /// - Each successor is categorized as \a Weight::Local, a local edge
  664. /// within the current loop, \a Weight::Backedge, a backedge to the
  665. /// loop header, or \a Weight::Exit, any successor outside the loop.
  666. /// The weight, the successor, and its category are stored in \a
  667. /// Distribution. There can be multiple edges to each successor.
  668. ///
  669. /// - If there's a backedge to a non-header, there's an irreducible SCC.
  670. /// The usual flow is temporarily aborted. \a
  671. /// computeIrreducibleMass() finds the irreducible SCCs within the
  672. /// loop, packages them up, and restarts the flow.
  673. ///
  674. /// - Normalize the distribution: scale weights down so that their sum
  675. /// is 32-bits, and coalesce multiple edges to the same node.
  676. ///
  677. /// - Distribute the mass accordingly, dithering to minimize mass loss,
  678. /// as described in \a distributeMass().
  679. ///
  680. /// In the case of irreducible loops, instead of a single loop header,
  681. /// there will be several. The computation of backedge masses is similar
  682. /// but instead of having a single backedge mass, there will be one
  683. /// backedge per loop header. In these cases, each backedge will carry
  684. /// a mass proportional to the edge weights along the corresponding
  685. /// path.
  686. ///
  687. /// At the end of propagation, the full mass assigned to the loop will be
  688. /// distributed among the loop headers proportionally according to the
  689. /// mass flowing through their backedges.
  690. ///
  691. /// Finally, calculate the loop scale from the accumulated backedge mass.
  692. ///
  693. /// 3. Distribute mass in the function (\a computeMassInFunction()).
  694. ///
  695. /// Finally, distribute mass through the DAG resulting from packaging all
  696. /// loops in the function. This uses the same algorithm as distributing
  697. /// mass in a loop, except that there are no exit or backedge edges.
  698. ///
  699. /// 4. Unpackage loops (\a unwrapLoops()).
  700. ///
  701. /// Initialize each block's frequency to a floating point representation of
  702. /// its mass.
  703. ///
  704. /// Visit loops top-down, scaling the frequencies of its immediate members
  705. /// by the loop's pseudo-node's frequency.
  706. ///
  707. /// 5. Convert frequencies to a 64-bit range (\a finalizeMetrics()).
  708. ///
  709. /// Using the min and max frequencies as a guide, translate floating point
  710. /// frequencies to an appropriate range in uint64_t.
  711. ///
  712. /// It has some known flaws.
  713. ///
  714. /// - The model of irreducible control flow is a rough approximation.
  715. ///
  716. /// Modelling irreducible control flow exactly involves setting up and
  717. /// solving a group of infinite geometric series. Such precision is
  718. /// unlikely to be worthwhile, since most of our algorithms give up on
  719. /// irreducible control flow anyway.
  720. ///
  721. /// Nevertheless, we might find that we need to get closer. Here's a sort
  722. /// of TODO list for the model with diminishing returns, to be completed as
  723. /// necessary.
  724. ///
  725. /// - The headers for the \a LoopData representing an irreducible SCC
  726. /// include non-entry blocks. When these extra blocks exist, they
  727. /// indicate a self-contained irreducible sub-SCC. We could treat them
  728. /// as sub-loops, rather than arbitrarily shoving the problematic
  729. /// blocks into the headers of the main irreducible SCC.
  730. ///
  731. /// - Entry frequencies are assumed to be evenly split between the
  732. /// headers of a given irreducible SCC, which is the only option if we
  733. /// need to compute mass in the SCC before its parent loop. Instead,
  734. /// we could partially compute mass in the parent loop, and stop when
  735. /// we get to the SCC. Here, we have the correct ratio of entry
  736. /// masses, which we can use to adjust their relative frequencies.
  737. /// Compute mass in the SCC, and then continue propagation in the
  738. /// parent.
  739. ///
  740. /// - We can propagate mass iteratively through the SCC, for some fixed
  741. /// number of iterations. Each iteration starts by assigning the entry
  742. /// blocks their backedge mass from the prior iteration. The final
  743. /// mass for each block (and each exit, and the total backedge mass
  744. /// used for computing loop scale) is the sum of all iterations.
  745. /// (Running this until fixed point would "solve" the geometric
  746. /// series by simulation.)
  747. template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
  748. // This is part of a workaround for a GCC 4.7 crash on lambdas.
  749. friend struct bfi_detail::BlockEdgesAdder<BT>;
  750. using BlockT = typename bfi_detail::TypeMap<BT>::BlockT;
  751. using BlockKeyT = typename bfi_detail::TypeMap<BT>::BlockKeyT;
  752. using FunctionT = typename bfi_detail::TypeMap<BT>::FunctionT;
  753. using BranchProbabilityInfoT =
  754. typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT;
  755. using LoopT = typename bfi_detail::TypeMap<BT>::LoopT;
  756. using LoopInfoT = typename bfi_detail::TypeMap<BT>::LoopInfoT;
  757. using Successor = GraphTraits<const BlockT *>;
  758. using Predecessor = GraphTraits<Inverse<const BlockT *>>;
  759. using BFICallbackVH =
  760. bfi_detail::BFICallbackVH<BlockT, BlockFrequencyInfoImpl>;
  761. const BranchProbabilityInfoT *BPI = nullptr;
  762. const LoopInfoT *LI = nullptr;
  763. const FunctionT *F = nullptr;
  764. // All blocks in reverse postorder.
  765. std::vector<const BlockT *> RPOT;
  766. DenseMap<BlockKeyT, std::pair<BlockNode, BFICallbackVH>> Nodes;
  767. using rpot_iterator = typename std::vector<const BlockT *>::const_iterator;
  768. rpot_iterator rpot_begin() const { return RPOT.begin(); }
  769. rpot_iterator rpot_end() const { return RPOT.end(); }
  770. size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); }
  771. BlockNode getNode(const rpot_iterator &I) const {
  772. return BlockNode(getIndex(I));
  773. }
  774. BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB).first; }
  775. const BlockT *getBlock(const BlockNode &Node) const {
  776. assert(Node.Index < RPOT.size());
  777. return RPOT[Node.Index];
  778. }
  779. /// Run (and save) a post-order traversal.
  780. ///
  781. /// Saves a reverse post-order traversal of all the nodes in \a F.
  782. void initializeRPOT();
  783. /// Initialize loop data.
  784. ///
  785. /// Build up \a Loops using \a LoopInfo. \a LoopInfo gives us a mapping from
  786. /// each block to the deepest loop it's in, but we need the inverse. For each
  787. /// loop, we store in reverse post-order its "immediate" members, defined as
  788. /// the header, the headers of immediate sub-loops, and all other blocks in
  789. /// the loop that are not in sub-loops.
  790. void initializeLoops();
  791. /// Propagate to a block's successors.
  792. ///
  793. /// In the context of distributing mass through \c OuterLoop, divide the mass
  794. /// currently assigned to \c Node between its successors.
  795. ///
  796. /// \return \c true unless there's an irreducible backedge.
  797. bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node);
  798. /// Compute mass in a particular loop.
  799. ///
  800. /// Assign mass to \c Loop's header, and then for each block in \c Loop in
  801. /// reverse post-order, distribute mass to its successors. Only visits nodes
  802. /// that have not been packaged into sub-loops.
  803. ///
  804. /// \pre \a computeMassInLoop() has been called for each subloop of \c Loop.
  805. /// \return \c true unless there's an irreducible backedge.
  806. bool computeMassInLoop(LoopData &Loop);
  807. /// Try to compute mass in the top-level function.
  808. ///
  809. /// Assign mass to the entry block, and then for each block in reverse
  810. /// post-order, distribute mass to its successors. Skips nodes that have
  811. /// been packaged into loops.
  812. ///
  813. /// \pre \a computeMassInLoops() has been called.
  814. /// \return \c true unless there's an irreducible backedge.
  815. bool tryToComputeMassInFunction();
  816. /// Compute mass in (and package up) irreducible SCCs.
  817. ///
  818. /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front
  819. /// of \c Insert), and call \a computeMassInLoop() on each of them.
  820. ///
  821. /// If \c OuterLoop is \c nullptr, it refers to the top-level function.
  822. ///
  823. /// \pre \a computeMassInLoop() has been called for each subloop of \c
  824. /// OuterLoop.
  825. /// \pre \c Insert points at the last loop successfully processed by \a
  826. /// computeMassInLoop().
  827. /// \pre \c OuterLoop has irreducible SCCs.
  828. void computeIrreducibleMass(LoopData *OuterLoop,
  829. std::list<LoopData>::iterator Insert);
  830. /// Compute mass in all loops.
  831. ///
  832. /// For each loop bottom-up, call \a computeMassInLoop().
  833. ///
  834. /// \a computeMassInLoop() aborts (and returns \c false) on loops that
  835. /// contain a irreducible sub-SCCs. Use \a computeIrreducibleMass() and then
  836. /// re-enter \a computeMassInLoop().
  837. ///
  838. /// \post \a computeMassInLoop() has returned \c true for every loop.
  839. void computeMassInLoops();
  840. /// Compute mass in the top-level function.
  841. ///
  842. /// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to
  843. /// compute mass in the top-level function.
  844. ///
  845. /// \post \a tryToComputeMassInFunction() has returned \c true.
  846. void computeMassInFunction();
  847. std::string getBlockName(const BlockNode &Node) const override {
  848. return bfi_detail::getBlockName(getBlock(Node));
  849. }
  850. public:
  851. BlockFrequencyInfoImpl() = default;
  852. const FunctionT *getFunction() const { return F; }
  853. void calculate(const FunctionT &F, const BranchProbabilityInfoT &BPI,
  854. const LoopInfoT &LI);
  855. using BlockFrequencyInfoImplBase::getEntryFreq;
  856. BlockFrequency getBlockFreq(const BlockT *BB) const {
  857. return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
  858. }
  859. Optional<uint64_t> getBlockProfileCount(const Function &F,
  860. const BlockT *BB,
  861. bool AllowSynthetic = false) const {
  862. return BlockFrequencyInfoImplBase::getBlockProfileCount(F, getNode(BB),
  863. AllowSynthetic);
  864. }
  865. Optional<uint64_t> getProfileCountFromFreq(const Function &F,
  866. uint64_t Freq,
  867. bool AllowSynthetic = false) const {
  868. return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq,
  869. AllowSynthetic);
  870. }
  871. bool isIrrLoopHeader(const BlockT *BB) {
  872. return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB));
  873. }
  874. void setBlockFreq(const BlockT *BB, uint64_t Freq);
  875. void forgetBlock(const BlockT *BB) {
  876. // We don't erase corresponding items from `Freqs`, `RPOT` and other to
  877. // avoid invalidating indices. Doing so would have saved some memory, but
  878. // it's not worth it.
  879. Nodes.erase(BB);
  880. }
  881. Scaled64 getFloatingBlockFreq(const BlockT *BB) const {
  882. return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
  883. }
  884. const BranchProbabilityInfoT &getBPI() const { return *BPI; }
  885. /// Print the frequencies for the current function.
  886. ///
  887. /// Prints the frequencies for the blocks in the current function.
  888. ///
  889. /// Blocks are printed in the natural iteration order of the function, rather
  890. /// than reverse post-order. This provides two advantages: writing -analyze
  891. /// tests is easier (since blocks come out in source order), and even
  892. /// unreachable blocks are printed.
  893. ///
  894. /// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so
  895. /// we need to override it here.
  896. raw_ostream &print(raw_ostream &OS) const override;
  897. using BlockFrequencyInfoImplBase::dump;
  898. using BlockFrequencyInfoImplBase::printBlockFreq;
  899. raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
  900. return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
  901. }
  902. void verifyMatch(BlockFrequencyInfoImpl<BT> &Other) const;
  903. };
  904. namespace bfi_detail {
  905. template <class BFIImplT>
  906. class BFICallbackVH<BasicBlock, BFIImplT> : public CallbackVH {
  907. BFIImplT *BFIImpl;
  908. public:
  909. BFICallbackVH() = default;
  910. BFICallbackVH(const BasicBlock *BB, BFIImplT *BFIImpl)
  911. : CallbackVH(BB), BFIImpl(BFIImpl) {}
  912. virtual ~BFICallbackVH() = default;
  913. void deleted() override {
  914. BFIImpl->forgetBlock(cast<BasicBlock>(getValPtr()));
  915. }
  916. };
  917. /// Dummy implementation since MachineBasicBlocks aren't Values, so ValueHandles
  918. /// don't apply to them.
  919. template <class BFIImplT>
  920. class BFICallbackVH<MachineBasicBlock, BFIImplT> {
  921. public:
  922. BFICallbackVH() = default;
  923. BFICallbackVH(const MachineBasicBlock *, BFIImplT *) {}
  924. };
  925. } // end namespace bfi_detail
  926. template <class BT>
  927. void BlockFrequencyInfoImpl<BT>::calculate(const FunctionT &F,
  928. const BranchProbabilityInfoT &BPI,
  929. const LoopInfoT &LI) {
  930. // Save the parameters.
  931. this->BPI = &BPI;
  932. this->LI = &LI;
  933. this->F = &F;
  934. // Clean up left-over data structures.
  935. BlockFrequencyInfoImplBase::clear();
  936. RPOT.clear();
  937. Nodes.clear();
  938. // Initialize.
  939. LLVM_DEBUG(dbgs() << "\nblock-frequency: " << F.getName()
  940. << "\n================="
  941. << std::string(F.getName().size(), '=') << "\n");
  942. initializeRPOT();
  943. initializeLoops();
  944. // Visit loops in post-order to find the local mass distribution, and then do
  945. // the full function.
  946. computeMassInLoops();
  947. computeMassInFunction();
  948. unwrapLoops();
  949. finalizeMetrics();
  950. if (CheckBFIUnknownBlockQueries) {
  951. // To detect BFI queries for unknown blocks, add entries for unreachable
  952. // blocks, if any. This is to distinguish between known/existing unreachable
  953. // blocks and unknown blocks.
  954. for (const BlockT &BB : F)
  955. if (!Nodes.count(&BB))
  956. setBlockFreq(&BB, 0);
  957. }
  958. }
  959. template <class BT>
  960. void BlockFrequencyInfoImpl<BT>::setBlockFreq(const BlockT *BB, uint64_t Freq) {
  961. if (Nodes.count(BB))
  962. BlockFrequencyInfoImplBase::setBlockFreq(getNode(BB), Freq);
  963. else {
  964. // If BB is a newly added block after BFI is done, we need to create a new
  965. // BlockNode for it assigned with a new index. The index can be determined
  966. // by the size of Freqs.
  967. BlockNode NewNode(Freqs.size());
  968. Nodes[BB] = {NewNode, BFICallbackVH(BB, this)};
  969. Freqs.emplace_back();
  970. BlockFrequencyInfoImplBase::setBlockFreq(NewNode, Freq);
  971. }
  972. }
  973. template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() {
  974. const BlockT *Entry = &F->front();
  975. RPOT.reserve(F->size());
  976. std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT));
  977. std::reverse(RPOT.begin(), RPOT.end());
  978. assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
  979. "More nodes in function than Block Frequency Info supports");
  980. LLVM_DEBUG(dbgs() << "reverse-post-order-traversal\n");
  981. for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
  982. BlockNode Node = getNode(I);
  983. LLVM_DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node)
  984. << "\n");
  985. Nodes[*I] = {Node, BFICallbackVH(*I, this)};
  986. }
  987. Working.reserve(RPOT.size());
  988. for (size_t Index = 0; Index < RPOT.size(); ++Index)
  989. Working.emplace_back(Index);
  990. Freqs.resize(RPOT.size());
  991. }
  992. template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() {
  993. LLVM_DEBUG(dbgs() << "loop-detection\n");
  994. if (LI->empty())
  995. return;
  996. // Visit loops top down and assign them an index.
  997. std::deque<std::pair<const LoopT *, LoopData *>> Q;
  998. for (const LoopT *L : *LI)
  999. Q.emplace_back(L, nullptr);
  1000. while (!Q.empty()) {
  1001. const LoopT *Loop = Q.front().first;
  1002. LoopData *Parent = Q.front().second;
  1003. Q.pop_front();
  1004. BlockNode Header = getNode(Loop->getHeader());
  1005. assert(Header.isValid());
  1006. Loops.emplace_back(Parent, Header);
  1007. Working[Header.Index].Loop = &Loops.back();
  1008. LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
  1009. for (const LoopT *L : *Loop)
  1010. Q.emplace_back(L, &Loops.back());
  1011. }
  1012. // Visit nodes in reverse post-order and add them to their deepest containing
  1013. // loop.
  1014. for (size_t Index = 0; Index < RPOT.size(); ++Index) {
  1015. // Loop headers have already been mostly mapped.
  1016. if (Working[Index].isLoopHeader()) {
  1017. LoopData *ContainingLoop = Working[Index].getContainingLoop();
  1018. if (ContainingLoop)
  1019. ContainingLoop->Nodes.push_back(Index);
  1020. continue;
  1021. }
  1022. const LoopT *Loop = LI->getLoopFor(RPOT[Index]);
  1023. if (!Loop)
  1024. continue;
  1025. // Add this node to its containing loop's member list.
  1026. BlockNode Header = getNode(Loop->getHeader());
  1027. assert(Header.isValid());
  1028. const auto &HeaderData = Working[Header.Index];
  1029. assert(HeaderData.isLoopHeader());
  1030. Working[Index].Loop = HeaderData.Loop;
  1031. HeaderData.Loop->Nodes.push_back(Index);
  1032. LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header)
  1033. << ": member = " << getBlockName(Index) << "\n");
  1034. }
  1035. }
  1036. template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() {
  1037. // Visit loops with the deepest first, and the top-level loops last.
  1038. for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
  1039. if (computeMassInLoop(*L))
  1040. continue;
  1041. auto Next = std::next(L);
  1042. computeIrreducibleMass(&*L, L.base());
  1043. L = std::prev(Next);
  1044. if (computeMassInLoop(*L))
  1045. continue;
  1046. llvm_unreachable("unhandled irreducible control flow");
  1047. }
  1048. }
  1049. template <class BT>
  1050. bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
  1051. // Compute mass in loop.
  1052. LLVM_DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
  1053. if (Loop.isIrreducible()) {
  1054. LLVM_DEBUG(dbgs() << "isIrreducible = true\n");
  1055. Distribution Dist;
  1056. unsigned NumHeadersWithWeight = 0;
  1057. Optional<uint64_t> MinHeaderWeight;
  1058. DenseSet<uint32_t> HeadersWithoutWeight;
  1059. HeadersWithoutWeight.reserve(Loop.NumHeaders);
  1060. for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
  1061. auto &HeaderNode = Loop.Nodes[H];
  1062. const BlockT *Block = getBlock(HeaderNode);
  1063. IsIrrLoopHeader.set(Loop.Nodes[H].Index);
  1064. Optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight();
  1065. if (!HeaderWeight) {
  1066. LLVM_DEBUG(dbgs() << "Missing irr loop header metadata on "
  1067. << getBlockName(HeaderNode) << "\n");
  1068. HeadersWithoutWeight.insert(H);
  1069. continue;
  1070. }
  1071. LLVM_DEBUG(dbgs() << getBlockName(HeaderNode)
  1072. << " has irr loop header weight "
  1073. << HeaderWeight.getValue() << "\n");
  1074. NumHeadersWithWeight++;
  1075. uint64_t HeaderWeightValue = HeaderWeight.getValue();
  1076. if (!MinHeaderWeight || HeaderWeightValue < MinHeaderWeight)
  1077. MinHeaderWeight = HeaderWeightValue;
  1078. if (HeaderWeightValue) {
  1079. Dist.addLocal(HeaderNode, HeaderWeightValue);
  1080. }
  1081. }
  1082. // As a heuristic, if some headers don't have a weight, give them the
  1083. // minimum weight seen (not to disrupt the existing trends too much by
  1084. // using a weight that's in the general range of the other headers' weights,
  1085. // and the minimum seems to perform better than the average.)
  1086. // FIXME: better update in the passes that drop the header weight.
  1087. // If no headers have a weight, give them even weight (use weight 1).
  1088. if (!MinHeaderWeight)
  1089. MinHeaderWeight = 1;
  1090. for (uint32_t H : HeadersWithoutWeight) {
  1091. auto &HeaderNode = Loop.Nodes[H];
  1092. assert(!getBlock(HeaderNode)->getIrrLoopHeaderWeight() &&
  1093. "Shouldn't have a weight metadata");
  1094. uint64_t MinWeight = MinHeaderWeight.getValue();
  1095. LLVM_DEBUG(dbgs() << "Giving weight " << MinWeight << " to "
  1096. << getBlockName(HeaderNode) << "\n");
  1097. if (MinWeight)
  1098. Dist.addLocal(HeaderNode, MinWeight);
  1099. }
  1100. distributeIrrLoopHeaderMass(Dist);
  1101. for (const BlockNode &M : Loop.Nodes)
  1102. if (!propagateMassToSuccessors(&Loop, M))
  1103. llvm_unreachable("unhandled irreducible control flow");
  1104. if (NumHeadersWithWeight == 0)
  1105. // No headers have a metadata. Adjust header mass.
  1106. adjustLoopHeaderMass(Loop);
  1107. } else {
  1108. Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
  1109. if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
  1110. llvm_unreachable("irreducible control flow to loop header!?");
  1111. for (const BlockNode &M : Loop.members())
  1112. if (!propagateMassToSuccessors(&Loop, M))
  1113. // Irreducible backedge.
  1114. return false;
  1115. }
  1116. computeLoopScale(Loop);
  1117. packageLoop(Loop);
  1118. return true;
  1119. }
  1120. template <class BT>
  1121. bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() {
  1122. // Compute mass in function.
  1123. LLVM_DEBUG(dbgs() << "compute-mass-in-function\n");
  1124. assert(!Working.empty() && "no blocks in function");
  1125. assert(!Working[0].isLoopHeader() && "entry block is a loop header");
  1126. Working[0].getMass() = BlockMass::getFull();
  1127. for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) {
  1128. // Check for nodes that have been packaged.
  1129. BlockNode Node = getNode(I);
  1130. if (Working[Node.Index].isPackaged())
  1131. continue;
  1132. if (!propagateMassToSuccessors(nullptr, Node))
  1133. return false;
  1134. }
  1135. return true;
  1136. }
  1137. template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() {
  1138. if (tryToComputeMassInFunction())
  1139. return;
  1140. computeIrreducibleMass(nullptr, Loops.begin());
  1141. if (tryToComputeMassInFunction())
  1142. return;
  1143. llvm_unreachable("unhandled irreducible control flow");
  1144. }
  1145. /// \note This should be a lambda, but that crashes GCC 4.7.
  1146. namespace bfi_detail {
  1147. template <class BT> struct BlockEdgesAdder {
  1148. using BlockT = BT;
  1149. using LoopData = BlockFrequencyInfoImplBase::LoopData;
  1150. using Successor = GraphTraits<const BlockT *>;
  1151. const BlockFrequencyInfoImpl<BT> &BFI;
  1152. explicit BlockEdgesAdder(const BlockFrequencyInfoImpl<BT> &BFI)
  1153. : BFI(BFI) {}
  1154. void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr,
  1155. const LoopData *OuterLoop) {
  1156. const BlockT *BB = BFI.RPOT[Irr.Node.Index];
  1157. for (const auto Succ : children<const BlockT *>(BB))
  1158. G.addEdge(Irr, BFI.getNode(Succ), OuterLoop);
  1159. }
  1160. };
  1161. } // end namespace bfi_detail
  1162. template <class BT>
  1163. void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass(
  1164. LoopData *OuterLoop, std::list<LoopData>::iterator Insert) {
  1165. LLVM_DEBUG(dbgs() << "analyze-irreducible-in-";
  1166. if (OuterLoop) dbgs()
  1167. << "loop: " << getLoopName(*OuterLoop) << "\n";
  1168. else dbgs() << "function\n");
  1169. using namespace bfi_detail;
  1170. // Ideally, addBlockEdges() would be declared here as a lambda, but that
  1171. // crashes GCC 4.7.
  1172. BlockEdgesAdder<BT> addBlockEdges(*this);
  1173. IrreducibleGraph G(*this, OuterLoop, addBlockEdges);
  1174. for (auto &L : analyzeIrreducible(G, OuterLoop, Insert))
  1175. computeMassInLoop(L);
  1176. if (!OuterLoop)
  1177. return;
  1178. updateLoopWithIrreducible(*OuterLoop);
  1179. }
  1180. // A helper function that converts a branch probability into weight.
  1181. inline uint32_t getWeightFromBranchProb(const BranchProbability Prob) {
  1182. return Prob.getNumerator();
  1183. }
  1184. template <class BT>
  1185. bool
  1186. BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop,
  1187. const BlockNode &Node) {
  1188. LLVM_DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
  1189. // Calculate probability for successors.
  1190. Distribution Dist;
  1191. if (auto *Loop = Working[Node.Index].getPackagedLoop()) {
  1192. assert(Loop != OuterLoop && "Cannot propagate mass in a packaged loop");
  1193. if (!addLoopSuccessorsToDist(OuterLoop, *Loop, Dist))
  1194. // Irreducible backedge.
  1195. return false;
  1196. } else {
  1197. const BlockT *BB = getBlock(Node);
  1198. for (auto SI = GraphTraits<const BlockT *>::child_begin(BB),
  1199. SE = GraphTraits<const BlockT *>::child_end(BB);
  1200. SI != SE; ++SI)
  1201. if (!addToDist(
  1202. Dist, OuterLoop, Node, getNode(*SI),
  1203. getWeightFromBranchProb(BPI->getEdgeProbability(BB, SI))))
  1204. // Irreducible backedge.
  1205. return false;
  1206. }
  1207. // Distribute mass to successors, saving exit and backedge data in the
  1208. // loop header.
  1209. distributeMass(Node, OuterLoop, Dist);
  1210. return true;
  1211. }
  1212. template <class BT>
  1213. raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const {
  1214. if (!F)
  1215. return OS;
  1216. OS << "block-frequency-info: " << F->getName() << "\n";
  1217. for (const BlockT &BB : *F) {
  1218. OS << " - " << bfi_detail::getBlockName(&BB) << ": float = ";
  1219. getFloatingBlockFreq(&BB).print(OS, 5)
  1220. << ", int = " << getBlockFreq(&BB).getFrequency();
  1221. if (Optional<uint64_t> ProfileCount =
  1222. BlockFrequencyInfoImplBase::getBlockProfileCount(
  1223. F->getFunction(), getNode(&BB)))
  1224. OS << ", count = " << ProfileCount.getValue();
  1225. if (Optional<uint64_t> IrrLoopHeaderWeight =
  1226. BB.getIrrLoopHeaderWeight())
  1227. OS << ", irr_loop_header_weight = " << IrrLoopHeaderWeight.getValue();
  1228. OS << "\n";
  1229. }
  1230. // Add an extra newline for readability.
  1231. OS << "\n";
  1232. return OS;
  1233. }
  1234. template <class BT>
  1235. void BlockFrequencyInfoImpl<BT>::verifyMatch(
  1236. BlockFrequencyInfoImpl<BT> &Other) const {
  1237. bool Match = true;
  1238. DenseMap<const BlockT *, BlockNode> ValidNodes;
  1239. DenseMap<const BlockT *, BlockNode> OtherValidNodes;
  1240. for (auto &Entry : Nodes) {
  1241. const BlockT *BB = Entry.first;
  1242. if (BB) {
  1243. ValidNodes[BB] = Entry.second.first;
  1244. }
  1245. }
  1246. for (auto &Entry : Other.Nodes) {
  1247. const BlockT *BB = Entry.first;
  1248. if (BB) {
  1249. OtherValidNodes[BB] = Entry.second.first;
  1250. }
  1251. }
  1252. unsigned NumValidNodes = ValidNodes.size();
  1253. unsigned NumOtherValidNodes = OtherValidNodes.size();
  1254. if (NumValidNodes != NumOtherValidNodes) {
  1255. Match = false;
  1256. dbgs() << "Number of blocks mismatch: " << NumValidNodes << " vs "
  1257. << NumOtherValidNodes << "\n";
  1258. } else {
  1259. for (auto &Entry : ValidNodes) {
  1260. const BlockT *BB = Entry.first;
  1261. BlockNode Node = Entry.second;
  1262. if (OtherValidNodes.count(BB)) {
  1263. BlockNode OtherNode = OtherValidNodes[BB];
  1264. const auto &Freq = Freqs[Node.Index];
  1265. const auto &OtherFreq = Other.Freqs[OtherNode.Index];
  1266. if (Freq.Integer != OtherFreq.Integer) {
  1267. Match = false;
  1268. dbgs() << "Freq mismatch: " << bfi_detail::getBlockName(BB) << " "
  1269. << Freq.Integer << " vs " << OtherFreq.Integer << "\n";
  1270. }
  1271. } else {
  1272. Match = false;
  1273. dbgs() << "Block " << bfi_detail::getBlockName(BB) << " index "
  1274. << Node.Index << " does not exist in Other.\n";
  1275. }
  1276. }
  1277. // If there's a valid node in OtherValidNodes that's not in ValidNodes,
  1278. // either the above num check or the check on OtherValidNodes will fail.
  1279. }
  1280. if (!Match) {
  1281. dbgs() << "This\n";
  1282. print(dbgs());
  1283. dbgs() << "Other\n";
  1284. Other.print(dbgs());
  1285. }
  1286. assert(Match && "BFI mismatch");
  1287. }
  1288. // Graph trait base class for block frequency information graph
  1289. // viewer.
  1290. enum GVDAGType { GVDT_None, GVDT_Fraction, GVDT_Integer, GVDT_Count };
  1291. template <class BlockFrequencyInfoT, class BranchProbabilityInfoT>
  1292. struct BFIDOTGraphTraitsBase : public DefaultDOTGraphTraits {
  1293. using GTraits = GraphTraits<BlockFrequencyInfoT *>;
  1294. using NodeRef = typename GTraits::NodeRef;
  1295. using EdgeIter = typename GTraits::ChildIteratorType;
  1296. using NodeIter = typename GTraits::nodes_iterator;
  1297. uint64_t MaxFrequency = 0;
  1298. explicit BFIDOTGraphTraitsBase(bool isSimple = false)
  1299. : DefaultDOTGraphTraits(isSimple) {}
  1300. static StringRef getGraphName(const BlockFrequencyInfoT *G) {
  1301. return G->getFunction()->getName();
  1302. }
  1303. std::string getNodeAttributes(NodeRef Node, const BlockFrequencyInfoT *Graph,
  1304. unsigned HotPercentThreshold = 0) {
  1305. std::string Result;
  1306. if (!HotPercentThreshold)
  1307. return Result;
  1308. // Compute MaxFrequency on the fly:
  1309. if (!MaxFrequency) {
  1310. for (NodeIter I = GTraits::nodes_begin(Graph),
  1311. E = GTraits::nodes_end(Graph);
  1312. I != E; ++I) {
  1313. NodeRef N = *I;
  1314. MaxFrequency =
  1315. std::max(MaxFrequency, Graph->getBlockFreq(N).getFrequency());
  1316. }
  1317. }
  1318. BlockFrequency Freq = Graph->getBlockFreq(Node);
  1319. BlockFrequency HotFreq =
  1320. (BlockFrequency(MaxFrequency) *
  1321. BranchProbability::getBranchProbability(HotPercentThreshold, 100));
  1322. if (Freq < HotFreq)
  1323. return Result;
  1324. raw_string_ostream OS(Result);
  1325. OS << "color=\"red\"";
  1326. OS.flush();
  1327. return Result;
  1328. }
  1329. std::string getNodeLabel(NodeRef Node, const BlockFrequencyInfoT *Graph,
  1330. GVDAGType GType, int layout_order = -1) {
  1331. std::string Result;
  1332. raw_string_ostream OS(Result);
  1333. if (layout_order != -1)
  1334. OS << Node->getName() << "[" << layout_order << "] : ";
  1335. else
  1336. OS << Node->getName() << " : ";
  1337. switch (GType) {
  1338. case GVDT_Fraction:
  1339. Graph->printBlockFreq(OS, Node);
  1340. break;
  1341. case GVDT_Integer:
  1342. OS << Graph->getBlockFreq(Node).getFrequency();
  1343. break;
  1344. case GVDT_Count: {
  1345. auto Count = Graph->getBlockProfileCount(Node);
  1346. if (Count)
  1347. OS << Count.getValue();
  1348. else
  1349. OS << "Unknown";
  1350. break;
  1351. }
  1352. case GVDT_None:
  1353. llvm_unreachable("If we are not supposed to render a graph we should "
  1354. "never reach this point.");
  1355. }
  1356. return Result;
  1357. }
  1358. std::string getEdgeAttributes(NodeRef Node, EdgeIter EI,
  1359. const BlockFrequencyInfoT *BFI,
  1360. const BranchProbabilityInfoT *BPI,
  1361. unsigned HotPercentThreshold = 0) {
  1362. std::string Str;
  1363. if (!BPI)
  1364. return Str;
  1365. BranchProbability BP = BPI->getEdgeProbability(Node, EI);
  1366. uint32_t N = BP.getNumerator();
  1367. uint32_t D = BP.getDenominator();
  1368. double Percent = 100.0 * N / D;
  1369. raw_string_ostream OS(Str);
  1370. OS << format("label=\"%.1f%%\"", Percent);
  1371. if (HotPercentThreshold) {
  1372. BlockFrequency EFreq = BFI->getBlockFreq(Node) * BP;
  1373. BlockFrequency HotFreq = BlockFrequency(MaxFrequency) *
  1374. BranchProbability(HotPercentThreshold, 100);
  1375. if (EFreq >= HotFreq) {
  1376. OS << ",color=\"red\"";
  1377. }
  1378. }
  1379. OS.flush();
  1380. return Str;
  1381. }
  1382. };
  1383. } // end namespace llvm
  1384. #undef DEBUG_TYPE
  1385. #endif // LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  1386. #ifdef __GNUC__
  1387. #pragma GCC diagnostic pop
  1388. #endif