SampleProfileInference.cpp 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954
  1. //===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements a profile inference algorithm. Given an incomplete and
  10. // possibly imprecise block counts, the algorithm reconstructs realistic block
  11. // and edge counts that satisfy flow conservation rules, while minimally modify
  12. // input block counts.
  13. //
  14. //===----------------------------------------------------------------------===//
  15. #include "llvm/Transforms/Utils/SampleProfileInference.h"
  16. #include "llvm/ADT/BitVector.h"
  17. #include "llvm/Support/Debug.h"
  18. #include <queue>
  19. #include <set>
  20. using namespace llvm;
  21. #define DEBUG_TYPE "sample-profile-inference"
  22. namespace {
  23. /// A value indicating an infinite flow/capacity/weight of a block/edge.
  24. /// Not using numeric_limits<int64_t>::max(), as the values can be summed up
  25. /// during the execution.
  26. static constexpr int64_t INF = ((int64_t)1) << 50;
  27. /// The minimum-cost maximum flow algorithm.
  28. ///
  29. /// The algorithm finds the maximum flow of minimum cost on a given (directed)
  30. /// network using a modified version of the classical Moore-Bellman-Ford
  31. /// approach. The algorithm applies a number of augmentation iterations in which
  32. /// flow is sent along paths of positive capacity from the source to the sink.
  33. /// The worst-case time complexity of the implementation is O(v(f)*m*n), where
  34. /// where m is the number of edges, n is the number of vertices, and v(f) is the
  35. /// value of the maximum flow. However, the observed running time on typical
  36. /// instances is sub-quadratic, that is, o(n^2).
  37. ///
  38. /// The input is a set of edges with specified costs and capacities, and a pair
  39. /// of nodes (source and sink). The output is the flow along each edge of the
  40. /// minimum total cost respecting the given edge capacities.
  41. class MinCostMaxFlow {
  42. public:
  43. // Initialize algorithm's data structures for a network of a given size.
  44. void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) {
  45. Source = SourceNode;
  46. Target = SinkNode;
  47. Nodes = std::vector<Node>(NodeCount);
  48. Edges = std::vector<std::vector<Edge>>(NodeCount, std::vector<Edge>());
  49. }
  50. // Run the algorithm.
  51. int64_t run() {
  52. // Find an augmenting path and update the flow along the path
  53. size_t AugmentationIters = 0;
  54. while (findAugmentingPath()) {
  55. augmentFlowAlongPath();
  56. AugmentationIters++;
  57. }
  58. // Compute the total flow and its cost
  59. int64_t TotalCost = 0;
  60. int64_t TotalFlow = 0;
  61. for (uint64_t Src = 0; Src < Nodes.size(); Src++) {
  62. for (auto &Edge : Edges[Src]) {
  63. if (Edge.Flow > 0) {
  64. TotalCost += Edge.Cost * Edge.Flow;
  65. if (Src == Source)
  66. TotalFlow += Edge.Flow;
  67. }
  68. }
  69. }
  70. LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters
  71. << " iterations with " << TotalFlow << " total flow"
  72. << " of " << TotalCost << " cost\n");
  73. (void)TotalFlow;
  74. return TotalCost;
  75. }
  76. /// Adding an edge to the network with a specified capacity and a cost.
  77. /// Multiple edges between a pair of nodes are allowed but self-edges
  78. /// are not supported.
  79. void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) {
  80. assert(Capacity > 0 && "adding an edge of zero capacity");
  81. assert(Src != Dst && "loop edge are not supported");
  82. Edge SrcEdge;
  83. SrcEdge.Dst = Dst;
  84. SrcEdge.Cost = Cost;
  85. SrcEdge.Capacity = Capacity;
  86. SrcEdge.Flow = 0;
  87. SrcEdge.RevEdgeIndex = Edges[Dst].size();
  88. Edge DstEdge;
  89. DstEdge.Dst = Src;
  90. DstEdge.Cost = -Cost;
  91. DstEdge.Capacity = 0;
  92. DstEdge.Flow = 0;
  93. DstEdge.RevEdgeIndex = Edges[Src].size();
  94. Edges[Src].push_back(SrcEdge);
  95. Edges[Dst].push_back(DstEdge);
  96. }
  97. /// Adding an edge to the network of infinite capacity and a given cost.
  98. void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) {
  99. addEdge(Src, Dst, INF, Cost);
  100. }
  101. /// Get the total flow from a given source node.
  102. /// Returns a list of pairs (target node, amount of flow to the target).
  103. const std::vector<std::pair<uint64_t, int64_t>> getFlow(uint64_t Src) const {
  104. std::vector<std::pair<uint64_t, int64_t>> Flow;
  105. for (auto &Edge : Edges[Src]) {
  106. if (Edge.Flow > 0)
  107. Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow));
  108. }
  109. return Flow;
  110. }
  111. /// Get the total flow between a pair of nodes.
  112. int64_t getFlow(uint64_t Src, uint64_t Dst) const {
  113. int64_t Flow = 0;
  114. for (auto &Edge : Edges[Src]) {
  115. if (Edge.Dst == Dst) {
  116. Flow += Edge.Flow;
  117. }
  118. }
  119. return Flow;
  120. }
  121. /// A cost of increasing a block's count by one.
  122. static constexpr int64_t AuxCostInc = 10;
  123. /// A cost of decreasing a block's count by one.
  124. static constexpr int64_t AuxCostDec = 20;
  125. /// A cost of increasing a count of zero-weight block by one.
  126. static constexpr int64_t AuxCostIncZero = 11;
  127. /// A cost of increasing the entry block's count by one.
  128. static constexpr int64_t AuxCostIncEntry = 40;
  129. /// A cost of decreasing the entry block's count by one.
  130. static constexpr int64_t AuxCostDecEntry = 10;
  131. /// A cost of taking an unlikely jump.
  132. static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 30;
  133. private:
  134. /// Check for existence of an augmenting path with a positive capacity.
  135. bool findAugmentingPath() {
  136. // Initialize data structures
  137. for (auto &Node : Nodes) {
  138. Node.Distance = INF;
  139. Node.ParentNode = uint64_t(-1);
  140. Node.ParentEdgeIndex = uint64_t(-1);
  141. Node.Taken = false;
  142. }
  143. std::queue<uint64_t> Queue;
  144. Queue.push(Source);
  145. Nodes[Source].Distance = 0;
  146. Nodes[Source].Taken = true;
  147. while (!Queue.empty()) {
  148. uint64_t Src = Queue.front();
  149. Queue.pop();
  150. Nodes[Src].Taken = false;
  151. // Although the residual network contains edges with negative costs
  152. // (in particular, backward edges), it can be shown that there are no
  153. // negative-weight cycles and the following two invariants are maintained:
  154. // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V,
  155. // where Dist is the length of the shortest path between two nodes. This
  156. // allows to prune the search-space of the path-finding algorithm using
  157. // the following early-stop criteria:
  158. // -- If we find a path with zero-distance from Source to Target, stop the
  159. // search, as the path is the shortest since Dist[Source, Target] >= 0;
  160. // -- If we have Dist[Source, V] > Dist[Source, Target], then do not
  161. // process node V, as it is guaranteed _not_ to be on a shortest path
  162. // from Source to Target; it follows from inequalities
  163. // Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target]
  164. // >= Dist[Source, V]
  165. if (Nodes[Target].Distance == 0)
  166. break;
  167. if (Nodes[Src].Distance > Nodes[Target].Distance)
  168. continue;
  169. // Process adjacent edges
  170. for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) {
  171. auto &Edge = Edges[Src][EdgeIdx];
  172. if (Edge.Flow < Edge.Capacity) {
  173. uint64_t Dst = Edge.Dst;
  174. int64_t NewDistance = Nodes[Src].Distance + Edge.Cost;
  175. if (Nodes[Dst].Distance > NewDistance) {
  176. // Update the distance and the parent node/edge
  177. Nodes[Dst].Distance = NewDistance;
  178. Nodes[Dst].ParentNode = Src;
  179. Nodes[Dst].ParentEdgeIndex = EdgeIdx;
  180. // Add the node to the queue, if it is not there yet
  181. if (!Nodes[Dst].Taken) {
  182. Queue.push(Dst);
  183. Nodes[Dst].Taken = true;
  184. }
  185. }
  186. }
  187. }
  188. }
  189. return Nodes[Target].Distance != INF;
  190. }
  191. /// Update the current flow along the augmenting path.
  192. void augmentFlowAlongPath() {
  193. // Find path capacity
  194. int64_t PathCapacity = INF;
  195. uint64_t Now = Target;
  196. while (Now != Source) {
  197. uint64_t Pred = Nodes[Now].ParentNode;
  198. auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
  199. PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow);
  200. Now = Pred;
  201. }
  202. assert(PathCapacity > 0 && "found an incorrect augmenting path");
  203. // Update the flow along the path
  204. Now = Target;
  205. while (Now != Source) {
  206. uint64_t Pred = Nodes[Now].ParentNode;
  207. auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex];
  208. auto &RevEdge = Edges[Now][Edge.RevEdgeIndex];
  209. Edge.Flow += PathCapacity;
  210. RevEdge.Flow -= PathCapacity;
  211. Now = Pred;
  212. }
  213. }
  214. /// A node in a flow network.
  215. struct Node {
  216. /// The cost of the cheapest path from the source to the current node.
  217. int64_t Distance;
  218. /// The node preceding the current one in the path.
  219. uint64_t ParentNode;
  220. /// The index of the edge between ParentNode and the current node.
  221. uint64_t ParentEdgeIndex;
  222. /// An indicator of whether the current node is in a queue.
  223. bool Taken;
  224. };
  225. /// An edge in a flow network.
  226. struct Edge {
  227. /// The cost of the edge.
  228. int64_t Cost;
  229. /// The capacity of the edge.
  230. int64_t Capacity;
  231. /// The current flow on the edge.
  232. int64_t Flow;
  233. /// The destination node of the edge.
  234. uint64_t Dst;
  235. /// The index of the reverse edge between Dst and the current node.
  236. uint64_t RevEdgeIndex;
  237. };
  238. /// The set of network nodes.
  239. std::vector<Node> Nodes;
  240. /// The set of network edges.
  241. std::vector<std::vector<Edge>> Edges;
  242. /// Source node of the flow.
  243. uint64_t Source;
  244. /// Target (sink) node of the flow.
  245. uint64_t Target;
  246. };
  247. /// A post-processing adjustment of control flow. It applies two steps by
  248. /// rerouting some flow and making it more realistic:
  249. ///
  250. /// - First, it removes all isolated components ("islands") with a positive flow
  251. /// that are unreachable from the entry block. For every such component, we
  252. /// find the shortest from the entry to an exit passing through the component,
  253. /// and increase the flow by one unit along the path.
  254. ///
  255. /// - Second, it identifies all "unknown subgraphs" consisting of basic blocks
  256. /// with no sampled counts. Then it rebalnces the flow that goes through such
  257. /// a subgraph so that each branch is taken with probability 50%.
  258. /// An unknown subgraph is such that for every two nodes u and v:
  259. /// - u dominates v and u is not unknown;
  260. /// - v post-dominates u; and
  261. /// - all inner-nodes of all (u,v)-paths are unknown.
  262. ///
  263. class FlowAdjuster {
  264. public:
  265. FlowAdjuster(FlowFunction &Func) : Func(Func) {
  266. assert(Func.Blocks[Func.Entry].isEntry() &&
  267. "incorrect index of the entry block");
  268. }
  269. // Run the post-processing
  270. void run() {
  271. /// Adjust the flow to get rid of isolated components.
  272. joinIsolatedComponents();
  273. /// Rebalance the flow inside unknown subgraphs.
  274. rebalanceUnknownSubgraphs();
  275. }
  276. private:
  277. void joinIsolatedComponents() {
  278. // Find blocks that are reachable from the source
  279. auto Visited = BitVector(NumBlocks(), false);
  280. findReachable(Func.Entry, Visited);
  281. // Iterate over all non-reachable blocks and adjust their weights
  282. for (uint64_t I = 0; I < NumBlocks(); I++) {
  283. auto &Block = Func.Blocks[I];
  284. if (Block.Flow > 0 && !Visited[I]) {
  285. // Find a path from the entry to an exit passing through the block I
  286. auto Path = findShortestPath(I);
  287. // Increase the flow along the path
  288. assert(Path.size() > 0 && Path[0]->Source == Func.Entry &&
  289. "incorrectly computed path adjusting control flow");
  290. Func.Blocks[Func.Entry].Flow += 1;
  291. for (auto &Jump : Path) {
  292. Jump->Flow += 1;
  293. Func.Blocks[Jump->Target].Flow += 1;
  294. // Update reachability
  295. findReachable(Jump->Target, Visited);
  296. }
  297. }
  298. }
  299. }
  300. /// Run BFS from a given block along the jumps with a positive flow and mark
  301. /// all reachable blocks.
  302. void findReachable(uint64_t Src, BitVector &Visited) {
  303. if (Visited[Src])
  304. return;
  305. std::queue<uint64_t> Queue;
  306. Queue.push(Src);
  307. Visited[Src] = true;
  308. while (!Queue.empty()) {
  309. Src = Queue.front();
  310. Queue.pop();
  311. for (auto Jump : Func.Blocks[Src].SuccJumps) {
  312. uint64_t Dst = Jump->Target;
  313. if (Jump->Flow > 0 && !Visited[Dst]) {
  314. Queue.push(Dst);
  315. Visited[Dst] = true;
  316. }
  317. }
  318. }
  319. }
  320. /// Find the shortest path from the entry block to an exit block passing
  321. /// through a given block.
  322. std::vector<FlowJump *> findShortestPath(uint64_t BlockIdx) {
  323. // A path from the entry block to BlockIdx
  324. auto ForwardPath = findShortestPath(Func.Entry, BlockIdx);
  325. // A path from BlockIdx to an exit block
  326. auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock);
  327. // Concatenate the two paths
  328. std::vector<FlowJump *> Result;
  329. Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end());
  330. Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end());
  331. return Result;
  332. }
  333. /// Apply the Dijkstra algorithm to find the shortest path from a given
  334. /// Source to a given Target block.
  335. /// If Target == -1, then the path ends at an exit block.
  336. std::vector<FlowJump *> findShortestPath(uint64_t Source, uint64_t Target) {
  337. // Quit early, if possible
  338. if (Source == Target)
  339. return std::vector<FlowJump *>();
  340. if (Func.Blocks[Source].isExit() && Target == AnyExitBlock)
  341. return std::vector<FlowJump *>();
  342. // Initialize data structures
  343. auto Distance = std::vector<int64_t>(NumBlocks(), INF);
  344. auto Parent = std::vector<FlowJump *>(NumBlocks(), nullptr);
  345. Distance[Source] = 0;
  346. std::set<std::pair<uint64_t, uint64_t>> Queue;
  347. Queue.insert(std::make_pair(Distance[Source], Source));
  348. // Run the Dijkstra algorithm
  349. while (!Queue.empty()) {
  350. uint64_t Src = Queue.begin()->second;
  351. Queue.erase(Queue.begin());
  352. // If we found a solution, quit early
  353. if (Src == Target ||
  354. (Func.Blocks[Src].isExit() && Target == AnyExitBlock))
  355. break;
  356. for (auto Jump : Func.Blocks[Src].SuccJumps) {
  357. uint64_t Dst = Jump->Target;
  358. int64_t JumpDist = jumpDistance(Jump);
  359. if (Distance[Dst] > Distance[Src] + JumpDist) {
  360. Queue.erase(std::make_pair(Distance[Dst], Dst));
  361. Distance[Dst] = Distance[Src] + JumpDist;
  362. Parent[Dst] = Jump;
  363. Queue.insert(std::make_pair(Distance[Dst], Dst));
  364. }
  365. }
  366. }
  367. // If Target is not provided, find the closest exit block
  368. if (Target == AnyExitBlock) {
  369. for (uint64_t I = 0; I < NumBlocks(); I++) {
  370. if (Func.Blocks[I].isExit() && Parent[I] != nullptr) {
  371. if (Target == AnyExitBlock || Distance[Target] > Distance[I]) {
  372. Target = I;
  373. }
  374. }
  375. }
  376. }
  377. assert(Parent[Target] != nullptr && "a path does not exist");
  378. // Extract the constructed path
  379. std::vector<FlowJump *> Result;
  380. uint64_t Now = Target;
  381. while (Now != Source) {
  382. assert(Now == Parent[Now]->Target && "incorrect parent jump");
  383. Result.push_back(Parent[Now]);
  384. Now = Parent[Now]->Source;
  385. }
  386. // Reverse the path, since it is extracted from Target to Source
  387. std::reverse(Result.begin(), Result.end());
  388. return Result;
  389. }
  390. /// A distance of a path for a given jump.
  391. /// In order to incite the path to use blocks/jumps with large positive flow,
  392. /// and avoid changing branch probability of outgoing edges drastically,
  393. /// set the distance as follows:
  394. /// if Jump.Flow > 0, then distance = max(100 - Jump->Flow, 0)
  395. /// if Block.Weight > 0, then distance = 1
  396. /// otherwise distance >> 1
  397. int64_t jumpDistance(FlowJump *Jump) const {
  398. int64_t BaseDistance = 100;
  399. if (Jump->IsUnlikely)
  400. return MinCostMaxFlow::AuxCostUnlikely;
  401. if (Jump->Flow > 0)
  402. return std::max(BaseDistance - (int64_t)Jump->Flow, (int64_t)0);
  403. if (Func.Blocks[Jump->Target].Weight > 0)
  404. return BaseDistance;
  405. return BaseDistance * (NumBlocks() + 1);
  406. };
  407. uint64_t NumBlocks() const { return Func.Blocks.size(); }
  408. /// Rebalance unknown subgraphs so that the flow is split evenly across the
  409. /// outgoing branches of every block of the subgraph. The method iterates over
  410. /// blocks with known weight and identifies unknown subgraphs rooted at the
  411. /// blocks. Then it verifies if flow rebalancing is feasible and applies it.
  412. void rebalanceUnknownSubgraphs() {
  413. // Try to find unknown subgraphs from each block
  414. for (uint64_t I = 0; I < Func.Blocks.size(); I++) {
  415. auto SrcBlock = &Func.Blocks[I];
  416. // Verify if rebalancing rooted at SrcBlock is feasible
  417. if (!canRebalanceAtRoot(SrcBlock))
  418. continue;
  419. // Find an unknown subgraphs starting at SrcBlock. Along the way,
  420. // fill in known destinations and intermediate unknown blocks.
  421. std::vector<FlowBlock *> UnknownBlocks;
  422. std::vector<FlowBlock *> KnownDstBlocks;
  423. findUnknownSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks);
  424. // Verify if rebalancing of the subgraph is feasible. If the search is
  425. // successful, find the unique destination block (which can be null)
  426. FlowBlock *DstBlock = nullptr;
  427. if (!canRebalanceSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks,
  428. DstBlock))
  429. continue;
  430. // We cannot rebalance subgraphs containing cycles among unknown blocks
  431. if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownBlocks))
  432. continue;
  433. // Rebalance the flow
  434. rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownBlocks);
  435. }
  436. }
  437. /// Verify if rebalancing rooted at a given block is possible.
  438. bool canRebalanceAtRoot(const FlowBlock *SrcBlock) {
  439. // Do not attempt to find unknown subgraphs from an unknown or a
  440. // zero-flow block
  441. if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0)
  442. return false;
  443. // Do not attempt to process subgraphs from a block w/o unknown sucessors
  444. bool HasUnknownSuccs = false;
  445. for (auto Jump : SrcBlock->SuccJumps) {
  446. if (Func.Blocks[Jump->Target].UnknownWeight) {
  447. HasUnknownSuccs = true;
  448. break;
  449. }
  450. }
  451. if (!HasUnknownSuccs)
  452. return false;
  453. return true;
  454. }
  455. /// Find an unknown subgraph starting at block SrcBlock. The method sets
  456. /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks.
  457. void findUnknownSubgraph(const FlowBlock *SrcBlock,
  458. std::vector<FlowBlock *> &KnownDstBlocks,
  459. std::vector<FlowBlock *> &UnknownBlocks) {
  460. // Run BFS from SrcBlock and make sure all paths are going through unknown
  461. // blocks and end at a non-unknown DstBlock
  462. auto Visited = BitVector(NumBlocks(), false);
  463. std::queue<uint64_t> Queue;
  464. Queue.push(SrcBlock->Index);
  465. Visited[SrcBlock->Index] = true;
  466. while (!Queue.empty()) {
  467. auto &Block = Func.Blocks[Queue.front()];
  468. Queue.pop();
  469. // Process blocks reachable from Block
  470. for (auto Jump : Block.SuccJumps) {
  471. // If Jump can be ignored, skip it
  472. if (ignoreJump(SrcBlock, nullptr, Jump))
  473. continue;
  474. uint64_t Dst = Jump->Target;
  475. // If Dst has been visited, skip Jump
  476. if (Visited[Dst])
  477. continue;
  478. // Process block Dst
  479. Visited[Dst] = true;
  480. if (!Func.Blocks[Dst].UnknownWeight) {
  481. KnownDstBlocks.push_back(&Func.Blocks[Dst]);
  482. } else {
  483. Queue.push(Dst);
  484. UnknownBlocks.push_back(&Func.Blocks[Dst]);
  485. }
  486. }
  487. }
  488. }
  489. /// Verify if rebalancing of the subgraph is feasible. If the checks are
  490. /// successful, set the unique destination block, DstBlock (can be null).
  491. bool canRebalanceSubgraph(const FlowBlock *SrcBlock,
  492. const std::vector<FlowBlock *> &KnownDstBlocks,
  493. const std::vector<FlowBlock *> &UnknownBlocks,
  494. FlowBlock *&DstBlock) {
  495. // If the list of unknown blocks is empty, we don't need rebalancing
  496. if (UnknownBlocks.empty())
  497. return false;
  498. // If there are multiple known sinks, we can't rebalance
  499. if (KnownDstBlocks.size() > 1)
  500. return false;
  501. DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front();
  502. // Verify sinks of the subgraph
  503. for (auto Block : UnknownBlocks) {
  504. if (Block->SuccJumps.empty()) {
  505. // If there are multiple (known and unknown) sinks, we can't rebalance
  506. if (DstBlock != nullptr)
  507. return false;
  508. continue;
  509. }
  510. size_t NumIgnoredJumps = 0;
  511. for (auto Jump : Block->SuccJumps) {
  512. if (ignoreJump(SrcBlock, DstBlock, Jump))
  513. NumIgnoredJumps++;
  514. }
  515. // If there is a non-sink block in UnknownBlocks with all jumps ignored,
  516. // then we can't rebalance
  517. if (NumIgnoredJumps == Block->SuccJumps.size())
  518. return false;
  519. }
  520. return true;
  521. }
  522. /// Decide whether the Jump is ignored while processing an unknown subgraphs
  523. /// rooted at basic block SrcBlock with the destination block, DstBlock.
  524. bool ignoreJump(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
  525. const FlowJump *Jump) {
  526. // Ignore unlikely jumps with zero flow
  527. if (Jump->IsUnlikely && Jump->Flow == 0)
  528. return true;
  529. auto JumpSource = &Func.Blocks[Jump->Source];
  530. auto JumpTarget = &Func.Blocks[Jump->Target];
  531. // Do not ignore jumps coming into DstBlock
  532. if (DstBlock != nullptr && JumpTarget == DstBlock)
  533. return false;
  534. // Ignore jumps out of SrcBlock to known blocks
  535. if (!JumpTarget->UnknownWeight && JumpSource == SrcBlock)
  536. return true;
  537. // Ignore jumps to known blocks with zero flow
  538. if (!JumpTarget->UnknownWeight && JumpTarget->Flow == 0)
  539. return true;
  540. return false;
  541. }
  542. /// Verify if the given unknown subgraph is acyclic, and if yes, reorder
  543. /// UnknownBlocks in the topological order (so that all jumps are "forward").
  544. bool isAcyclicSubgraph(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
  545. std::vector<FlowBlock *> &UnknownBlocks) {
  546. // Extract local in-degrees in the considered subgraph
  547. auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
  548. auto fillInDegree = [&](const FlowBlock *Block) {
  549. for (auto Jump : Block->SuccJumps) {
  550. if (ignoreJump(SrcBlock, DstBlock, Jump))
  551. continue;
  552. LocalInDegree[Jump->Target]++;
  553. }
  554. };
  555. fillInDegree(SrcBlock);
  556. for (auto Block : UnknownBlocks) {
  557. fillInDegree(Block);
  558. }
  559. // A loop containing SrcBlock
  560. if (LocalInDegree[SrcBlock->Index] > 0)
  561. return false;
  562. std::vector<FlowBlock *> AcyclicOrder;
  563. std::queue<uint64_t> Queue;
  564. Queue.push(SrcBlock->Index);
  565. while (!Queue.empty()) {
  566. FlowBlock *Block = &Func.Blocks[Queue.front()];
  567. Queue.pop();
  568. // Stop propagation once we reach DstBlock, if any
  569. if (DstBlock != nullptr && Block == DstBlock)
  570. break;
  571. // Keep an acyclic order of unknown blocks
  572. if (Block->UnknownWeight && Block != SrcBlock)
  573. AcyclicOrder.push_back(Block);
  574. // Add to the queue all successors with zero local in-degree
  575. for (auto Jump : Block->SuccJumps) {
  576. if (ignoreJump(SrcBlock, DstBlock, Jump))
  577. continue;
  578. uint64_t Dst = Jump->Target;
  579. LocalInDegree[Dst]--;
  580. if (LocalInDegree[Dst] == 0) {
  581. Queue.push(Dst);
  582. }
  583. }
  584. }
  585. // If there is a cycle in the subgraph, AcyclicOrder contains only a subset
  586. // of all blocks
  587. if (UnknownBlocks.size() != AcyclicOrder.size())
  588. return false;
  589. UnknownBlocks = AcyclicOrder;
  590. return true;
  591. }
  592. /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and
  593. /// having UnknownBlocks intermediate blocks.
  594. void rebalanceUnknownSubgraph(const FlowBlock *SrcBlock,
  595. const FlowBlock *DstBlock,
  596. const std::vector<FlowBlock *> &UnknownBlocks) {
  597. assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph");
  598. // Ditribute flow from the source block
  599. uint64_t BlockFlow = 0;
  600. // SrcBlock's flow is the sum of outgoing flows along non-ignored jumps
  601. for (auto Jump : SrcBlock->SuccJumps) {
  602. if (ignoreJump(SrcBlock, DstBlock, Jump))
  603. continue;
  604. BlockFlow += Jump->Flow;
  605. }
  606. rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow);
  607. // Ditribute flow from the remaining blocks
  608. for (auto Block : UnknownBlocks) {
  609. assert(Block->UnknownWeight && "incorrect unknown subgraph");
  610. uint64_t BlockFlow = 0;
  611. // Block's flow is the sum of incoming flows
  612. for (auto Jump : Block->PredJumps) {
  613. BlockFlow += Jump->Flow;
  614. }
  615. Block->Flow = BlockFlow;
  616. rebalanceBlock(SrcBlock, DstBlock, Block, BlockFlow);
  617. }
  618. }
  619. /// Redistribute flow for a block in a subgraph rooted at SrcBlock,
  620. /// and ending at DstBlock.
  621. void rebalanceBlock(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
  622. const FlowBlock *Block, uint64_t BlockFlow) {
  623. // Process all successor jumps and update corresponding flow values
  624. size_t BlockDegree = 0;
  625. for (auto Jump : Block->SuccJumps) {
  626. if (ignoreJump(SrcBlock, DstBlock, Jump))
  627. continue;
  628. BlockDegree++;
  629. }
  630. // If all successor jumps of the block are ignored, skip it
  631. if (DstBlock == nullptr && BlockDegree == 0)
  632. return;
  633. assert(BlockDegree > 0 && "all outgoing jumps are ignored");
  634. // Each of the Block's successors gets the following amount of flow.
  635. // Rounding the value up so that all flow is propagated
  636. uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree;
  637. for (auto Jump : Block->SuccJumps) {
  638. if (ignoreJump(SrcBlock, DstBlock, Jump))
  639. continue;
  640. uint64_t Flow = std::min(SuccFlow, BlockFlow);
  641. Jump->Flow = Flow;
  642. BlockFlow -= Flow;
  643. }
  644. assert(BlockFlow == 0 && "not all flow is propagated");
  645. }
  646. /// A constant indicating an arbitrary exit block of a function.
  647. static constexpr uint64_t AnyExitBlock = uint64_t(-1);
  648. /// The function.
  649. FlowFunction &Func;
  650. };
  651. /// Initializing flow network for a given function.
  652. ///
  653. /// Every block is split into three nodes that are responsible for (i) an
  654. /// incoming flow, (ii) an outgoing flow, and (iii) penalizing an increase or
  655. /// reduction of the block weight.
  656. void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) {
  657. uint64_t NumBlocks = Func.Blocks.size();
  658. assert(NumBlocks > 1 && "Too few blocks in a function");
  659. LLVM_DEBUG(dbgs() << "Initializing profi for " << NumBlocks << " blocks\n");
  660. // Pre-process data: make sure the entry weight is at least 1
  661. if (Func.Blocks[Func.Entry].Weight == 0) {
  662. Func.Blocks[Func.Entry].Weight = 1;
  663. }
  664. // Introducing dummy source/sink pairs to allow flow circulation.
  665. // The nodes corresponding to blocks of Func have indicies in the range
  666. // [0..3 * NumBlocks); the dummy nodes are indexed by the next four values.
  667. uint64_t S = 3 * NumBlocks;
  668. uint64_t T = S + 1;
  669. uint64_t S1 = S + 2;
  670. uint64_t T1 = S + 3;
  671. Network.initialize(3 * NumBlocks + 4, S1, T1);
  672. // Create three nodes for every block of the function
  673. for (uint64_t B = 0; B < NumBlocks; B++) {
  674. auto &Block = Func.Blocks[B];
  675. assert((!Block.UnknownWeight || Block.Weight == 0 || Block.isEntry()) &&
  676. "non-zero weight of a block w/o weight except for an entry");
  677. // Split every block into two nodes
  678. uint64_t Bin = 3 * B;
  679. uint64_t Bout = 3 * B + 1;
  680. uint64_t Baux = 3 * B + 2;
  681. if (Block.Weight > 0) {
  682. Network.addEdge(S1, Bout, Block.Weight, 0);
  683. Network.addEdge(Bin, T1, Block.Weight, 0);
  684. }
  685. // Edges from S and to T
  686. assert((!Block.isEntry() || !Block.isExit()) &&
  687. "a block cannot be an entry and an exit");
  688. if (Block.isEntry()) {
  689. Network.addEdge(S, Bin, 0);
  690. } else if (Block.isExit()) {
  691. Network.addEdge(Bout, T, 0);
  692. }
  693. // An auxiliary node to allow increase/reduction of block counts:
  694. // We assume that decreasing block counts is more expensive than increasing,
  695. // and thus, setting separate costs here. In the future we may want to tune
  696. // the relative costs so as to maximize the quality of generated profiles.
  697. int64_t AuxCostInc = MinCostMaxFlow::AuxCostInc;
  698. int64_t AuxCostDec = MinCostMaxFlow::AuxCostDec;
  699. if (Block.UnknownWeight) {
  700. // Do not penalize changing weights of blocks w/o known profile count
  701. AuxCostInc = 0;
  702. AuxCostDec = 0;
  703. } else {
  704. // Increasing the count for "cold" blocks with zero initial count is more
  705. // expensive than for "hot" ones
  706. if (Block.Weight == 0) {
  707. AuxCostInc = MinCostMaxFlow::AuxCostIncZero;
  708. }
  709. // Modifying the count of the entry block is expensive
  710. if (Block.isEntry()) {
  711. AuxCostInc = MinCostMaxFlow::AuxCostIncEntry;
  712. AuxCostDec = MinCostMaxFlow::AuxCostDecEntry;
  713. }
  714. }
  715. // For blocks with self-edges, do not penalize a reduction of the count,
  716. // as all of the increase can be attributed to the self-edge
  717. if (Block.HasSelfEdge) {
  718. AuxCostDec = 0;
  719. }
  720. Network.addEdge(Bin, Baux, AuxCostInc);
  721. Network.addEdge(Baux, Bout, AuxCostInc);
  722. if (Block.Weight > 0) {
  723. Network.addEdge(Bout, Baux, AuxCostDec);
  724. Network.addEdge(Baux, Bin, AuxCostDec);
  725. }
  726. }
  727. // Creating edges for every jump
  728. for (auto &Jump : Func.Jumps) {
  729. uint64_t Src = Jump.Source;
  730. uint64_t Dst = Jump.Target;
  731. if (Src != Dst) {
  732. uint64_t SrcOut = 3 * Src + 1;
  733. uint64_t DstIn = 3 * Dst;
  734. uint64_t Cost = Jump.IsUnlikely ? MinCostMaxFlow::AuxCostUnlikely : 0;
  735. Network.addEdge(SrcOut, DstIn, Cost);
  736. }
  737. }
  738. // Make sure we have a valid flow circulation
  739. Network.addEdge(T, S, 0);
  740. }
  741. /// Extract resulting block and edge counts from the flow network.
  742. void extractWeights(MinCostMaxFlow &Network, FlowFunction &Func) {
  743. uint64_t NumBlocks = Func.Blocks.size();
  744. // Extract resulting block counts
  745. for (uint64_t Src = 0; Src < NumBlocks; Src++) {
  746. auto &Block = Func.Blocks[Src];
  747. uint64_t SrcOut = 3 * Src + 1;
  748. int64_t Flow = 0;
  749. for (auto &Adj : Network.getFlow(SrcOut)) {
  750. uint64_t DstIn = Adj.first;
  751. int64_t DstFlow = Adj.second;
  752. bool IsAuxNode = (DstIn < 3 * NumBlocks && DstIn % 3 == 2);
  753. if (!IsAuxNode || Block.HasSelfEdge) {
  754. Flow += DstFlow;
  755. }
  756. }
  757. Block.Flow = Flow;
  758. assert(Flow >= 0 && "negative block flow");
  759. }
  760. // Extract resulting jump counts
  761. for (auto &Jump : Func.Jumps) {
  762. uint64_t Src = Jump.Source;
  763. uint64_t Dst = Jump.Target;
  764. int64_t Flow = 0;
  765. if (Src != Dst) {
  766. uint64_t SrcOut = 3 * Src + 1;
  767. uint64_t DstIn = 3 * Dst;
  768. Flow = Network.getFlow(SrcOut, DstIn);
  769. } else {
  770. uint64_t SrcOut = 3 * Src + 1;
  771. uint64_t SrcAux = 3 * Src + 2;
  772. int64_t AuxFlow = Network.getFlow(SrcOut, SrcAux);
  773. if (AuxFlow > 0)
  774. Flow = AuxFlow;
  775. }
  776. Jump.Flow = Flow;
  777. assert(Flow >= 0 && "negative jump flow");
  778. }
  779. }
  780. #ifndef NDEBUG
  781. /// Verify that the computed flow values satisfy flow conservation rules
  782. void verifyWeights(const FlowFunction &Func) {
  783. const uint64_t NumBlocks = Func.Blocks.size();
  784. auto InFlow = std::vector<uint64_t>(NumBlocks, 0);
  785. auto OutFlow = std::vector<uint64_t>(NumBlocks, 0);
  786. for (auto &Jump : Func.Jumps) {
  787. InFlow[Jump.Target] += Jump.Flow;
  788. OutFlow[Jump.Source] += Jump.Flow;
  789. }
  790. uint64_t TotalInFlow = 0;
  791. uint64_t TotalOutFlow = 0;
  792. for (uint64_t I = 0; I < NumBlocks; I++) {
  793. auto &Block = Func.Blocks[I];
  794. if (Block.isEntry()) {
  795. TotalInFlow += Block.Flow;
  796. assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
  797. } else if (Block.isExit()) {
  798. TotalOutFlow += Block.Flow;
  799. assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
  800. } else {
  801. assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow");
  802. assert(Block.Flow == InFlow[I] && "incorrectly computed control flow");
  803. }
  804. }
  805. assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow");
  806. // Verify that there are no isolated flow components
  807. // One could modify FlowFunction to hold edges indexed by the sources, which
  808. // will avoid a creation of the object
  809. auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks);
  810. for (auto &Jump : Func.Jumps) {
  811. if (Jump.Flow > 0) {
  812. PositiveFlowEdges[Jump.Source].push_back(Jump.Target);
  813. }
  814. }
  815. // Run BFS from the source along edges with positive flow
  816. std::queue<uint64_t> Queue;
  817. auto Visited = BitVector(NumBlocks, false);
  818. Queue.push(Func.Entry);
  819. Visited[Func.Entry] = true;
  820. while (!Queue.empty()) {
  821. uint64_t Src = Queue.front();
  822. Queue.pop();
  823. for (uint64_t Dst : PositiveFlowEdges[Src]) {
  824. if (!Visited[Dst]) {
  825. Queue.push(Dst);
  826. Visited[Dst] = true;
  827. }
  828. }
  829. }
  830. // Verify that every block that has a positive flow is reached from the source
  831. // along edges with a positive flow
  832. for (uint64_t I = 0; I < NumBlocks; I++) {
  833. auto &Block = Func.Blocks[I];
  834. assert((Visited[I] || Block.Flow == 0) && "an isolated flow component");
  835. }
  836. }
  837. #endif
  838. } // end of anonymous namespace
  839. /// Apply the profile inference algorithm for a given flow function
  840. void llvm::applyFlowInference(FlowFunction &Func) {
  841. // Create and apply an inference network model
  842. auto InferenceNetwork = MinCostMaxFlow();
  843. initializeNetwork(InferenceNetwork, Func);
  844. InferenceNetwork.run();
  845. // Extract flow values for every block and every edge
  846. extractWeights(InferenceNetwork, Func);
  847. // Post-processing adjustments to the flow
  848. auto Adjuster = FlowAdjuster(Func);
  849. Adjuster.run();
  850. #ifndef NDEBUG
  851. // Verify the result
  852. verifyWeights(Func);
  853. #endif
  854. }