CodeLayout.cpp 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942
  1. //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // ExtTSP - layout of basic blocks with i-cache optimization.
  10. //
  11. // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
  12. // optimizing jump locality and thus processor I-cache utilization. This is
  13. // achieved via increasing the number of fall-through jumps and co-locating
  14. // frequently executed nodes together. The name follows the underlying
  15. // optimization problem, Extended-TSP, which is a generalization of classical
  16. // (maximum) Traveling Salesmen Problem.
  17. //
  18. // The algorithm is a greedy heuristic that works with chains (ordered lists)
  19. // of basic blocks. Initially all chains are isolated basic blocks. On every
  20. // iteration, we pick a pair of chains whose merging yields the biggest increase
  21. // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
  22. // A pair of chains giving the maximum gain is merged into a new chain. The
  23. // procedure stops when there is only one chain left, or when merging does not
  24. // increase ExtTSP. In the latter case, the remaining chains are sorted by
  25. // density in the decreasing order.
  26. //
  27. // An important aspect is the way two chains are merged. Unlike earlier
  28. // algorithms (e.g., based on the approach of Pettis-Hansen), two
  29. // chains, X and Y, are first split into three, X1, X2, and Y. Then we
  30. // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
  31. // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
  32. // This improves the quality of the final result (the search space is larger)
  33. // while keeping the implementation sufficiently fast.
  34. //
  35. // Reference:
  36. // * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
  37. // IEEE Transactions on Computers, 2020
  38. //
  39. //===----------------------------------------------------------------------===//
  40. #include "llvm/Transforms/Utils/CodeLayout.h"
  41. #include "llvm/Support/CommandLine.h"
  42. #include "llvm/Support/Debug.h"
  43. using namespace llvm;
  44. #define DEBUG_TYPE "code-layout"
  45. // Algorithm-specific constants. The values are tuned for the best performance
  46. // of large-scale front-end bound binaries.
  47. static cl::opt<double>
  48. ForwardWeight("ext-tsp-forward-weight", cl::Hidden, cl::init(0.1),
  49. cl::desc("The weight of forward jumps for ExtTSP value"));
  50. static cl::opt<double>
  51. BackwardWeight("ext-tsp-backward-weight", cl::Hidden, cl::init(0.1),
  52. cl::desc("The weight of backward jumps for ExtTSP value"));
  53. static cl::opt<unsigned> ForwardDistance(
  54. "ext-tsp-forward-distance", cl::Hidden, cl::init(1024),
  55. cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
  56. static cl::opt<unsigned> BackwardDistance(
  57. "ext-tsp-backward-distance", cl::Hidden, cl::init(640),
  58. cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
  59. // The maximum size of a chain for splitting. Larger values of the threshold
  60. // may yield better quality at the cost of worsen run-time.
  61. static cl::opt<unsigned> ChainSplitThreshold(
  62. "ext-tsp-chain-split-threshold", cl::Hidden, cl::init(128),
  63. cl::desc("The maximum size of a chain to apply splitting"));
  64. // The option enables splitting (large) chains along in-coming and out-going
  65. // jumps. This typically results in a better quality.
  66. static cl::opt<bool> EnableChainSplitAlongJumps(
  67. "ext-tsp-enable-chain-split-along-jumps", cl::Hidden, cl::init(true),
  68. cl::desc("The maximum size of a chain to apply splitting"));
  69. namespace {
  70. // Epsilon for comparison of doubles.
  71. constexpr double EPS = 1e-8;
  72. // Compute the Ext-TSP score for a jump between a given pair of blocks,
  73. // using their sizes, (estimated) addresses and the jump execution count.
  74. double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
  75. uint64_t Count) {
  76. // Fallthrough
  77. if (SrcAddr + SrcSize == DstAddr) {
  78. // Assume that FallthroughWeight = 1.0 after normalization
  79. return static_cast<double>(Count);
  80. }
  81. // Forward
  82. if (SrcAddr + SrcSize < DstAddr) {
  83. const auto Dist = DstAddr - (SrcAddr + SrcSize);
  84. if (Dist <= ForwardDistance) {
  85. double Prob = 1.0 - static_cast<double>(Dist) / ForwardDistance;
  86. return ForwardWeight * Prob * Count;
  87. }
  88. return 0;
  89. }
  90. // Backward
  91. const auto Dist = SrcAddr + SrcSize - DstAddr;
  92. if (Dist <= BackwardDistance) {
  93. double Prob = 1.0 - static_cast<double>(Dist) / BackwardDistance;
  94. return BackwardWeight * Prob * Count;
  95. }
  96. return 0;
  97. }
  98. /// A type of merging two chains, X and Y. The former chain is split into
  99. /// X1 and X2 and then concatenated with Y in the order specified by the type.
  100. enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y };
  101. /// The gain of merging two chains, that is, the Ext-TSP score of the merge
  102. /// together with the corresponfiding merge 'type' and 'offset'.
  103. class MergeGainTy {
  104. public:
  105. explicit MergeGainTy() {}
  106. explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType)
  107. : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
  108. double score() const { return Score; }
  109. size_t mergeOffset() const { return MergeOffset; }
  110. MergeTypeTy mergeType() const { return MergeType; }
  111. // Returns 'true' iff Other is preferred over this.
  112. bool operator<(const MergeGainTy &Other) const {
  113. return (Other.Score > EPS && Other.Score > Score + EPS);
  114. }
  115. // Update the current gain if Other is preferred over this.
  116. void updateIfLessThan(const MergeGainTy &Other) {
  117. if (*this < Other)
  118. *this = Other;
  119. }
  120. private:
  121. double Score{-1.0};
  122. size_t MergeOffset{0};
  123. MergeTypeTy MergeType{MergeTypeTy::X_Y};
  124. };
  125. class Block;
  126. class Jump;
  127. class Chain;
  128. class ChainEdge;
  129. /// A node in the graph, typically corresponding to a basic block in CFG.
  130. class Block {
  131. public:
  132. Block(const Block &) = delete;
  133. Block(Block &&) = default;
  134. Block &operator=(const Block &) = delete;
  135. Block &operator=(Block &&) = default;
  136. // The original index of the block in CFG.
  137. size_t Index{0};
  138. // The index of the block in the current chain.
  139. size_t CurIndex{0};
  140. // Size of the block in the binary.
  141. uint64_t Size{0};
  142. // Execution count of the block in the profile data.
  143. uint64_t ExecutionCount{0};
  144. // Current chain of the node.
  145. Chain *CurChain{nullptr};
  146. // An offset of the block in the current chain.
  147. mutable uint64_t EstimatedAddr{0};
  148. // Forced successor of the block in CFG.
  149. Block *ForcedSucc{nullptr};
  150. // Forced predecessor of the block in CFG.
  151. Block *ForcedPred{nullptr};
  152. // Outgoing jumps from the block.
  153. std::vector<Jump *> OutJumps;
  154. // Incoming jumps to the block.
  155. std::vector<Jump *> InJumps;
  156. public:
  157. explicit Block(size_t Index, uint64_t Size_, uint64_t EC)
  158. : Index(Index), Size(Size_), ExecutionCount(EC) {}
  159. bool isEntry() const { return Index == 0; }
  160. };
  161. /// An arc in the graph, typically corresponding to a jump between two blocks.
  162. class Jump {
  163. public:
  164. Jump(const Jump &) = delete;
  165. Jump(Jump &&) = default;
  166. Jump &operator=(const Jump &) = delete;
  167. Jump &operator=(Jump &&) = default;
  168. // Source block of the jump.
  169. Block *Source;
  170. // Target block of the jump.
  171. Block *Target;
  172. // Execution count of the arc in the profile data.
  173. uint64_t ExecutionCount{0};
  174. public:
  175. explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount)
  176. : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
  177. };
  178. /// A chain (ordered sequence) of blocks.
  179. class Chain {
  180. public:
  181. Chain(const Chain &) = delete;
  182. Chain(Chain &&) = default;
  183. Chain &operator=(const Chain &) = delete;
  184. Chain &operator=(Chain &&) = default;
  185. explicit Chain(uint64_t Id, Block *Block)
  186. : Id(Id), Score(0), Blocks(1, Block) {}
  187. uint64_t id() const { return Id; }
  188. bool isEntry() const { return Blocks[0]->Index == 0; }
  189. double score() const { return Score; }
  190. void setScore(double NewScore) { Score = NewScore; }
  191. const std::vector<Block *> &blocks() const { return Blocks; }
  192. const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const {
  193. return Edges;
  194. }
  195. ChainEdge *getEdge(Chain *Other) const {
  196. for (auto It : Edges) {
  197. if (It.first == Other)
  198. return It.second;
  199. }
  200. return nullptr;
  201. }
  202. void removeEdge(Chain *Other) {
  203. auto It = Edges.begin();
  204. while (It != Edges.end()) {
  205. if (It->first == Other) {
  206. Edges.erase(It);
  207. return;
  208. }
  209. It++;
  210. }
  211. }
  212. void addEdge(Chain *Other, ChainEdge *Edge) {
  213. Edges.push_back(std::make_pair(Other, Edge));
  214. }
  215. void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) {
  216. Blocks = MergedBlocks;
  217. // Update the block's chains
  218. for (size_t Idx = 0; Idx < Blocks.size(); Idx++) {
  219. Blocks[Idx]->CurChain = this;
  220. Blocks[Idx]->CurIndex = Idx;
  221. }
  222. }
  223. void mergeEdges(Chain *Other);
  224. void clear() {
  225. Blocks.clear();
  226. Blocks.shrink_to_fit();
  227. Edges.clear();
  228. Edges.shrink_to_fit();
  229. }
  230. private:
  231. // Unique chain identifier.
  232. uint64_t Id;
  233. // Cached ext-tsp score for the chain.
  234. double Score;
  235. // Blocks of the chain.
  236. std::vector<Block *> Blocks;
  237. // Adjacent chains and corresponding edges (lists of jumps).
  238. std::vector<std::pair<Chain *, ChainEdge *>> Edges;
  239. };
  240. /// An edge in CFG representing jumps between two chains.
  241. /// When blocks are merged into chains, the edges are combined too so that
  242. /// there is always at most one edge between a pair of chains
  243. class ChainEdge {
  244. public:
  245. ChainEdge(const ChainEdge &) = delete;
  246. ChainEdge(ChainEdge &&) = default;
  247. ChainEdge &operator=(const ChainEdge &) = delete;
  248. ChainEdge &operator=(ChainEdge &&) = default;
  249. explicit ChainEdge(Jump *Jump)
  250. : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
  251. Jumps(1, Jump) {}
  252. const std::vector<Jump *> &jumps() const { return Jumps; }
  253. void changeEndpoint(Chain *From, Chain *To) {
  254. if (From == SrcChain)
  255. SrcChain = To;
  256. if (From == DstChain)
  257. DstChain = To;
  258. }
  259. void appendJump(Jump *Jump) { Jumps.push_back(Jump); }
  260. void moveJumps(ChainEdge *Other) {
  261. Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
  262. Other->Jumps.clear();
  263. Other->Jumps.shrink_to_fit();
  264. }
  265. bool hasCachedMergeGain(Chain *Src, Chain *Dst) const {
  266. return Src == SrcChain ? CacheValidForward : CacheValidBackward;
  267. }
  268. MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const {
  269. return Src == SrcChain ? CachedGainForward : CachedGainBackward;
  270. }
  271. void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) {
  272. if (Src == SrcChain) {
  273. CachedGainForward = MergeGain;
  274. CacheValidForward = true;
  275. } else {
  276. CachedGainBackward = MergeGain;
  277. CacheValidBackward = true;
  278. }
  279. }
  280. void invalidateCache() {
  281. CacheValidForward = false;
  282. CacheValidBackward = false;
  283. }
  284. private:
  285. // Source chain.
  286. Chain *SrcChain{nullptr};
  287. // Destination chain.
  288. Chain *DstChain{nullptr};
  289. // Original jumps in the binary with correspinding execution counts.
  290. std::vector<Jump *> Jumps;
  291. // Cached ext-tsp value for merging the pair of chains.
  292. // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
  293. // we store both values here.
  294. MergeGainTy CachedGainForward;
  295. MergeGainTy CachedGainBackward;
  296. // Whether the cached value must be recomputed.
  297. bool CacheValidForward{false};
  298. bool CacheValidBackward{false};
  299. };
  300. void Chain::mergeEdges(Chain *Other) {
  301. assert(this != Other && "cannot merge a chain with itself");
  302. // Update edges adjacent to chain Other
  303. for (auto EdgeIt : Other->Edges) {
  304. const auto DstChain = EdgeIt.first;
  305. const auto DstEdge = EdgeIt.second;
  306. const auto TargetChain = DstChain == Other ? this : DstChain;
  307. auto CurEdge = getEdge(TargetChain);
  308. if (CurEdge == nullptr) {
  309. DstEdge->changeEndpoint(Other, this);
  310. this->addEdge(TargetChain, DstEdge);
  311. if (DstChain != this && DstChain != Other) {
  312. DstChain->addEdge(this, DstEdge);
  313. }
  314. } else {
  315. CurEdge->moveJumps(DstEdge);
  316. }
  317. // Cleanup leftover edge
  318. if (DstChain != Other) {
  319. DstChain->removeEdge(Other);
  320. }
  321. }
  322. }
  323. using BlockIter = std::vector<Block *>::const_iterator;
  324. /// A wrapper around three chains of blocks; it is used to avoid extra
  325. /// instantiation of the vectors.
  326. class MergedChain {
  327. public:
  328. MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(),
  329. BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(),
  330. BlockIter End3 = BlockIter())
  331. : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
  332. End3(End3) {}
  333. template <typename F> void forEach(const F &Func) const {
  334. for (auto It = Begin1; It != End1; It++)
  335. Func(*It);
  336. for (auto It = Begin2; It != End2; It++)
  337. Func(*It);
  338. for (auto It = Begin3; It != End3; It++)
  339. Func(*It);
  340. }
  341. std::vector<Block *> getBlocks() const {
  342. std::vector<Block *> Result;
  343. Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
  344. std::distance(Begin3, End3));
  345. Result.insert(Result.end(), Begin1, End1);
  346. Result.insert(Result.end(), Begin2, End2);
  347. Result.insert(Result.end(), Begin3, End3);
  348. return Result;
  349. }
  350. const Block *getFirstBlock() const { return *Begin1; }
  351. private:
  352. BlockIter Begin1;
  353. BlockIter End1;
  354. BlockIter Begin2;
  355. BlockIter End2;
  356. BlockIter Begin3;
  357. BlockIter End3;
  358. };
  359. /// The implementation of the ExtTSP algorithm.
  360. class ExtTSPImpl {
  361. using EdgeT = std::pair<uint64_t, uint64_t>;
  362. using EdgeCountMap = DenseMap<EdgeT, uint64_t>;
  363. public:
  364. ExtTSPImpl(size_t NumNodes, const std::vector<uint64_t> &NodeSizes,
  365. const std::vector<uint64_t> &NodeCounts,
  366. const EdgeCountMap &EdgeCounts)
  367. : NumNodes(NumNodes) {
  368. initialize(NodeSizes, NodeCounts, EdgeCounts);
  369. }
  370. /// Run the algorithm and return an optimized ordering of blocks.
  371. void run(std::vector<uint64_t> &Result) {
  372. // Pass 1: Merge blocks with their mutually forced successors
  373. mergeForcedPairs();
  374. // Pass 2: Merge pairs of chains while improving the ExtTSP objective
  375. mergeChainPairs();
  376. // Pass 3: Merge cold blocks to reduce code size
  377. mergeColdChains();
  378. // Collect blocks from all chains
  379. concatChains(Result);
  380. }
  381. private:
  382. /// Initialize the algorithm's data structures.
  383. void initialize(const std::vector<uint64_t> &NodeSizes,
  384. const std::vector<uint64_t> &NodeCounts,
  385. const EdgeCountMap &EdgeCounts) {
  386. // Initialize blocks
  387. AllBlocks.reserve(NumNodes);
  388. for (uint64_t Node = 0; Node < NumNodes; Node++) {
  389. uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
  390. uint64_t ExecutionCount = NodeCounts[Node];
  391. // The execution count of the entry block is set to at least 1
  392. if (Node == 0 && ExecutionCount == 0)
  393. ExecutionCount = 1;
  394. AllBlocks.emplace_back(Node, Size, ExecutionCount);
  395. }
  396. // Initialize jumps between blocks
  397. SuccNodes = std::vector<std::vector<uint64_t>>(NumNodes);
  398. PredNodes = std::vector<std::vector<uint64_t>>(NumNodes);
  399. AllJumps.reserve(EdgeCounts.size());
  400. for (auto It : EdgeCounts) {
  401. auto Pred = It.first.first;
  402. auto Succ = It.first.second;
  403. // Ignore self-edges
  404. if (Pred == Succ)
  405. continue;
  406. SuccNodes[Pred].push_back(Succ);
  407. PredNodes[Succ].push_back(Pred);
  408. auto ExecutionCount = It.second;
  409. if (ExecutionCount > 0) {
  410. auto &Block = AllBlocks[Pred];
  411. auto &SuccBlock = AllBlocks[Succ];
  412. AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount);
  413. SuccBlock.InJumps.push_back(&AllJumps.back());
  414. Block.OutJumps.push_back(&AllJumps.back());
  415. }
  416. }
  417. // Initialize chains
  418. AllChains.reserve(NumNodes);
  419. HotChains.reserve(NumNodes);
  420. for (auto &Block : AllBlocks) {
  421. AllChains.emplace_back(Block.Index, &Block);
  422. Block.CurChain = &AllChains.back();
  423. if (Block.ExecutionCount > 0) {
  424. HotChains.push_back(&AllChains.back());
  425. }
  426. }
  427. // Initialize chain edges
  428. AllEdges.reserve(AllJumps.size());
  429. for (auto &Block : AllBlocks) {
  430. for (auto &Jump : Block.OutJumps) {
  431. const auto SuccBlock = Jump->Target;
  432. auto CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain);
  433. // this edge is already present in the graph
  434. if (CurEdge != nullptr) {
  435. assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr);
  436. CurEdge->appendJump(Jump);
  437. continue;
  438. }
  439. // this is a new edge
  440. AllEdges.emplace_back(Jump);
  441. Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back());
  442. SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back());
  443. }
  444. }
  445. }
  446. /// For a pair of blocks, A and B, block B is the forced successor of A,
  447. /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
  448. /// to B are from A. Such blocks should be adjacent in the optimal ordering;
  449. /// the method finds and merges such pairs of blocks.
  450. void mergeForcedPairs() {
  451. // Find fallthroughs based on edge weights
  452. for (auto &Block : AllBlocks) {
  453. if (SuccNodes[Block.Index].size() == 1 &&
  454. PredNodes[SuccNodes[Block.Index][0]].size() == 1 &&
  455. SuccNodes[Block.Index][0] != 0) {
  456. size_t SuccIndex = SuccNodes[Block.Index][0];
  457. Block.ForcedSucc = &AllBlocks[SuccIndex];
  458. AllBlocks[SuccIndex].ForcedPred = &Block;
  459. }
  460. }
  461. // There might be 'cycles' in the forced dependencies, since profile
  462. // data isn't 100% accurate. Typically this is observed in loops, when the
  463. // loop edges are the hottest successors for the basic blocks of the loop.
  464. // Break the cycles by choosing the block with the smallest index as the
  465. // head. This helps to keep the original order of the loops, which likely
  466. // have already been rotated in the optimized manner.
  467. for (auto &Block : AllBlocks) {
  468. if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr)
  469. continue;
  470. auto SuccBlock = Block.ForcedSucc;
  471. while (SuccBlock != nullptr && SuccBlock != &Block) {
  472. SuccBlock = SuccBlock->ForcedSucc;
  473. }
  474. if (SuccBlock == nullptr)
  475. continue;
  476. // Break the cycle
  477. AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr;
  478. Block.ForcedPred = nullptr;
  479. }
  480. // Merge blocks with their fallthrough successors
  481. for (auto &Block : AllBlocks) {
  482. if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) {
  483. auto CurBlock = &Block;
  484. while (CurBlock->ForcedSucc != nullptr) {
  485. const auto NextBlock = CurBlock->ForcedSucc;
  486. mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y);
  487. CurBlock = NextBlock;
  488. }
  489. }
  490. }
  491. }
  492. /// Merge pairs of chains while improving the ExtTSP objective.
  493. void mergeChainPairs() {
  494. /// Deterministically compare pairs of chains
  495. auto compareChainPairs = [](const Chain *A1, const Chain *B1,
  496. const Chain *A2, const Chain *B2) {
  497. if (A1 != A2)
  498. return A1->id() < A2->id();
  499. return B1->id() < B2->id();
  500. };
  501. while (HotChains.size() > 1) {
  502. Chain *BestChainPred = nullptr;
  503. Chain *BestChainSucc = nullptr;
  504. auto BestGain = MergeGainTy();
  505. // Iterate over all pairs of chains
  506. for (auto ChainPred : HotChains) {
  507. // Get candidates for merging with the current chain
  508. for (auto EdgeIter : ChainPred->edges()) {
  509. auto ChainSucc = EdgeIter.first;
  510. auto ChainEdge = EdgeIter.second;
  511. // Ignore loop edges
  512. if (ChainPred == ChainSucc)
  513. continue;
  514. // Compute the gain of merging the two chains
  515. auto CurGain = getBestMergeGain(ChainPred, ChainSucc, ChainEdge);
  516. if (CurGain.score() <= EPS)
  517. continue;
  518. if (BestGain < CurGain ||
  519. (std::abs(CurGain.score() - BestGain.score()) < EPS &&
  520. compareChainPairs(ChainPred, ChainSucc, BestChainPred,
  521. BestChainSucc))) {
  522. BestGain = CurGain;
  523. BestChainPred = ChainPred;
  524. BestChainSucc = ChainSucc;
  525. }
  526. }
  527. }
  528. // Stop merging when there is no improvement
  529. if (BestGain.score() <= EPS)
  530. break;
  531. // Merge the best pair of chains
  532. mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
  533. BestGain.mergeType());
  534. }
  535. }
  536. /// Merge cold blocks to reduce code size.
  537. void mergeColdChains() {
  538. for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
  539. // Iterating over neighbors in the reverse order to make sure original
  540. // fallthrough jumps are merged first
  541. size_t NumSuccs = SuccNodes[SrcBB].size();
  542. for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
  543. auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
  544. auto SrcChain = AllBlocks[SrcBB].CurChain;
  545. auto DstChain = AllBlocks[DstBB].CurChain;
  546. if (SrcChain != DstChain && !DstChain->isEntry() &&
  547. SrcChain->blocks().back()->Index == SrcBB &&
  548. DstChain->blocks().front()->Index == DstBB) {
  549. mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y);
  550. }
  551. }
  552. }
  553. }
  554. /// Compute the Ext-TSP score for a given block order and a list of jumps.
  555. double extTSPScore(const MergedChain &MergedBlocks,
  556. const std::vector<Jump *> &Jumps) const {
  557. if (Jumps.empty())
  558. return 0.0;
  559. uint64_t CurAddr = 0;
  560. MergedBlocks.forEach([&](const Block *BB) {
  561. BB->EstimatedAddr = CurAddr;
  562. CurAddr += BB->Size;
  563. });
  564. double Score = 0;
  565. for (auto &Jump : Jumps) {
  566. const auto SrcBlock = Jump->Source;
  567. const auto DstBlock = Jump->Target;
  568. Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
  569. DstBlock->EstimatedAddr, Jump->ExecutionCount);
  570. }
  571. return Score;
  572. }
  573. /// Compute the gain of merging two chains.
  574. ///
  575. /// The function considers all possible ways of merging two chains and
  576. /// computes the one having the largest increase in ExtTSP objective. The
  577. /// result is a pair with the first element being the gain and the second
  578. /// element being the corresponding merging type.
  579. MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc,
  580. ChainEdge *Edge) const {
  581. if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
  582. return Edge->getCachedMergeGain(ChainPred, ChainSucc);
  583. }
  584. // Precompute jumps between ChainPred and ChainSucc
  585. auto Jumps = Edge->jumps();
  586. auto EdgePP = ChainPred->getEdge(ChainPred);
  587. if (EdgePP != nullptr) {
  588. Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end());
  589. }
  590. assert(!Jumps.empty() && "trying to merge chains w/o jumps");
  591. // The object holds the best currently chosen gain of merging the two chains
  592. MergeGainTy Gain = MergeGainTy();
  593. /// Given a merge offset and a list of merge types, try to merge two chains
  594. /// and update Gain with a better alternative
  595. auto tryChainMerging = [&](size_t Offset,
  596. const std::vector<MergeTypeTy> &MergeTypes) {
  597. // Skip merging corresponding to concatenation w/o splitting
  598. if (Offset == 0 || Offset == ChainPred->blocks().size())
  599. return;
  600. // Skip merging if it breaks Forced successors
  601. auto BB = ChainPred->blocks()[Offset - 1];
  602. if (BB->ForcedSucc != nullptr)
  603. return;
  604. // Apply the merge, compute the corresponding gain, and update the best
  605. // value, if the merge is beneficial
  606. for (auto &MergeType : MergeTypes) {
  607. Gain.updateIfLessThan(
  608. computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
  609. }
  610. };
  611. // Try to concatenate two chains w/o splitting
  612. Gain.updateIfLessThan(
  613. computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y));
  614. if (EnableChainSplitAlongJumps) {
  615. // Attach (a part of) ChainPred before the first block of ChainSucc
  616. for (auto &Jump : ChainSucc->blocks().front()->InJumps) {
  617. const auto SrcBlock = Jump->Source;
  618. if (SrcBlock->CurChain != ChainPred)
  619. continue;
  620. size_t Offset = SrcBlock->CurIndex + 1;
  621. tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y});
  622. }
  623. // Attach (a part of) ChainPred after the last block of ChainSucc
  624. for (auto &Jump : ChainSucc->blocks().back()->OutJumps) {
  625. const auto DstBlock = Jump->Source;
  626. if (DstBlock->CurChain != ChainPred)
  627. continue;
  628. size_t Offset = DstBlock->CurIndex;
  629. tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1});
  630. }
  631. }
  632. // Try to break ChainPred in various ways and concatenate with ChainSucc
  633. if (ChainPred->blocks().size() <= ChainSplitThreshold) {
  634. for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) {
  635. // Try to split the chain in different ways. In practice, applying
  636. // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
  637. // it from consideration to reduce the search space
  638. tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1,
  639. MergeTypeTy::X2_X1_Y});
  640. }
  641. }
  642. Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
  643. return Gain;
  644. }
  645. /// Compute the score gain of merging two chains, respecting a given
  646. /// merge 'type' and 'offset'.
  647. ///
  648. /// The two chains are not modified in the method.
  649. MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc,
  650. const std::vector<Jump *> &Jumps,
  651. size_t MergeOffset,
  652. MergeTypeTy MergeType) const {
  653. auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(),
  654. MergeOffset, MergeType);
  655. // Do not allow a merge that does not preserve the original entry block
  656. if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
  657. !MergedBlocks.getFirstBlock()->isEntry())
  658. return MergeGainTy();
  659. // The gain for the new chain
  660. auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score();
  661. return MergeGainTy(NewGainScore, MergeOffset, MergeType);
  662. }
  663. /// Merge two chains of blocks respecting a given merge 'type' and 'offset'.
  664. ///
  665. /// If MergeType == 0, then the result is a concatentation of two chains.
  666. /// Otherwise, the first chain is cut into two sub-chains at the offset,
  667. /// and merged using all possible ways of concatenating three chains.
  668. MergedChain mergeBlocks(const std::vector<Block *> &X,
  669. const std::vector<Block *> &Y, size_t MergeOffset,
  670. MergeTypeTy MergeType) const {
  671. // Split the first chain, X, into X1 and X2
  672. BlockIter BeginX1 = X.begin();
  673. BlockIter EndX1 = X.begin() + MergeOffset;
  674. BlockIter BeginX2 = X.begin() + MergeOffset;
  675. BlockIter EndX2 = X.end();
  676. BlockIter BeginY = Y.begin();
  677. BlockIter EndY = Y.end();
  678. // Construct a new chain from the three existing ones
  679. switch (MergeType) {
  680. case MergeTypeTy::X_Y:
  681. return MergedChain(BeginX1, EndX2, BeginY, EndY);
  682. case MergeTypeTy::X1_Y_X2:
  683. return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
  684. case MergeTypeTy::Y_X2_X1:
  685. return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
  686. case MergeTypeTy::X2_X1_Y:
  687. return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
  688. }
  689. llvm_unreachable("unexpected chain merge type");
  690. }
  691. /// Merge chain From into chain Into, update the list of active chains,
  692. /// adjacency information, and the corresponding cached values.
  693. void mergeChains(Chain *Into, Chain *From, size_t MergeOffset,
  694. MergeTypeTy MergeType) {
  695. assert(Into != From && "a chain cannot be merged with itself");
  696. // Merge the blocks
  697. auto MergedBlocks =
  698. mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType);
  699. Into->merge(From, MergedBlocks.getBlocks());
  700. Into->mergeEdges(From);
  701. From->clear();
  702. // Update cached ext-tsp score for the new chain
  703. auto SelfEdge = Into->getEdge(Into);
  704. if (SelfEdge != nullptr) {
  705. MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end());
  706. Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps()));
  707. }
  708. // Remove chain From from the list of active chains
  709. auto Iter = std::remove(HotChains.begin(), HotChains.end(), From);
  710. HotChains.erase(Iter, HotChains.end());
  711. // Invalidate caches
  712. for (auto EdgeIter : Into->edges()) {
  713. EdgeIter.second->invalidateCache();
  714. }
  715. }
  716. /// Concatenate all chains into a final order of blocks.
  717. void concatChains(std::vector<uint64_t> &Order) {
  718. // Collect chains and calculate some stats for their sorting
  719. std::vector<Chain *> SortedChains;
  720. DenseMap<const Chain *, double> ChainDensity;
  721. for (auto &Chain : AllChains) {
  722. if (!Chain.blocks().empty()) {
  723. SortedChains.push_back(&Chain);
  724. // Using doubles to avoid overflow of ExecutionCount
  725. double Size = 0;
  726. double ExecutionCount = 0;
  727. for (auto Block : Chain.blocks()) {
  728. Size += static_cast<double>(Block->Size);
  729. ExecutionCount += static_cast<double>(Block->ExecutionCount);
  730. }
  731. assert(Size > 0 && "a chain of zero size");
  732. ChainDensity[&Chain] = ExecutionCount / Size;
  733. }
  734. }
  735. // Sorting chains by density in the decreasing order
  736. std::stable_sort(SortedChains.begin(), SortedChains.end(),
  737. [&](const Chain *C1, const Chain *C2) {
  738. // Makre sure the original entry block is at the
  739. // beginning of the order
  740. if (C1->isEntry() != C2->isEntry()) {
  741. return C1->isEntry();
  742. }
  743. const double D1 = ChainDensity[C1];
  744. const double D2 = ChainDensity[C2];
  745. // Compare by density and break ties by chain identifiers
  746. return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id());
  747. });
  748. // Collect the blocks in the order specified by their chains
  749. Order.reserve(NumNodes);
  750. for (auto Chain : SortedChains) {
  751. for (auto Block : Chain->blocks()) {
  752. Order.push_back(Block->Index);
  753. }
  754. }
  755. }
  756. private:
  757. /// The number of nodes in the graph.
  758. const size_t NumNodes;
  759. /// Successors of each node.
  760. std::vector<std::vector<uint64_t>> SuccNodes;
  761. /// Predecessors of each node.
  762. std::vector<std::vector<uint64_t>> PredNodes;
  763. /// All basic blocks.
  764. std::vector<Block> AllBlocks;
  765. /// All jumps between blocks.
  766. std::vector<Jump> AllJumps;
  767. /// All chains of basic blocks.
  768. std::vector<Chain> AllChains;
  769. /// All edges between chains.
  770. std::vector<ChainEdge> AllEdges;
  771. /// Active chains. The vector gets updated at runtime when chains are merged.
  772. std::vector<Chain *> HotChains;
  773. };
  774. } // end of anonymous namespace
  775. std::vector<uint64_t> llvm::applyExtTspLayout(
  776. const std::vector<uint64_t> &NodeSizes,
  777. const std::vector<uint64_t> &NodeCounts,
  778. const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
  779. size_t NumNodes = NodeSizes.size();
  780. // Verify correctness of the input data.
  781. assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
  782. assert(NumNodes > 2 && "Incorrect input");
  783. // Apply the reordering algorithm.
  784. auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts);
  785. std::vector<uint64_t> Result;
  786. Alg.run(Result);
  787. // Verify correctness of the output.
  788. assert(Result.front() == 0 && "Original entry point is not preserved");
  789. assert(Result.size() == NumNodes && "Incorrect size of reordered layout");
  790. return Result;
  791. }
  792. double llvm::calcExtTspScore(
  793. const std::vector<uint64_t> &Order, const std::vector<uint64_t> &NodeSizes,
  794. const std::vector<uint64_t> &NodeCounts,
  795. const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
  796. // Estimate addresses of the blocks in memory
  797. auto Addr = std::vector<uint64_t>(NodeSizes.size(), 0);
  798. for (size_t Idx = 1; Idx < Order.size(); Idx++) {
  799. Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
  800. }
  801. // Increase the score for each jump
  802. double Score = 0;
  803. for (auto It : EdgeCounts) {
  804. auto Pred = It.first.first;
  805. auto Succ = It.first.second;
  806. uint64_t Count = It.second;
  807. Score += extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count);
  808. }
  809. return Score;
  810. }
  811. double llvm::calcExtTspScore(
  812. const std::vector<uint64_t> &NodeSizes,
  813. const std::vector<uint64_t> &NodeCounts,
  814. const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) {
  815. auto Order = std::vector<uint64_t>(NodeSizes.size());
  816. for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
  817. Order[Idx] = Idx;
  818. }
  819. return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
  820. }