BottleneckAnalysis.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. //===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file
  9. ///
  10. /// This file implements the bottleneck analysis view.
  11. ///
  12. /// This view internally observes backend pressure increase events in order to
  13. /// identify problematic data dependencies and processor resource interferences.
  14. ///
  15. /// Example of bottleneck analysis report for a dot-product on X86 btver2:
  16. ///
  17. /// Cycles with backend pressure increase [ 40.76% ]
  18. /// Throughput Bottlenecks:
  19. /// Resource Pressure [ 39.34% ]
  20. /// - JFPA [ 39.34% ]
  21. /// - JFPU0 [ 39.34% ]
  22. /// Data Dependencies: [ 1.42% ]
  23. /// - Register Dependencies [ 1.42% ]
  24. /// - Memory Dependencies [ 0.00% ]
  25. ///
  26. /// According to the example, backend pressure increased during the 40.76% of
  27. /// the simulated cycles. In particular, the major cause of backend pressure
  28. /// increases was the contention on floating point adder JFPA accessible from
  29. /// pipeline resource JFPU0.
  30. ///
  31. /// At the end of each cycle, if pressure on the simulated out-of-order buffers
  32. /// has increased, a backend pressure event is reported.
  33. /// In particular, this occurs when there is a delta between the number of uOps
  34. /// dispatched and the number of uOps issued to the underlying pipelines.
  35. ///
  36. /// The bottleneck analysis view is also responsible for identifying and
  37. /// printing the most "critical" sequence of dependent instructions according to
  38. /// the simulated run.
  39. ///
  40. /// Below is the critical sequence computed for the dot-product example on
  41. /// btver2:
  42. ///
  43. /// Instruction Dependency Information
  44. /// +----< 2. vhaddps %xmm3, %xmm3, %xmm4
  45. /// |
  46. /// | < loop carried >
  47. /// |
  48. /// | 0. vmulps %xmm0, %xmm0, %xmm2
  49. /// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
  50. /// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3
  51. /// |
  52. /// | < loop carried >
  53. /// |
  54. /// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
  55. ///
  56. ///
  57. /// The algorithm that computes the critical sequence is very similar to a
  58. /// critical path analysis.
  59. ///
  60. /// A dependency graph is used internally to track dependencies between nodes.
  61. /// Nodes of the graph represent instructions from the input assembly sequence,
  62. /// and edges of the graph represent data dependencies or processor resource
  63. /// interferences.
  64. ///
  65. /// Edges are dynamically 'discovered' by observing instruction state
  66. /// transitions and backend pressure increase events. Edges are internally
  67. /// ranked based on their "criticality". A dependency is considered to be
  68. /// critical if it takes a long time to execute, and if it contributes to
  69. /// backend pressure increases. Criticality is internally measured in terms of
  70. /// cycles; it is computed for every edge in the graph as a function of the edge
  71. /// latency and the number of backend pressure increase cycles contributed by
  72. /// that edge.
  73. ///
  74. /// At the end of simulation, costs are propagated to nodes through the edges of
  75. /// the graph, and the most expensive path connecting the root-set (a
  76. /// set of nodes with no predecessors) to a leaf node is reported as critical
  77. /// sequence.
  78. //
  79. //===----------------------------------------------------------------------===//
  80. #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
  81. #define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
  82. #include "Views/InstructionView.h"
  83. #include "llvm/ADT/DenseMap.h"
  84. #include "llvm/ADT/SmallVector.h"
  85. #include "llvm/MC/MCInstPrinter.h"
  86. #include "llvm/MC/MCSchedule.h"
  87. #include "llvm/MC/MCSubtargetInfo.h"
  88. #include "llvm/Support/FormattedStream.h"
  89. #include "llvm/Support/raw_ostream.h"
  90. namespace llvm {
  91. namespace mca {
  92. class PressureTracker {
  93. const MCSchedModel &SM;
  94. // Resource pressure distribution. There is an element for every processor
  95. // resource declared by the scheduling model. Quantities are number of cycles.
  96. SmallVector<unsigned, 4> ResourcePressureDistribution;
  97. // Each processor resource is associated with a so-called processor resource
  98. // mask. This vector allows to correlate processor resource IDs with processor
  99. // resource masks. There is exactly one element per each processor resource
  100. // declared by the scheduling model.
  101. SmallVector<uint64_t, 4> ProcResID2Mask;
  102. // Maps processor resource state indices (returned by calls to
  103. // `getResourceStateIndex(Mask)` to processor resource identifiers.
  104. SmallVector<unsigned, 4> ResIdx2ProcResID;
  105. // Maps Processor Resource identifiers to ResourceUsers indices.
  106. SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
  107. // Identifies the last user of a processor resource unit.
  108. // This vector is updated on every instruction issued event.
  109. // There is one entry for every processor resource unit declared by the
  110. // processor model. An all_ones value is treated like an invalid instruction
  111. // identifier.
  112. using User = std::pair<unsigned, unsigned>;
  113. SmallVector<User, 4> ResourceUsers;
  114. struct InstructionPressureInfo {
  115. unsigned RegisterPressureCycles;
  116. unsigned MemoryPressureCycles;
  117. unsigned ResourcePressureCycles;
  118. };
  119. DenseMap<unsigned, InstructionPressureInfo> IPI;
  120. void updateResourcePressureDistribution(uint64_t CumulativeMask);
  121. User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
  122. unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
  123. return ResourceUsers[Index + UnitID];
  124. }
  125. public:
  126. PressureTracker(const MCSchedModel &Model);
  127. ArrayRef<unsigned> getResourcePressureDistribution() const {
  128. return ResourcePressureDistribution;
  129. }
  130. void getResourceUsers(uint64_t ResourceMask,
  131. SmallVectorImpl<User> &Users) const;
  132. unsigned getRegisterPressureCycles(unsigned IID) const {
  133. assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
  134. const InstructionPressureInfo &Info = IPI.find(IID)->second;
  135. return Info.RegisterPressureCycles;
  136. }
  137. unsigned getMemoryPressureCycles(unsigned IID) const {
  138. assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
  139. const InstructionPressureInfo &Info = IPI.find(IID)->second;
  140. return Info.MemoryPressureCycles;
  141. }
  142. unsigned getResourcePressureCycles(unsigned IID) const {
  143. assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
  144. const InstructionPressureInfo &Info = IPI.find(IID)->second;
  145. return Info.ResourcePressureCycles;
  146. }
  147. const char *resolveResourceName(uint64_t ResourceMask) const {
  148. unsigned Index = getResourceStateIndex(ResourceMask);
  149. unsigned ProcResID = ResIdx2ProcResID[Index];
  150. const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
  151. return PRDesc.Name;
  152. }
  153. void onInstructionDispatched(unsigned IID);
  154. void onInstructionExecuted(unsigned IID);
  155. void handlePressureEvent(const HWPressureEvent &Event);
  156. void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
  157. };
  158. // A dependency edge.
  159. struct DependencyEdge {
  160. enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
  161. // Dependency edge descriptor.
  162. //
  163. // It specifies the dependency type, as well as the edge cost in cycles.
  164. struct Dependency {
  165. DependencyType Type;
  166. uint64_t ResourceOrRegID;
  167. uint64_t Cost;
  168. };
  169. Dependency Dep;
  170. unsigned FromIID;
  171. unsigned ToIID;
  172. // Used by the bottleneck analysis to compute the interference
  173. // probability for processor resources.
  174. unsigned Frequency;
  175. };
  176. // A dependency graph used by the bottleneck analysis to describe data
  177. // dependencies and processor resource interferences between instructions.
  178. //
  179. // There is a node (an instance of struct DGNode) for every instruction in the
  180. // input assembly sequence. Edges of the graph represent dependencies between
  181. // instructions.
  182. //
  183. // Each edge of the graph is associated with a cost value which is used
  184. // internally to rank dependency based on their impact on the runtime
  185. // performance (see field DependencyEdge::Dependency::Cost). In general, the
  186. // higher the cost of an edge, the higher the impact on performance.
  187. //
  188. // The cost of a dependency is a function of both the latency and the number of
  189. // cycles where the dependency has been seen as critical (i.e. contributing to
  190. // back-pressure increases).
  191. //
  192. // Loop carried dependencies are carefully expanded by the bottleneck analysis
  193. // to guarantee that the graph stays acyclic. To this end, extra nodes are
  194. // pre-allocated at construction time to describe instructions from "past and
  195. // future" iterations. The graph is kept acyclic mainly because it simplifies
  196. // the complexity of the algorithm that computes the critical sequence.
  197. class DependencyGraph {
  198. struct DGNode {
  199. unsigned NumPredecessors;
  200. unsigned NumVisitedPredecessors;
  201. uint64_t Cost;
  202. unsigned Depth;
  203. DependencyEdge CriticalPredecessor;
  204. SmallVector<DependencyEdge, 8> OutgoingEdges;
  205. };
  206. SmallVector<DGNode, 16> Nodes;
  207. DependencyGraph(const DependencyGraph &) = delete;
  208. DependencyGraph &operator=(const DependencyGraph &) = delete;
  209. void addDependency(unsigned From, unsigned To,
  210. DependencyEdge::Dependency &&DE);
  211. void pruneEdges(unsigned Iterations);
  212. void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
  213. void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet,
  214. unsigned Iterations);
  215. #ifndef NDEBUG
  216. void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
  217. MCInstPrinter &MCIP) const;
  218. #endif
  219. public:
  220. DependencyGraph(unsigned Size) : Nodes(Size) {}
  221. void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
  222. unsigned Cost) {
  223. addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
  224. }
  225. void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
  226. addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
  227. }
  228. void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
  229. unsigned Cost) {
  230. addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
  231. }
  232. // Called by the bottleneck analysis at the end of simulation to propagate
  233. // costs through the edges of the graph, and compute a critical path.
  234. void finalizeGraph(unsigned Iterations) {
  235. SmallVector<unsigned, 16> RootSet;
  236. pruneEdges(Iterations);
  237. initializeRootSet(RootSet);
  238. propagateThroughEdges(RootSet, Iterations);
  239. }
  240. // Returns a sequence of edges representing the critical sequence based on the
  241. // simulated run. It assumes that the graph has already been finalized (i.e.
  242. // method `finalizeGraph()` has already been called on this graph).
  243. void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
  244. #ifndef NDEBUG
  245. void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
  246. #endif
  247. };
  248. /// A view that collects and prints a few performance numbers.
  249. class BottleneckAnalysis : public InstructionView {
  250. PressureTracker Tracker;
  251. DependencyGraph DG;
  252. unsigned Iterations;
  253. unsigned TotalCycles;
  254. bool PressureIncreasedBecauseOfResources;
  255. bool PressureIncreasedBecauseOfRegisterDependencies;
  256. bool PressureIncreasedBecauseOfMemoryDependencies;
  257. // True if throughput was affected by dispatch stalls.
  258. bool SeenStallCycles;
  259. struct BackPressureInfo {
  260. // Cycles where backpressure increased.
  261. unsigned PressureIncreaseCycles;
  262. // Cycles where backpressure increased because of pipeline pressure.
  263. unsigned ResourcePressureCycles;
  264. // Cycles where backpressure increased because of data dependencies.
  265. unsigned DataDependencyCycles;
  266. // Cycles where backpressure increased because of register dependencies.
  267. unsigned RegisterDependencyCycles;
  268. // Cycles where backpressure increased because of memory dependencies.
  269. unsigned MemoryDependencyCycles;
  270. };
  271. BackPressureInfo BPI;
  272. // Used to populate the dependency graph DG.
  273. void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
  274. void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
  275. void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
  276. void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI,
  277. bool UseDifferentColor = false) const;
  278. // Prints a bottleneck message to OS.
  279. void printBottleneckHints(raw_ostream &OS) const;
  280. void printCriticalSequence(raw_ostream &OS) const;
  281. public:
  282. BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
  283. ArrayRef<MCInst> Sequence, unsigned Iterations);
  284. void onCycleEnd() override;
  285. void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
  286. void onEvent(const HWPressureEvent &Event) override;
  287. void onEvent(const HWInstructionEvent &Event) override;
  288. void printView(raw_ostream &OS) const override;
  289. StringRef getNameAsString() const override { return "BottleneckAnalysis"; }
  290. bool isSerializable() const override { return false; }
  291. #ifndef NDEBUG
  292. void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
  293. #endif
  294. };
  295. } // namespace mca
  296. } // namespace llvm
  297. #endif