LSUnit.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===------------------------- LSUnit.h --------------------------*- C++-*-===//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. /// \file
  14. ///
  15. /// A Load/Store unit class that models load/store queues and that implements
  16. /// a simple weak memory consistency model.
  17. ///
  18. //===----------------------------------------------------------------------===//
  19. #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H
  20. #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H
  21. #include "llvm/ADT/DenseMap.h"
  22. #include "llvm/ADT/SmallVector.h"
  23. #include "llvm/MC/MCSchedule.h"
  24. #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
  25. #include "llvm/MCA/Instruction.h"
  26. namespace llvm {
  27. namespace mca {
  28. /// A node of a memory dependency graph. A MemoryGroup describes a set of
  29. /// instructions with same memory dependencies.
  30. ///
  31. /// By construction, instructions of a MemoryGroup don't depend on each other.
  32. /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
  33. /// A Memory group identifier is then stored as a "token" in field
  34. /// Instruction::LSUTokenID of each dispatched instructions. That token is used
  35. /// internally by the LSUnit to track memory dependencies.
  36. class MemoryGroup {
  37. unsigned NumPredecessors;
  38. unsigned NumExecutingPredecessors;
  39. unsigned NumExecutedPredecessors;
  40. unsigned NumInstructions;
  41. unsigned NumExecuting;
  42. unsigned NumExecuted;
  43. // Successors that are in a order dependency with this group.
  44. SmallVector<MemoryGroup *, 4> OrderSucc;
  45. // Successors that are in a data dependency with this group.
  46. SmallVector<MemoryGroup *, 4> DataSucc;
  47. CriticalDependency CriticalPredecessor;
  48. InstRef CriticalMemoryInstruction;
  49. MemoryGroup(const MemoryGroup &) = delete;
  50. MemoryGroup &operator=(const MemoryGroup &) = delete;
  51. public:
  52. MemoryGroup()
  53. : NumPredecessors(0), NumExecutingPredecessors(0),
  54. NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
  55. NumExecuted(0), CriticalPredecessor() {}
  56. MemoryGroup(MemoryGroup &&) = default;
  57. size_t getNumSuccessors() const {
  58. return OrderSucc.size() + DataSucc.size();
  59. }
  60. unsigned getNumPredecessors() const { return NumPredecessors; }
  61. unsigned getNumExecutingPredecessors() const {
  62. return NumExecutingPredecessors;
  63. }
  64. unsigned getNumExecutedPredecessors() const {
  65. return NumExecutedPredecessors;
  66. }
  67. unsigned getNumInstructions() const { return NumInstructions; }
  68. unsigned getNumExecuting() const { return NumExecuting; }
  69. unsigned getNumExecuted() const { return NumExecuted; }
  70. const InstRef &getCriticalMemoryInstruction() const {
  71. return CriticalMemoryInstruction;
  72. }
  73. const CriticalDependency &getCriticalPredecessor() const {
  74. return CriticalPredecessor;
  75. }
  76. void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
  77. // Do not need to add a dependency if there is no data
  78. // dependency and all instructions from this group have been
  79. // issued already.
  80. if (!IsDataDependent && isExecuting())
  81. return;
  82. Group->NumPredecessors++;
  83. assert(!isExecuted() && "Should have been removed!");
  84. if (isExecuting())
  85. Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
  86. if (IsDataDependent)
  87. DataSucc.emplace_back(Group);
  88. else
  89. OrderSucc.emplace_back(Group);
  90. }
  91. bool isWaiting() const {
  92. return NumPredecessors >
  93. (NumExecutingPredecessors + NumExecutedPredecessors);
  94. }
  95. bool isPending() const {
  96. return NumExecutingPredecessors &&
  97. ((NumExecutedPredecessors + NumExecutingPredecessors) ==
  98. NumPredecessors);
  99. }
  100. bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
  101. bool isExecuting() const {
  102. return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
  103. }
  104. bool isExecuted() const { return NumInstructions == NumExecuted; }
  105. void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
  106. assert(!isReady() && "Unexpected group-start event!");
  107. NumExecutingPredecessors++;
  108. if (!ShouldUpdateCriticalDep)
  109. return;
  110. unsigned Cycles = IR.getInstruction()->getCyclesLeft();
  111. if (CriticalPredecessor.Cycles < Cycles) {
  112. CriticalPredecessor.IID = IR.getSourceIndex();
  113. CriticalPredecessor.Cycles = Cycles;
  114. }
  115. }
  116. void onGroupExecuted() {
  117. assert(!isReady() && "Inconsistent state found!");
  118. NumExecutingPredecessors--;
  119. NumExecutedPredecessors++;
  120. }
  121. void onInstructionIssued(const InstRef &IR) {
  122. assert(!isExecuting() && "Invalid internal state!");
  123. ++NumExecuting;
  124. // update the CriticalMemDep.
  125. const Instruction &IS = *IR.getInstruction();
  126. if ((bool)CriticalMemoryInstruction) {
  127. const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
  128. if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
  129. CriticalMemoryInstruction = IR;
  130. } else {
  131. CriticalMemoryInstruction = IR;
  132. }
  133. if (!isExecuting())
  134. return;
  135. // Notify successors that this group started execution.
  136. for (MemoryGroup *MG : OrderSucc) {
  137. MG->onGroupIssued(CriticalMemoryInstruction, false);
  138. // Release the order dependency with this group.
  139. MG->onGroupExecuted();
  140. }
  141. for (MemoryGroup *MG : DataSucc)
  142. MG->onGroupIssued(CriticalMemoryInstruction, true);
  143. }
  144. void onInstructionExecuted(const InstRef &IR) {
  145. assert(isReady() && !isExecuted() && "Invalid internal state!");
  146. --NumExecuting;
  147. ++NumExecuted;
  148. if (CriticalMemoryInstruction &&
  149. CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) {
  150. CriticalMemoryInstruction.invalidate();
  151. }
  152. if (!isExecuted())
  153. return;
  154. // Notify data dependent successors that this group has finished execution.
  155. for (MemoryGroup *MG : DataSucc)
  156. MG->onGroupExecuted();
  157. }
  158. void addInstruction() {
  159. assert(!getNumSuccessors() && "Cannot add instructions to this group!");
  160. ++NumInstructions;
  161. }
  162. void cycleEvent() {
  163. if (isWaiting() && CriticalPredecessor.Cycles)
  164. CriticalPredecessor.Cycles--;
  165. }
  166. };
  167. /// Abstract base interface for LS (load/store) units in llvm-mca.
  168. class LSUnitBase : public HardwareUnit {
  169. /// Load queue size.
  170. ///
  171. /// A value of zero for this field means that the load queue is unbounded.
  172. /// Processor models can declare the size of a load queue via tablegen (see
  173. /// the definition of tablegen class LoadQueue in
  174. /// llvm/Target/TargetSchedule.td).
  175. unsigned LQSize;
  176. /// Load queue size.
  177. ///
  178. /// A value of zero for this field means that the store queue is unbounded.
  179. /// Processor models can declare the size of a store queue via tablegen (see
  180. /// the definition of tablegen class StoreQueue in
  181. /// llvm/Target/TargetSchedule.td).
  182. unsigned SQSize;
  183. unsigned UsedLQEntries;
  184. unsigned UsedSQEntries;
  185. /// True if loads don't alias with stores.
  186. ///
  187. /// By default, the LS unit assumes that loads and stores don't alias with
  188. /// eachother. If this field is set to false, then loads are always assumed to
  189. /// alias with stores.
  190. const bool NoAlias;
  191. /// Used to map group identifiers to MemoryGroups.
  192. DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
  193. unsigned NextGroupID;
  194. public:
  195. LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
  196. unsigned StoreQueueSize, bool AssumeNoAlias);
  197. virtual ~LSUnitBase();
  198. /// Returns the total number of entries in the load queue.
  199. unsigned getLoadQueueSize() const { return LQSize; }
  200. /// Returns the total number of entries in the store queue.
  201. unsigned getStoreQueueSize() const { return SQSize; }
  202. unsigned getUsedLQEntries() const { return UsedLQEntries; }
  203. unsigned getUsedSQEntries() const { return UsedSQEntries; }
  204. void acquireLQSlot() { ++UsedLQEntries; }
  205. void acquireSQSlot() { ++UsedSQEntries; }
  206. void releaseLQSlot() { --UsedLQEntries; }
  207. void releaseSQSlot() { --UsedSQEntries; }
  208. bool assumeNoAlias() const { return NoAlias; }
  209. enum Status {
  210. LSU_AVAILABLE = 0,
  211. LSU_LQUEUE_FULL, // Load Queue unavailable
  212. LSU_SQUEUE_FULL // Store Queue unavailable
  213. };
  214. /// This method checks the availability of the load/store buffers.
  215. ///
  216. /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
  217. /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
  218. /// not a memory operation.
  219. virtual Status isAvailable(const InstRef &IR) const = 0;
  220. /// Allocates LS resources for instruction IR.
  221. ///
  222. /// This method assumes that a previous call to `isAvailable(IR)` succeeded
  223. /// with a LSUnitBase::Status value of LSU_AVAILABLE.
  224. /// Returns the GroupID associated with this instruction. That value will be
  225. /// used to set the LSUTokenID field in class Instruction.
  226. virtual unsigned dispatch(const InstRef &IR) = 0;
  227. bool isSQEmpty() const { return !UsedSQEntries; }
  228. bool isLQEmpty() const { return !UsedLQEntries; }
  229. bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
  230. bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
  231. bool isValidGroupID(unsigned Index) const {
  232. return Index && (Groups.find(Index) != Groups.end());
  233. }
  234. /// Check if a peviously dispatched instruction IR is now ready for execution.
  235. bool isReady(const InstRef &IR) const {
  236. unsigned GroupID = IR.getInstruction()->getLSUTokenID();
  237. const MemoryGroup &Group = getGroup(GroupID);
  238. return Group.isReady();
  239. }
  240. /// Check if instruction IR only depends on memory instructions that are
  241. /// currently executing.
  242. bool isPending(const InstRef &IR) const {
  243. unsigned GroupID = IR.getInstruction()->getLSUTokenID();
  244. const MemoryGroup &Group = getGroup(GroupID);
  245. return Group.isPending();
  246. }
  247. /// Check if instruction IR is still waiting on memory operations, and the
  248. /// wait time is still unknown.
  249. bool isWaiting(const InstRef &IR) const {
  250. unsigned GroupID = IR.getInstruction()->getLSUTokenID();
  251. const MemoryGroup &Group = getGroup(GroupID);
  252. return Group.isWaiting();
  253. }
  254. bool hasDependentUsers(const InstRef &IR) const {
  255. unsigned GroupID = IR.getInstruction()->getLSUTokenID();
  256. const MemoryGroup &Group = getGroup(GroupID);
  257. return !Group.isExecuted() && Group.getNumSuccessors();
  258. }
  259. const MemoryGroup &getGroup(unsigned Index) const {
  260. assert(isValidGroupID(Index) && "Group doesn't exist!");
  261. return *Groups.find(Index)->second;
  262. }
  263. MemoryGroup &getGroup(unsigned Index) {
  264. assert(isValidGroupID(Index) && "Group doesn't exist!");
  265. return *Groups.find(Index)->second;
  266. }
  267. unsigned createMemoryGroup() {
  268. Groups.insert(
  269. std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
  270. return NextGroupID++;
  271. }
  272. virtual void onInstructionExecuted(const InstRef &IR);
  273. // Loads are tracked by the LDQ (load queue) from dispatch until completion.
  274. // Stores are tracked by the STQ (store queue) from dispatch until commitment.
  275. // By default we conservatively assume that the LDQ receives a load at
  276. // dispatch. Loads leave the LDQ at retirement stage.
  277. virtual void onInstructionRetired(const InstRef &IR);
  278. virtual void onInstructionIssued(const InstRef &IR) {
  279. unsigned GroupID = IR.getInstruction()->getLSUTokenID();
  280. Groups[GroupID]->onInstructionIssued(IR);
  281. }
  282. virtual void cycleEvent();
  283. #ifndef NDEBUG
  284. void dump() const;
  285. #endif
  286. };
  287. /// Default Load/Store Unit (LS Unit) for simulated processors.
  288. ///
  289. /// Each load (or store) consumes one entry in the load (or store) queue.
  290. ///
  291. /// Rules are:
  292. /// 1) A younger load is allowed to pass an older load only if there are no
  293. /// stores nor barriers in between the two loads.
  294. /// 2) An younger store is not allowed to pass an older store.
  295. /// 3) A younger store is not allowed to pass an older load.
  296. /// 4) A younger load is allowed to pass an older store only if the load does
  297. /// not alias with the store.
  298. ///
  299. /// This class optimistically assumes that loads don't alias store operations.
  300. /// Under this assumption, younger loads are always allowed to pass older
  301. /// stores (this would only affects rule 4).
  302. /// Essentially, this class doesn't perform any sort alias analysis to
  303. /// identify aliasing loads and stores.
  304. ///
  305. /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
  306. /// set to `false` by the constructor of LSUnit.
  307. ///
  308. /// Note that this class doesn't know about the existence of different memory
  309. /// types for memory operations (example: write-through, write-combining, etc.).
  310. /// Derived classes are responsible for implementing that extra knowledge, and
  311. /// provide different sets of rules for loads and stores by overriding method
  312. /// `isReady()`.
  313. /// To emulate a write-combining memory type, rule 2. must be relaxed in a
  314. /// derived class to enable the reordering of non-aliasing store operations.
  315. ///
  316. /// No assumptions are made by this class on the size of the store buffer. This
  317. /// class doesn't know how to identify cases where store-to-load forwarding may
  318. /// occur.
  319. ///
  320. /// LSUnit doesn't attempt to predict whether a load or store hits or misses
  321. /// the L1 cache. To be more specific, LSUnit doesn't know anything about
  322. /// cache hierarchy and memory types.
  323. /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
  324. /// scheduling model provides an "optimistic" load-to-use latency (which usually
  325. /// matches the load-to-use latency for when there is a hit in the L1D).
  326. /// Derived classes may expand this knowledge.
  327. ///
  328. /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
  329. /// memory-barrier like instructions.
  330. /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
  331. /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
  332. /// serializes loads without forcing a flush of the load queue.
  333. /// Similarly, instructions that both `mayStore` and have `unmodeled side
  334. /// effects` are treated like store barriers. A full memory
  335. /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
  336. /// effects. This is obviously inaccurate, but this is the best that we can do
  337. /// at the moment.
  338. ///
  339. /// Each load/store barrier consumes one entry in the load/store queue. A
  340. /// load/store barrier enforces ordering of loads/stores:
  341. /// - A younger load cannot pass a load barrier.
  342. /// - A younger store cannot pass a store barrier.
  343. ///
  344. /// A younger load has to wait for the memory load barrier to execute.
  345. /// A load/store barrier is "executed" when it becomes the oldest entry in
  346. /// the load/store queue(s). That also means, all the older loads/stores have
  347. /// already been executed.
  348. class LSUnit : public LSUnitBase {
  349. // This class doesn't know about the latency of a load instruction. So, it
  350. // conservatively/pessimistically assumes that the latency of a load opcode
  351. // matches the instruction latency.
  352. //
  353. // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
  354. // and load/store conflicts, the latency of a load is determined by the depth
  355. // of the load pipeline. So, we could use field `LoadLatency` in the
  356. // MCSchedModel to model that latency.
  357. // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
  358. // L1D, and it usually already accounts for any extra latency due to data
  359. // forwarding.
  360. // When doing throughput analysis, `LoadLatency` is likely to
  361. // be a better predictor of load latency than instruction latency. This is
  362. // particularly true when simulating code with temporal/spatial locality of
  363. // memory accesses.
  364. // Using `LoadLatency` (instead of the instruction latency) is also expected
  365. // to improve the load queue allocation for long latency instructions with
  366. // folded memory operands (See PR39829).
  367. //
  368. // FIXME: On some processors, load/store operations are split into multiple
  369. // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
  370. // not 256-bit data types. So, a 256-bit load is effectively split into two
  371. // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
  372. // simplicity, this class optimistically assumes that a load instruction only
  373. // consumes one entry in the LoadQueue. Similarly, store instructions only
  374. // consume a single entry in the StoreQueue.
  375. // In future, we should reassess the quality of this design, and consider
  376. // alternative approaches that let instructions specify the number of
  377. // load/store queue entries which they consume at dispatch stage (See
  378. // PR39830).
  379. //
  380. // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
  381. // conservatively treated as a store barrier. It forces older store to be
  382. // executed before newer stores are issued.
  383. //
  384. // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
  385. // conservatively treated as a load barrier. It forces older loads to execute
  386. // before newer loads are issued.
  387. unsigned CurrentLoadGroupID;
  388. unsigned CurrentLoadBarrierGroupID;
  389. unsigned CurrentStoreGroupID;
  390. unsigned CurrentStoreBarrierGroupID;
  391. public:
  392. LSUnit(const MCSchedModel &SM)
  393. : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
  394. LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
  395. : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
  396. LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
  397. : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
  398. CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
  399. CurrentStoreBarrierGroupID(0) {}
  400. /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
  401. /// accomodate instruction IR.
  402. Status isAvailable(const InstRef &IR) const override;
  403. /// Allocates LS resources for instruction IR.
  404. ///
  405. /// This method assumes that a previous call to `isAvailable(IR)` succeeded
  406. /// returning LSU_AVAILABLE.
  407. ///
  408. /// Rules are:
  409. /// By default, rules are:
  410. /// 1. A store may not pass a previous store.
  411. /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
  412. /// 3. A load may pass a previous load.
  413. /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
  414. /// 5. A load has to wait until an older load barrier is fully executed.
  415. /// 6. A store has to wait until an older store barrier is fully executed.
  416. unsigned dispatch(const InstRef &IR) override;
  417. void onInstructionExecuted(const InstRef &IR) override;
  418. };
  419. } // namespace mca
  420. } // namespace llvm
  421. #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H
  422. #ifdef __GNUC__
  423. #pragma GCC diagnostic pop
  424. #endif