123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491 |
- #pragma once
- #ifdef __GNUC__
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Wunused-parameter"
- #endif
- //===------------------------- LSUnit.h --------------------------*- C++-*-===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- /// \file
- ///
- /// A Load/Store unit class that models load/store queues and that implements
- /// a simple weak memory consistency model.
- ///
- //===----------------------------------------------------------------------===//
- #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H
- #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H
- #include "llvm/ADT/DenseMap.h"
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/MC/MCSchedule.h"
- #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
- #include "llvm/MCA/Instruction.h"
- namespace llvm {
- namespace mca {
- /// A node of a memory dependency graph. A MemoryGroup describes a set of
- /// instructions with same memory dependencies.
- ///
- /// By construction, instructions of a MemoryGroup don't depend on each other.
- /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
- /// A Memory group identifier is then stored as a "token" in field
- /// Instruction::LSUTokenID of each dispatched instructions. That token is used
- /// internally by the LSUnit to track memory dependencies.
- class MemoryGroup {
- unsigned NumPredecessors;
- unsigned NumExecutingPredecessors;
- unsigned NumExecutedPredecessors;
- unsigned NumInstructions;
- unsigned NumExecuting;
- unsigned NumExecuted;
- // Successors that are in a order dependency with this group.
- SmallVector<MemoryGroup *, 4> OrderSucc;
- // Successors that are in a data dependency with this group.
- SmallVector<MemoryGroup *, 4> DataSucc;
- CriticalDependency CriticalPredecessor;
- InstRef CriticalMemoryInstruction;
- MemoryGroup(const MemoryGroup &) = delete;
- MemoryGroup &operator=(const MemoryGroup &) = delete;
- public:
- MemoryGroup()
- : NumPredecessors(0), NumExecutingPredecessors(0),
- NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
- NumExecuted(0), CriticalPredecessor() {}
- MemoryGroup(MemoryGroup &&) = default;
- size_t getNumSuccessors() const {
- return OrderSucc.size() + DataSucc.size();
- }
- unsigned getNumPredecessors() const { return NumPredecessors; }
- unsigned getNumExecutingPredecessors() const {
- return NumExecutingPredecessors;
- }
- unsigned getNumExecutedPredecessors() const {
- return NumExecutedPredecessors;
- }
- unsigned getNumInstructions() const { return NumInstructions; }
- unsigned getNumExecuting() const { return NumExecuting; }
- unsigned getNumExecuted() const { return NumExecuted; }
- const InstRef &getCriticalMemoryInstruction() const {
- return CriticalMemoryInstruction;
- }
- const CriticalDependency &getCriticalPredecessor() const {
- return CriticalPredecessor;
- }
- void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
- // Do not need to add a dependency if there is no data
- // dependency and all instructions from this group have been
- // issued already.
- if (!IsDataDependent && isExecuting())
- return;
- Group->NumPredecessors++;
- assert(!isExecuted() && "Should have been removed!");
- if (isExecuting())
- Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
- if (IsDataDependent)
- DataSucc.emplace_back(Group);
- else
- OrderSucc.emplace_back(Group);
- }
- bool isWaiting() const {
- return NumPredecessors >
- (NumExecutingPredecessors + NumExecutedPredecessors);
- }
- bool isPending() const {
- return NumExecutingPredecessors &&
- ((NumExecutedPredecessors + NumExecutingPredecessors) ==
- NumPredecessors);
- }
- bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
- bool isExecuting() const {
- return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
- }
- bool isExecuted() const { return NumInstructions == NumExecuted; }
- void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
- assert(!isReady() && "Unexpected group-start event!");
- NumExecutingPredecessors++;
- if (!ShouldUpdateCriticalDep)
- return;
- unsigned Cycles = IR.getInstruction()->getCyclesLeft();
- if (CriticalPredecessor.Cycles < Cycles) {
- CriticalPredecessor.IID = IR.getSourceIndex();
- CriticalPredecessor.Cycles = Cycles;
- }
- }
- void onGroupExecuted() {
- assert(!isReady() && "Inconsistent state found!");
- NumExecutingPredecessors--;
- NumExecutedPredecessors++;
- }
- void onInstructionIssued(const InstRef &IR) {
- assert(!isExecuting() && "Invalid internal state!");
- ++NumExecuting;
- // update the CriticalMemDep.
- const Instruction &IS = *IR.getInstruction();
- if ((bool)CriticalMemoryInstruction) {
- const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
- if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
- CriticalMemoryInstruction = IR;
- } else {
- CriticalMemoryInstruction = IR;
- }
- if (!isExecuting())
- return;
- // Notify successors that this group started execution.
- for (MemoryGroup *MG : OrderSucc) {
- MG->onGroupIssued(CriticalMemoryInstruction, false);
- // Release the order dependency with this group.
- MG->onGroupExecuted();
- }
- for (MemoryGroup *MG : DataSucc)
- MG->onGroupIssued(CriticalMemoryInstruction, true);
- }
- void onInstructionExecuted(const InstRef &IR) {
- assert(isReady() && !isExecuted() && "Invalid internal state!");
- --NumExecuting;
- ++NumExecuted;
- if (CriticalMemoryInstruction &&
- CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) {
- CriticalMemoryInstruction.invalidate();
- }
- if (!isExecuted())
- return;
- // Notify data dependent successors that this group has finished execution.
- for (MemoryGroup *MG : DataSucc)
- MG->onGroupExecuted();
- }
- void addInstruction() {
- assert(!getNumSuccessors() && "Cannot add instructions to this group!");
- ++NumInstructions;
- }
- void cycleEvent() {
- if (isWaiting() && CriticalPredecessor.Cycles)
- CriticalPredecessor.Cycles--;
- }
- };
- /// Abstract base interface for LS (load/store) units in llvm-mca.
- class LSUnitBase : public HardwareUnit {
- /// Load queue size.
- ///
- /// A value of zero for this field means that the load queue is unbounded.
- /// Processor models can declare the size of a load queue via tablegen (see
- /// the definition of tablegen class LoadQueue in
- /// llvm/Target/TargetSchedule.td).
- unsigned LQSize;
- /// Load queue size.
- ///
- /// A value of zero for this field means that the store queue is unbounded.
- /// Processor models can declare the size of a store queue via tablegen (see
- /// the definition of tablegen class StoreQueue in
- /// llvm/Target/TargetSchedule.td).
- unsigned SQSize;
- unsigned UsedLQEntries;
- unsigned UsedSQEntries;
- /// True if loads don't alias with stores.
- ///
- /// By default, the LS unit assumes that loads and stores don't alias with
- /// eachother. If this field is set to false, then loads are always assumed to
- /// alias with stores.
- const bool NoAlias;
- /// Used to map group identifiers to MemoryGroups.
- DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
- unsigned NextGroupID;
- public:
- LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
- unsigned StoreQueueSize, bool AssumeNoAlias);
- virtual ~LSUnitBase();
- /// Returns the total number of entries in the load queue.
- unsigned getLoadQueueSize() const { return LQSize; }
- /// Returns the total number of entries in the store queue.
- unsigned getStoreQueueSize() const { return SQSize; }
- unsigned getUsedLQEntries() const { return UsedLQEntries; }
- unsigned getUsedSQEntries() const { return UsedSQEntries; }
- void acquireLQSlot() { ++UsedLQEntries; }
- void acquireSQSlot() { ++UsedSQEntries; }
- void releaseLQSlot() { --UsedLQEntries; }
- void releaseSQSlot() { --UsedSQEntries; }
- bool assumeNoAlias() const { return NoAlias; }
- enum Status {
- LSU_AVAILABLE = 0,
- LSU_LQUEUE_FULL, // Load Queue unavailable
- LSU_SQUEUE_FULL // Store Queue unavailable
- };
- /// This method checks the availability of the load/store buffers.
- ///
- /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
- /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
- /// not a memory operation.
- virtual Status isAvailable(const InstRef &IR) const = 0;
- /// Allocates LS resources for instruction IR.
- ///
- /// This method assumes that a previous call to `isAvailable(IR)` succeeded
- /// with a LSUnitBase::Status value of LSU_AVAILABLE.
- /// Returns the GroupID associated with this instruction. That value will be
- /// used to set the LSUTokenID field in class Instruction.
- virtual unsigned dispatch(const InstRef &IR) = 0;
- bool isSQEmpty() const { return !UsedSQEntries; }
- bool isLQEmpty() const { return !UsedLQEntries; }
- bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
- bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
- bool isValidGroupID(unsigned Index) const {
- return Index && (Groups.find(Index) != Groups.end());
- }
- /// Check if a peviously dispatched instruction IR is now ready for execution.
- bool isReady(const InstRef &IR) const {
- unsigned GroupID = IR.getInstruction()->getLSUTokenID();
- const MemoryGroup &Group = getGroup(GroupID);
- return Group.isReady();
- }
- /// Check if instruction IR only depends on memory instructions that are
- /// currently executing.
- bool isPending(const InstRef &IR) const {
- unsigned GroupID = IR.getInstruction()->getLSUTokenID();
- const MemoryGroup &Group = getGroup(GroupID);
- return Group.isPending();
- }
- /// Check if instruction IR is still waiting on memory operations, and the
- /// wait time is still unknown.
- bool isWaiting(const InstRef &IR) const {
- unsigned GroupID = IR.getInstruction()->getLSUTokenID();
- const MemoryGroup &Group = getGroup(GroupID);
- return Group.isWaiting();
- }
- bool hasDependentUsers(const InstRef &IR) const {
- unsigned GroupID = IR.getInstruction()->getLSUTokenID();
- const MemoryGroup &Group = getGroup(GroupID);
- return !Group.isExecuted() && Group.getNumSuccessors();
- }
- const MemoryGroup &getGroup(unsigned Index) const {
- assert(isValidGroupID(Index) && "Group doesn't exist!");
- return *Groups.find(Index)->second;
- }
- MemoryGroup &getGroup(unsigned Index) {
- assert(isValidGroupID(Index) && "Group doesn't exist!");
- return *Groups.find(Index)->second;
- }
- unsigned createMemoryGroup() {
- Groups.insert(
- std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
- return NextGroupID++;
- }
- virtual void onInstructionExecuted(const InstRef &IR);
- // Loads are tracked by the LDQ (load queue) from dispatch until completion.
- // Stores are tracked by the STQ (store queue) from dispatch until commitment.
- // By default we conservatively assume that the LDQ receives a load at
- // dispatch. Loads leave the LDQ at retirement stage.
- virtual void onInstructionRetired(const InstRef &IR);
- virtual void onInstructionIssued(const InstRef &IR) {
- unsigned GroupID = IR.getInstruction()->getLSUTokenID();
- Groups[GroupID]->onInstructionIssued(IR);
- }
- virtual void cycleEvent();
- #ifndef NDEBUG
- void dump() const;
- #endif
- };
- /// Default Load/Store Unit (LS Unit) for simulated processors.
- ///
- /// Each load (or store) consumes one entry in the load (or store) queue.
- ///
- /// Rules are:
- /// 1) A younger load is allowed to pass an older load only if there are no
- /// stores nor barriers in between the two loads.
- /// 2) An younger store is not allowed to pass an older store.
- /// 3) A younger store is not allowed to pass an older load.
- /// 4) A younger load is allowed to pass an older store only if the load does
- /// not alias with the store.
- ///
- /// This class optimistically assumes that loads don't alias store operations.
- /// Under this assumption, younger loads are always allowed to pass older
- /// stores (this would only affects rule 4).
- /// Essentially, this class doesn't perform any sort alias analysis to
- /// identify aliasing loads and stores.
- ///
- /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
- /// set to `false` by the constructor of LSUnit.
- ///
- /// Note that this class doesn't know about the existence of different memory
- /// types for memory operations (example: write-through, write-combining, etc.).
- /// Derived classes are responsible for implementing that extra knowledge, and
- /// provide different sets of rules for loads and stores by overriding method
- /// `isReady()`.
- /// To emulate a write-combining memory type, rule 2. must be relaxed in a
- /// derived class to enable the reordering of non-aliasing store operations.
- ///
- /// No assumptions are made by this class on the size of the store buffer. This
- /// class doesn't know how to identify cases where store-to-load forwarding may
- /// occur.
- ///
- /// LSUnit doesn't attempt to predict whether a load or store hits or misses
- /// the L1 cache. To be more specific, LSUnit doesn't know anything about
- /// cache hierarchy and memory types.
- /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
- /// scheduling model provides an "optimistic" load-to-use latency (which usually
- /// matches the load-to-use latency for when there is a hit in the L1D).
- /// Derived classes may expand this knowledge.
- ///
- /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
- /// memory-barrier like instructions.
- /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
- /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
- /// serializes loads without forcing a flush of the load queue.
- /// Similarly, instructions that both `mayStore` and have `unmodeled side
- /// effects` are treated like store barriers. A full memory
- /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
- /// effects. This is obviously inaccurate, but this is the best that we can do
- /// at the moment.
- ///
- /// Each load/store barrier consumes one entry in the load/store queue. A
- /// load/store barrier enforces ordering of loads/stores:
- /// - A younger load cannot pass a load barrier.
- /// - A younger store cannot pass a store barrier.
- ///
- /// A younger load has to wait for the memory load barrier to execute.
- /// A load/store barrier is "executed" when it becomes the oldest entry in
- /// the load/store queue(s). That also means, all the older loads/stores have
- /// already been executed.
- class LSUnit : public LSUnitBase {
- // This class doesn't know about the latency of a load instruction. So, it
- // conservatively/pessimistically assumes that the latency of a load opcode
- // matches the instruction latency.
- //
- // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
- // and load/store conflicts, the latency of a load is determined by the depth
- // of the load pipeline. So, we could use field `LoadLatency` in the
- // MCSchedModel to model that latency.
- // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
- // L1D, and it usually already accounts for any extra latency due to data
- // forwarding.
- // When doing throughput analysis, `LoadLatency` is likely to
- // be a better predictor of load latency than instruction latency. This is
- // particularly true when simulating code with temporal/spatial locality of
- // memory accesses.
- // Using `LoadLatency` (instead of the instruction latency) is also expected
- // to improve the load queue allocation for long latency instructions with
- // folded memory operands (See PR39829).
- //
- // FIXME: On some processors, load/store operations are split into multiple
- // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
- // not 256-bit data types. So, a 256-bit load is effectively split into two
- // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
- // simplicity, this class optimistically assumes that a load instruction only
- // consumes one entry in the LoadQueue. Similarly, store instructions only
- // consume a single entry in the StoreQueue.
- // In future, we should reassess the quality of this design, and consider
- // alternative approaches that let instructions specify the number of
- // load/store queue entries which they consume at dispatch stage (See
- // PR39830).
- //
- // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
- // conservatively treated as a store barrier. It forces older store to be
- // executed before newer stores are issued.
- //
- // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
- // conservatively treated as a load barrier. It forces older loads to execute
- // before newer loads are issued.
- unsigned CurrentLoadGroupID;
- unsigned CurrentLoadBarrierGroupID;
- unsigned CurrentStoreGroupID;
- unsigned CurrentStoreBarrierGroupID;
- public:
- LSUnit(const MCSchedModel &SM)
- : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
- LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
- : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
- LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
- : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
- CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
- CurrentStoreBarrierGroupID(0) {}
- /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
- /// accomodate instruction IR.
- Status isAvailable(const InstRef &IR) const override;
- /// Allocates LS resources for instruction IR.
- ///
- /// This method assumes that a previous call to `isAvailable(IR)` succeeded
- /// returning LSU_AVAILABLE.
- ///
- /// Rules are:
- /// By default, rules are:
- /// 1. A store may not pass a previous store.
- /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
- /// 3. A load may pass a previous load.
- /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
- /// 5. A load has to wait until an older load barrier is fully executed.
- /// 6. A store has to wait until an older store barrier is fully executed.
- unsigned dispatch(const InstRef &IR) override;
- void onInstructionExecuted(const InstRef &IR) override;
- };
- } // namespace mca
- } // namespace llvm
- #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H
- #ifdef __GNUC__
- #pragma GCC diagnostic pop
- #endif
|