123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346 |
- //===--------------------- BottleneckAnalysis.h -----------------*- C++ -*-===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- /// \file
- ///
- /// This file implements the bottleneck analysis view.
- ///
- /// This view internally observes backend pressure increase events in order to
- /// identify problematic data dependencies and processor resource interferences.
- ///
- /// Example of bottleneck analysis report for a dot-product on X86 btver2:
- ///
- /// Cycles with backend pressure increase [ 40.76% ]
- /// Throughput Bottlenecks:
- /// Resource Pressure [ 39.34% ]
- /// - JFPA [ 39.34% ]
- /// - JFPU0 [ 39.34% ]
- /// Data Dependencies: [ 1.42% ]
- /// - Register Dependencies [ 1.42% ]
- /// - Memory Dependencies [ 0.00% ]
- ///
- /// According to the example, backend pressure increased during the 40.76% of
- /// the simulated cycles. In particular, the major cause of backend pressure
- /// increases was the contention on floating point adder JFPA accessible from
- /// pipeline resource JFPU0.
- ///
- /// At the end of each cycle, if pressure on the simulated out-of-order buffers
- /// has increased, a backend pressure event is reported.
- /// In particular, this occurs when there is a delta between the number of uOps
- /// dispatched and the number of uOps issued to the underlying pipelines.
- ///
- /// The bottleneck analysis view is also responsible for identifying and printing
- /// the most "critical" sequence of dependent instructions according to the
- /// simulated run.
- ///
- /// Below is the critical sequence computed for the dot-product example on
- /// btver2:
- ///
- /// Instruction Dependency Information
- /// +----< 2. vhaddps %xmm3, %xmm3, %xmm4
- /// |
- /// | < loop carried >
- /// |
- /// | 0. vmulps %xmm0, %xmm0, %xmm2
- /// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
- /// +----> 2. vhaddps %xmm3, %xmm3, %xmm4 ## REGISTER dependency: %xmm3
- /// |
- /// | < loop carried >
- /// |
- /// +----> 1. vhaddps %xmm2, %xmm2, %xmm3 ## RESOURCE interference: JFPA [ probability: 73% ]
- ///
- ///
- /// The algorithm that computes the critical sequence is very similar to a
- /// critical path analysis.
- ///
- /// A dependency graph is used internally to track dependencies between nodes.
- /// Nodes of the graph represent instructions from the input assembly sequence,
- /// and edges of the graph represent data dependencies or processor resource
- /// interferences.
- ///
- /// Edges are dynamically 'discovered' by observing instruction state transitions
- /// and backend pressure increase events. Edges are internally ranked based on
- /// their "criticality". A dependency is considered to be critical if it takes a
- /// long time to execute, and if it contributes to backend pressure increases.
- /// Criticality is internally measured in terms of cycles; it is computed for
- /// every edge in the graph as a function of the edge latency and the number of
- /// backend pressure increase cycles contributed by that edge.
- ///
- /// At the end of simulation, costs are propagated to nodes through the edges of
- /// the graph, and the most expensive path connecting the root-set (a
- /// set of nodes with no predecessors) to a leaf node is reported as critical
- /// sequence.
- //
- //===----------------------------------------------------------------------===//
- #ifndef LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
- #define LLVM_TOOLS_LLVM_MCA_BOTTLENECK_ANALYSIS_H
- #include "Views/InstructionView.h"
- #include "llvm/ADT/DenseMap.h"
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/MC/MCInstPrinter.h"
- #include "llvm/MC/MCSchedule.h"
- #include "llvm/MC/MCSubtargetInfo.h"
- #include "llvm/Support/FormattedStream.h"
- #include "llvm/Support/raw_ostream.h"
- namespace llvm {
- namespace mca {
- class PressureTracker {
- const MCSchedModel &SM;
- // Resource pressure distribution. There is an element for every processor
- // resource declared by the scheduling model. Quantities are number of cycles.
- SmallVector<unsigned, 4> ResourcePressureDistribution;
- // Each processor resource is associated with a so-called processor resource
- // mask. This vector allows to correlate processor resource IDs with processor
- // resource masks. There is exactly one element per each processor resource
- // declared by the scheduling model.
- SmallVector<uint64_t, 4> ProcResID2Mask;
- // Maps processor resource state indices (returned by calls to
- // `getResourceStateIndex(Mask)` to processor resource identifiers.
- SmallVector<unsigned, 4> ResIdx2ProcResID;
- // Maps Processor Resource identifiers to ResourceUsers indices.
- SmallVector<unsigned, 4> ProcResID2ResourceUsersIndex;
- // Identifies the last user of a processor resource unit.
- // This vector is updated on every instruction issued event.
- // There is one entry for every processor resource unit declared by the
- // processor model. An all_ones value is treated like an invalid instruction
- // identifier.
- using User = std::pair<unsigned, unsigned>;
- SmallVector<User, 4> ResourceUsers;
- struct InstructionPressureInfo {
- unsigned RegisterPressureCycles;
- unsigned MemoryPressureCycles;
- unsigned ResourcePressureCycles;
- };
- DenseMap<unsigned, InstructionPressureInfo> IPI;
- void updateResourcePressureDistribution(uint64_t CumulativeMask);
- User getResourceUser(unsigned ProcResID, unsigned UnitID) const {
- unsigned Index = ProcResID2ResourceUsersIndex[ProcResID];
- return ResourceUsers[Index + UnitID];
- }
- public:
- PressureTracker(const MCSchedModel &Model);
- ArrayRef<unsigned> getResourcePressureDistribution() const {
- return ResourcePressureDistribution;
- }
- void getResourceUsers(uint64_t ResourceMask,
- SmallVectorImpl<User> &Users) const;
- unsigned getRegisterPressureCycles(unsigned IID) const {
- assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
- const InstructionPressureInfo &Info = IPI.find(IID)->second;
- return Info.RegisterPressureCycles;
- }
- unsigned getMemoryPressureCycles(unsigned IID) const {
- assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
- const InstructionPressureInfo &Info = IPI.find(IID)->second;
- return Info.MemoryPressureCycles;
- }
- unsigned getResourcePressureCycles(unsigned IID) const {
- assert(IPI.find(IID) != IPI.end() && "Instruction is not tracked!");
- const InstructionPressureInfo &Info = IPI.find(IID)->second;
- return Info.ResourcePressureCycles;
- }
- const char *resolveResourceName(uint64_t ResourceMask) const {
- unsigned Index = getResourceStateIndex(ResourceMask);
- unsigned ProcResID = ResIdx2ProcResID[Index];
- const MCProcResourceDesc &PRDesc = *SM.getProcResource(ProcResID);
- return PRDesc.Name;
- }
- void onInstructionDispatched(unsigned IID);
- void onInstructionExecuted(unsigned IID);
- void handlePressureEvent(const HWPressureEvent &Event);
- void handleInstructionIssuedEvent(const HWInstructionIssuedEvent &Event);
- };
- // A dependency edge.
- struct DependencyEdge {
- enum DependencyType { DT_INVALID, DT_REGISTER, DT_MEMORY, DT_RESOURCE };
- // Dependency edge descriptor.
- //
- // It specifies the dependency type, as well as the edge cost in cycles.
- struct Dependency {
- DependencyType Type;
- uint64_t ResourceOrRegID;
- uint64_t Cost;
- };
- Dependency Dep;
- unsigned FromIID;
- unsigned ToIID;
- // Used by the bottleneck analysis to compute the interference
- // probability for processor resources.
- unsigned Frequency;
- };
- // A dependency graph used by the bottleneck analysis to describe data
- // dependencies and processor resource interferences between instructions.
- //
- // There is a node (an instance of struct DGNode) for every instruction in the
- // input assembly sequence. Edges of the graph represent dependencies between
- // instructions.
- //
- // Each edge of the graph is associated with a cost value which is used
- // internally to rank dependency based on their impact on the runtime
- // performance (see field DependencyEdge::Dependency::Cost). In general, the
- // higher the cost of an edge, the higher the impact on performance.
- //
- // The cost of a dependency is a function of both the latency and the number of
- // cycles where the dependency has been seen as critical (i.e. contributing to
- // back-pressure increases).
- //
- // Loop carried dependencies are carefully expanded by the bottleneck analysis
- // to guarantee that the graph stays acyclic. To this end, extra nodes are
- // pre-allocated at construction time to describe instructions from "past and
- // future" iterations. The graph is kept acyclic mainly because it simplifies the
- // complexity of the algorithm that computes the critical sequence.
- class DependencyGraph {
- struct DGNode {
- unsigned NumPredecessors;
- unsigned NumVisitedPredecessors;
- uint64_t Cost;
- unsigned Depth;
- DependencyEdge CriticalPredecessor;
- SmallVector<DependencyEdge, 8> OutgoingEdges;
- };
- SmallVector<DGNode, 16> Nodes;
- DependencyGraph(const DependencyGraph &) = delete;
- DependencyGraph &operator=(const DependencyGraph &) = delete;
- void addDependency(unsigned From, unsigned To,
- DependencyEdge::Dependency &&DE);
- void pruneEdges(unsigned Iterations);
- void initializeRootSet(SmallVectorImpl<unsigned> &RootSet) const;
- void propagateThroughEdges(SmallVectorImpl<unsigned> &RootSet, unsigned Iterations);
- #ifndef NDEBUG
- void dumpDependencyEdge(raw_ostream &OS, const DependencyEdge &DE,
- MCInstPrinter &MCIP) const;
- #endif
- public:
- DependencyGraph(unsigned Size) : Nodes(Size) {}
- void addRegisterDep(unsigned From, unsigned To, unsigned RegID,
- unsigned Cost) {
- addDependency(From, To, {DependencyEdge::DT_REGISTER, RegID, Cost});
- }
- void addMemoryDep(unsigned From, unsigned To, unsigned Cost) {
- addDependency(From, To, {DependencyEdge::DT_MEMORY, /* unused */ 0, Cost});
- }
- void addResourceDep(unsigned From, unsigned To, uint64_t Mask,
- unsigned Cost) {
- addDependency(From, To, {DependencyEdge::DT_RESOURCE, Mask, Cost});
- }
- // Called by the bottleneck analysis at the end of simulation to propagate
- // costs through the edges of the graph, and compute a critical path.
- void finalizeGraph(unsigned Iterations) {
- SmallVector<unsigned, 16> RootSet;
- pruneEdges(Iterations);
- initializeRootSet(RootSet);
- propagateThroughEdges(RootSet, Iterations);
- }
- // Returns a sequence of edges representing the critical sequence based on the
- // simulated run. It assumes that the graph has already been finalized (i.e.
- // method `finalizeGraph()` has already been called on this graph).
- void getCriticalSequence(SmallVectorImpl<const DependencyEdge *> &Seq) const;
- #ifndef NDEBUG
- void dump(raw_ostream &OS, MCInstPrinter &MCIP) const;
- #endif
- };
- /// A view that collects and prints a few performance numbers.
- class BottleneckAnalysis : public InstructionView {
- PressureTracker Tracker;
- DependencyGraph DG;
- unsigned Iterations;
- unsigned TotalCycles;
- bool PressureIncreasedBecauseOfResources;
- bool PressureIncreasedBecauseOfRegisterDependencies;
- bool PressureIncreasedBecauseOfMemoryDependencies;
- // True if throughput was affected by dispatch stalls.
- bool SeenStallCycles;
- struct BackPressureInfo {
- // Cycles where backpressure increased.
- unsigned PressureIncreaseCycles;
- // Cycles where backpressure increased because of pipeline pressure.
- unsigned ResourcePressureCycles;
- // Cycles where backpressure increased because of data dependencies.
- unsigned DataDependencyCycles;
- // Cycles where backpressure increased because of register dependencies.
- unsigned RegisterDependencyCycles;
- // Cycles where backpressure increased because of memory dependencies.
- unsigned MemoryDependencyCycles;
- };
- BackPressureInfo BPI;
- // Used to populate the dependency graph DG.
- void addRegisterDep(unsigned From, unsigned To, unsigned RegID, unsigned Cy);
- void addMemoryDep(unsigned From, unsigned To, unsigned Cy);
- void addResourceDep(unsigned From, unsigned To, uint64_t Mask, unsigned Cy);
- void printInstruction(formatted_raw_ostream &FOS, const MCInst &MCI,
- bool UseDifferentColor = false) const;
- // Prints a bottleneck message to OS.
- void printBottleneckHints(raw_ostream &OS) const;
- void printCriticalSequence(raw_ostream &OS) const;
- public:
- BottleneckAnalysis(const MCSubtargetInfo &STI, MCInstPrinter &MCIP,
- ArrayRef<MCInst> Sequence, unsigned Iterations);
- void onCycleEnd() override;
- void onEvent(const HWStallEvent &Event) override { SeenStallCycles = true; }
- void onEvent(const HWPressureEvent &Event) override;
- void onEvent(const HWInstructionEvent &Event) override;
- void printView(raw_ostream &OS) const override;
- StringRef getNameAsString() const override { return "BottleneckAnalysis"; }
- json::Value toJSON() const override { return "not implemented"; }
- #ifndef NDEBUG
- void dump(raw_ostream &OS, MCInstPrinter &MCIP) const { DG.dump(OS, MCIP); }
- #endif
- };
- } // namespace mca
- } // namespace llvm
- #endif
|