ModuloSchedule.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // Software pipelining (SWP) is an instruction scheduling technique for loops
  15. // that overlaps loop iterations and exploits ILP via compiler transformations.
  16. //
  17. // There are multiple methods for analyzing a loop and creating a schedule.
  18. // An example algorithm is Swing Modulo Scheduling (implemented by the
  19. // MachinePipeliner). The details of how a schedule is arrived at are irrelevant
  20. // for the task of actually rewriting a loop to adhere to the schedule, which
  21. // is what this file does.
  22. //
  23. // A schedule is, for every instruction in a block, a Cycle and a Stage. Note
  24. // that we only support single-block loops, so "block" and "loop" can be used
  25. // interchangably.
  26. //
  27. // The Cycle of an instruction defines a partial order of the instructions in
  28. // the remapped loop. Instructions within a cycle must not consume the output
  29. // of any instruction in the same cycle. Cycle information is assumed to have
  30. // been calculated such that the processor will execute instructions in
  31. // lock-step (for example in a VLIW ISA).
  32. //
  33. // The Stage of an instruction defines the mapping between logical loop
  34. // iterations and pipelined loop iterations. An example (unrolled) pipeline
  35. // may look something like:
  36. //
  37. // I0[0] Execute instruction I0 of iteration 0
  38. // I1[0], I0[1] Execute I0 of iteration 1 and I1 of iteration 1
  39. // I1[1], I0[2]
  40. // I1[2], I0[3]
  41. //
  42. // In the schedule for this unrolled sequence we would say that I0 was scheduled
  43. // in stage 0 and I1 in stage 1:
  44. //
  45. // loop:
  46. // [stage 0] x = I0
  47. // [stage 1] I1 x (from stage 0)
  48. //
  49. // And to actually generate valid code we must insert a phi:
  50. //
  51. // loop:
  52. // x' = phi(x)
  53. // x = I0
  54. // I1 x'
  55. //
  56. // This is a simple example; the rules for how to generate correct code given
  57. // an arbitrary schedule containing loop-carried values are complex.
  58. //
  59. // Note that these examples only mention the steady-state kernel of the
  60. // generated loop; prologs and epilogs must be generated also that prime and
  61. // flush the pipeline. Doing so is nontrivial.
  62. //
  63. //===----------------------------------------------------------------------===//
  64. #ifndef LLVM_CODEGEN_MODULOSCHEDULE_H
  65. #define LLVM_CODEGEN_MODULOSCHEDULE_H
  66. #include "llvm/CodeGen/MachineFunction.h"
  67. #include "llvm/CodeGen/MachineLoopUtils.h"
  68. #include "llvm/CodeGen/TargetInstrInfo.h"
  69. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  70. #include <deque>
  71. #include <vector>
  72. namespace llvm {
  73. class MachineBasicBlock;
  74. class MachineLoop;
  75. class MachineRegisterInfo;
  76. class MachineInstr;
  77. class LiveIntervals;
  78. /// Represents a schedule for a single-block loop. For every instruction we
  79. /// maintain a Cycle and Stage.
  80. class ModuloSchedule {
  81. private:
  82. /// The block containing the loop instructions.
  83. MachineLoop *Loop;
  84. /// The instructions to be generated, in total order. Cycle provides a partial
  85. /// order; the total order within cycles has been decided by the schedule
  86. /// producer.
  87. std::vector<MachineInstr *> ScheduledInstrs;
  88. /// The cycle for each instruction.
  89. DenseMap<MachineInstr *, int> Cycle;
  90. /// The stage for each instruction.
  91. DenseMap<MachineInstr *, int> Stage;
  92. /// The number of stages in this schedule (Max(Stage) + 1).
  93. int NumStages;
  94. public:
  95. /// Create a new ModuloSchedule.
  96. /// \arg ScheduledInstrs The new loop instructions, in total resequenced
  97. /// order.
  98. /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
  99. /// not need to start at zero. ScheduledInstrs must be partially ordered by
  100. /// Cycle.
  101. /// \arg Stage Stage index for all instructions in ScheduleInstrs.
  102. ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
  103. std::vector<MachineInstr *> ScheduledInstrs,
  104. DenseMap<MachineInstr *, int> Cycle,
  105. DenseMap<MachineInstr *, int> Stage)
  106. : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)),
  107. Stage(std::move(Stage)) {
  108. NumStages = 0;
  109. for (auto &KV : this->Stage)
  110. NumStages = std::max(NumStages, KV.second);
  111. ++NumStages;
  112. }
  113. /// Return the single-block loop being scheduled.
  114. MachineLoop *getLoop() const { return Loop; }
  115. /// Return the number of stages contained in this schedule, which is the
  116. /// largest stage index + 1.
  117. int getNumStages() const { return NumStages; }
  118. /// Return the first cycle in the schedule, which is the cycle index of the
  119. /// first instruction.
  120. int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; }
  121. /// Return the final cycle in the schedule, which is the cycle index of the
  122. /// last instruction.
  123. int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; }
  124. /// Return the stage that MI is scheduled in, or -1.
  125. int getStage(MachineInstr *MI) {
  126. auto I = Stage.find(MI);
  127. return I == Stage.end() ? -1 : I->second;
  128. }
  129. /// Return the cycle that MI is scheduled at, or -1.
  130. int getCycle(MachineInstr *MI) {
  131. auto I = Cycle.find(MI);
  132. return I == Cycle.end() ? -1 : I->second;
  133. }
  134. /// Set the stage of a newly created instruction.
  135. void setStage(MachineInstr *MI, int MIStage) {
  136. assert(Stage.count(MI) == 0);
  137. Stage[MI] = MIStage;
  138. }
  139. /// Return the rescheduled instructions in order.
  140. ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
  141. void dump() { print(dbgs()); }
  142. void print(raw_ostream &OS);
  143. };
  144. /// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
  145. /// rewriting the old loop and inserting prologs and epilogs as required.
  146. class ModuloScheduleExpander {
  147. public:
  148. using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>;
  149. private:
  150. using ValueMapTy = DenseMap<unsigned, unsigned>;
  151. using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
  152. using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
  153. ModuloSchedule &Schedule;
  154. MachineFunction &MF;
  155. const TargetSubtargetInfo &ST;
  156. MachineRegisterInfo &MRI;
  157. const TargetInstrInfo *TII;
  158. LiveIntervals &LIS;
  159. MachineBasicBlock *BB;
  160. MachineBasicBlock *Preheader;
  161. MachineBasicBlock *NewKernel = nullptr;
  162. std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
  163. /// Map for each register and the max difference between its uses and def.
  164. /// The first element in the pair is the max difference in stages. The
  165. /// second is true if the register defines a Phi value and loop value is
  166. /// scheduled before the Phi.
  167. std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
  168. /// Instructions to change when emitting the final schedule.
  169. InstrChangesTy InstrChanges;
  170. void generatePipelinedLoop();
  171. void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
  172. ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
  173. void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
  174. MachineBasicBlock *OrigBB, ValueMapTy *VRMap,
  175. ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
  176. MBBVectorTy &PrologBBs);
  177. void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
  178. MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
  179. ValueMapTy *VRMap, InstrMapTy &InstrMap,
  180. unsigned LastStageNum, unsigned CurStageNum,
  181. bool IsLast);
  182. void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
  183. MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
  184. ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
  185. InstrMapTy &InstrMap, unsigned LastStageNum,
  186. unsigned CurStageNum, bool IsLast);
  187. void removeDeadInstructions(MachineBasicBlock *KernelBB,
  188. MBBVectorTy &EpilogBBs);
  189. void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
  190. void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
  191. MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
  192. ValueMapTy *VRMap);
  193. bool computeDelta(MachineInstr &MI, unsigned &Delta);
  194. void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
  195. unsigned Num);
  196. MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
  197. unsigned InstStageNum);
  198. MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
  199. unsigned InstStageNum);
  200. void updateInstruction(MachineInstr *NewMI, bool LastDef,
  201. unsigned CurStageNum, unsigned InstrStageNum,
  202. ValueMapTy *VRMap);
  203. MachineInstr *findDefInLoop(unsigned Reg);
  204. unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
  205. unsigned LoopStage, ValueMapTy *VRMap,
  206. MachineBasicBlock *BB);
  207. void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
  208. ValueMapTy *VRMap, InstrMapTy &InstrMap);
  209. void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
  210. unsigned CurStageNum, unsigned PhiNum,
  211. MachineInstr *Phi, unsigned OldReg,
  212. unsigned NewReg, unsigned PrevReg = 0);
  213. bool isLoopCarried(MachineInstr &Phi);
  214. /// Return the max. number of stages/iterations that can occur between a
  215. /// register definition and its uses.
  216. unsigned getStagesForReg(int Reg, unsigned CurStage) {
  217. std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
  218. if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 &&
  219. Stages.second)
  220. return 1;
  221. return Stages.first;
  222. }
  223. /// The number of stages for a Phi is a little different than other
  224. /// instructions. The minimum value computed in RegToStageDiff is 1
  225. /// because we assume the Phi is needed for at least 1 iteration.
  226. /// This is not the case if the loop value is scheduled prior to the
  227. /// Phi in the same stage. This function returns the number of stages
  228. /// or iterations needed between the Phi definition and any uses.
  229. unsigned getStagesForPhi(int Reg) {
  230. std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
  231. if (Stages.second)
  232. return Stages.first;
  233. return Stages.first - 1;
  234. }
  235. public:
  236. /// Create a new ModuloScheduleExpander.
  237. /// \arg InstrChanges Modifications to make to instructions with memory
  238. /// operands.
  239. /// FIXME: InstrChanges is opaque and is an implementation detail of an
  240. /// optimization in MachinePipeliner that crosses abstraction boundaries.
  241. ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
  242. LiveIntervals &LIS, InstrChangesTy InstrChanges)
  243. : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
  244. TII(ST.getInstrInfo()), LIS(LIS),
  245. InstrChanges(std::move(InstrChanges)) {}
  246. /// Performs the actual expansion.
  247. void expand();
  248. /// Performs final cleanup after expansion.
  249. void cleanup();
  250. /// Returns the newly rewritten kernel block, or nullptr if this was
  251. /// optimized away.
  252. MachineBasicBlock *getRewrittenKernel() { return NewKernel; }
  253. };
  254. /// A reimplementation of ModuloScheduleExpander. It works by generating a
  255. /// standalone kernel loop and peeling out the prologs and epilogs.
  256. class PeelingModuloScheduleExpander {
  257. public:
  258. PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
  259. LiveIntervals *LIS)
  260. : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
  261. TII(ST.getInstrInfo()), LIS(LIS) {}
  262. void expand();
  263. /// Runs ModuloScheduleExpander and treats it as a golden input to validate
  264. /// aspects of the code generated by PeelingModuloScheduleExpander.
  265. void validateAgainstModuloScheduleExpander();
  266. protected:
  267. ModuloSchedule &Schedule;
  268. MachineFunction &MF;
  269. const TargetSubtargetInfo &ST;
  270. MachineRegisterInfo &MRI;
  271. const TargetInstrInfo *TII;
  272. LiveIntervals *LIS;
  273. /// The original loop block that gets rewritten in-place.
  274. MachineBasicBlock *BB;
  275. /// The original loop preheader.
  276. MachineBasicBlock *Preheader;
  277. /// All prolog and epilog blocks.
  278. SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
  279. /// For every block, the stages that are produced.
  280. DenseMap<MachineBasicBlock *, BitVector> LiveStages;
  281. /// For every block, the stages that are available. A stage can be available
  282. /// but not produced (in the epilog) or produced but not available (in the
  283. /// prolog).
  284. DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
  285. /// When peeling the epilogue keep track of the distance between the phi
  286. /// nodes and the kernel.
  287. DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration;
  288. /// CanonicalMIs and BlockMIs form a bidirectional map between any of the
  289. /// loop kernel clones.
  290. DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs;
  291. DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *>
  292. BlockMIs;
  293. /// State passed from peelKernel to peelPrologAndEpilogs().
  294. std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
  295. /// Illegal phis that need to be deleted once we re-link stages.
  296. SmallVector<MachineInstr *, 4> IllegalPhisToDelete;
  297. /// Converts BB from the original loop body to the rewritten, pipelined
  298. /// steady-state.
  299. void rewriteKernel();
  300. /// Peels one iteration of the rewritten kernel (BB) in the specified
  301. /// direction.
  302. MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
  303. // Delete instructions whose stage is less than MinStage in the given basic
  304. // block.
  305. void filterInstructions(MachineBasicBlock *MB, int MinStage);
  306. // Move instructions of the given stage from sourceBB to DestBB. Remap the phi
  307. // instructions to keep a valid IR.
  308. void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
  309. MachineBasicBlock *SourceBB, unsigned Stage);
  310. /// Peel the kernel forwards and backwards to produce prologs and epilogs,
  311. /// and stitch them together.
  312. void peelPrologAndEpilogs();
  313. /// All prolog and epilog blocks are clones of the kernel, so any produced
  314. /// register in one block has an corollary in all other blocks.
  315. Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
  316. /// Change all users of MI, if MI is predicated out
  317. /// (LiveStages[MI->getParent()] == false).
  318. void rewriteUsesOf(MachineInstr *MI);
  319. /// Insert branches between prologs, kernel and epilogs.
  320. void fixupBranches();
  321. /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
  322. /// to a block dominated by all prologs and epilogs. This allows us to treat
  323. /// the loop exiting block as any other kernel clone.
  324. MachineBasicBlock *CreateLCSSAExitingBlock();
  325. /// Helper to get the stage of an instruction in the schedule.
  326. unsigned getStage(MachineInstr *MI) {
  327. if (CanonicalMIs.count(MI))
  328. MI = CanonicalMIs[MI];
  329. return Schedule.getStage(MI);
  330. }
  331. /// Helper function to find the right canonical register for a phi instruction
  332. /// coming from a peeled out prologue.
  333. Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi);
  334. /// Target loop info before kernel peeling.
  335. std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
  336. };
  337. /// Expander that simply annotates each scheduled instruction with a post-instr
  338. /// symbol that can be consumed by the ModuloScheduleTest pass.
  339. ///
  340. /// The post-instr symbol is a way of annotating an instruction that can be
  341. /// roundtripped in MIR. The syntax is:
  342. /// MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
  343. class ModuloScheduleTestAnnotater {
  344. MachineFunction &MF;
  345. ModuloSchedule &S;
  346. public:
  347. ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
  348. : MF(MF), S(S) {}
  349. /// Performs the annotation.
  350. void annotate();
  351. };
  352. } // end namespace llvm
  353. #endif // LLVM_CODEGEN_MODULOSCHEDULE_H
  354. #ifdef __GNUC__
  355. #pragma GCC diagnostic pop
  356. #endif