AArch64PostSelectOptimize.cpp 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. //=== AArch64PostSelectOptimize.cpp ---------------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This pass does post-instruction-selection optimizations in the GlobalISel
  10. // pipeline, before the rest of codegen runs.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "AArch64.h"
  14. #include "AArch64TargetMachine.h"
  15. #include "MCTargetDesc/AArch64MCTargetDesc.h"
  16. #include "llvm/ADT/STLExtras.h"
  17. #include "llvm/CodeGen/GlobalISel/Utils.h"
  18. #include "llvm/CodeGen/MachineBasicBlock.h"
  19. #include "llvm/CodeGen/MachineFunctionPass.h"
  20. #include "llvm/CodeGen/MachineInstr.h"
  21. #include "llvm/CodeGen/MachineOperand.h"
  22. #include "llvm/CodeGen/TargetPassConfig.h"
  23. #include "llvm/Support/Debug.h"
  24. #include "llvm/Support/ErrorHandling.h"
  25. #define DEBUG_TYPE "aarch64-post-select-optimize"
  26. using namespace llvm;
  27. namespace {
  28. class AArch64PostSelectOptimize : public MachineFunctionPass {
  29. public:
  30. static char ID;
  31. AArch64PostSelectOptimize();
  32. StringRef getPassName() const override {
  33. return "AArch64 Post Select Optimizer";
  34. }
  35. bool runOnMachineFunction(MachineFunction &MF) override;
  36. void getAnalysisUsage(AnalysisUsage &AU) const override;
  37. private:
  38. bool optimizeNZCVDefs(MachineBasicBlock &MBB);
  39. bool doPeepholeOpts(MachineBasicBlock &MBB);
  40. /// Look for cross regclass copies that can be trivially eliminated.
  41. bool foldSimpleCrossClassCopies(MachineInstr &MI);
  42. };
  43. } // end anonymous namespace
  44. void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const {
  45. AU.addRequired<TargetPassConfig>();
  46. AU.setPreservesCFG();
  47. getSelectionDAGFallbackAnalysisUsage(AU);
  48. MachineFunctionPass::getAnalysisUsage(AU);
  49. }
  50. AArch64PostSelectOptimize::AArch64PostSelectOptimize()
  51. : MachineFunctionPass(ID) {
  52. initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry());
  53. }
  54. unsigned getNonFlagSettingVariant(unsigned Opc) {
  55. switch (Opc) {
  56. default:
  57. return 0;
  58. case AArch64::SUBSXrr:
  59. return AArch64::SUBXrr;
  60. case AArch64::SUBSWrr:
  61. return AArch64::SUBWrr;
  62. case AArch64::SUBSXrs:
  63. return AArch64::SUBXrs;
  64. case AArch64::SUBSXri:
  65. return AArch64::SUBXri;
  66. case AArch64::SUBSWri:
  67. return AArch64::SUBWri;
  68. }
  69. }
  70. bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) {
  71. bool Changed = false;
  72. for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) {
  73. Changed |= foldSimpleCrossClassCopies(MI);
  74. }
  75. return Changed;
  76. }
  77. bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) {
  78. auto *MF = MI.getMF();
  79. auto &MRI = MF->getRegInfo();
  80. if (!MI.isCopy())
  81. return false;
  82. if (MI.getOperand(1).getSubReg())
  83. return false; // Don't deal with subreg copies
  84. Register Src = MI.getOperand(1).getReg();
  85. Register Dst = MI.getOperand(0).getReg();
  86. if (Src.isPhysical() || Dst.isPhysical())
  87. return false;
  88. const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
  89. const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
  90. if (SrcRC == DstRC)
  91. return false;
  92. if (SrcRC->hasSubClass(DstRC)) {
  93. // This is the case where the source class is a superclass of the dest, so
  94. // if the copy is the only user of the source, we can just constrain the
  95. // source reg to the dest class.
  96. if (!MRI.hasOneNonDBGUse(Src))
  97. return false; // Only constrain single uses of the source.
  98. // Constrain to dst reg class as long as it's not a weird class that only
  99. // has a few registers.
  100. if (!MRI.constrainRegClass(Src, DstRC, /* MinNumRegs */ 25))
  101. return false;
  102. } else if (DstRC->hasSubClass(SrcRC)) {
  103. // This is the inverse case, where the destination class is a superclass of
  104. // the source. Here, if the copy is the only user, we can just constrain
  105. // the user of the copy to use the smaller class of the source.
  106. } else {
  107. return false;
  108. }
  109. MRI.replaceRegWith(Dst, Src);
  110. MI.eraseFromParent();
  111. return true;
  112. }
  113. bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
  114. // Consider the following code:
  115. // FCMPSrr %0, %1, implicit-def $nzcv
  116. // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
  117. // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv
  118. // FCMPSrr %0, %1, implicit-def $nzcv
  119. // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv
  120. // This kind of code where we have 2 FCMPs each feeding a CSEL can happen
  121. // when we have a single IR fcmp being used by two selects. During selection,
  122. // to ensure that there can be no clobbering of nzcv between the fcmp and the
  123. // csel, we have to generate an fcmp immediately before each csel is
  124. // selected.
  125. // However, often we can essentially CSE these together later in MachineCSE.
  126. // This doesn't work though if there are unrelated flag-setting instructions
  127. // in between the two FCMPs. In this case, the SUBS defines NZCV
  128. // but it doesn't have any users, being overwritten by the second FCMP.
  129. //
  130. // Our solution here is to try to convert flag setting operations between
  131. // a interval of identical FCMPs, so that CSE will be able to eliminate one.
  132. bool Changed = false;
  133. auto &MF = *MBB.getParent();
  134. auto &Subtarget = MF.getSubtarget();
  135. const auto &TII = Subtarget.getInstrInfo();
  136. auto TRI = Subtarget.getRegisterInfo();
  137. auto RBI = Subtarget.getRegBankInfo();
  138. auto &MRI = MF.getRegInfo();
  139. // The first step is to find the first and last FCMPs. If we have found
  140. // at least two, then set the limit of the bottom-up walk to the first FCMP
  141. // found since we're only interested in dealing with instructions between
  142. // them.
  143. MachineInstr *FirstCmp = nullptr, *LastCmp = nullptr;
  144. for (auto &MI : instructionsWithoutDebug(MBB.begin(), MBB.end())) {
  145. if (MI.getOpcode() == AArch64::FCMPSrr ||
  146. MI.getOpcode() == AArch64::FCMPDrr) {
  147. if (!FirstCmp)
  148. FirstCmp = &MI;
  149. else
  150. LastCmp = &MI;
  151. }
  152. }
  153. // In addition to converting flag-setting ops in fcmp ranges into non-flag
  154. // setting ops, across the whole basic block we also detect when nzcv
  155. // implicit-defs are dead, and mark them as dead. Peephole optimizations need
  156. // this information later.
  157. LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo());
  158. LRU.addLiveOuts(MBB);
  159. bool NZCVDead = LRU.available(AArch64::NZCV);
  160. bool InsideCmpRange = false;
  161. for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
  162. LRU.stepBackward(II);
  163. if (LastCmp) { // There's a range present in this block.
  164. // If we're inside an fcmp range, look for begin instruction.
  165. if (InsideCmpRange && &II == FirstCmp)
  166. InsideCmpRange = false;
  167. else if (&II == LastCmp)
  168. InsideCmpRange = true;
  169. }
  170. // Did this instruction define NZCV?
  171. bool NZCVDeadAtCurrInstr = LRU.available(AArch64::NZCV);
  172. if (NZCVDead && NZCVDeadAtCurrInstr && II.definesRegister(AArch64::NZCV)) {
  173. // If we have a def and NZCV is dead, then we may convert this op.
  174. unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode());
  175. int DeadNZCVIdx = II.findRegisterDefOperandIdx(AArch64::NZCV);
  176. if (DeadNZCVIdx != -1) {
  177. // If we're inside an fcmp range, then convert flag setting ops.
  178. if (InsideCmpRange && NewOpc) {
  179. LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting "
  180. "op in fcmp range: "
  181. << II);
  182. II.setDesc(TII->get(NewOpc));
  183. II.removeOperand(DeadNZCVIdx);
  184. // Changing the opcode can result in differing regclass requirements,
  185. // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp.
  186. // Constrain the regclasses, possibly introducing a copy.
  187. constrainOperandRegClass(MF, *TRI, MRI, *TII, *RBI, II, II.getDesc(),
  188. II.getOperand(0), 0);
  189. Changed |= true;
  190. } else {
  191. // Otherwise, we just set the nzcv imp-def operand to be dead, so the
  192. // peephole optimizations can optimize them further.
  193. II.getOperand(DeadNZCVIdx).setIsDead();
  194. }
  195. }
  196. }
  197. NZCVDead = NZCVDeadAtCurrInstr;
  198. }
  199. return Changed;
  200. }
  201. bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) {
  202. if (MF.getProperties().hasProperty(
  203. MachineFunctionProperties::Property::FailedISel))
  204. return false;
  205. assert(MF.getProperties().hasProperty(
  206. MachineFunctionProperties::Property::Selected) &&
  207. "Expected a selected MF");
  208. bool Changed = false;
  209. for (auto &BB : MF) {
  210. Changed |= optimizeNZCVDefs(BB);
  211. Changed |= doPeepholeOpts(BB);
  212. }
  213. return Changed;
  214. }
  215. char AArch64PostSelectOptimize::ID = 0;
  216. INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE,
  217. "Optimize AArch64 selected instructions",
  218. false, false)
  219. INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE,
  220. "Optimize AArch64 selected instructions", false,
  221. false)
  222. namespace llvm {
  223. FunctionPass *createAArch64PostSelectOptimize() {
  224. return new AArch64PostSelectOptimize();
  225. }
  226. } // end namespace llvm