MVEVPTOptimisationsPass.cpp 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890
  1. //===-- MVEVPTOptimisationsPass.cpp ---------------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. /// \file This pass does a few optimisations related to Tail predicated loops
  10. /// and MVE VPT blocks before register allocation is performed. For VPT blocks
  11. /// the goal is to maximize the sizes of the blocks that will be created by the
  12. /// MVE VPT Block Insertion pass (which runs after register allocation). For
  13. /// tail predicated loops we transform the loop into something that will
  14. /// hopefully make the backend ARMLowOverheadLoops pass's job easier.
  15. ///
  16. //===----------------------------------------------------------------------===//
  17. #include "ARM.h"
  18. #include "ARMSubtarget.h"
  19. #include "MCTargetDesc/ARMBaseInfo.h"
  20. #include "MVETailPredUtils.h"
  21. #include "Thumb2InstrInfo.h"
  22. #include "llvm/ADT/SmallVector.h"
  23. #include "llvm/CodeGen/MachineBasicBlock.h"
  24. #include "llvm/CodeGen/MachineDominators.h"
  25. #include "llvm/CodeGen/MachineFunction.h"
  26. #include "llvm/CodeGen/MachineFunctionPass.h"
  27. #include "llvm/CodeGen/MachineInstr.h"
  28. #include "llvm/CodeGen/MachineLoopInfo.h"
  29. #include "llvm/InitializePasses.h"
  30. #include "llvm/Support/Debug.h"
  31. #include <cassert>
  32. using namespace llvm;
  33. #define DEBUG_TYPE "arm-mve-vpt-opts"
  34. static cl::opt<bool>
  35. MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden,
  36. cl::desc("Enable merging Loop End and Dec instructions."),
  37. cl::init(true));
  38. namespace {
  39. class MVEVPTOptimisations : public MachineFunctionPass {
  40. public:
  41. static char ID;
  42. const Thumb2InstrInfo *TII;
  43. MachineRegisterInfo *MRI;
  44. MVEVPTOptimisations() : MachineFunctionPass(ID) {}
  45. bool runOnMachineFunction(MachineFunction &Fn) override;
  46. void getAnalysisUsage(AnalysisUsage &AU) const override {
  47. AU.addRequired<MachineLoopInfo>();
  48. AU.addPreserved<MachineLoopInfo>();
  49. AU.addRequired<MachineDominatorTree>();
  50. AU.addPreserved<MachineDominatorTree>();
  51. MachineFunctionPass::getAnalysisUsage(AU);
  52. }
  53. StringRef getPassName() const override {
  54. return "ARM MVE TailPred and VPT Optimisation Pass";
  55. }
  56. private:
  57. bool MergeLoopEnd(MachineLoop *ML);
  58. bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT);
  59. MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
  60. MachineInstr &Instr,
  61. MachineOperand &User,
  62. Register Target);
  63. bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
  64. bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
  65. bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
  66. bool ConvertVPSEL(MachineBasicBlock &MBB);
  67. };
  68. char MVEVPTOptimisations::ID = 0;
  69. } // end anonymous namespace
  70. INITIALIZE_PASS_BEGIN(MVEVPTOptimisations, DEBUG_TYPE,
  71. "ARM MVE TailPred and VPT Optimisations pass", false,
  72. false)
  73. INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
  74. INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
  75. INITIALIZE_PASS_END(MVEVPTOptimisations, DEBUG_TYPE,
  76. "ARM MVE TailPred and VPT Optimisations pass", false, false)
  77. static MachineInstr *LookThroughCOPY(MachineInstr *MI,
  78. MachineRegisterInfo *MRI) {
  79. while (MI && MI->getOpcode() == TargetOpcode::COPY &&
  80. MI->getOperand(1).getReg().isVirtual())
  81. MI = MRI->getVRegDef(MI->getOperand(1).getReg());
  82. return MI;
  83. }
  84. // Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and
  85. // corresponding PHI that make up a low overhead loop. Only handles 'do' loops
  86. // at the moment, returning a t2DoLoopStart in LoopStart.
  87. static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI,
  88. MachineInstr *&LoopStart, MachineInstr *&LoopPhi,
  89. MachineInstr *&LoopDec, MachineInstr *&LoopEnd) {
  90. MachineBasicBlock *Header = ML->getHeader();
  91. MachineBasicBlock *Latch = ML->getLoopLatch();
  92. if (!Header || !Latch) {
  93. LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n");
  94. return false;
  95. }
  96. // Find the loop end from the terminators.
  97. LoopEnd = nullptr;
  98. for (auto &T : Latch->terminators()) {
  99. if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) {
  100. LoopEnd = &T;
  101. break;
  102. }
  103. if (T.getOpcode() == ARM::t2LoopEndDec &&
  104. T.getOperand(2).getMBB() == Header) {
  105. LoopEnd = &T;
  106. break;
  107. }
  108. }
  109. if (!LoopEnd) {
  110. LLVM_DEBUG(dbgs() << " no LoopEnd\n");
  111. return false;
  112. }
  113. LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd);
  114. // Find the dec from the use of the end. There may be copies between
  115. // instructions. We expect the loop to loop like:
  116. // $vs = t2DoLoopStart ...
  117. // loop:
  118. // $vp = phi [ $vs ], [ $vd ]
  119. // ...
  120. // $vd = t2LoopDec $vp
  121. // ...
  122. // t2LoopEnd $vd, loop
  123. if (LoopEnd->getOpcode() == ARM::t2LoopEndDec)
  124. LoopDec = LoopEnd;
  125. else {
  126. LoopDec =
  127. LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI);
  128. if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) {
  129. LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n");
  130. return false;
  131. }
  132. }
  133. LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec);
  134. LoopPhi =
  135. LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI);
  136. if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI ||
  137. LoopPhi->getNumOperands() != 5 ||
  138. (LoopPhi->getOperand(2).getMBB() != Latch &&
  139. LoopPhi->getOperand(4).getMBB() != Latch)) {
  140. LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n");
  141. return false;
  142. }
  143. LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi);
  144. Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch
  145. ? LoopPhi->getOperand(3).getReg()
  146. : LoopPhi->getOperand(1).getReg();
  147. LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI);
  148. if (!LoopStart || LoopStart->getOpcode() != ARM::t2DoLoopStart) {
  149. LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n");
  150. return false;
  151. }
  152. LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart);
  153. return true;
  154. }
  155. // This function converts loops with t2LoopEnd and t2LoopEnd instructions into
  156. // a single t2LoopEndDec instruction. To do that it needs to make sure that LR
  157. // will be valid to be used for the low overhead loop, which means nothing else
  158. // is using LR (especially calls) and there are no superfluous copies in the
  159. // loop. The t2LoopEndDec is a branching terminator that produces a value (the
  160. // decrement) around the loop edge, which means we need to be careful that they
  161. // will be valid to allocate without any spilling.
  162. bool MVEVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
  163. if (!MergeEndDec)
  164. return false;
  165. LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName()
  166. << "\n");
  167. MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
  168. if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
  169. return false;
  170. // Check if there is an illegal instruction (a call) in the low overhead loop
  171. // and if so revert it now before we get any further.
  172. for (MachineBasicBlock *MBB : ML->blocks()) {
  173. for (MachineInstr &MI : *MBB) {
  174. if (MI.isCall()) {
  175. LLVM_DEBUG(dbgs() << "Found call in loop, reverting: " << MI);
  176. RevertDoLoopStart(LoopStart, TII);
  177. RevertLoopDec(LoopDec, TII);
  178. RevertLoopEnd(LoopEnd, TII);
  179. return true;
  180. }
  181. }
  182. }
  183. // Remove any copies from the loop, to ensure the phi that remains is both
  184. // simpler and contains no extra uses. Because t2LoopEndDec is a terminator
  185. // that cannot spill, we need to be careful what remains in the loop.
  186. Register PhiReg = LoopPhi->getOperand(0).getReg();
  187. Register DecReg = LoopDec->getOperand(0).getReg();
  188. Register StartReg = LoopStart->getOperand(0).getReg();
  189. // Ensure the uses are expected, and collect any copies we want to remove.
  190. SmallVector<MachineInstr *, 4> Copies;
  191. auto CheckUsers = [&Copies](Register BaseReg,
  192. ArrayRef<MachineInstr *> ExpectedUsers,
  193. MachineRegisterInfo *MRI) {
  194. SmallVector<Register, 4> Worklist;
  195. Worklist.push_back(BaseReg);
  196. while (!Worklist.empty()) {
  197. Register Reg = Worklist.pop_back_val();
  198. for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
  199. if (count(ExpectedUsers, &MI))
  200. continue;
  201. if (MI.getOpcode() != TargetOpcode::COPY ||
  202. !MI.getOperand(0).getReg().isVirtual()) {
  203. LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI);
  204. return false;
  205. }
  206. Worklist.push_back(MI.getOperand(0).getReg());
  207. Copies.push_back(&MI);
  208. }
  209. }
  210. return true;
  211. };
  212. if (!CheckUsers(PhiReg, {LoopDec}, MRI) ||
  213. !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) ||
  214. !CheckUsers(StartReg, {LoopPhi}, MRI))
  215. return false;
  216. MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass);
  217. MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass);
  218. MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass);
  219. if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) {
  220. LoopPhi->getOperand(3).setReg(StartReg);
  221. LoopPhi->getOperand(1).setReg(DecReg);
  222. } else {
  223. LoopPhi->getOperand(1).setReg(StartReg);
  224. LoopPhi->getOperand(3).setReg(DecReg);
  225. }
  226. // Replace the loop dec and loop end as a single instruction.
  227. MachineInstrBuilder MI =
  228. BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(),
  229. TII->get(ARM::t2LoopEndDec), DecReg)
  230. .addReg(PhiReg)
  231. .add(LoopEnd->getOperand(1));
  232. (void)MI;
  233. LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr());
  234. LoopDec->eraseFromParent();
  235. LoopEnd->eraseFromParent();
  236. for (auto *MI : Copies)
  237. MI->eraseFromParent();
  238. return true;
  239. }
  240. // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP
  241. // instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP
  242. // instruction, making the backend ARMLowOverheadLoops passes job of finding the
  243. // VCTP operand much simpler.
  244. bool MVEVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
  245. MachineDominatorTree *DT) {
  246. LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop "
  247. << ML->getHeader()->getName() << "\n");
  248. // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's
  249. // in the loop.
  250. MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
  251. if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
  252. return false;
  253. if (LoopDec != LoopEnd)
  254. return false;
  255. SmallVector<MachineInstr *, 4> VCTPs;
  256. for (MachineBasicBlock *BB : ML->blocks())
  257. for (MachineInstr &MI : *BB)
  258. if (isVCTP(&MI))
  259. VCTPs.push_back(&MI);
  260. if (VCTPs.empty()) {
  261. LLVM_DEBUG(dbgs() << " no VCTPs\n");
  262. return false;
  263. }
  264. // Check all VCTPs are the same.
  265. MachineInstr *FirstVCTP = *VCTPs.begin();
  266. for (MachineInstr *VCTP : VCTPs) {
  267. LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP);
  268. if (VCTP->getOpcode() != FirstVCTP->getOpcode() ||
  269. VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) {
  270. LLVM_DEBUG(dbgs() << " VCTP's are not identical\n");
  271. return false;
  272. }
  273. }
  274. // Check for the register being used can be setup before the loop. We expect
  275. // this to be:
  276. // $vx = ...
  277. // loop:
  278. // $vp = PHI [ $vx ], [ $vd ]
  279. // ..
  280. // $vpr = VCTP $vp
  281. // ..
  282. // $vd = t2SUBri $vp, #n
  283. // ..
  284. Register CountReg = FirstVCTP->getOperand(1).getReg();
  285. if (!CountReg.isVirtual()) {
  286. LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n");
  287. return false;
  288. }
  289. MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI);
  290. if (!Phi || Phi->getOpcode() != TargetOpcode::PHI ||
  291. Phi->getNumOperands() != 5 ||
  292. (Phi->getOperand(2).getMBB() != ML->getLoopLatch() &&
  293. Phi->getOperand(4).getMBB() != ML->getLoopLatch())) {
  294. LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n");
  295. return false;
  296. }
  297. CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch()
  298. ? Phi->getOperand(3).getReg()
  299. : Phi->getOperand(1).getReg();
  300. // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of
  301. // the preheader and add the new CountReg to it. We attempt to place it late
  302. // in the preheader, but may need to move that earlier based on uses.
  303. MachineBasicBlock *MBB = LoopStart->getParent();
  304. MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
  305. for (MachineInstr &Use :
  306. MRI->use_instructions(LoopStart->getOperand(0).getReg()))
  307. if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) ||
  308. !DT->dominates(ML->getHeader(), Use.getParent())) {
  309. LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n");
  310. return false;
  311. }
  312. MachineInstrBuilder MI = BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(),
  313. TII->get(ARM::t2DoLoopStartTP))
  314. .add(LoopStart->getOperand(0))
  315. .add(LoopStart->getOperand(1))
  316. .addReg(CountReg);
  317. (void)MI;
  318. LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with "
  319. << *MI.getInstr());
  320. MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass);
  321. LoopStart->eraseFromParent();
  322. return true;
  323. }
  324. // Returns true if Opcode is any VCMP Opcode.
  325. static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; }
  326. // Returns true if a VCMP with this Opcode can have its operands swapped.
  327. // There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs,
  328. // and VCMPr instructions (since the r is always on the right).
  329. static bool CanHaveSwappedOperands(unsigned Opcode) {
  330. switch (Opcode) {
  331. default:
  332. return true;
  333. case ARM::MVE_VCMPf32:
  334. case ARM::MVE_VCMPf16:
  335. case ARM::MVE_VCMPf32r:
  336. case ARM::MVE_VCMPf16r:
  337. case ARM::MVE_VCMPi8r:
  338. case ARM::MVE_VCMPi16r:
  339. case ARM::MVE_VCMPi32r:
  340. case ARM::MVE_VCMPu8r:
  341. case ARM::MVE_VCMPu16r:
  342. case ARM::MVE_VCMPu32r:
  343. case ARM::MVE_VCMPs8r:
  344. case ARM::MVE_VCMPs16r:
  345. case ARM::MVE_VCMPs32r:
  346. return false;
  347. }
  348. }
  349. // Returns the CondCode of a VCMP Instruction.
  350. static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) {
  351. assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP");
  352. return ARMCC::CondCodes(Instr.getOperand(3).getImm());
  353. }
  354. // Returns true if Cond is equivalent to a VPNOT instruction on the result of
  355. // Prev. Cond and Prev must be VCMPs.
  356. static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) {
  357. assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode()));
  358. // Opcodes must match.
  359. if (Cond.getOpcode() != Prev.getOpcode())
  360. return false;
  361. MachineOperand &CondOP1 = Cond.getOperand(1), &CondOP2 = Cond.getOperand(2);
  362. MachineOperand &PrevOP1 = Prev.getOperand(1), &PrevOP2 = Prev.getOperand(2);
  363. // If the VCMP has the opposite condition with the same operands, we can
  364. // replace it with a VPNOT
  365. ARMCC::CondCodes ExpectedCode = GetCondCode(Cond);
  366. ExpectedCode = ARMCC::getOppositeCondition(ExpectedCode);
  367. if (ExpectedCode == GetCondCode(Prev))
  368. if (CondOP1.isIdenticalTo(PrevOP1) && CondOP2.isIdenticalTo(PrevOP2))
  369. return true;
  370. // Check again with operands swapped if possible
  371. if (!CanHaveSwappedOperands(Cond.getOpcode()))
  372. return false;
  373. ExpectedCode = ARMCC::getSwappedCondition(ExpectedCode);
  374. return ExpectedCode == GetCondCode(Prev) && CondOP1.isIdenticalTo(PrevOP2) &&
  375. CondOP2.isIdenticalTo(PrevOP1);
  376. }
  377. // Returns true if Instr writes to VCCR.
  378. static bool IsWritingToVCCR(MachineInstr &Instr) {
  379. if (Instr.getNumOperands() == 0)
  380. return false;
  381. MachineOperand &Dst = Instr.getOperand(0);
  382. if (!Dst.isReg())
  383. return false;
  384. Register DstReg = Dst.getReg();
  385. if (!DstReg.isVirtual())
  386. return false;
  387. MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo();
  388. const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(DstReg);
  389. return RegClass && (RegClass->getID() == ARM::VCCRRegClassID);
  390. }
  391. // Transforms
  392. // <Instr that uses %A ('User' Operand)>
  393. // Into
  394. // %K = VPNOT %Target
  395. // <Instr that uses %K ('User' Operand)>
  396. // And returns the newly inserted VPNOT.
  397. // This optimization is done in the hopes of preventing spills/reloads of VPR by
  398. // reducing the number of VCCR values with overlapping lifetimes.
  399. MachineInstr &MVEVPTOptimisations::ReplaceRegisterUseWithVPNOT(
  400. MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User,
  401. Register Target) {
  402. Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target));
  403. MachineInstrBuilder MIBuilder =
  404. BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
  405. .addDef(NewResult)
  406. .addReg(Target);
  407. addUnpredicatedMveVpredNOp(MIBuilder);
  408. // Make the user use NewResult instead, and clear its kill flag.
  409. User.setReg(NewResult);
  410. User.setIsKill(false);
  411. LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): ";
  412. MIBuilder.getInstr()->dump());
  413. return *MIBuilder.getInstr();
  414. }
  415. // Moves a VPNOT before its first user if an instruction that uses Reg is found
  416. // in-between the VPNOT and its user.
  417. // Returns true if there is at least one user of the VPNOT in the block.
  418. static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB,
  419. MachineBasicBlock::iterator Iter,
  420. Register Reg) {
  421. assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!");
  422. assert(getVPTInstrPredicate(*Iter) == ARMVCC::None &&
  423. "The VPNOT cannot be predicated");
  424. MachineInstr &VPNOT = *Iter;
  425. Register VPNOTResult = VPNOT.getOperand(0).getReg();
  426. Register VPNOTOperand = VPNOT.getOperand(1).getReg();
  427. // Whether the VPNOT will need to be moved, and whether we found a user of the
  428. // VPNOT.
  429. bool MustMove = false, HasUser = false;
  430. MachineOperand *VPNOTOperandKiller = nullptr;
  431. for (; Iter != MBB.end(); ++Iter) {
  432. if (MachineOperand *MO =
  433. Iter->findRegisterUseOperand(VPNOTOperand, /*isKill*/ true)) {
  434. // If we find the operand that kills the VPNOTOperand's result, save it.
  435. VPNOTOperandKiller = MO;
  436. }
  437. if (Iter->findRegisterUseOperandIdx(Reg) != -1) {
  438. MustMove = true;
  439. continue;
  440. }
  441. if (Iter->findRegisterUseOperandIdx(VPNOTResult) == -1)
  442. continue;
  443. HasUser = true;
  444. if (!MustMove)
  445. break;
  446. // Move the VPNOT right before Iter
  447. LLVM_DEBUG(dbgs() << "Moving: "; VPNOT.dump(); dbgs() << " Before: ";
  448. Iter->dump());
  449. MBB.splice(Iter, &MBB, VPNOT.getIterator());
  450. // If we move the instr, and its operand was killed earlier, remove the kill
  451. // flag.
  452. if (VPNOTOperandKiller)
  453. VPNOTOperandKiller->setIsKill(false);
  454. break;
  455. }
  456. return HasUser;
  457. }
  458. // This optimisation attempts to reduce the number of overlapping lifetimes of
  459. // VCCR values by replacing uses of old VCCR values with VPNOTs. For example,
  460. // this replaces
  461. // %A:vccr = (something)
  462. // %B:vccr = VPNOT %A
  463. // %Foo = (some op that uses %B)
  464. // %Bar = (some op that uses %A)
  465. // With
  466. // %A:vccr = (something)
  467. // %B:vccr = VPNOT %A
  468. // %Foo = (some op that uses %B)
  469. // %TMP2:vccr = VPNOT %B
  470. // %Bar = (some op that uses %A)
  471. bool MVEVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
  472. MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end();
  473. SmallVector<MachineInstr *, 4> DeadInstructions;
  474. bool Modified = false;
  475. while (Iter != End) {
  476. Register VCCRValue, OppositeVCCRValue;
  477. // The first loop looks for 2 unpredicated instructions:
  478. // %A:vccr = (instr) ; A is stored in VCCRValue
  479. // %B:vccr = VPNOT %A ; B is stored in OppositeVCCRValue
  480. for (; Iter != End; ++Iter) {
  481. // We're only interested in unpredicated instructions that write to VCCR.
  482. if (!IsWritingToVCCR(*Iter) ||
  483. getVPTInstrPredicate(*Iter) != ARMVCC::None)
  484. continue;
  485. Register Dst = Iter->getOperand(0).getReg();
  486. // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've
  487. // found what we were looking for.
  488. if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT &&
  489. Iter->findRegisterUseOperandIdx(VCCRValue) != -1) {
  490. // Move the VPNOT closer to its first user if needed, and ignore if it
  491. // has no users.
  492. if (!MoveVPNOTBeforeFirstUser(MBB, Iter, VCCRValue))
  493. continue;
  494. OppositeVCCRValue = Dst;
  495. ++Iter;
  496. break;
  497. }
  498. // Else, just set VCCRValue.
  499. VCCRValue = Dst;
  500. }
  501. // If the first inner loop didn't find anything, stop here.
  502. if (Iter == End)
  503. break;
  504. assert(VCCRValue && OppositeVCCRValue &&
  505. "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop "
  506. "stopped before the end of the block!");
  507. assert(VCCRValue != OppositeVCCRValue &&
  508. "VCCRValue should not be equal to OppositeVCCRValue!");
  509. // LastVPNOTResult always contains the same value as OppositeVCCRValue.
  510. Register LastVPNOTResult = OppositeVCCRValue;
  511. // This second loop tries to optimize the remaining instructions.
  512. for (; Iter != End; ++Iter) {
  513. bool IsInteresting = false;
  514. if (MachineOperand *MO = Iter->findRegisterUseOperand(VCCRValue)) {
  515. IsInteresting = true;
  516. // - If the instruction is a VPNOT, it can be removed, and we can just
  517. // replace its uses with LastVPNOTResult.
  518. // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue.
  519. if (Iter->getOpcode() == ARM::MVE_VPNOT) {
  520. Register Result = Iter->getOperand(0).getReg();
  521. MRI->replaceRegWith(Result, LastVPNOTResult);
  522. DeadInstructions.push_back(&*Iter);
  523. Modified = true;
  524. LLVM_DEBUG(dbgs()
  525. << "Replacing all uses of '" << printReg(Result)
  526. << "' with '" << printReg(LastVPNOTResult) << "'\n");
  527. } else {
  528. MachineInstr &VPNOT =
  529. ReplaceRegisterUseWithVPNOT(MBB, *Iter, *MO, LastVPNOTResult);
  530. Modified = true;
  531. LastVPNOTResult = VPNOT.getOperand(0).getReg();
  532. std::swap(VCCRValue, OppositeVCCRValue);
  533. LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue)
  534. << "' with '" << printReg(LastVPNOTResult)
  535. << "' in instr: " << *Iter);
  536. }
  537. } else {
  538. // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult
  539. // instead as they contain the same value.
  540. if (MachineOperand *MO =
  541. Iter->findRegisterUseOperand(OppositeVCCRValue)) {
  542. IsInteresting = true;
  543. // This is pointless if LastVPNOTResult == OppositeVCCRValue.
  544. if (LastVPNOTResult != OppositeVCCRValue) {
  545. LLVM_DEBUG(dbgs() << "Replacing usage of '"
  546. << printReg(OppositeVCCRValue) << "' with '"
  547. << printReg(LastVPNOTResult) << " for instr: ";
  548. Iter->dump());
  549. MO->setReg(LastVPNOTResult);
  550. Modified = true;
  551. }
  552. MO->setIsKill(false);
  553. }
  554. // If this is an unpredicated VPNOT on
  555. // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it.
  556. if (Iter->getOpcode() == ARM::MVE_VPNOT &&
  557. getVPTInstrPredicate(*Iter) == ARMVCC::None) {
  558. Register VPNOTOperand = Iter->getOperand(1).getReg();
  559. if (VPNOTOperand == LastVPNOTResult ||
  560. VPNOTOperand == OppositeVCCRValue) {
  561. IsInteresting = true;
  562. std::swap(VCCRValue, OppositeVCCRValue);
  563. LastVPNOTResult = Iter->getOperand(0).getReg();
  564. }
  565. }
  566. }
  567. // If this instruction was not interesting, and it writes to VCCR, stop.
  568. if (!IsInteresting && IsWritingToVCCR(*Iter))
  569. break;
  570. }
  571. }
  572. for (MachineInstr *DeadInstruction : DeadInstructions)
  573. DeadInstruction->eraseFromParent();
  574. return Modified;
  575. }
  576. // This optimisation replaces VCMPs with VPNOTs when they are equivalent.
  577. bool MVEVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
  578. SmallVector<MachineInstr *, 4> DeadInstructions;
  579. // The last VCMP that we have seen and that couldn't be replaced.
  580. // This is reset when an instruction that writes to VCCR/VPR is found, or when
  581. // a VCMP is replaced with a VPNOT.
  582. // We'll only replace VCMPs with VPNOTs when this is not null, and when the
  583. // current VCMP is the opposite of PrevVCMP.
  584. MachineInstr *PrevVCMP = nullptr;
  585. // If we find an instruction that kills the result of PrevVCMP, we save the
  586. // operand here to remove the kill flag in case we need to use PrevVCMP's
  587. // result.
  588. MachineOperand *PrevVCMPResultKiller = nullptr;
  589. for (MachineInstr &Instr : MBB.instrs()) {
  590. if (PrevVCMP) {
  591. if (MachineOperand *MO = Instr.findRegisterUseOperand(
  592. PrevVCMP->getOperand(0).getReg(), /*isKill*/ true)) {
  593. // If we come accross the instr that kills PrevVCMP's result, record it
  594. // so we can remove the kill flag later if we need to.
  595. PrevVCMPResultKiller = MO;
  596. }
  597. }
  598. // Ignore predicated instructions.
  599. if (getVPTInstrPredicate(Instr) != ARMVCC::None)
  600. continue;
  601. // Only look at VCMPs
  602. if (!IsVCMP(Instr.getOpcode())) {
  603. // If the instruction writes to VCCR, forget the previous VCMP.
  604. if (IsWritingToVCCR(Instr))
  605. PrevVCMP = nullptr;
  606. continue;
  607. }
  608. if (!PrevVCMP || !IsVPNOTEquivalent(Instr, *PrevVCMP)) {
  609. PrevVCMP = &Instr;
  610. continue;
  611. }
  612. // The register containing the result of the VCMP that we're going to
  613. // replace.
  614. Register PrevVCMPResultReg = PrevVCMP->getOperand(0).getReg();
  615. // Build a VPNOT to replace the VCMP, reusing its operands.
  616. MachineInstrBuilder MIBuilder =
  617. BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
  618. .add(Instr.getOperand(0))
  619. .addReg(PrevVCMPResultReg);
  620. addUnpredicatedMveVpredNOp(MIBuilder);
  621. LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): ";
  622. MIBuilder.getInstr()->dump(); dbgs() << " Removed VCMP: ";
  623. Instr.dump());
  624. // If we found an instruction that uses, and kills PrevVCMP's result,
  625. // remove the kill flag.
  626. if (PrevVCMPResultKiller)
  627. PrevVCMPResultKiller->setIsKill(false);
  628. // Finally, mark the old VCMP for removal and reset
  629. // PrevVCMP/PrevVCMPResultKiller.
  630. DeadInstructions.push_back(&Instr);
  631. PrevVCMP = nullptr;
  632. PrevVCMPResultKiller = nullptr;
  633. }
  634. for (MachineInstr *DeadInstruction : DeadInstructions)
  635. DeadInstruction->eraseFromParent();
  636. return !DeadInstructions.empty();
  637. }
  638. bool MVEVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
  639. MachineDominatorTree *DT) {
  640. // Scan through the block, looking for instructions that use constants moves
  641. // into VPR that are the negative of one another. These are expected to be
  642. // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant
  643. // mask is kept it or and VPNOT's of it are added or reused as we scan through
  644. // the function.
  645. unsigned LastVPTImm = 0;
  646. Register LastVPTReg = 0;
  647. SmallSet<MachineInstr *, 4> DeadInstructions;
  648. for (MachineInstr &Instr : MBB.instrs()) {
  649. // Look for predicated MVE instructions.
  650. int PIdx = llvm::findFirstVPTPredOperandIdx(Instr);
  651. if (PIdx == -1)
  652. continue;
  653. Register VPR = Instr.getOperand(PIdx + 1).getReg();
  654. if (!VPR.isVirtual())
  655. continue;
  656. // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr.
  657. MachineInstr *Copy = MRI->getVRegDef(VPR);
  658. if (!Copy || Copy->getOpcode() != TargetOpcode::COPY ||
  659. !Copy->getOperand(1).getReg().isVirtual() ||
  660. MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) {
  661. LastVPTReg = 0;
  662. continue;
  663. }
  664. Register GPR = Copy->getOperand(1).getReg();
  665. // Find the Immediate used by the copy.
  666. auto getImm = [&](Register GPR) -> unsigned {
  667. MachineInstr *Def = MRI->getVRegDef(GPR);
  668. if (Def && (Def->getOpcode() == ARM::t2MOVi ||
  669. Def->getOpcode() == ARM::t2MOVi16))
  670. return Def->getOperand(1).getImm();
  671. return -1U;
  672. };
  673. unsigned Imm = getImm(GPR);
  674. if (Imm == -1U) {
  675. LastVPTReg = 0;
  676. continue;
  677. }
  678. unsigned NotImm = ~Imm & 0xffff;
  679. if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) {
  680. Instr.getOperand(PIdx + 1).setReg(LastVPTReg);
  681. if (MRI->use_empty(VPR)) {
  682. DeadInstructions.insert(Copy);
  683. if (MRI->hasOneUse(GPR))
  684. DeadInstructions.insert(MRI->getVRegDef(GPR));
  685. }
  686. LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr);
  687. } else if (LastVPTReg != 0 && LastVPTImm == NotImm) {
  688. // We have found the not of a previous constant. Create a VPNot of the
  689. // earlier predicate reg and use it instead of the copy.
  690. Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass);
  691. auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(),
  692. TII->get(ARM::MVE_VPNOT), NewVPR)
  693. .addReg(LastVPTReg);
  694. addUnpredicatedMveVpredNOp(VPNot);
  695. // Use the new register and check if the def is now dead.
  696. Instr.getOperand(PIdx + 1).setReg(NewVPR);
  697. if (MRI->use_empty(VPR)) {
  698. DeadInstructions.insert(Copy);
  699. if (MRI->hasOneUse(GPR))
  700. DeadInstructions.insert(MRI->getVRegDef(GPR));
  701. }
  702. LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << " to replace use at "
  703. << Instr);
  704. VPR = NewVPR;
  705. }
  706. LastVPTImm = Imm;
  707. LastVPTReg = VPR;
  708. }
  709. for (MachineInstr *DI : DeadInstructions)
  710. DI->eraseFromParent();
  711. return !DeadInstructions.empty();
  712. }
  713. // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
  714. // somewhat blunt approximation to allow tail predicated with vpsel
  715. // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
  716. // different semantics under tail predication. Until that is modelled we just
  717. // convert to a VMOVT (via a predicated VORR) instead.
  718. bool MVEVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
  719. bool HasVCTP = false;
  720. SmallVector<MachineInstr *, 4> DeadInstructions;
  721. for (MachineInstr &MI : MBB.instrs()) {
  722. if (isVCTP(&MI)) {
  723. HasVCTP = true;
  724. continue;
  725. }
  726. if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL)
  727. continue;
  728. MachineInstrBuilder MIBuilder =
  729. BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR))
  730. .add(MI.getOperand(0))
  731. .add(MI.getOperand(1))
  732. .add(MI.getOperand(1))
  733. .addImm(ARMVCC::Then)
  734. .add(MI.getOperand(4))
  735. .add(MI.getOperand(2));
  736. // Silence unused variable warning in release builds.
  737. (void)MIBuilder;
  738. LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump();
  739. dbgs() << " with VMOVT: "; MIBuilder.getInstr()->dump());
  740. DeadInstructions.push_back(&MI);
  741. }
  742. for (MachineInstr *DeadInstruction : DeadInstructions)
  743. DeadInstruction->eraseFromParent();
  744. return !DeadInstructions.empty();
  745. }
  746. bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
  747. const ARMSubtarget &STI =
  748. static_cast<const ARMSubtarget &>(Fn.getSubtarget());
  749. if (!STI.isThumb2() || !STI.hasLOB())
  750. return false;
  751. TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
  752. MRI = &Fn.getRegInfo();
  753. MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>();
  754. MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
  755. LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
  756. << "********** Function: " << Fn.getName() << '\n');
  757. bool Modified = false;
  758. for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) {
  759. Modified |= MergeLoopEnd(ML);
  760. Modified |= ConvertTailPredLoop(ML, DT);
  761. }
  762. for (MachineBasicBlock &MBB : Fn) {
  763. Modified |= ReplaceConstByVPNOTs(MBB, DT);
  764. Modified |= ReplaceVCMPsByVPNOTs(MBB);
  765. Modified |= ReduceOldVCCRValueUses(MBB);
  766. Modified |= ConvertVPSEL(MBB);
  767. }
  768. LLVM_DEBUG(dbgs() << "**************************************\n");
  769. return Modified;
  770. }
  771. /// createMVEVPTOptimisationsPass
  772. FunctionPass *llvm::createMVEVPTOptimisationsPass() {
  773. return new MVEVPTOptimisations();
  774. }