MVETPAndVPTOptimisationsPass.cpp 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080
  1. //===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. /// \file This pass does a few optimisations related to Tail predicated loops
  10. /// and MVE VPT blocks before register allocation is performed. For VPT blocks
  11. /// the goal is to maximize the sizes of the blocks that will be created by the
  12. /// MVE VPT Block Insertion pass (which runs after register allocation). For
  13. /// tail predicated loops we transform the loop into something that will
  14. /// hopefully make the backend ARMLowOverheadLoops pass's job easier.
  15. ///
  16. //===----------------------------------------------------------------------===//
  17. #include "ARM.h"
  18. #include "ARMSubtarget.h"
  19. #include "MCTargetDesc/ARMBaseInfo.h"
  20. #include "MVETailPredUtils.h"
  21. #include "Thumb2InstrInfo.h"
  22. #include "llvm/ADT/SmallVector.h"
  23. #include "llvm/CodeGen/MachineBasicBlock.h"
  24. #include "llvm/CodeGen/MachineDominators.h"
  25. #include "llvm/CodeGen/MachineFunction.h"
  26. #include "llvm/CodeGen/MachineFunctionPass.h"
  27. #include "llvm/CodeGen/MachineInstr.h"
  28. #include "llvm/CodeGen/MachineLoopInfo.h"
  29. #include "llvm/InitializePasses.h"
  30. #include "llvm/Support/Debug.h"
  31. #include <cassert>
  32. using namespace llvm;
  33. #define DEBUG_TYPE "arm-mve-vpt-opts"
  34. static cl::opt<bool>
  35. MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden,
  36. cl::desc("Enable merging Loop End and Dec instructions."),
  37. cl::init(true));
  38. static cl::opt<bool>
  39. SetLRPredicate("arm-set-lr-predicate", cl::Hidden,
  40. cl::desc("Enable setting lr as a predicate in tail predication regions."),
  41. cl::init(true));
  42. namespace {
  43. class MVETPAndVPTOptimisations : public MachineFunctionPass {
  44. public:
  45. static char ID;
  46. const Thumb2InstrInfo *TII;
  47. MachineRegisterInfo *MRI;
  48. MVETPAndVPTOptimisations() : MachineFunctionPass(ID) {}
  49. bool runOnMachineFunction(MachineFunction &Fn) override;
  50. void getAnalysisUsage(AnalysisUsage &AU) const override {
  51. AU.addRequired<MachineLoopInfo>();
  52. AU.addPreserved<MachineLoopInfo>();
  53. AU.addRequired<MachineDominatorTree>();
  54. AU.addPreserved<MachineDominatorTree>();
  55. MachineFunctionPass::getAnalysisUsage(AU);
  56. }
  57. StringRef getPassName() const override {
  58. return "ARM MVE TailPred and VPT Optimisation Pass";
  59. }
  60. private:
  61. bool LowerWhileLoopStart(MachineLoop *ML);
  62. bool MergeLoopEnd(MachineLoop *ML);
  63. bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT);
  64. MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
  65. MachineInstr &Instr,
  66. MachineOperand &User,
  67. Register Target);
  68. bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
  69. bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
  70. bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT);
  71. bool ConvertVPSEL(MachineBasicBlock &MBB);
  72. bool HintDoLoopStartReg(MachineBasicBlock &MBB);
  73. MachineInstr *CheckForLRUseInPredecessors(MachineBasicBlock *PreHeader,
  74. MachineInstr *LoopStart);
  75. };
  76. char MVETPAndVPTOptimisations::ID = 0;
  77. } // end anonymous namespace
  78. INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE,
  79. "ARM MVE TailPred and VPT Optimisations pass", false,
  80. false)
  81. INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
  82. INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
  83. INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE,
  84. "ARM MVE TailPred and VPT Optimisations pass", false, false)
  85. static MachineInstr *LookThroughCOPY(MachineInstr *MI,
  86. MachineRegisterInfo *MRI) {
  87. while (MI && MI->getOpcode() == TargetOpcode::COPY &&
  88. MI->getOperand(1).getReg().isVirtual())
  89. MI = MRI->getVRegDef(MI->getOperand(1).getReg());
  90. return MI;
  91. }
  92. // Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and
  93. // corresponding PHI that make up a low overhead loop. Only handles 'do' loops
  94. // at the moment, returning a t2DoLoopStart in LoopStart.
  95. static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI,
  96. MachineInstr *&LoopStart, MachineInstr *&LoopPhi,
  97. MachineInstr *&LoopDec, MachineInstr *&LoopEnd) {
  98. MachineBasicBlock *Header = ML->getHeader();
  99. MachineBasicBlock *Latch = ML->getLoopLatch();
  100. if (!Header || !Latch) {
  101. LLVM_DEBUG(dbgs() << " no Loop Latch or Header\n");
  102. return false;
  103. }
  104. // Find the loop end from the terminators.
  105. LoopEnd = nullptr;
  106. for (auto &T : Latch->terminators()) {
  107. if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) {
  108. LoopEnd = &T;
  109. break;
  110. }
  111. if (T.getOpcode() == ARM::t2LoopEndDec &&
  112. T.getOperand(2).getMBB() == Header) {
  113. LoopEnd = &T;
  114. break;
  115. }
  116. }
  117. if (!LoopEnd) {
  118. LLVM_DEBUG(dbgs() << " no LoopEnd\n");
  119. return false;
  120. }
  121. LLVM_DEBUG(dbgs() << " found loop end: " << *LoopEnd);
  122. // Find the dec from the use of the end. There may be copies between
  123. // instructions. We expect the loop to loop like:
  124. // $vs = t2DoLoopStart ...
  125. // loop:
  126. // $vp = phi [ $vs ], [ $vd ]
  127. // ...
  128. // $vd = t2LoopDec $vp
  129. // ...
  130. // t2LoopEnd $vd, loop
  131. if (LoopEnd->getOpcode() == ARM::t2LoopEndDec)
  132. LoopDec = LoopEnd;
  133. else {
  134. LoopDec =
  135. LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI);
  136. if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) {
  137. LLVM_DEBUG(dbgs() << " didn't find LoopDec where we expected!\n");
  138. return false;
  139. }
  140. }
  141. LLVM_DEBUG(dbgs() << " found loop dec: " << *LoopDec);
  142. LoopPhi =
  143. LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI);
  144. if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI ||
  145. LoopPhi->getNumOperands() != 5 ||
  146. (LoopPhi->getOperand(2).getMBB() != Latch &&
  147. LoopPhi->getOperand(4).getMBB() != Latch)) {
  148. LLVM_DEBUG(dbgs() << " didn't find PHI where we expected!\n");
  149. return false;
  150. }
  151. LLVM_DEBUG(dbgs() << " found loop phi: " << *LoopPhi);
  152. Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch
  153. ? LoopPhi->getOperand(3).getReg()
  154. : LoopPhi->getOperand(1).getReg();
  155. LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI);
  156. if (!LoopStart || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
  157. LoopStart->getOpcode() != ARM::t2WhileLoopSetup &&
  158. LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) {
  159. LLVM_DEBUG(dbgs() << " didn't find Start where we expected!\n");
  160. return false;
  161. }
  162. LLVM_DEBUG(dbgs() << " found loop start: " << *LoopStart);
  163. return true;
  164. }
  165. static void RevertWhileLoopSetup(MachineInstr *MI, const TargetInstrInfo *TII) {
  166. MachineBasicBlock *MBB = MI->getParent();
  167. assert(MI->getOpcode() == ARM::t2WhileLoopSetup &&
  168. "Only expected a t2WhileLoopSetup in RevertWhileLoopStart!");
  169. // Subs
  170. MachineInstrBuilder MIB =
  171. BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
  172. MIB.add(MI->getOperand(0));
  173. MIB.add(MI->getOperand(1));
  174. MIB.addImm(0);
  175. MIB.addImm(ARMCC::AL);
  176. MIB.addReg(ARM::NoRegister);
  177. MIB.addReg(ARM::CPSR, RegState::Define);
  178. // Attempt to find a t2WhileLoopStart and revert to a t2Bcc.
  179. for (MachineInstr &I : MBB->terminators()) {
  180. if (I.getOpcode() == ARM::t2WhileLoopStart) {
  181. MachineInstrBuilder MIB =
  182. BuildMI(*MBB, &I, I.getDebugLoc(), TII->get(ARM::t2Bcc));
  183. MIB.add(MI->getOperand(1)); // branch target
  184. MIB.addImm(ARMCC::EQ);
  185. MIB.addReg(ARM::CPSR);
  186. I.eraseFromParent();
  187. break;
  188. }
  189. }
  190. MI->eraseFromParent();
  191. }
  192. // The Hardware Loop insertion and ISel Lowering produce the pseudos for the
  193. // start of a while loop:
  194. // %a:gprlr = t2WhileLoopSetup %Cnt
  195. // t2WhileLoopStart %a, %BB
  196. // We want to convert those to a single instruction which, like t2LoopEndDec and
  197. // t2DoLoopStartTP is both a terminator and produces a value:
  198. // %a:grplr: t2WhileLoopStartLR %Cnt, %BB
  199. //
  200. // Otherwise if we can't, we revert the loop. t2WhileLoopSetup and
  201. // t2WhileLoopStart are not valid past regalloc.
  202. bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) {
  203. LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop "
  204. << ML->getHeader()->getName() << "\n");
  205. MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
  206. if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
  207. return false;
  208. if (LoopStart->getOpcode() != ARM::t2WhileLoopSetup)
  209. return false;
  210. Register LR = LoopStart->getOperand(0).getReg();
  211. auto WLSIt = find_if(MRI->use_nodbg_instructions(LR), [](auto &MI) {
  212. return MI.getOpcode() == ARM::t2WhileLoopStart;
  213. });
  214. if (!MergeEndDec || WLSIt == MRI->use_instr_nodbg_end()) {
  215. RevertWhileLoopSetup(LoopStart, TII);
  216. RevertLoopDec(LoopStart, TII);
  217. RevertLoopEnd(LoopStart, TII);
  218. return true;
  219. }
  220. MachineInstrBuilder MI =
  221. BuildMI(*WLSIt->getParent(), *WLSIt, WLSIt->getDebugLoc(),
  222. TII->get(ARM::t2WhileLoopStartLR), LR)
  223. .add(LoopStart->getOperand(1))
  224. .add(WLSIt->getOperand(1));
  225. (void)MI;
  226. LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI.getInstr());
  227. WLSIt->eraseFromParent();
  228. LoopStart->eraseFromParent();
  229. return true;
  230. }
  231. // Return true if this instruction is invalid in a low overhead loop, usually
  232. // because it clobbers LR.
  233. static bool IsInvalidTPInstruction(MachineInstr &MI) {
  234. return MI.isCall() || isLoopStart(MI);
  235. }
  236. // Starting from PreHeader, search for invalid instructions back until the
  237. // LoopStart block is reached. If invalid instructions are found, the loop start
  238. // is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will
  239. // return the new DLS LoopStart if updated.
  240. MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors(
  241. MachineBasicBlock *PreHeader, MachineInstr *LoopStart) {
  242. SmallVector<MachineBasicBlock *> Worklist;
  243. SmallPtrSet<MachineBasicBlock *, 4> Visited;
  244. Worklist.push_back(PreHeader);
  245. Visited.insert(LoopStart->getParent());
  246. while (!Worklist.empty()) {
  247. MachineBasicBlock *MBB = Worklist.pop_back_val();
  248. if (Visited.count(MBB))
  249. continue;
  250. for (MachineInstr &MI : *MBB) {
  251. if (!IsInvalidTPInstruction(MI))
  252. continue;
  253. LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI);
  254. // Create a t2DoLoopStart at the end of the preheader.
  255. MachineInstrBuilder MIB =
  256. BuildMI(*PreHeader, PreHeader->getFirstTerminator(),
  257. LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStart));
  258. MIB.add(LoopStart->getOperand(0));
  259. MIB.add(LoopStart->getOperand(1));
  260. // Make sure to remove the kill flags, to prevent them from being invalid.
  261. LoopStart->getOperand(1).setIsKill(false);
  262. // Revert the t2WhileLoopStartLR to a CMP and Br.
  263. RevertWhileLoopStartLR(LoopStart, TII, ARM::t2Bcc, true);
  264. return MIB;
  265. }
  266. Visited.insert(MBB);
  267. for (auto *Pred : MBB->predecessors())
  268. Worklist.push_back(Pred);
  269. }
  270. return LoopStart;
  271. }
  272. // This function converts loops with t2LoopEnd and t2LoopEnd instructions into
  273. // a single t2LoopEndDec instruction. To do that it needs to make sure that LR
  274. // will be valid to be used for the low overhead loop, which means nothing else
  275. // is using LR (especially calls) and there are no superfluous copies in the
  276. // loop. The t2LoopEndDec is a branching terminator that produces a value (the
  277. // decrement) around the loop edge, which means we need to be careful that they
  278. // will be valid to allocate without any spilling.
  279. bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) {
  280. if (!MergeEndDec)
  281. return false;
  282. LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName()
  283. << "\n");
  284. MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
  285. if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
  286. return false;
  287. // Check if there is an illegal instruction (a call) in the low overhead loop
  288. // and if so revert it now before we get any further. While loops also need to
  289. // check the preheaders, but can be reverted to a DLS loop if needed.
  290. auto *PreHeader = ML->getLoopPreheader();
  291. if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader)
  292. LoopStart = CheckForLRUseInPredecessors(PreHeader, LoopStart);
  293. for (MachineBasicBlock *MBB : ML->blocks()) {
  294. for (MachineInstr &MI : *MBB) {
  295. if (IsInvalidTPInstruction(MI)) {
  296. LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI);
  297. if (LoopStart->getOpcode() == ARM::t2DoLoopStart)
  298. RevertDoLoopStart(LoopStart, TII);
  299. else
  300. RevertWhileLoopStartLR(LoopStart, TII);
  301. RevertLoopDec(LoopDec, TII);
  302. RevertLoopEnd(LoopEnd, TII);
  303. return true;
  304. }
  305. }
  306. }
  307. // Remove any copies from the loop, to ensure the phi that remains is both
  308. // simpler and contains no extra uses. Because t2LoopEndDec is a terminator
  309. // that cannot spill, we need to be careful what remains in the loop.
  310. Register PhiReg = LoopPhi->getOperand(0).getReg();
  311. Register DecReg = LoopDec->getOperand(0).getReg();
  312. Register StartReg = LoopStart->getOperand(0).getReg();
  313. // Ensure the uses are expected, and collect any copies we want to remove.
  314. SmallVector<MachineInstr *, 4> Copies;
  315. auto CheckUsers = [&Copies](Register BaseReg,
  316. ArrayRef<MachineInstr *> ExpectedUsers,
  317. MachineRegisterInfo *MRI) {
  318. SmallVector<Register, 4> Worklist;
  319. Worklist.push_back(BaseReg);
  320. while (!Worklist.empty()) {
  321. Register Reg = Worklist.pop_back_val();
  322. for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
  323. if (llvm::is_contained(ExpectedUsers, &MI))
  324. continue;
  325. if (MI.getOpcode() != TargetOpcode::COPY ||
  326. !MI.getOperand(0).getReg().isVirtual()) {
  327. LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI);
  328. return false;
  329. }
  330. Worklist.push_back(MI.getOperand(0).getReg());
  331. Copies.push_back(&MI);
  332. }
  333. }
  334. return true;
  335. };
  336. if (!CheckUsers(PhiReg, {LoopDec}, MRI) ||
  337. !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) ||
  338. !CheckUsers(StartReg, {LoopPhi}, MRI)) {
  339. // Don't leave a t2WhileLoopStartLR without the LoopDecEnd.
  340. if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR) {
  341. RevertWhileLoopStartLR(LoopStart, TII);
  342. RevertLoopDec(LoopDec, TII);
  343. RevertLoopEnd(LoopEnd, TII);
  344. return true;
  345. }
  346. return false;
  347. }
  348. MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass);
  349. MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass);
  350. MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass);
  351. if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) {
  352. LoopPhi->getOperand(3).setReg(StartReg);
  353. LoopPhi->getOperand(1).setReg(DecReg);
  354. } else {
  355. LoopPhi->getOperand(1).setReg(StartReg);
  356. LoopPhi->getOperand(3).setReg(DecReg);
  357. }
  358. // Replace the loop dec and loop end as a single instruction.
  359. MachineInstrBuilder MI =
  360. BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(),
  361. TII->get(ARM::t2LoopEndDec), DecReg)
  362. .addReg(PhiReg)
  363. .add(LoopEnd->getOperand(1));
  364. (void)MI;
  365. LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr());
  366. LoopDec->eraseFromParent();
  367. LoopEnd->eraseFromParent();
  368. for (auto *MI : Copies)
  369. MI->eraseFromParent();
  370. return true;
  371. }
  372. // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP
  373. // instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP
  374. // instruction, making the backend ARMLowOverheadLoops passes job of finding the
  375. // VCTP operand much simpler.
  376. bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
  377. MachineDominatorTree *DT) {
  378. LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop "
  379. << ML->getHeader()->getName() << "\n");
  380. // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's
  381. // in the loop.
  382. MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec;
  383. if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd))
  384. return false;
  385. if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart &&
  386. LoopStart->getOpcode() != ARM::t2WhileLoopStartLR))
  387. return false;
  388. SmallVector<MachineInstr *, 4> VCTPs;
  389. SmallVector<MachineInstr *, 4> MVEInstrs;
  390. for (MachineBasicBlock *BB : ML->blocks()) {
  391. for (MachineInstr &MI : *BB)
  392. if (isVCTP(&MI))
  393. VCTPs.push_back(&MI);
  394. else if (findFirstVPTPredOperandIdx(MI) != -1)
  395. MVEInstrs.push_back(&MI);
  396. }
  397. if (VCTPs.empty()) {
  398. LLVM_DEBUG(dbgs() << " no VCTPs\n");
  399. return false;
  400. }
  401. // Check all VCTPs are the same.
  402. MachineInstr *FirstVCTP = *VCTPs.begin();
  403. for (MachineInstr *VCTP : VCTPs) {
  404. LLVM_DEBUG(dbgs() << " with VCTP " << *VCTP);
  405. if (VCTP->getOpcode() != FirstVCTP->getOpcode() ||
  406. VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) {
  407. LLVM_DEBUG(dbgs() << " VCTP's are not identical\n");
  408. return false;
  409. }
  410. }
  411. // Check for the register being used can be setup before the loop. We expect
  412. // this to be:
  413. // $vx = ...
  414. // loop:
  415. // $vp = PHI [ $vx ], [ $vd ]
  416. // ..
  417. // $vpr = VCTP $vp
  418. // ..
  419. // $vd = t2SUBri $vp, #n
  420. // ..
  421. Register CountReg = FirstVCTP->getOperand(1).getReg();
  422. if (!CountReg.isVirtual()) {
  423. LLVM_DEBUG(dbgs() << " cannot determine VCTP PHI\n");
  424. return false;
  425. }
  426. MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI);
  427. if (!Phi || Phi->getOpcode() != TargetOpcode::PHI ||
  428. Phi->getNumOperands() != 5 ||
  429. (Phi->getOperand(2).getMBB() != ML->getLoopLatch() &&
  430. Phi->getOperand(4).getMBB() != ML->getLoopLatch())) {
  431. LLVM_DEBUG(dbgs() << " cannot determine VCTP Count\n");
  432. return false;
  433. }
  434. CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch()
  435. ? Phi->getOperand(3).getReg()
  436. : Phi->getOperand(1).getReg();
  437. // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of
  438. // the preheader and add the new CountReg to it. We attempt to place it late
  439. // in the preheader, but may need to move that earlier based on uses.
  440. MachineBasicBlock *MBB = LoopStart->getParent();
  441. MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator();
  442. for (MachineInstr &Use :
  443. MRI->use_instructions(LoopStart->getOperand(0).getReg()))
  444. if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) ||
  445. !DT->dominates(ML->getHeader(), Use.getParent())) {
  446. LLVM_DEBUG(dbgs() << " InsertPt could not be a terminator!\n");
  447. return false;
  448. }
  449. unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart
  450. ? ARM::t2DoLoopStartTP
  451. : ARM::t2WhileLoopStartTP;
  452. MachineInstrBuilder MI =
  453. BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(NewOpc))
  454. .add(LoopStart->getOperand(0))
  455. .add(LoopStart->getOperand(1))
  456. .addReg(CountReg);
  457. if (NewOpc == ARM::t2WhileLoopStartTP)
  458. MI.add(LoopStart->getOperand(2));
  459. LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << " with "
  460. << *MI.getInstr());
  461. MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass);
  462. LoopStart->eraseFromParent();
  463. if (SetLRPredicate) {
  464. // Each instruction in the loop needs to be using LR as the predicate from
  465. // the Phi as the predicate.
  466. Register LR = LoopPhi->getOperand(0).getReg();
  467. for (MachineInstr *MI : MVEInstrs) {
  468. int Idx = findFirstVPTPredOperandIdx(*MI);
  469. MI->getOperand(Idx + 2).setReg(LR);
  470. }
  471. }
  472. return true;
  473. }
  474. // Returns true if Opcode is any VCMP Opcode.
  475. static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; }
  476. // Returns true if a VCMP with this Opcode can have its operands swapped.
  477. // There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs,
  478. // and VCMPr instructions (since the r is always on the right).
  479. static bool CanHaveSwappedOperands(unsigned Opcode) {
  480. switch (Opcode) {
  481. default:
  482. return true;
  483. case ARM::MVE_VCMPf32:
  484. case ARM::MVE_VCMPf16:
  485. case ARM::MVE_VCMPf32r:
  486. case ARM::MVE_VCMPf16r:
  487. case ARM::MVE_VCMPi8r:
  488. case ARM::MVE_VCMPi16r:
  489. case ARM::MVE_VCMPi32r:
  490. case ARM::MVE_VCMPu8r:
  491. case ARM::MVE_VCMPu16r:
  492. case ARM::MVE_VCMPu32r:
  493. case ARM::MVE_VCMPs8r:
  494. case ARM::MVE_VCMPs16r:
  495. case ARM::MVE_VCMPs32r:
  496. return false;
  497. }
  498. }
  499. // Returns the CondCode of a VCMP Instruction.
  500. static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) {
  501. assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP");
  502. return ARMCC::CondCodes(Instr.getOperand(3).getImm());
  503. }
  504. // Returns true if Cond is equivalent to a VPNOT instruction on the result of
  505. // Prev. Cond and Prev must be VCMPs.
  506. static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) {
  507. assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode()));
  508. // Opcodes must match.
  509. if (Cond.getOpcode() != Prev.getOpcode())
  510. return false;
  511. MachineOperand &CondOP1 = Cond.getOperand(1), &CondOP2 = Cond.getOperand(2);
  512. MachineOperand &PrevOP1 = Prev.getOperand(1), &PrevOP2 = Prev.getOperand(2);
  513. // If the VCMP has the opposite condition with the same operands, we can
  514. // replace it with a VPNOT
  515. ARMCC::CondCodes ExpectedCode = GetCondCode(Cond);
  516. ExpectedCode = ARMCC::getOppositeCondition(ExpectedCode);
  517. if (ExpectedCode == GetCondCode(Prev))
  518. if (CondOP1.isIdenticalTo(PrevOP1) && CondOP2.isIdenticalTo(PrevOP2))
  519. return true;
  520. // Check again with operands swapped if possible
  521. if (!CanHaveSwappedOperands(Cond.getOpcode()))
  522. return false;
  523. ExpectedCode = ARMCC::getSwappedCondition(ExpectedCode);
  524. return ExpectedCode == GetCondCode(Prev) && CondOP1.isIdenticalTo(PrevOP2) &&
  525. CondOP2.isIdenticalTo(PrevOP1);
  526. }
  527. // Returns true if Instr writes to VCCR.
  528. static bool IsWritingToVCCR(MachineInstr &Instr) {
  529. if (Instr.getNumOperands() == 0)
  530. return false;
  531. MachineOperand &Dst = Instr.getOperand(0);
  532. if (!Dst.isReg())
  533. return false;
  534. Register DstReg = Dst.getReg();
  535. if (!DstReg.isVirtual())
  536. return false;
  537. MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo();
  538. const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(DstReg);
  539. return RegClass && (RegClass->getID() == ARM::VCCRRegClassID);
  540. }
  541. // Transforms
  542. // <Instr that uses %A ('User' Operand)>
  543. // Into
  544. // %K = VPNOT %Target
  545. // <Instr that uses %K ('User' Operand)>
  546. // And returns the newly inserted VPNOT.
  547. // This optimization is done in the hopes of preventing spills/reloads of VPR by
  548. // reducing the number of VCCR values with overlapping lifetimes.
  549. MachineInstr &MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT(
  550. MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User,
  551. Register Target) {
  552. Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target));
  553. MachineInstrBuilder MIBuilder =
  554. BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
  555. .addDef(NewResult)
  556. .addReg(Target);
  557. addUnpredicatedMveVpredNOp(MIBuilder);
  558. // Make the user use NewResult instead, and clear its kill flag.
  559. User.setReg(NewResult);
  560. User.setIsKill(false);
  561. LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): ";
  562. MIBuilder.getInstr()->dump());
  563. return *MIBuilder.getInstr();
  564. }
  565. // Moves a VPNOT before its first user if an instruction that uses Reg is found
  566. // in-between the VPNOT and its user.
  567. // Returns true if there is at least one user of the VPNOT in the block.
  568. static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB,
  569. MachineBasicBlock::iterator Iter,
  570. Register Reg) {
  571. assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!");
  572. assert(getVPTInstrPredicate(*Iter) == ARMVCC::None &&
  573. "The VPNOT cannot be predicated");
  574. MachineInstr &VPNOT = *Iter;
  575. Register VPNOTResult = VPNOT.getOperand(0).getReg();
  576. Register VPNOTOperand = VPNOT.getOperand(1).getReg();
  577. // Whether the VPNOT will need to be moved, and whether we found a user of the
  578. // VPNOT.
  579. bool MustMove = false, HasUser = false;
  580. MachineOperand *VPNOTOperandKiller = nullptr;
  581. for (; Iter != MBB.end(); ++Iter) {
  582. if (MachineOperand *MO =
  583. Iter->findRegisterUseOperand(VPNOTOperand, /*isKill*/ true)) {
  584. // If we find the operand that kills the VPNOTOperand's result, save it.
  585. VPNOTOperandKiller = MO;
  586. }
  587. if (Iter->findRegisterUseOperandIdx(Reg) != -1) {
  588. MustMove = true;
  589. continue;
  590. }
  591. if (Iter->findRegisterUseOperandIdx(VPNOTResult) == -1)
  592. continue;
  593. HasUser = true;
  594. if (!MustMove)
  595. break;
  596. // Move the VPNOT right before Iter
  597. LLVM_DEBUG(dbgs() << "Moving: "; VPNOT.dump(); dbgs() << " Before: ";
  598. Iter->dump());
  599. MBB.splice(Iter, &MBB, VPNOT.getIterator());
  600. // If we move the instr, and its operand was killed earlier, remove the kill
  601. // flag.
  602. if (VPNOTOperandKiller)
  603. VPNOTOperandKiller->setIsKill(false);
  604. break;
  605. }
  606. return HasUser;
  607. }
  608. // This optimisation attempts to reduce the number of overlapping lifetimes of
  609. // VCCR values by replacing uses of old VCCR values with VPNOTs. For example,
  610. // this replaces
  611. // %A:vccr = (something)
  612. // %B:vccr = VPNOT %A
  613. // %Foo = (some op that uses %B)
  614. // %Bar = (some op that uses %A)
  615. // With
  616. // %A:vccr = (something)
  617. // %B:vccr = VPNOT %A
  618. // %Foo = (some op that uses %B)
  619. // %TMP2:vccr = VPNOT %B
  620. // %Bar = (some op that uses %A)
  621. bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
  622. MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end();
  623. SmallVector<MachineInstr *, 4> DeadInstructions;
  624. bool Modified = false;
  625. while (Iter != End) {
  626. Register VCCRValue, OppositeVCCRValue;
  627. // The first loop looks for 2 unpredicated instructions:
  628. // %A:vccr = (instr) ; A is stored in VCCRValue
  629. // %B:vccr = VPNOT %A ; B is stored in OppositeVCCRValue
  630. for (; Iter != End; ++Iter) {
  631. // We're only interested in unpredicated instructions that write to VCCR.
  632. if (!IsWritingToVCCR(*Iter) ||
  633. getVPTInstrPredicate(*Iter) != ARMVCC::None)
  634. continue;
  635. Register Dst = Iter->getOperand(0).getReg();
  636. // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've
  637. // found what we were looking for.
  638. if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT &&
  639. Iter->findRegisterUseOperandIdx(VCCRValue) != -1) {
  640. // Move the VPNOT closer to its first user if needed, and ignore if it
  641. // has no users.
  642. if (!MoveVPNOTBeforeFirstUser(MBB, Iter, VCCRValue))
  643. continue;
  644. OppositeVCCRValue = Dst;
  645. ++Iter;
  646. break;
  647. }
  648. // Else, just set VCCRValue.
  649. VCCRValue = Dst;
  650. }
  651. // If the first inner loop didn't find anything, stop here.
  652. if (Iter == End)
  653. break;
  654. assert(VCCRValue && OppositeVCCRValue &&
  655. "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop "
  656. "stopped before the end of the block!");
  657. assert(VCCRValue != OppositeVCCRValue &&
  658. "VCCRValue should not be equal to OppositeVCCRValue!");
  659. // LastVPNOTResult always contains the same value as OppositeVCCRValue.
  660. Register LastVPNOTResult = OppositeVCCRValue;
  661. // This second loop tries to optimize the remaining instructions.
  662. for (; Iter != End; ++Iter) {
  663. bool IsInteresting = false;
  664. if (MachineOperand *MO = Iter->findRegisterUseOperand(VCCRValue)) {
  665. IsInteresting = true;
  666. // - If the instruction is a VPNOT, it can be removed, and we can just
  667. // replace its uses with LastVPNOTResult.
  668. // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue.
  669. if (Iter->getOpcode() == ARM::MVE_VPNOT) {
  670. Register Result = Iter->getOperand(0).getReg();
  671. MRI->replaceRegWith(Result, LastVPNOTResult);
  672. DeadInstructions.push_back(&*Iter);
  673. Modified = true;
  674. LLVM_DEBUG(dbgs()
  675. << "Replacing all uses of '" << printReg(Result)
  676. << "' with '" << printReg(LastVPNOTResult) << "'\n");
  677. } else {
  678. MachineInstr &VPNOT =
  679. ReplaceRegisterUseWithVPNOT(MBB, *Iter, *MO, LastVPNOTResult);
  680. Modified = true;
  681. LastVPNOTResult = VPNOT.getOperand(0).getReg();
  682. std::swap(VCCRValue, OppositeVCCRValue);
  683. LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue)
  684. << "' with '" << printReg(LastVPNOTResult)
  685. << "' in instr: " << *Iter);
  686. }
  687. } else {
  688. // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult
  689. // instead as they contain the same value.
  690. if (MachineOperand *MO =
  691. Iter->findRegisterUseOperand(OppositeVCCRValue)) {
  692. IsInteresting = true;
  693. // This is pointless if LastVPNOTResult == OppositeVCCRValue.
  694. if (LastVPNOTResult != OppositeVCCRValue) {
  695. LLVM_DEBUG(dbgs() << "Replacing usage of '"
  696. << printReg(OppositeVCCRValue) << "' with '"
  697. << printReg(LastVPNOTResult) << " for instr: ";
  698. Iter->dump());
  699. MO->setReg(LastVPNOTResult);
  700. Modified = true;
  701. }
  702. MO->setIsKill(false);
  703. }
  704. // If this is an unpredicated VPNOT on
  705. // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it.
  706. if (Iter->getOpcode() == ARM::MVE_VPNOT &&
  707. getVPTInstrPredicate(*Iter) == ARMVCC::None) {
  708. Register VPNOTOperand = Iter->getOperand(1).getReg();
  709. if (VPNOTOperand == LastVPNOTResult ||
  710. VPNOTOperand == OppositeVCCRValue) {
  711. IsInteresting = true;
  712. std::swap(VCCRValue, OppositeVCCRValue);
  713. LastVPNOTResult = Iter->getOperand(0).getReg();
  714. }
  715. }
  716. }
  717. // If this instruction was not interesting, and it writes to VCCR, stop.
  718. if (!IsInteresting && IsWritingToVCCR(*Iter))
  719. break;
  720. }
  721. }
  722. for (MachineInstr *DeadInstruction : DeadInstructions)
  723. DeadInstruction->eraseFromParent();
  724. return Modified;
  725. }
  726. // This optimisation replaces VCMPs with VPNOTs when they are equivalent.
  727. bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
  728. SmallVector<MachineInstr *, 4> DeadInstructions;
  729. // The last VCMP that we have seen and that couldn't be replaced.
  730. // This is reset when an instruction that writes to VCCR/VPR is found, or when
  731. // a VCMP is replaced with a VPNOT.
  732. // We'll only replace VCMPs with VPNOTs when this is not null, and when the
  733. // current VCMP is the opposite of PrevVCMP.
  734. MachineInstr *PrevVCMP = nullptr;
  735. // If we find an instruction that kills the result of PrevVCMP, we save the
  736. // operand here to remove the kill flag in case we need to use PrevVCMP's
  737. // result.
  738. MachineOperand *PrevVCMPResultKiller = nullptr;
  739. for (MachineInstr &Instr : MBB.instrs()) {
  740. if (PrevVCMP) {
  741. if (MachineOperand *MO = Instr.findRegisterUseOperand(
  742. PrevVCMP->getOperand(0).getReg(), /*isKill*/ true)) {
  743. // If we come accross the instr that kills PrevVCMP's result, record it
  744. // so we can remove the kill flag later if we need to.
  745. PrevVCMPResultKiller = MO;
  746. }
  747. }
  748. // Ignore predicated instructions.
  749. if (getVPTInstrPredicate(Instr) != ARMVCC::None)
  750. continue;
  751. // Only look at VCMPs
  752. if (!IsVCMP(Instr.getOpcode())) {
  753. // If the instruction writes to VCCR, forget the previous VCMP.
  754. if (IsWritingToVCCR(Instr))
  755. PrevVCMP = nullptr;
  756. continue;
  757. }
  758. if (!PrevVCMP || !IsVPNOTEquivalent(Instr, *PrevVCMP)) {
  759. PrevVCMP = &Instr;
  760. continue;
  761. }
  762. // The register containing the result of the VCMP that we're going to
  763. // replace.
  764. Register PrevVCMPResultReg = PrevVCMP->getOperand(0).getReg();
  765. // Build a VPNOT to replace the VCMP, reusing its operands.
  766. MachineInstrBuilder MIBuilder =
  767. BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
  768. .add(Instr.getOperand(0))
  769. .addReg(PrevVCMPResultReg);
  770. addUnpredicatedMveVpredNOp(MIBuilder);
  771. LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): ";
  772. MIBuilder.getInstr()->dump(); dbgs() << " Removed VCMP: ";
  773. Instr.dump());
  774. // If we found an instruction that uses, and kills PrevVCMP's result,
  775. // remove the kill flag.
  776. if (PrevVCMPResultKiller)
  777. PrevVCMPResultKiller->setIsKill(false);
  778. // Finally, mark the old VCMP for removal and reset
  779. // PrevVCMP/PrevVCMPResultKiller.
  780. DeadInstructions.push_back(&Instr);
  781. PrevVCMP = nullptr;
  782. PrevVCMPResultKiller = nullptr;
  783. }
  784. for (MachineInstr *DeadInstruction : DeadInstructions)
  785. DeadInstruction->eraseFromParent();
  786. return !DeadInstructions.empty();
  787. }
  788. bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB,
  789. MachineDominatorTree *DT) {
  790. // Scan through the block, looking for instructions that use constants moves
  791. // into VPR that are the negative of one another. These are expected to be
  792. // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant
  793. // mask is kept it or and VPNOT's of it are added or reused as we scan through
  794. // the function.
  795. unsigned LastVPTImm = 0;
  796. Register LastVPTReg = 0;
  797. SmallSet<MachineInstr *, 4> DeadInstructions;
  798. for (MachineInstr &Instr : MBB.instrs()) {
  799. // Look for predicated MVE instructions.
  800. int PIdx = llvm::findFirstVPTPredOperandIdx(Instr);
  801. if (PIdx == -1)
  802. continue;
  803. Register VPR = Instr.getOperand(PIdx + 1).getReg();
  804. if (!VPR.isVirtual())
  805. continue;
  806. // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr.
  807. MachineInstr *Copy = MRI->getVRegDef(VPR);
  808. if (!Copy || Copy->getOpcode() != TargetOpcode::COPY ||
  809. !Copy->getOperand(1).getReg().isVirtual() ||
  810. MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) {
  811. LastVPTReg = 0;
  812. continue;
  813. }
  814. Register GPR = Copy->getOperand(1).getReg();
  815. // Find the Immediate used by the copy.
  816. auto getImm = [&](Register GPR) -> unsigned {
  817. MachineInstr *Def = MRI->getVRegDef(GPR);
  818. if (Def && (Def->getOpcode() == ARM::t2MOVi ||
  819. Def->getOpcode() == ARM::t2MOVi16))
  820. return Def->getOperand(1).getImm();
  821. return -1U;
  822. };
  823. unsigned Imm = getImm(GPR);
  824. if (Imm == -1U) {
  825. LastVPTReg = 0;
  826. continue;
  827. }
  828. unsigned NotImm = ~Imm & 0xffff;
  829. if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) {
  830. Instr.getOperand(PIdx + 1).setReg(LastVPTReg);
  831. if (MRI->use_empty(VPR)) {
  832. DeadInstructions.insert(Copy);
  833. if (MRI->hasOneUse(GPR))
  834. DeadInstructions.insert(MRI->getVRegDef(GPR));
  835. }
  836. LLVM_DEBUG(dbgs() << "Reusing predicate: in " << Instr);
  837. } else if (LastVPTReg != 0 && LastVPTImm == NotImm) {
  838. // We have found the not of a previous constant. Create a VPNot of the
  839. // earlier predicate reg and use it instead of the copy.
  840. Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass);
  841. auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(),
  842. TII->get(ARM::MVE_VPNOT), NewVPR)
  843. .addReg(LastVPTReg);
  844. addUnpredicatedMveVpredNOp(VPNot);
  845. // Use the new register and check if the def is now dead.
  846. Instr.getOperand(PIdx + 1).setReg(NewVPR);
  847. if (MRI->use_empty(VPR)) {
  848. DeadInstructions.insert(Copy);
  849. if (MRI->hasOneUse(GPR))
  850. DeadInstructions.insert(MRI->getVRegDef(GPR));
  851. }
  852. LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << " to replace use at "
  853. << Instr);
  854. VPR = NewVPR;
  855. }
  856. LastVPTImm = Imm;
  857. LastVPTReg = VPR;
  858. }
  859. for (MachineInstr *DI : DeadInstructions)
  860. DI->eraseFromParent();
  861. return !DeadInstructions.empty();
  862. }
  863. // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a
  864. // somewhat blunt approximation to allow tail predicated with vpsel
  865. // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly
  866. // different semantics under tail predication. Until that is modelled we just
  867. // convert to a VMOVT (via a predicated VORR) instead.
  868. bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) {
  869. bool HasVCTP = false;
  870. SmallVector<MachineInstr *, 4> DeadInstructions;
  871. for (MachineInstr &MI : MBB.instrs()) {
  872. if (isVCTP(&MI)) {
  873. HasVCTP = true;
  874. continue;
  875. }
  876. if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL)
  877. continue;
  878. MachineInstrBuilder MIBuilder =
  879. BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR))
  880. .add(MI.getOperand(0))
  881. .add(MI.getOperand(1))
  882. .add(MI.getOperand(1))
  883. .addImm(ARMVCC::Then)
  884. .add(MI.getOperand(4))
  885. .add(MI.getOperand(5))
  886. .add(MI.getOperand(2));
  887. // Silence unused variable warning in release builds.
  888. (void)MIBuilder;
  889. LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump();
  890. dbgs() << " with VMOVT: "; MIBuilder.getInstr()->dump());
  891. DeadInstructions.push_back(&MI);
  892. }
  893. for (MachineInstr *DeadInstruction : DeadInstructions)
  894. DeadInstruction->eraseFromParent();
  895. return !DeadInstructions.empty();
  896. }
  897. // Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as
  898. // the instruction may be removable as a noop.
  899. bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) {
  900. bool Changed = false;
  901. for (MachineInstr &MI : MBB.instrs()) {
  902. if (MI.getOpcode() != ARM::t2DoLoopStart)
  903. continue;
  904. Register R = MI.getOperand(1).getReg();
  905. MachineFunction *MF = MI.getParent()->getParent();
  906. MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0);
  907. Changed = true;
  908. }
  909. return Changed;
  910. }
  911. bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
  912. const ARMSubtarget &STI =
  913. static_cast<const ARMSubtarget &>(Fn.getSubtarget());
  914. if (!STI.isThumb2() || !STI.hasLOB())
  915. return false;
  916. TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
  917. MRI = &Fn.getRegInfo();
  918. MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>();
  919. MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
  920. LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
  921. << "********** Function: " << Fn.getName() << '\n');
  922. bool Modified = false;
  923. for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) {
  924. Modified |= LowerWhileLoopStart(ML);
  925. Modified |= MergeLoopEnd(ML);
  926. Modified |= ConvertTailPredLoop(ML, DT);
  927. }
  928. for (MachineBasicBlock &MBB : Fn) {
  929. Modified |= HintDoLoopStartReg(MBB);
  930. Modified |= ReplaceConstByVPNOTs(MBB, DT);
  931. Modified |= ReplaceVCMPsByVPNOTs(MBB);
  932. Modified |= ReduceOldVCCRValueUses(MBB);
  933. Modified |= ConvertVPSEL(MBB);
  934. }
  935. LLVM_DEBUG(dbgs() << "**************************************\n");
  936. return Modified;
  937. }
  938. /// createMVETPAndVPTOptimisationsPass
  939. FunctionPass *llvm::createMVETPAndVPTOptimisationsPass() {
  940. return new MVETPAndVPTOptimisations();
  941. }