ARMLowOverheadLoops.cpp 64 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731
  1. //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file
  9. /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
  10. /// instructions into machine operations.
  11. /// The expectation is that the loop contains three pseudo instructions:
  12. /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
  13. /// form should be in the preheader, whereas the while form should be in the
  14. /// preheaders only predecessor.
  15. /// - t2LoopDec - placed within in the loop body.
  16. /// - t2LoopEnd - the loop latch terminator.
  17. ///
  18. /// In addition to this, we also look for the presence of the VCTP instruction,
  19. /// which determines whether we can generated the tail-predicated low-overhead
  20. /// loop form.
  21. ///
  22. /// Assumptions and Dependencies:
  23. /// Low-overhead loops are constructed and executed using a setup instruction:
  24. /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
  25. /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
  26. /// but fixed polarity: WLS can only branch forwards and LE can only branch
  27. /// backwards. These restrictions mean that this pass is dependent upon block
  28. /// layout and block sizes, which is why it's the last pass to run. The same is
  29. /// true for ConstantIslands, but this pass does not increase the size of the
  30. /// basic blocks, nor does it change the CFG. Instructions are mainly removed
  31. /// during the transform and pseudo instructions are replaced by real ones. In
  32. /// some cases, when we have to revert to a 'normal' loop, we have to introduce
  33. /// multiple instructions for a single pseudo (see RevertWhile and
  34. /// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd
  35. /// are defined to be as large as this maximum sequence of replacement
  36. /// instructions.
  37. ///
  38. /// A note on VPR.P0 (the lane mask):
  39. /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
  40. /// "VPT Active" context (which includes low-overhead loops and vpt blocks).
  41. /// They will simply "and" the result of their calculation with the current
  42. /// value of VPR.P0. You can think of it like this:
  43. /// \verbatim
  44. /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs
  45. /// VPR.P0 &= Value
  46. /// else
  47. /// VPR.P0 = Value
  48. /// \endverbatim
  49. /// When we're inside the low-overhead loop (between DLSTP and LETP), we always
  50. /// fall in the "VPT active" case, so we can consider that all VPR writes by
  51. /// one of those instruction is actually a "and".
  52. //===----------------------------------------------------------------------===//
  53. #include "ARM.h"
  54. #include "ARMBaseInstrInfo.h"
  55. #include "ARMBaseRegisterInfo.h"
  56. #include "ARMBasicBlockInfo.h"
  57. #include "ARMSubtarget.h"
  58. #include "MVETailPredUtils.h"
  59. #include "Thumb2InstrInfo.h"
  60. #include "llvm/ADT/SetOperations.h"
  61. #include "llvm/ADT/SmallSet.h"
  62. #include "llvm/CodeGen/LivePhysRegs.h"
  63. #include "llvm/CodeGen/MachineFunctionPass.h"
  64. #include "llvm/CodeGen/MachineLoopInfo.h"
  65. #include "llvm/CodeGen/MachineLoopUtils.h"
  66. #include "llvm/CodeGen/MachineRegisterInfo.h"
  67. #include "llvm/CodeGen/Passes.h"
  68. #include "llvm/CodeGen/ReachingDefAnalysis.h"
  69. #include "llvm/MC/MCInstrDesc.h"
  70. using namespace llvm;
  71. #define DEBUG_TYPE "arm-low-overhead-loops"
  72. #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
  73. static cl::opt<bool>
  74. DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
  75. cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
  76. cl::init(false));
  77. static bool isVectorPredicated(MachineInstr *MI) {
  78. int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
  79. return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
  80. }
  81. static bool isVectorPredicate(MachineInstr *MI) {
  82. return MI->findRegisterDefOperandIdx(ARM::VPR) != -1;
  83. }
  84. static bool hasVPRUse(MachineInstr &MI) {
  85. return MI.findRegisterUseOperandIdx(ARM::VPR) != -1;
  86. }
  87. static bool isDomainMVE(MachineInstr *MI) {
  88. uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
  89. return Domain == ARMII::DomainMVE;
  90. }
  91. static bool shouldInspect(MachineInstr &MI) {
  92. return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
  93. }
  94. static bool isDo(MachineInstr *MI) {
  95. return MI->getOpcode() != ARM::t2WhileLoopStart;
  96. }
  97. namespace {
  98. using InstSet = SmallPtrSetImpl<MachineInstr *>;
  99. class PostOrderLoopTraversal {
  100. MachineLoop &ML;
  101. MachineLoopInfo &MLI;
  102. SmallPtrSet<MachineBasicBlock*, 4> Visited;
  103. SmallVector<MachineBasicBlock*, 4> Order;
  104. public:
  105. PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
  106. : ML(ML), MLI(MLI) { }
  107. const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
  108. return Order;
  109. }
  110. // Visit all the blocks within the loop, as well as exit blocks and any
  111. // blocks properly dominating the header.
  112. void ProcessLoop() {
  113. std::function<void(MachineBasicBlock*)> Search = [this, &Search]
  114. (MachineBasicBlock *MBB) -> void {
  115. if (Visited.count(MBB))
  116. return;
  117. Visited.insert(MBB);
  118. for (auto *Succ : MBB->successors()) {
  119. if (!ML.contains(Succ))
  120. continue;
  121. Search(Succ);
  122. }
  123. Order.push_back(MBB);
  124. };
  125. // Insert exit blocks.
  126. SmallVector<MachineBasicBlock*, 2> ExitBlocks;
  127. ML.getExitBlocks(ExitBlocks);
  128. append_range(Order, ExitBlocks);
  129. // Then add the loop body.
  130. Search(ML.getHeader());
  131. // Then try the preheader and its predecessors.
  132. std::function<void(MachineBasicBlock*)> GetPredecessor =
  133. [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
  134. Order.push_back(MBB);
  135. if (MBB->pred_size() == 1)
  136. GetPredecessor(*MBB->pred_begin());
  137. };
  138. if (auto *Preheader = ML.getLoopPreheader())
  139. GetPredecessor(Preheader);
  140. else if (auto *Preheader = MLI.findLoopPreheader(&ML, true))
  141. GetPredecessor(Preheader);
  142. }
  143. };
  144. struct PredicatedMI {
  145. MachineInstr *MI = nullptr;
  146. SetVector<MachineInstr*> Predicates;
  147. public:
  148. PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) {
  149. assert(I && "Instruction must not be null!");
  150. Predicates.insert(Preds.begin(), Preds.end());
  151. }
  152. };
  153. // Represent the current state of the VPR and hold all instances which
  154. // represent a VPT block, which is a list of instructions that begins with a
  155. // VPT/VPST and has a maximum of four proceeding instructions. All
  156. // instructions within the block are predicated upon the vpr and we allow
  157. // instructions to define the vpr within in the block too.
  158. class VPTState {
  159. friend struct LowOverheadLoop;
  160. SmallVector<MachineInstr *, 4> Insts;
  161. static SmallVector<VPTState, 4> Blocks;
  162. static SetVector<MachineInstr *> CurrentPredicates;
  163. static std::map<MachineInstr *,
  164. std::unique_ptr<PredicatedMI>> PredicatedInsts;
  165. static void CreateVPTBlock(MachineInstr *MI) {
  166. assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))
  167. && "Can't begin VPT without predicate");
  168. Blocks.emplace_back(MI);
  169. // The execution of MI is predicated upon the current set of instructions
  170. // that are AND'ed together to form the VPR predicate value. In the case
  171. // that MI is a VPT, CurrentPredicates will also just be MI.
  172. PredicatedInsts.emplace(
  173. MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
  174. }
  175. static void reset() {
  176. Blocks.clear();
  177. PredicatedInsts.clear();
  178. CurrentPredicates.clear();
  179. }
  180. static void addInst(MachineInstr *MI) {
  181. Blocks.back().insert(MI);
  182. PredicatedInsts.emplace(
  183. MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
  184. }
  185. static void addPredicate(MachineInstr *MI) {
  186. LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
  187. CurrentPredicates.insert(MI);
  188. }
  189. static void resetPredicate(MachineInstr *MI) {
  190. LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
  191. CurrentPredicates.clear();
  192. CurrentPredicates.insert(MI);
  193. }
  194. public:
  195. // Have we found an instruction within the block which defines the vpr? If
  196. // so, not all the instructions in the block will have the same predicate.
  197. static bool hasUniformPredicate(VPTState &Block) {
  198. return getDivergent(Block) == nullptr;
  199. }
  200. // If it exists, return the first internal instruction which modifies the
  201. // VPR.
  202. static MachineInstr *getDivergent(VPTState &Block) {
  203. SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
  204. for (unsigned i = 1; i < Insts.size(); ++i) {
  205. MachineInstr *Next = Insts[i];
  206. if (isVectorPredicate(Next))
  207. return Next; // Found an instruction altering the vpr.
  208. }
  209. return nullptr;
  210. }
  211. // Return whether the given instruction is predicated upon a VCTP.
  212. static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
  213. SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
  214. if (Exclusive && Predicates.size() != 1)
  215. return false;
  216. for (auto *PredMI : Predicates)
  217. if (isVCTP(PredMI))
  218. return true;
  219. return false;
  220. }
  221. // Is the VPST, controlling the block entry, predicated upon a VCTP.
  222. static bool isEntryPredicatedOnVCTP(VPTState &Block,
  223. bool Exclusive = false) {
  224. SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
  225. return isPredicatedOnVCTP(Insts.front(), Exclusive);
  226. }
  227. // If this block begins with a VPT, we can check whether it's using
  228. // at least one predicated input(s), as well as possible loop invariant
  229. // which would result in it being implicitly predicated.
  230. static bool hasImplicitlyValidVPT(VPTState &Block,
  231. ReachingDefAnalysis &RDA) {
  232. SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
  233. MachineInstr *VPT = Insts.front();
  234. assert(isVPTOpcode(VPT->getOpcode()) &&
  235. "Expected VPT block to begin with VPT/VPST");
  236. if (VPT->getOpcode() == ARM::MVE_VPST)
  237. return false;
  238. auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
  239. MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
  240. return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
  241. };
  242. auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {
  243. MachineOperand &MO = MI->getOperand(Idx);
  244. if (!MO.isReg() || !MO.getReg())
  245. return true;
  246. SmallPtrSet<MachineInstr *, 2> Defs;
  247. RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);
  248. if (Defs.empty())
  249. return true;
  250. for (auto *Def : Defs)
  251. if (Def->getParent() == VPT->getParent())
  252. return false;
  253. return true;
  254. };
  255. // Check that at least one of the operands is directly predicated on a
  256. // vctp and allow an invariant value too.
  257. return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&
  258. (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&
  259. (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));
  260. }
  261. static bool isValid(ReachingDefAnalysis &RDA) {
  262. // All predication within the loop should be based on vctp. If the block
  263. // isn't predicated on entry, check whether the vctp is within the block
  264. // and that all other instructions are then predicated on it.
  265. for (auto &Block : Blocks) {
  266. if (isEntryPredicatedOnVCTP(Block, false) ||
  267. hasImplicitlyValidVPT(Block, RDA))
  268. continue;
  269. SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
  270. // We don't know how to convert a block with just a VPT;VCTP into
  271. // anything valid once we remove the VCTP. For now just bail out.
  272. assert(isVPTOpcode(Insts.front()->getOpcode()) &&
  273. "Expected VPT block to start with a VPST or VPT!");
  274. if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&
  275. isVCTP(Insts.back()))
  276. return false;
  277. for (auto *MI : Insts) {
  278. // Check that any internal VCTPs are 'Then' predicated.
  279. if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
  280. return false;
  281. // Skip other instructions that build up the predicate.
  282. if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
  283. continue;
  284. // Check that any other instructions are predicated upon a vctp.
  285. // TODO: We could infer when VPTs are implicitly predicated on the
  286. // vctp (when the operands are predicated).
  287. if (!isPredicatedOnVCTP(MI)) {
  288. LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
  289. return false;
  290. }
  291. }
  292. }
  293. return true;
  294. }
  295. VPTState(MachineInstr *MI) { Insts.push_back(MI); }
  296. void insert(MachineInstr *MI) {
  297. Insts.push_back(MI);
  298. // VPT/VPST + 4 predicated instructions.
  299. assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
  300. }
  301. bool containsVCTP() const {
  302. for (auto *MI : Insts)
  303. if (isVCTP(MI))
  304. return true;
  305. return false;
  306. }
  307. unsigned size() const { return Insts.size(); }
  308. SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
  309. };
  310. struct LowOverheadLoop {
  311. MachineLoop &ML;
  312. MachineBasicBlock *Preheader = nullptr;
  313. MachineLoopInfo &MLI;
  314. ReachingDefAnalysis &RDA;
  315. const TargetRegisterInfo &TRI;
  316. const ARMBaseInstrInfo &TII;
  317. MachineFunction *MF = nullptr;
  318. MachineBasicBlock::iterator StartInsertPt;
  319. MachineBasicBlock *StartInsertBB = nullptr;
  320. MachineInstr *Start = nullptr;
  321. MachineInstr *Dec = nullptr;
  322. MachineInstr *End = nullptr;
  323. MachineOperand TPNumElements;
  324. SmallVector<MachineInstr*, 4> VCTPs;
  325. SmallPtrSet<MachineInstr*, 4> ToRemove;
  326. SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
  327. bool Revert = false;
  328. bool CannotTailPredicate = false;
  329. LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
  330. ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
  331. const ARMBaseInstrInfo &TII)
  332. : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
  333. TPNumElements(MachineOperand::CreateImm(0)) {
  334. MF = ML.getHeader()->getParent();
  335. if (auto *MBB = ML.getLoopPreheader())
  336. Preheader = MBB;
  337. else if (auto *MBB = MLI.findLoopPreheader(&ML, true))
  338. Preheader = MBB;
  339. VPTState::reset();
  340. }
  341. // If this is an MVE instruction, check that we know how to use tail
  342. // predication with it. Record VPT blocks and return whether the
  343. // instruction is valid for tail predication.
  344. bool ValidateMVEInst(MachineInstr *MI);
  345. void AnalyseMVEInst(MachineInstr *MI) {
  346. CannotTailPredicate = !ValidateMVEInst(MI);
  347. }
  348. bool IsTailPredicationLegal() const {
  349. // For now, let's keep things really simple and only support a single
  350. // block for tail predication.
  351. return !Revert && FoundAllComponents() && !VCTPs.empty() &&
  352. !CannotTailPredicate && ML.getNumBlocks() == 1;
  353. }
  354. // Given that MI is a VCTP, check that is equivalent to any other VCTPs
  355. // found.
  356. bool AddVCTP(MachineInstr *MI);
  357. // Check that the predication in the loop will be equivalent once we
  358. // perform the conversion. Also ensure that we can provide the number
  359. // of elements to the loop start instruction.
  360. bool ValidateTailPredicate();
  361. // Check that any values available outside of the loop will be the same
  362. // after tail predication conversion.
  363. bool ValidateLiveOuts();
  364. // Is it safe to define LR with DLS/WLS?
  365. // LR can be defined if it is the operand to start, because it's the same
  366. // value, or if it's going to be equivalent to the operand to Start.
  367. MachineInstr *isSafeToDefineLR();
  368. // Check the branch targets are within range and we satisfy our
  369. // restrictions.
  370. void Validate(ARMBasicBlockUtils *BBUtils);
  371. bool FoundAllComponents() const {
  372. return Start && Dec && End;
  373. }
  374. SmallVectorImpl<VPTState> &getVPTBlocks() {
  375. return VPTState::Blocks;
  376. }
  377. // Return the operand for the loop start instruction. This will be the loop
  378. // iteration count, or the number of elements if we're tail predicating.
  379. MachineOperand &getLoopStartOperand() {
  380. if (IsTailPredicationLegal())
  381. return TPNumElements;
  382. return isDo(Start) ? Start->getOperand(1) : Start->getOperand(0);
  383. }
  384. unsigned getStartOpcode() const {
  385. bool IsDo = isDo(Start);
  386. if (!IsTailPredicationLegal())
  387. return IsDo ? ARM::t2DLS : ARM::t2WLS;
  388. return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
  389. }
  390. void dump() const {
  391. if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
  392. if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
  393. if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
  394. if (!VCTPs.empty()) {
  395. dbgs() << "ARM Loops: Found VCTP(s):\n";
  396. for (auto *MI : VCTPs)
  397. dbgs() << " - " << *MI;
  398. }
  399. if (!FoundAllComponents())
  400. dbgs() << "ARM Loops: Not a low-overhead loop.\n";
  401. else if (!(Start && Dec && End))
  402. dbgs() << "ARM Loops: Failed to find all loop components.\n";
  403. }
  404. };
  405. class ARMLowOverheadLoops : public MachineFunctionPass {
  406. MachineFunction *MF = nullptr;
  407. MachineLoopInfo *MLI = nullptr;
  408. ReachingDefAnalysis *RDA = nullptr;
  409. const ARMBaseInstrInfo *TII = nullptr;
  410. MachineRegisterInfo *MRI = nullptr;
  411. const TargetRegisterInfo *TRI = nullptr;
  412. std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
  413. public:
  414. static char ID;
  415. ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
  416. void getAnalysisUsage(AnalysisUsage &AU) const override {
  417. AU.setPreservesCFG();
  418. AU.addRequired<MachineLoopInfo>();
  419. AU.addRequired<ReachingDefAnalysis>();
  420. MachineFunctionPass::getAnalysisUsage(AU);
  421. }
  422. bool runOnMachineFunction(MachineFunction &MF) override;
  423. MachineFunctionProperties getRequiredProperties() const override {
  424. return MachineFunctionProperties().set(
  425. MachineFunctionProperties::Property::NoVRegs).set(
  426. MachineFunctionProperties::Property::TracksLiveness);
  427. }
  428. StringRef getPassName() const override {
  429. return ARM_LOW_OVERHEAD_LOOPS_NAME;
  430. }
  431. private:
  432. bool ProcessLoop(MachineLoop *ML);
  433. bool RevertNonLoops();
  434. void RevertWhile(MachineInstr *MI) const;
  435. void RevertDo(MachineInstr *MI) const;
  436. bool RevertLoopDec(MachineInstr *MI) const;
  437. void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
  438. void RevertLoopEndDec(MachineInstr *MI) const;
  439. void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
  440. MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
  441. void Expand(LowOverheadLoop &LoLoop);
  442. void IterationCountDCE(LowOverheadLoop &LoLoop);
  443. };
  444. }
  445. char ARMLowOverheadLoops::ID = 0;
  446. SmallVector<VPTState, 4> VPTState::Blocks;
  447. SetVector<MachineInstr *> VPTState::CurrentPredicates;
  448. std::map<MachineInstr *,
  449. std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
  450. INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
  451. false, false)
  452. static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
  453. InstSet &ToRemove, InstSet &Ignore) {
  454. // Check that we can remove all of Killed without having to modify any IT
  455. // blocks.
  456. auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
  457. // Collect the dead code and the MBBs in which they reside.
  458. SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
  459. for (auto *Dead : Killed)
  460. BasicBlocks.insert(Dead->getParent());
  461. // Collect IT blocks in all affected basic blocks.
  462. std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
  463. for (auto *MBB : BasicBlocks) {
  464. for (auto &IT : *MBB) {
  465. if (IT.getOpcode() != ARM::t2IT)
  466. continue;
  467. RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE),
  468. ITBlocks[&IT]);
  469. }
  470. }
  471. // If we're removing all of the instructions within an IT block, then
  472. // also remove the IT instruction.
  473. SmallPtrSet<MachineInstr *, 2> ModifiedITs;
  474. SmallPtrSet<MachineInstr *, 2> RemoveITs;
  475. for (auto *Dead : Killed) {
  476. if (MachineOperand *MO = Dead->findRegisterUseOperand(ARM::ITSTATE)) {
  477. MachineInstr *IT = RDA.getMIOperand(Dead, *MO);
  478. RemoveITs.insert(IT);
  479. auto &CurrentBlock = ITBlocks[IT];
  480. CurrentBlock.erase(Dead);
  481. if (CurrentBlock.empty())
  482. ModifiedITs.erase(IT);
  483. else
  484. ModifiedITs.insert(IT);
  485. }
  486. }
  487. if (!ModifiedITs.empty())
  488. return false;
  489. Killed.insert(RemoveITs.begin(), RemoveITs.end());
  490. return true;
  491. };
  492. SmallPtrSet<MachineInstr *, 2> Uses;
  493. if (!RDA.isSafeToRemove(MI, Uses, Ignore))
  494. return false;
  495. if (WontCorruptITs(Uses, RDA)) {
  496. ToRemove.insert(Uses.begin(), Uses.end());
  497. LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
  498. << " - can also remove:\n";
  499. for (auto *Use : Uses)
  500. dbgs() << " - " << *Use);
  501. SmallPtrSet<MachineInstr*, 4> Killed;
  502. RDA.collectKilledOperands(MI, Killed);
  503. if (WontCorruptITs(Killed, RDA)) {
  504. ToRemove.insert(Killed.begin(), Killed.end());
  505. LLVM_DEBUG(for (auto *Dead : Killed)
  506. dbgs() << " - " << *Dead);
  507. }
  508. return true;
  509. }
  510. return false;
  511. }
  512. bool LowOverheadLoop::ValidateTailPredicate() {
  513. if (!IsTailPredicationLegal()) {
  514. LLVM_DEBUG(if (VCTPs.empty())
  515. dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
  516. dbgs() << "ARM Loops: Tail-predication is not valid.\n");
  517. return false;
  518. }
  519. assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
  520. assert(ML.getBlocks().size() == 1 &&
  521. "Shouldn't be processing a loop with more than one block");
  522. if (DisableTailPredication) {
  523. LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
  524. return false;
  525. }
  526. if (!VPTState::isValid(RDA)) {
  527. LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
  528. return false;
  529. }
  530. if (!ValidateLiveOuts()) {
  531. LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
  532. return false;
  533. }
  534. // Check that creating a [W|D]LSTP, which will define LR with an element
  535. // count instead of iteration count, won't affect any other instructions
  536. // than the LoopStart and LoopDec.
  537. // TODO: We should try to insert the [W|D]LSTP after any of the other uses.
  538. Register StartReg = isDo(Start) ? Start->getOperand(1).getReg()
  539. : Start->getOperand(0).getReg();
  540. if (StartInsertPt == Start && StartReg == ARM::LR) {
  541. if (auto *IterCount = RDA.getMIOperand(Start, isDo(Start) ? 1 : 0)) {
  542. SmallPtrSet<MachineInstr *, 2> Uses;
  543. RDA.getGlobalUses(IterCount, MCRegister::from(ARM::LR), Uses);
  544. for (auto *Use : Uses) {
  545. if (Use != Start && Use != Dec) {
  546. LLVM_DEBUG(dbgs() << " ARM Loops: Found LR use: " << *Use);
  547. return false;
  548. }
  549. }
  550. }
  551. }
  552. // For tail predication, we need to provide the number of elements, instead
  553. // of the iteration count, to the loop start instruction. The number of
  554. // elements is provided to the vctp instruction, so we need to check that
  555. // we can use this register at InsertPt.
  556. MachineInstr *VCTP = VCTPs.back();
  557. if (Start->getOpcode() == ARM::t2DoLoopStartTP) {
  558. TPNumElements = Start->getOperand(2);
  559. StartInsertPt = Start;
  560. StartInsertBB = Start->getParent();
  561. } else {
  562. TPNumElements = VCTP->getOperand(1);
  563. MCRegister NumElements = TPNumElements.getReg().asMCReg();
  564. // If the register is defined within loop, then we can't perform TP.
  565. // TODO: Check whether this is just a mov of a register that would be
  566. // available.
  567. if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
  568. LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
  569. return false;
  570. }
  571. // The element count register maybe defined after InsertPt, in which case we
  572. // need to try to move either InsertPt or the def so that the [w|d]lstp can
  573. // use the value.
  574. if (StartInsertPt != StartInsertBB->end() &&
  575. !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
  576. if (auto *ElemDef =
  577. RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
  578. if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
  579. ElemDef->removeFromParent();
  580. StartInsertBB->insert(StartInsertPt, ElemDef);
  581. LLVM_DEBUG(dbgs()
  582. << "ARM Loops: Moved element count def: " << *ElemDef);
  583. } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
  584. StartInsertPt->removeFromParent();
  585. StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
  586. &*StartInsertPt);
  587. LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
  588. } else {
  589. // If we fail to move an instruction and the element count is provided
  590. // by a mov, use the mov operand if it will have the same value at the
  591. // insertion point
  592. MachineOperand Operand = ElemDef->getOperand(1);
  593. if (isMovRegOpcode(ElemDef->getOpcode()) &&
  594. RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==
  595. RDA.getUniqueReachingMIDef(&*StartInsertPt,
  596. Operand.getReg().asMCReg())) {
  597. TPNumElements = Operand;
  598. NumElements = TPNumElements.getReg();
  599. } else {
  600. LLVM_DEBUG(dbgs()
  601. << "ARM Loops: Unable to move element count to loop "
  602. << "start instruction.\n");
  603. return false;
  604. }
  605. }
  606. }
  607. }
  608. // Especially in the case of while loops, InsertBB may not be the
  609. // preheader, so we need to check that the register isn't redefined
  610. // before entering the loop.
  611. auto CannotProvideElements = [this](MachineBasicBlock *MBB,
  612. MCRegister NumElements) {
  613. if (MBB->empty())
  614. return false;
  615. // NumElements is redefined in this block.
  616. if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
  617. return true;
  618. // Don't continue searching up through multiple predecessors.
  619. if (MBB->pred_size() > 1)
  620. return true;
  621. return false;
  622. };
  623. // Search backwards for a def, until we get to InsertBB.
  624. MachineBasicBlock *MBB = Preheader;
  625. while (MBB && MBB != StartInsertBB) {
  626. if (CannotProvideElements(MBB, NumElements)) {
  627. LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
  628. return false;
  629. }
  630. MBB = *MBB->pred_begin();
  631. }
  632. }
  633. // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
  634. // world the [w|d]lstp instruction would be last instruction in the preheader
  635. // and so it would only affect instructions within the loop body. But due to
  636. // scheduling, and/or the logic in this pass (above), the insertion point can
  637. // be moved earlier. So if the Loop Start isn't the last instruction in the
  638. // preheader, and if the initial element count is smaller than the vector
  639. // width, the Loop Start instruction will immediately generate one or more
  640. // false lane mask which can, incorrectly, affect the proceeding MVE
  641. // instructions in the preheader.
  642. if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {
  643. LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");
  644. return false;
  645. }
  646. // Check that the value change of the element count is what we expect and
  647. // that the predication will be equivalent. For this we need:
  648. // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
  649. // and we can also allow register copies within the chain too.
  650. auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
  651. return -getAddSubImmediate(*MI) == ExpectedVecWidth;
  652. };
  653. MachineBasicBlock *MBB = VCTP->getParent();
  654. // Remove modifications to the element count since they have no purpose in a
  655. // tail predicated loop. Explicitly refer to the vctp operand no matter which
  656. // register NumElements has been assigned to, since that is what the
  657. // modifications will be using
  658. if (auto *Def = RDA.getUniqueReachingMIDef(
  659. &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {
  660. SmallPtrSet<MachineInstr*, 2> ElementChain;
  661. SmallPtrSet<MachineInstr*, 2> Ignore;
  662. unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
  663. Ignore.insert(VCTPs.begin(), VCTPs.end());
  664. if (TryRemove(Def, RDA, ElementChain, Ignore)) {
  665. bool FoundSub = false;
  666. for (auto *MI : ElementChain) {
  667. if (isMovRegOpcode(MI->getOpcode()))
  668. continue;
  669. if (isSubImmOpcode(MI->getOpcode())) {
  670. if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
  671. LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
  672. " count: " << *MI);
  673. return false;
  674. }
  675. FoundSub = true;
  676. } else {
  677. LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
  678. " count: " << *MI);
  679. return false;
  680. }
  681. }
  682. ToRemove.insert(ElementChain.begin(), ElementChain.end());
  683. }
  684. }
  685. return true;
  686. }
  687. static bool isRegInClass(const MachineOperand &MO,
  688. const TargetRegisterClass *Class) {
  689. return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
  690. }
  691. // MVE 'narrowing' operate on half a lane, reading from half and writing
  692. // to half, which are referred to has the top and bottom half. The other
  693. // half retains its previous value.
  694. static bool retainsPreviousHalfElement(const MachineInstr &MI) {
  695. const MCInstrDesc &MCID = MI.getDesc();
  696. uint64_t Flags = MCID.TSFlags;
  697. return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
  698. }
  699. // Some MVE instructions read from the top/bottom halves of their operand(s)
  700. // and generate a vector result with result elements that are double the
  701. // width of the input.
  702. static bool producesDoubleWidthResult(const MachineInstr &MI) {
  703. const MCInstrDesc &MCID = MI.getDesc();
  704. uint64_t Flags = MCID.TSFlags;
  705. return (Flags & ARMII::DoubleWidthResult) != 0;
  706. }
  707. static bool isHorizontalReduction(const MachineInstr &MI) {
  708. const MCInstrDesc &MCID = MI.getDesc();
  709. uint64_t Flags = MCID.TSFlags;
  710. return (Flags & ARMII::HorizontalReduction) != 0;
  711. }
  712. // Can this instruction generate a non-zero result when given only zeroed
  713. // operands? This allows us to know that, given operands with false bytes
  714. // zeroed by masked loads, that the result will also contain zeros in those
  715. // bytes.
  716. static bool canGenerateNonZeros(const MachineInstr &MI) {
  717. // Check for instructions which can write into a larger element size,
  718. // possibly writing into a previous zero'd lane.
  719. if (producesDoubleWidthResult(MI))
  720. return true;
  721. switch (MI.getOpcode()) {
  722. default:
  723. break;
  724. // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
  725. // fp16 -> fp32 vector conversions.
  726. // Instructions that perform a NOT will generate 1s from 0s.
  727. case ARM::MVE_VMVN:
  728. case ARM::MVE_VORN:
  729. // Count leading zeros will do just that!
  730. case ARM::MVE_VCLZs8:
  731. case ARM::MVE_VCLZs16:
  732. case ARM::MVE_VCLZs32:
  733. return true;
  734. }
  735. return false;
  736. }
  737. // Look at its register uses to see if it only can only receive zeros
  738. // into its false lanes which would then produce zeros. Also check that
  739. // the output register is also defined by an FalseLanesZero instruction
  740. // so that if tail-predication happens, the lanes that aren't updated will
  741. // still be zeros.
  742. static bool producesFalseLanesZero(MachineInstr &MI,
  743. const TargetRegisterClass *QPRs,
  744. const ReachingDefAnalysis &RDA,
  745. InstSet &FalseLanesZero) {
  746. if (canGenerateNonZeros(MI))
  747. return false;
  748. bool isPredicated = isVectorPredicated(&MI);
  749. // Predicated loads will write zeros to the falsely predicated bytes of the
  750. // destination register.
  751. if (MI.mayLoad())
  752. return isPredicated;
  753. auto IsZeroInit = [](MachineInstr *Def) {
  754. return !isVectorPredicated(Def) &&
  755. Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
  756. Def->getOperand(1).getImm() == 0;
  757. };
  758. bool AllowScalars = isHorizontalReduction(MI);
  759. for (auto &MO : MI.operands()) {
  760. if (!MO.isReg() || !MO.getReg())
  761. continue;
  762. if (!isRegInClass(MO, QPRs) && AllowScalars)
  763. continue;
  764. // Check that this instruction will produce zeros in its false lanes:
  765. // - If it only consumes false lanes zero or constant 0 (vmov #0)
  766. // - If it's predicated, it only matters that it's def register already has
  767. // false lane zeros, so we can ignore the uses.
  768. SmallPtrSet<MachineInstr *, 2> Defs;
  769. RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
  770. for (auto *Def : Defs) {
  771. if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
  772. continue;
  773. if (MO.isUse() && isPredicated)
  774. continue;
  775. return false;
  776. }
  777. }
  778. LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
  779. return true;
  780. }
  781. bool LowOverheadLoop::ValidateLiveOuts() {
  782. // We want to find out if the tail-predicated version of this loop will
  783. // produce the same values as the loop in its original form. For this to
  784. // be true, the newly inserted implicit predication must not change the
  785. // the (observable) results.
  786. // We're doing this because many instructions in the loop will not be
  787. // predicated and so the conversion from VPT predication to tail-predication
  788. // can result in different values being produced; due to the tail-predication
  789. // preventing many instructions from updating their falsely predicated
  790. // lanes. This analysis assumes that all the instructions perform lane-wise
  791. // operations and don't perform any exchanges.
  792. // A masked load, whether through VPT or tail predication, will write zeros
  793. // to any of the falsely predicated bytes. So, from the loads, we know that
  794. // the false lanes are zeroed and here we're trying to track that those false
  795. // lanes remain zero, or where they change, the differences are masked away
  796. // by their user(s).
  797. // All MVE stores have to be predicated, so we know that any predicate load
  798. // operands, or stored results are equivalent already. Other explicitly
  799. // predicated instructions will perform the same operation in the original
  800. // loop and the tail-predicated form too. Because of this, we can insert
  801. // loads, stores and other predicated instructions into our Predicated
  802. // set and build from there.
  803. const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
  804. SetVector<MachineInstr *> FalseLanesUnknown;
  805. SmallPtrSet<MachineInstr *, 4> FalseLanesZero;
  806. SmallPtrSet<MachineInstr *, 4> Predicated;
  807. MachineBasicBlock *Header = ML.getHeader();
  808. for (auto &MI : *Header) {
  809. if (!shouldInspect(MI))
  810. continue;
  811. if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
  812. continue;
  813. bool isPredicated = isVectorPredicated(&MI);
  814. bool retainsOrReduces =
  815. retainsPreviousHalfElement(MI) || isHorizontalReduction(MI);
  816. if (isPredicated)
  817. Predicated.insert(&MI);
  818. if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
  819. FalseLanesZero.insert(&MI);
  820. else if (MI.getNumDefs() == 0)
  821. continue;
  822. else if (!isPredicated && retainsOrReduces)
  823. return false;
  824. else if (!isPredicated)
  825. FalseLanesUnknown.insert(&MI);
  826. }
  827. auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
  828. SmallPtrSetImpl<MachineInstr *> &Predicated) {
  829. SmallPtrSet<MachineInstr *, 2> Uses;
  830. RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);
  831. for (auto *Use : Uses) {
  832. if (Use != MI && !Predicated.count(Use))
  833. return false;
  834. }
  835. return true;
  836. };
  837. // Visit the unknowns in reverse so that we can start at the values being
  838. // stored and then we can work towards the leaves, hopefully adding more
  839. // instructions to Predicated. Successfully terminating the loop means that
  840. // all the unknown values have to found to be masked by predicated user(s).
  841. // For any unpredicated values, we store them in NonPredicated so that we
  842. // can later check whether these form a reduction.
  843. SmallPtrSet<MachineInstr*, 2> NonPredicated;
  844. for (auto *MI : reverse(FalseLanesUnknown)) {
  845. for (auto &MO : MI->operands()) {
  846. if (!isRegInClass(MO, QPRs) || !MO.isDef())
  847. continue;
  848. if (!HasPredicatedUsers(MI, MO, Predicated)) {
  849. LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : "
  850. << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
  851. NonPredicated.insert(MI);
  852. break;
  853. }
  854. }
  855. // Any unknown false lanes have been masked away by the user(s).
  856. if (!NonPredicated.contains(MI))
  857. Predicated.insert(MI);
  858. }
  859. SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
  860. SmallVector<MachineBasicBlock *, 2> ExitBlocks;
  861. ML.getExitBlocks(ExitBlocks);
  862. assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
  863. assert(ExitBlocks.size() == 1 && "Expected a single exit block");
  864. MachineBasicBlock *ExitBB = ExitBlocks.front();
  865. for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
  866. // TODO: Instead of blocking predication, we could move the vctp to the exit
  867. // block and calculate it's operand there in or the preheader.
  868. if (RegMask.PhysReg == ARM::VPR)
  869. return false;
  870. // Check Q-regs that are live in the exit blocks. We don't collect scalars
  871. // because they won't be affected by lane predication.
  872. if (QPRs->contains(RegMask.PhysReg))
  873. if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
  874. LiveOutMIs.insert(MI);
  875. }
  876. // We've already validated that any VPT predication within the loop will be
  877. // equivalent when we perform the predication transformation; so we know that
  878. // any VPT predicated instruction is predicated upon VCTP. Any live-out
  879. // instruction needs to be predicated, so check this here. The instructions
  880. // in NonPredicated have been found to be a reduction that we can ensure its
  881. // legality.
  882. for (auto *MI : LiveOutMIs) {
  883. if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
  884. LLVM_DEBUG(dbgs() << "ARM Loops: Unable to handle live out: " << *MI);
  885. return false;
  886. }
  887. }
  888. return true;
  889. }
  890. void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
  891. if (Revert)
  892. return;
  893. // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
  894. // can only jump back.
  895. auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,
  896. ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
  897. MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
  898. ? End->getOperand(1).getMBB()
  899. : End->getOperand(2).getMBB();
  900. // TODO Maybe there's cases where the target doesn't have to be the header,
  901. // but for now be safe and revert.
  902. if (TgtBB != ML.getHeader()) {
  903. LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
  904. return false;
  905. }
  906. // The WLS and LE instructions have 12-bits for the label offset. WLS
  907. // requires a positive offset, while LE uses negative.
  908. if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
  909. !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
  910. LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
  911. return false;
  912. }
  913. if (Start->getOpcode() == ARM::t2WhileLoopStart &&
  914. (BBUtils->getOffsetOf(Start) >
  915. BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
  916. !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
  917. LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
  918. return false;
  919. }
  920. return true;
  921. };
  922. // Find a suitable position to insert the loop start instruction. It needs to
  923. // be able to safely define LR.
  924. auto FindStartInsertionPoint = [](MachineInstr *Start, MachineInstr *Dec,
  925. MachineBasicBlock::iterator &InsertPt,
  926. MachineBasicBlock *&InsertBB,
  927. ReachingDefAnalysis &RDA,
  928. InstSet &ToRemove) {
  929. // For a t2DoLoopStart it is always valid to use the start insertion point.
  930. // For WLS we can define LR if LR already contains the same value.
  931. if (isDo(Start) || Start->getOperand(0).getReg() == ARM::LR) {
  932. InsertPt = MachineBasicBlock::iterator(Start);
  933. InsertBB = Start->getParent();
  934. return true;
  935. }
  936. // We've found no suitable LR def and Start doesn't use LR directly. Can we
  937. // just define LR anyway?
  938. if (!RDA.isSafeToDefRegAt(Start, MCRegister::from(ARM::LR)))
  939. return false;
  940. InsertPt = MachineBasicBlock::iterator(Start);
  941. InsertBB = Start->getParent();
  942. return true;
  943. };
  944. if (!FindStartInsertionPoint(Start, Dec, StartInsertPt, StartInsertBB, RDA,
  945. ToRemove)) {
  946. LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
  947. Revert = true;
  948. return;
  949. }
  950. LLVM_DEBUG(if (StartInsertPt == StartInsertBB->end())
  951. dbgs() << "ARM Loops: Will insert LoopStart at end of block\n";
  952. else
  953. dbgs() << "ARM Loops: Will insert LoopStart at "
  954. << *StartInsertPt
  955. );
  956. Revert = !ValidateRanges(Start, End, BBUtils, ML);
  957. CannotTailPredicate = !ValidateTailPredicate();
  958. }
  959. bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
  960. LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
  961. if (VCTPs.empty()) {
  962. VCTPs.push_back(MI);
  963. return true;
  964. }
  965. // If we find another VCTP, check whether it uses the same value as the main VCTP.
  966. // If it does, store it in the VCTPs set, else refuse it.
  967. MachineInstr *Prev = VCTPs.back();
  968. if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
  969. !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {
  970. LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
  971. "definition from the main VCTP");
  972. return false;
  973. }
  974. VCTPs.push_back(MI);
  975. return true;
  976. }
  977. bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
  978. if (CannotTailPredicate)
  979. return false;
  980. if (!shouldInspect(*MI))
  981. return true;
  982. if (MI->getOpcode() == ARM::MVE_VPSEL ||
  983. MI->getOpcode() == ARM::MVE_VPNOT) {
  984. // TODO: Allow VPSEL and VPNOT, we currently cannot because:
  985. // 1) It will use the VPR as a predicate operand, but doesn't have to be
  986. // instead a VPT block, which means we can assert while building up
  987. // the VPT block because we don't find another VPT or VPST to being a new
  988. // one.
  989. // 2) VPSEL still requires a VPR operand even after tail predicating,
  990. // which means we can't remove it unless there is another
  991. // instruction, such as vcmp, that can provide the VPR def.
  992. return false;
  993. }
  994. // Record all VCTPs and check that they're equivalent to one another.
  995. if (isVCTP(MI) && !AddVCTP(MI))
  996. return false;
  997. // Inspect uses first so that any instructions that alter the VPR don't
  998. // alter the predicate upon themselves.
  999. const MCInstrDesc &MCID = MI->getDesc();
  1000. bool IsUse = false;
  1001. unsigned LastOpIdx = MI->getNumOperands() - 1;
  1002. for (auto &Op : enumerate(reverse(MCID.operands()))) {
  1003. const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());
  1004. if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
  1005. continue;
  1006. if (ARM::isVpred(Op.value().OperandType)) {
  1007. VPTState::addInst(MI);
  1008. IsUse = true;
  1009. } else if (MI->getOpcode() != ARM::MVE_VPST) {
  1010. LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
  1011. return false;
  1012. }
  1013. }
  1014. // If we find an instruction that has been marked as not valid for tail
  1015. // predication, only allow the instruction if it's contained within a valid
  1016. // VPT block.
  1017. bool RequiresExplicitPredication =
  1018. (MCID.TSFlags & ARMII::ValidForTailPredication) == 0;
  1019. if (isDomainMVE(MI) && RequiresExplicitPredication) {
  1020. LLVM_DEBUG(if (!IsUse)
  1021. dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
  1022. return IsUse;
  1023. }
  1024. // If the instruction is already explicitly predicated, then the conversion
  1025. // will be fine, but ensure that all store operations are predicated.
  1026. if (MI->mayStore())
  1027. return IsUse;
  1028. // If this instruction defines the VPR, update the predicate for the
  1029. // proceeding instructions.
  1030. if (isVectorPredicate(MI)) {
  1031. // Clear the existing predicate when we're not in VPT Active state,
  1032. // otherwise we add to it.
  1033. if (!isVectorPredicated(MI))
  1034. VPTState::resetPredicate(MI);
  1035. else
  1036. VPTState::addPredicate(MI);
  1037. }
  1038. // Finally once the predicate has been modified, we can start a new VPT
  1039. // block if necessary.
  1040. if (isVPTOpcode(MI->getOpcode()))
  1041. VPTState::CreateVPTBlock(MI);
  1042. return true;
  1043. }
  1044. bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
  1045. const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
  1046. if (!ST.hasLOB())
  1047. return false;
  1048. MF = &mf;
  1049. LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
  1050. MLI = &getAnalysis<MachineLoopInfo>();
  1051. RDA = &getAnalysis<ReachingDefAnalysis>();
  1052. MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
  1053. MRI = &MF->getRegInfo();
  1054. TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
  1055. TRI = ST.getRegisterInfo();
  1056. BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
  1057. BBUtils->computeAllBlockSizes();
  1058. BBUtils->adjustBBOffsetsAfter(&MF->front());
  1059. bool Changed = false;
  1060. for (auto ML : *MLI) {
  1061. if (ML->isOutermost())
  1062. Changed |= ProcessLoop(ML);
  1063. }
  1064. Changed |= RevertNonLoops();
  1065. return Changed;
  1066. }
  1067. bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
  1068. bool Changed = false;
  1069. // Process inner loops first.
  1070. for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
  1071. Changed |= ProcessLoop(*I);
  1072. LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n";
  1073. if (auto *Preheader = ML->getLoopPreheader())
  1074. dbgs() << " - " << Preheader->getName() << "\n";
  1075. else if (auto *Preheader = MLI->findLoopPreheader(ML))
  1076. dbgs() << " - " << Preheader->getName() << "\n";
  1077. else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
  1078. dbgs() << " - " << Preheader->getName() << "\n";
  1079. for (auto *MBB : ML->getBlocks())
  1080. dbgs() << " - " << MBB->getName() << "\n";
  1081. );
  1082. // Search the given block for a loop start instruction. If one isn't found,
  1083. // and there's only one predecessor block, search that one too.
  1084. std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
  1085. [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
  1086. for (auto &MI : *MBB) {
  1087. if (isLoopStart(MI))
  1088. return &MI;
  1089. }
  1090. if (MBB->pred_size() == 1)
  1091. return SearchForStart(*MBB->pred_begin());
  1092. return nullptr;
  1093. };
  1094. LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);
  1095. // Search the preheader for the start intrinsic.
  1096. // FIXME: I don't see why we shouldn't be supporting multiple predecessors
  1097. // with potentially multiple set.loop.iterations, so we need to enable this.
  1098. if (LoLoop.Preheader)
  1099. LoLoop.Start = SearchForStart(LoLoop.Preheader);
  1100. else
  1101. return false;
  1102. // Find the low-overhead loop components and decide whether or not to fall
  1103. // back to a normal loop. Also look for a vctp instructions and decide
  1104. // whether we can convert that predicate using tail predication.
  1105. for (auto *MBB : reverse(ML->getBlocks())) {
  1106. for (auto &MI : *MBB) {
  1107. if (MI.isDebugValue())
  1108. continue;
  1109. else if (MI.getOpcode() == ARM::t2LoopDec)
  1110. LoLoop.Dec = &MI;
  1111. else if (MI.getOpcode() == ARM::t2LoopEnd)
  1112. LoLoop.End = &MI;
  1113. else if (MI.getOpcode() == ARM::t2LoopEndDec)
  1114. LoLoop.End = LoLoop.Dec = &MI;
  1115. else if (isLoopStart(MI))
  1116. LoLoop.Start = &MI;
  1117. else if (MI.getDesc().isCall()) {
  1118. // TODO: Though the call will require LE to execute again, does this
  1119. // mean we should revert? Always executing LE hopefully should be
  1120. // faster than performing a sub,cmp,br or even subs,br.
  1121. LoLoop.Revert = true;
  1122. LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
  1123. } else {
  1124. // Record VPR defs and build up their corresponding vpt blocks.
  1125. // Check we know how to tail predicate any mve instructions.
  1126. LoLoop.AnalyseMVEInst(&MI);
  1127. }
  1128. }
  1129. }
  1130. LLVM_DEBUG(LoLoop.dump());
  1131. if (!LoLoop.FoundAllComponents()) {
  1132. LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
  1133. return false;
  1134. }
  1135. // Check that the only instruction using LoopDec is LoopEnd. This can only
  1136. // happen when the Dec and End are separate, not a single t2LoopEndDec.
  1137. // TODO: Check for copy chains that really have no effect.
  1138. if (LoLoop.Dec != LoLoop.End) {
  1139. SmallPtrSet<MachineInstr *, 2> Uses;
  1140. RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
  1141. if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
  1142. LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
  1143. LoLoop.Revert = true;
  1144. }
  1145. }
  1146. LoLoop.Validate(BBUtils.get());
  1147. Expand(LoLoop);
  1148. return true;
  1149. }
  1150. // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
  1151. // beq that branches to the exit branch.
  1152. // TODO: We could also try to generate a cbz if the value in LR is also in
  1153. // another low register.
  1154. void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
  1155. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
  1156. MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
  1157. unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
  1158. ARM::tBcc : ARM::t2Bcc;
  1159. RevertWhileLoopStart(MI, TII, BrOpc);
  1160. }
  1161. void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
  1162. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
  1163. RevertDoLoopStart(MI, TII);
  1164. }
  1165. bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
  1166. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
  1167. MachineBasicBlock *MBB = MI->getParent();
  1168. SmallPtrSet<MachineInstr*, 1> Ignore;
  1169. for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {
  1170. if (I->getOpcode() == ARM::t2LoopEnd) {
  1171. Ignore.insert(&*I);
  1172. break;
  1173. }
  1174. }
  1175. // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
  1176. bool SetFlags =
  1177. RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore);
  1178. llvm::RevertLoopDec(MI, TII, SetFlags);
  1179. return SetFlags;
  1180. }
  1181. // Generate a subs, or sub and cmp, and a branch instead of an LE.
  1182. void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
  1183. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
  1184. MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
  1185. unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
  1186. ARM::tBcc : ARM::t2Bcc;
  1187. llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
  1188. }
  1189. // Generate a subs, or sub and cmp, and a branch instead of an LE.
  1190. void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {
  1191. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
  1192. assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
  1193. MachineBasicBlock *MBB = MI->getParent();
  1194. MachineInstrBuilder MIB =
  1195. BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
  1196. MIB.addDef(ARM::LR);
  1197. MIB.add(MI->getOperand(1));
  1198. MIB.addImm(1);
  1199. MIB.addImm(ARMCC::AL);
  1200. MIB.addReg(ARM::NoRegister);
  1201. MIB.addReg(ARM::CPSR);
  1202. MIB->getOperand(5).setIsDef(true);
  1203. MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
  1204. unsigned BrOpc =
  1205. BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;
  1206. // Create bne
  1207. MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
  1208. MIB.add(MI->getOperand(2)); // branch target
  1209. MIB.addImm(ARMCC::NE); // condition code
  1210. MIB.addReg(ARM::CPSR);
  1211. MI->eraseFromParent();
  1212. }
  1213. // Perform dead code elimation on the loop iteration count setup expression.
  1214. // If we are tail-predicating, the number of elements to be processed is the
  1215. // operand of the VCTP instruction in the vector body, see getCount(), which is
  1216. // register $r3 in this example:
  1217. //
  1218. // $lr = big-itercount-expression
  1219. // ..
  1220. // $lr = t2DoLoopStart renamable $lr
  1221. // vector.body:
  1222. // ..
  1223. // $vpr = MVE_VCTP32 renamable $r3
  1224. // renamable $lr = t2LoopDec killed renamable $lr, 1
  1225. // t2LoopEnd renamable $lr, %vector.body
  1226. // tB %end
  1227. //
  1228. // What we would like achieve here is to replace the do-loop start pseudo
  1229. // instruction t2DoLoopStart with:
  1230. //
  1231. // $lr = MVE_DLSTP_32 killed renamable $r3
  1232. //
  1233. // Thus, $r3 which defines the number of elements, is written to $lr,
  1234. // and then we want to delete the whole chain that used to define $lr,
  1235. // see the comment below how this chain could look like.
  1236. //
  1237. void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
  1238. if (!LoLoop.IsTailPredicationLegal())
  1239. return;
  1240. LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
  1241. MachineInstr *Def =
  1242. RDA->getMIOperand(LoLoop.Start, isDo(LoLoop.Start) ? 1 : 0);
  1243. if (!Def) {
  1244. LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
  1245. return;
  1246. }
  1247. // Collect and remove the users of iteration count.
  1248. SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec,
  1249. LoLoop.End };
  1250. if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
  1251. LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
  1252. }
  1253. MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
  1254. LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
  1255. // When using tail-predication, try to delete the dead code that was used to
  1256. // calculate the number of loop iterations.
  1257. IterationCountDCE(LoLoop);
  1258. MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
  1259. MachineInstr *Start = LoLoop.Start;
  1260. MachineBasicBlock *MBB = LoLoop.StartInsertBB;
  1261. unsigned Opc = LoLoop.getStartOpcode();
  1262. MachineOperand &Count = LoLoop.getLoopStartOperand();
  1263. MachineInstrBuilder MIB =
  1264. BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
  1265. MIB.addDef(ARM::LR);
  1266. MIB.add(Count);
  1267. if (!isDo(Start))
  1268. MIB.add(Start->getOperand(1));
  1269. LoLoop.ToRemove.insert(Start);
  1270. LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
  1271. return &*MIB;
  1272. }
  1273. void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
  1274. auto RemovePredicate = [](MachineInstr *MI) {
  1275. if (MI->isDebugInstr())
  1276. return;
  1277. LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
  1278. int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
  1279. assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction");
  1280. assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
  1281. "Expected Then predicate!");
  1282. MI->getOperand(PIdx).setImm(ARMVCC::None);
  1283. MI->getOperand(PIdx + 1).setReg(0);
  1284. };
  1285. for (auto &Block : LoLoop.getVPTBlocks()) {
  1286. SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
  1287. auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {
  1288. assert(TheVCMP && "Replacing a removed or non-existent VCMP");
  1289. // Replace the VCMP with a VPT
  1290. MachineInstrBuilder MIB =
  1291. BuildMI(*At->getParent(), At, At->getDebugLoc(),
  1292. TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));
  1293. MIB.addImm(ARMVCC::Then);
  1294. // Register one
  1295. MIB.add(TheVCMP->getOperand(1));
  1296. // Register two
  1297. MIB.add(TheVCMP->getOperand(2));
  1298. // The comparison code, e.g. ge, eq, lt
  1299. MIB.add(TheVCMP->getOperand(3));
  1300. LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
  1301. LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
  1302. LoLoop.ToRemove.insert(TheVCMP);
  1303. TheVCMP = nullptr;
  1304. };
  1305. if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {
  1306. MachineInstr *VPST = Insts.front();
  1307. if (VPTState::hasUniformPredicate(Block)) {
  1308. // A vpt block starting with VPST, is only predicated upon vctp and has no
  1309. // internal vpr defs:
  1310. // - Remove vpst.
  1311. // - Unpredicate the remaining instructions.
  1312. LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
  1313. for (unsigned i = 1; i < Insts.size(); ++i)
  1314. RemovePredicate(Insts[i]);
  1315. } else {
  1316. // The VPT block has a non-uniform predicate but it uses a vpst and its
  1317. // entry is guarded only by a vctp, which means we:
  1318. // - Need to remove the original vpst.
  1319. // - Then need to unpredicate any following instructions, until
  1320. // we come across the divergent vpr def.
  1321. // - Insert a new vpst to predicate the instruction(s) that following
  1322. // the divergent vpr def.
  1323. MachineInstr *Divergent = VPTState::getDivergent(Block);
  1324. MachineBasicBlock *MBB = Divergent->getParent();
  1325. auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
  1326. while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr())
  1327. ++DivergentNext;
  1328. bool DivergentNextIsPredicated =
  1329. DivergentNext != MBB->end() &&
  1330. getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
  1331. for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
  1332. I != E; ++I)
  1333. RemovePredicate(&*I);
  1334. // Check if the instruction defining vpr is a vcmp so it can be combined
  1335. // with the VPST This should be the divergent instruction
  1336. MachineInstr *VCMP =
  1337. VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
  1338. if (DivergentNextIsPredicated) {
  1339. // Insert a VPST at the divergent only if the next instruction
  1340. // would actually use it. A VCMP following a VPST can be
  1341. // merged into a VPT so do that instead if the VCMP exists.
  1342. if (!VCMP) {
  1343. // Create a VPST (with a null mask for now, we'll recompute it
  1344. // later)
  1345. MachineInstrBuilder MIB =
  1346. BuildMI(*Divergent->getParent(), Divergent,
  1347. Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
  1348. MIB.addImm(0);
  1349. LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
  1350. LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
  1351. } else {
  1352. // No RDA checks are necessary here since the VPST would have been
  1353. // directly after the VCMP
  1354. ReplaceVCMPWithVPT(VCMP, VCMP);
  1355. }
  1356. }
  1357. }
  1358. LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
  1359. LoLoop.ToRemove.insert(VPST);
  1360. } else if (Block.containsVCTP()) {
  1361. // The vctp will be removed, so either the entire block will be dead or
  1362. // the block mask of the vp(s)t will need to be recomputed.
  1363. MachineInstr *VPST = Insts.front();
  1364. if (Block.size() == 2) {
  1365. assert(VPST->getOpcode() == ARM::MVE_VPST &&
  1366. "Found a VPST in an otherwise empty vpt block");
  1367. LoLoop.ToRemove.insert(VPST);
  1368. } else
  1369. LoLoop.BlockMasksToRecompute.insert(VPST);
  1370. } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
  1371. // If this block starts with a VPST then attempt to merge it with the
  1372. // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
  1373. // block that no longer exists
  1374. MachineInstr *VPST = Insts.front();
  1375. auto Next = ++MachineBasicBlock::iterator(VPST);
  1376. assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
  1377. "The instruction after a VPST must be predicated");
  1378. (void)Next;
  1379. MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
  1380. if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&
  1381. !LoLoop.ToRemove.contains(VprDef)) {
  1382. MachineInstr *VCMP = VprDef;
  1383. // The VCMP and VPST can only be merged if the VCMP's operands will have
  1384. // the same values at the VPST.
  1385. // If any of the instructions between the VCMP and VPST are predicated
  1386. // then a different code path is expected to have merged the VCMP and
  1387. // VPST already.
  1388. if (!std::any_of(++MachineBasicBlock::iterator(VCMP),
  1389. MachineBasicBlock::iterator(VPST), hasVPRUse) &&
  1390. RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
  1391. RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
  1392. ReplaceVCMPWithVPT(VCMP, VPST);
  1393. LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
  1394. LoLoop.ToRemove.insert(VPST);
  1395. }
  1396. }
  1397. }
  1398. }
  1399. LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
  1400. }
  1401. void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
  1402. // Combine the LoopDec and LoopEnd instructions into LE(TP).
  1403. auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
  1404. MachineInstr *End = LoLoop.End;
  1405. MachineBasicBlock *MBB = End->getParent();
  1406. unsigned Opc = LoLoop.IsTailPredicationLegal() ?
  1407. ARM::MVE_LETP : ARM::t2LEUpdate;
  1408. MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
  1409. TII->get(Opc));
  1410. MIB.addDef(ARM::LR);
  1411. unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;
  1412. MIB.add(End->getOperand(Off + 0));
  1413. MIB.add(End->getOperand(Off + 1));
  1414. LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
  1415. LoLoop.ToRemove.insert(LoLoop.Dec);
  1416. LoLoop.ToRemove.insert(End);
  1417. return &*MIB;
  1418. };
  1419. // TODO: We should be able to automatically remove these branches before we
  1420. // get here - probably by teaching analyzeBranch about the pseudo
  1421. // instructions.
  1422. // If there is an unconditional branch, after I, that just branches to the
  1423. // next block, remove it.
  1424. auto RemoveDeadBranch = [](MachineInstr *I) {
  1425. MachineBasicBlock *BB = I->getParent();
  1426. MachineInstr *Terminator = &BB->instr_back();
  1427. if (Terminator->isUnconditionalBranch() && I != Terminator) {
  1428. MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
  1429. if (BB->isLayoutSuccessor(Succ)) {
  1430. LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
  1431. Terminator->eraseFromParent();
  1432. }
  1433. }
  1434. };
  1435. if (LoLoop.Revert) {
  1436. if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
  1437. RevertWhile(LoLoop.Start);
  1438. else
  1439. RevertDo(LoLoop.Start);
  1440. if (LoLoop.Dec == LoLoop.End)
  1441. RevertLoopEndDec(LoLoop.End);
  1442. else
  1443. RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
  1444. } else {
  1445. LoLoop.Start = ExpandLoopStart(LoLoop);
  1446. RemoveDeadBranch(LoLoop.Start);
  1447. LoLoop.End = ExpandLoopEnd(LoLoop);
  1448. RemoveDeadBranch(LoLoop.End);
  1449. if (LoLoop.IsTailPredicationLegal())
  1450. ConvertVPTBlocks(LoLoop);
  1451. for (auto *I : LoLoop.ToRemove) {
  1452. LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
  1453. I->eraseFromParent();
  1454. }
  1455. for (auto *I : LoLoop.BlockMasksToRecompute) {
  1456. LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
  1457. recomputeVPTBlockMask(*I);
  1458. LLVM_DEBUG(dbgs() << " ... done: " << *I);
  1459. }
  1460. }
  1461. PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
  1462. DFS.ProcessLoop();
  1463. const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
  1464. for (auto *MBB : PostOrder) {
  1465. recomputeLiveIns(*MBB);
  1466. // FIXME: For some reason, the live-in print order is non-deterministic for
  1467. // our tests and I can't out why... So just sort them.
  1468. MBB->sortUniqueLiveIns();
  1469. }
  1470. for (auto *MBB : reverse(PostOrder))
  1471. recomputeLivenessFlags(*MBB);
  1472. // We've moved, removed and inserted new instructions, so update RDA.
  1473. RDA->reset();
  1474. }
  1475. bool ARMLowOverheadLoops::RevertNonLoops() {
  1476. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
  1477. bool Changed = false;
  1478. for (auto &MBB : *MF) {
  1479. SmallVector<MachineInstr*, 4> Starts;
  1480. SmallVector<MachineInstr*, 4> Decs;
  1481. SmallVector<MachineInstr*, 4> Ends;
  1482. SmallVector<MachineInstr *, 4> EndDecs;
  1483. for (auto &I : MBB) {
  1484. if (isLoopStart(I))
  1485. Starts.push_back(&I);
  1486. else if (I.getOpcode() == ARM::t2LoopDec)
  1487. Decs.push_back(&I);
  1488. else if (I.getOpcode() == ARM::t2LoopEnd)
  1489. Ends.push_back(&I);
  1490. else if (I.getOpcode() == ARM::t2LoopEndDec)
  1491. EndDecs.push_back(&I);
  1492. }
  1493. if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty())
  1494. continue;
  1495. Changed = true;
  1496. for (auto *Start : Starts) {
  1497. if (Start->getOpcode() == ARM::t2WhileLoopStart)
  1498. RevertWhile(Start);
  1499. else
  1500. RevertDo(Start);
  1501. }
  1502. for (auto *Dec : Decs)
  1503. RevertLoopDec(Dec);
  1504. for (auto *End : Ends)
  1505. RevertLoopEnd(End);
  1506. for (auto *End : EndDecs)
  1507. RevertLoopEndDec(End);
  1508. }
  1509. return Changed;
  1510. }
  1511. FunctionPass *llvm::createARMLowOverheadLoopsPass() {
  1512. return new ARMLowOverheadLoops();
  1513. }