ARMLowOverheadLoops.cpp 70 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889
  1. //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file
  9. /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
  10. /// instructions into machine operations.
  11. /// The expectation is that the loop contains three pseudo instructions:
  12. /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
  13. /// form should be in the preheader, whereas the while form should be in the
  14. /// preheaders only predecessor.
  15. /// - t2LoopDec - placed within in the loop body.
  16. /// - t2LoopEnd - the loop latch terminator.
  17. ///
  18. /// In addition to this, we also look for the presence of the VCTP instruction,
  19. /// which determines whether we can generated the tail-predicated low-overhead
  20. /// loop form.
  21. ///
  22. /// Assumptions and Dependencies:
  23. /// Low-overhead loops are constructed and executed using a setup instruction:
  24. /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
  25. /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
  26. /// but fixed polarity: WLS can only branch forwards and LE can only branch
  27. /// backwards. These restrictions mean that this pass is dependent upon block
  28. /// layout and block sizes, which is why it's the last pass to run. The same is
  29. /// true for ConstantIslands, but this pass does not increase the size of the
  30. /// basic blocks, nor does it change the CFG. Instructions are mainly removed
  31. /// during the transform and pseudo instructions are replaced by real ones. In
  32. /// some cases, when we have to revert to a 'normal' loop, we have to introduce
  33. /// multiple instructions for a single pseudo (see RevertWhile and
  34. /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
  35. /// are defined to be as large as this maximum sequence of replacement
  36. /// instructions.
  37. ///
  38. /// A note on VPR.P0 (the lane mask):
  39. /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
  40. /// "VPT Active" context (which includes low-overhead loops and vpt blocks).
  41. /// They will simply "and" the result of their calculation with the current
  42. /// value of VPR.P0. You can think of it like this:
  43. /// \verbatim
  44. /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs
  45. /// VPR.P0 &= Value
  46. /// else
  47. /// VPR.P0 = Value
  48. /// \endverbatim
  49. /// When we're inside the low-overhead loop (between DLSTP and LETP), we always
  50. /// fall in the "VPT active" case, so we can consider that all VPR writes by
  51. /// one of those instruction is actually a "and".
  52. //===----------------------------------------------------------------------===//
  53. #include "ARM.h"
  54. #include "ARMBaseInstrInfo.h"
  55. #include "ARMBaseRegisterInfo.h"
  56. #include "ARMBasicBlockInfo.h"
  57. #include "ARMSubtarget.h"
  58. #include "MVETailPredUtils.h"
  59. #include "Thumb2InstrInfo.h"
  60. #include "llvm/ADT/SetOperations.h"
  61. #include "llvm/ADT/SetVector.h"
  62. #include "llvm/ADT/SmallSet.h"
  63. #include "llvm/CodeGen/LivePhysRegs.h"
  64. #include "llvm/CodeGen/MachineFrameInfo.h"
  65. #include "llvm/CodeGen/MachineFunctionPass.h"
  66. #include "llvm/CodeGen/MachineLoopInfo.h"
  67. #include "llvm/CodeGen/MachineLoopUtils.h"
  68. #include "llvm/CodeGen/MachineRegisterInfo.h"
  69. #include "llvm/CodeGen/Passes.h"
  70. #include "llvm/CodeGen/ReachingDefAnalysis.h"
  71. #include "llvm/MC/MCInstrDesc.h"
  72. using namespace llvm;
  73. #define DEBUG_TYPE "arm-low-overhead-loops"
  74. #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
  75. static cl::opt<bool>
  76. DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
  77. cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
  78. cl::init(false));
  79. static cl::opt<bool>
  80. DisableOmitDLS("arm-disable-omit-dls", cl::Hidden,
  81. cl::desc("Disable omitting 'dls lr, lr' instructions"),
  82. cl::init(false));
  83. static bool isVectorPredicated(MachineInstr *MI) {
  84. int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
  85. return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
  86. }
  87. static bool isVectorPredicate(MachineInstr *MI) {
  88. return MI->findRegisterDefOperandIdx(ARM::VPR) != -1;
  89. }
  90. static bool hasVPRUse(MachineInstr &MI) {
  91. return MI.findRegisterUseOperandIdx(ARM::VPR) != -1;
  92. }
  93. static bool isDomainMVE(MachineInstr *MI) {
  94. uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
  95. return Domain == ARMII::DomainMVE;
  96. }
  97. static int getVecSize(const MachineInstr &MI) {
  98. const MCInstrDesc &MCID = MI.getDesc();
  99. uint64_t Flags = MCID.TSFlags;
  100. return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
  101. }
  102. static bool shouldInspect(MachineInstr &MI) {
  103. if (MI.isDebugInstr())
  104. return false;
  105. return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
  106. }
  107. namespace {
  108. using InstSet = SmallPtrSetImpl<MachineInstr *>;
  109. class PostOrderLoopTraversal {
  110. MachineLoop &ML;
  111. MachineLoopInfo &MLI;
  112. SmallPtrSet<MachineBasicBlock*, 4> Visited;
  113. SmallVector<MachineBasicBlock*, 4> Order;
  114. public:
  115. PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
  116. : ML(ML), MLI(MLI) { }
  117. const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
  118. return Order;
  119. }
  120. // Visit all the blocks within the loop, as well as exit blocks and any
  121. // blocks properly dominating the header.
  122. void ProcessLoop() {
  123. std::function<void(MachineBasicBlock*)> Search = [this, &Search]
  124. (MachineBasicBlock *MBB) -> void {
  125. if (Visited.count(MBB))
  126. return;
  127. Visited.insert(MBB);
  128. for (auto *Succ : MBB->successors()) {
  129. if (!ML.contains(Succ))
  130. continue;
  131. Search(Succ);
  132. }
  133. Order.push_back(MBB);
  134. };
  135. // Insert exit blocks.
  136. SmallVector<MachineBasicBlock*, 2> ExitBlocks;
  137. ML.getExitBlocks(ExitBlocks);
  138. append_range(Order, ExitBlocks);
  139. // Then add the loop body.
  140. Search(ML.getHeader());
  141. // Then try the preheader and its predecessors.
  142. std::function<void(MachineBasicBlock*)> GetPredecessor =
  143. [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
  144. Order.push_back(MBB);
  145. if (MBB->pred_size() == 1)
  146. GetPredecessor(*MBB->pred_begin());
  147. };
  148. if (auto *Preheader = ML.getLoopPreheader())
  149. GetPredecessor(Preheader);
  150. else if (auto *Preheader = MLI.findLoopPreheader(&ML, true, true))
  151. GetPredecessor(Preheader);
  152. }
  153. };
  154. struct PredicatedMI {
  155. MachineInstr *MI = nullptr;
  156. SetVector<MachineInstr*> Predicates;
  157. public:
  158. PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) {
  159. assert(I && "Instruction must not be null!");
  160. Predicates.insert(Preds.begin(), Preds.end());
  161. }
  162. };
  163. // Represent the current state of the VPR and hold all instances which
  164. // represent a VPT block, which is a list of instructions that begins with a
  165. // VPT/VPST and has a maximum of four proceeding instructions. All
  166. // instructions within the block are predicated upon the vpr and we allow
  167. // instructions to define the vpr within in the block too.
  168. class VPTState {
  169. friend struct LowOverheadLoop;
  170. SmallVector<MachineInstr *, 4> Insts;
  171. static SmallVector<VPTState, 4> Blocks;
  172. static SetVector<MachineInstr *> CurrentPredicates;
  173. static std::map<MachineInstr *,
  174. std::unique_ptr<PredicatedMI>> PredicatedInsts;
  175. static void CreateVPTBlock(MachineInstr *MI) {
  176. assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))
  177. && "Can't begin VPT without predicate");
  178. Blocks.emplace_back(MI);
  179. // The execution of MI is predicated upon the current set of instructions
  180. // that are AND'ed together to form the VPR predicate value. In the case
  181. // that MI is a VPT, CurrentPredicates will also just be MI.
  182. PredicatedInsts.emplace(
  183. MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
  184. }
  185. static void reset() {
  186. Blocks.clear();
  187. PredicatedInsts.clear();
  188. CurrentPredicates.clear();
  189. }
  190. static void addInst(MachineInstr *MI) {
  191. Blocks.back().insert(MI);
  192. PredicatedInsts.emplace(
  193. MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
  194. }
  195. static void addPredicate(MachineInstr *MI) {
  196. LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
  197. CurrentPredicates.insert(MI);
  198. }
  199. static void resetPredicate(MachineInstr *MI) {
  200. LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
  201. CurrentPredicates.clear();
  202. CurrentPredicates.insert(MI);
  203. }
  204. public:
  205. // Have we found an instruction within the block which defines the vpr? If
  206. // so, not all the instructions in the block will have the same predicate.
  207. static bool hasUniformPredicate(VPTState &Block) {
  208. return getDivergent(Block) == nullptr;
  209. }
  210. // If it exists, return the first internal instruction which modifies the
  211. // VPR.
  212. static MachineInstr *getDivergent(VPTState &Block) {
  213. SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
  214. for (unsigned i = 1; i < Insts.size(); ++i) {
  215. MachineInstr *Next = Insts[i];
  216. if (isVectorPredicate(Next))
  217. return Next; // Found an instruction altering the vpr.
  218. }
  219. return nullptr;
  220. }
  221. // Return whether the given instruction is predicated upon a VCTP.
  222. static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
  223. SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
  224. if (Exclusive && Predicates.size() != 1)
  225. return false;
  226. return llvm::any_of(Predicates, isVCTP);
  227. }
  228. // Is the VPST, controlling the block entry, predicated upon a VCTP.
  229. static bool isEntryPredicatedOnVCTP(VPTState &Block,
  230. bool Exclusive = false) {
  231. SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
  232. return isPredicatedOnVCTP(Insts.front(), Exclusive);
  233. }
  234. // If this block begins with a VPT, we can check whether it's using
  235. // at least one predicated input(s), as well as possible loop invariant
  236. // which would result in it being implicitly predicated.
  237. static bool hasImplicitlyValidVPT(VPTState &Block,
  238. ReachingDefAnalysis &RDA) {
  239. SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
  240. MachineInstr *VPT = Insts.front();
  241. assert(isVPTOpcode(VPT->getOpcode()) &&
  242. "Expected VPT block to begin with VPT/VPST");
  243. if (VPT->getOpcode() == ARM::MVE_VPST)
  244. return false;
  245. auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
  246. MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
  247. return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
  248. };
  249. auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {
  250. MachineOperand &MO = MI->getOperand(Idx);
  251. if (!MO.isReg() || !MO.getReg())
  252. return true;
  253. SmallPtrSet<MachineInstr *, 2> Defs;
  254. RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);
  255. if (Defs.empty())
  256. return true;
  257. for (auto *Def : Defs)
  258. if (Def->getParent() == VPT->getParent())
  259. return false;
  260. return true;
  261. };
  262. // Check that at least one of the operands is directly predicated on a
  263. // vctp and allow an invariant value too.
  264. return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&
  265. (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&
  266. (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));
  267. }
  268. static bool isValid(ReachingDefAnalysis &RDA) {
  269. // All predication within the loop should be based on vctp. If the block
  270. // isn't predicated on entry, check whether the vctp is within the block
  271. // and that all other instructions are then predicated on it.
  272. for (auto &Block : Blocks) {
  273. if (isEntryPredicatedOnVCTP(Block, false) ||
  274. hasImplicitlyValidVPT(Block, RDA))
  275. continue;
  276. SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
  277. // We don't know how to convert a block with just a VPT;VCTP into
  278. // anything valid once we remove the VCTP. For now just bail out.
  279. assert(isVPTOpcode(Insts.front()->getOpcode()) &&
  280. "Expected VPT block to start with a VPST or VPT!");
  281. if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&
  282. isVCTP(Insts.back()))
  283. return false;
  284. for (auto *MI : Insts) {
  285. // Check that any internal VCTPs are 'Then' predicated.
  286. if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
  287. return false;
  288. // Skip other instructions that build up the predicate.
  289. if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
  290. continue;
  291. // Check that any other instructions are predicated upon a vctp.
  292. // TODO: We could infer when VPTs are implicitly predicated on the
  293. // vctp (when the operands are predicated).
  294. if (!isPredicatedOnVCTP(MI)) {
  295. LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
  296. return false;
  297. }
  298. }
  299. }
  300. return true;
  301. }
  302. VPTState(MachineInstr *MI) { Insts.push_back(MI); }
  303. void insert(MachineInstr *MI) {
  304. Insts.push_back(MI);
  305. // VPT/VPST + 4 predicated instructions.
  306. assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
  307. }
  308. bool containsVCTP() const {
  309. return llvm::any_of(Insts, isVCTP);
  310. }
  311. unsigned size() const { return Insts.size(); }
  312. SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
  313. };
  314. struct LowOverheadLoop {
  315. MachineLoop &ML;
  316. MachineBasicBlock *Preheader = nullptr;
  317. MachineLoopInfo &MLI;
  318. ReachingDefAnalysis &RDA;
  319. const TargetRegisterInfo &TRI;
  320. const ARMBaseInstrInfo &TII;
  321. MachineFunction *MF = nullptr;
  322. MachineBasicBlock::iterator StartInsertPt;
  323. MachineBasicBlock *StartInsertBB = nullptr;
  324. MachineInstr *Start = nullptr;
  325. MachineInstr *Dec = nullptr;
  326. MachineInstr *End = nullptr;
  327. MachineOperand TPNumElements;
  328. SmallVector<MachineInstr *, 4> VCTPs;
  329. SmallPtrSet<MachineInstr *, 4> ToRemove;
  330. SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute;
  331. SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs;
  332. SmallPtrSet<MachineInstr *, 4> VMOVCopies;
  333. bool Revert = false;
  334. bool CannotTailPredicate = false;
  335. LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
  336. ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
  337. const ARMBaseInstrInfo &TII)
  338. : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
  339. TPNumElements(MachineOperand::CreateImm(0)) {
  340. MF = ML.getHeader()->getParent();
  341. if (auto *MBB = ML.getLoopPreheader())
  342. Preheader = MBB;
  343. else if (auto *MBB = MLI.findLoopPreheader(&ML, true, true))
  344. Preheader = MBB;
  345. VPTState::reset();
  346. }
  347. // If this is an MVE instruction, check that we know how to use tail
  348. // predication with it. Record VPT blocks and return whether the
  349. // instruction is valid for tail predication.
  350. bool ValidateMVEInst(MachineInstr *MI);
  351. void AnalyseMVEInst(MachineInstr *MI) {
  352. CannotTailPredicate = !ValidateMVEInst(MI);
  353. }
  354. bool IsTailPredicationLegal() const {
  355. // For now, let's keep things really simple and only support a single
  356. // block for tail predication.
  357. return !Revert && FoundAllComponents() && !VCTPs.empty() &&
  358. !CannotTailPredicate && ML.getNumBlocks() == 1;
  359. }
  360. // Given that MI is a VCTP, check that is equivalent to any other VCTPs
  361. // found.
  362. bool AddVCTP(MachineInstr *MI);
  363. // Check that the predication in the loop will be equivalent once we
  364. // perform the conversion. Also ensure that we can provide the number
  365. // of elements to the loop start instruction.
  366. bool ValidateTailPredicate();
  367. // Check that any values available outside of the loop will be the same
  368. // after tail predication conversion.
  369. bool ValidateLiveOuts();
  370. // Is it safe to define LR with DLS/WLS?
  371. // LR can be defined if it is the operand to start, because it's the same
  372. // value, or if it's going to be equivalent to the operand to Start.
  373. MachineInstr *isSafeToDefineLR();
  374. // Check the branch targets are within range and we satisfy our
  375. // restrictions.
  376. void Validate(ARMBasicBlockUtils *BBUtils);
  377. bool FoundAllComponents() const {
  378. return Start && Dec && End;
  379. }
  380. SmallVectorImpl<VPTState> &getVPTBlocks() {
  381. return VPTState::Blocks;
  382. }
  383. // Return the operand for the loop start instruction. This will be the loop
  384. // iteration count, or the number of elements if we're tail predicating.
  385. MachineOperand &getLoopStartOperand() {
  386. if (IsTailPredicationLegal())
  387. return TPNumElements;
  388. return Start->getOperand(1);
  389. }
  390. unsigned getStartOpcode() const {
  391. bool IsDo = isDoLoopStart(*Start);
  392. if (!IsTailPredicationLegal())
  393. return IsDo ? ARM::t2DLS : ARM::t2WLS;
  394. return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
  395. }
  396. void dump() const {
  397. if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
  398. if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
  399. if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
  400. if (!VCTPs.empty()) {
  401. dbgs() << "ARM Loops: Found VCTP(s):\n";
  402. for (auto *MI : VCTPs)
  403. dbgs() << " - " << *MI;
  404. }
  405. if (!FoundAllComponents())
  406. dbgs() << "ARM Loops: Not a low-overhead loop.\n";
  407. else if (!(Start && Dec && End))
  408. dbgs() << "ARM Loops: Failed to find all loop components.\n";
  409. }
  410. };
  411. class ARMLowOverheadLoops : public MachineFunctionPass {
  412. MachineFunction *MF = nullptr;
  413. MachineLoopInfo *MLI = nullptr;
  414. ReachingDefAnalysis *RDA = nullptr;
  415. const ARMBaseInstrInfo *TII = nullptr;
  416. MachineRegisterInfo *MRI = nullptr;
  417. const TargetRegisterInfo *TRI = nullptr;
  418. std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
  419. public:
  420. static char ID;
  421. ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
  422. void getAnalysisUsage(AnalysisUsage &AU) const override {
  423. AU.setPreservesCFG();
  424. AU.addRequired<MachineLoopInfo>();
  425. AU.addRequired<ReachingDefAnalysis>();
  426. MachineFunctionPass::getAnalysisUsage(AU);
  427. }
  428. bool runOnMachineFunction(MachineFunction &MF) override;
  429. MachineFunctionProperties getRequiredProperties() const override {
  430. return MachineFunctionProperties().set(
  431. MachineFunctionProperties::Property::NoVRegs).set(
  432. MachineFunctionProperties::Property::TracksLiveness);
  433. }
  434. StringRef getPassName() const override {
  435. return ARM_LOW_OVERHEAD_LOOPS_NAME;
  436. }
  437. private:
  438. bool ProcessLoop(MachineLoop *ML);
  439. bool RevertNonLoops();
  440. void RevertWhile(MachineInstr *MI) const;
  441. void RevertDo(MachineInstr *MI) const;
  442. bool RevertLoopDec(MachineInstr *MI) const;
  443. void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
  444. void RevertLoopEndDec(MachineInstr *MI) const;
  445. void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
  446. MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
  447. void Expand(LowOverheadLoop &LoLoop);
  448. void IterationCountDCE(LowOverheadLoop &LoLoop);
  449. };
  450. }
  451. char ARMLowOverheadLoops::ID = 0;
  452. SmallVector<VPTState, 4> VPTState::Blocks;
  453. SetVector<MachineInstr *> VPTState::CurrentPredicates;
  454. std::map<MachineInstr *,
  455. std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
  456. INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
  457. false, false)
  458. static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
  459. InstSet &ToRemove, InstSet &Ignore) {
  460. // Check that we can remove all of Killed without having to modify any IT
  461. // blocks.
  462. auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
  463. // Collect the dead code and the MBBs in which they reside.
  464. SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
  465. for (auto *Dead : Killed)
  466. BasicBlocks.insert(Dead->getParent());
  467. // Collect IT blocks in all affected basic blocks.
  468. std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
  469. for (auto *MBB : BasicBlocks) {
  470. for (auto &IT : *MBB) {
  471. if (IT.getOpcode() != ARM::t2IT)
  472. continue;
  473. RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE),
  474. ITBlocks[&IT]);
  475. }
  476. }
  477. // If we're removing all of the instructions within an IT block, then
  478. // also remove the IT instruction.
  479. SmallPtrSet<MachineInstr *, 2> ModifiedITs;
  480. SmallPtrSet<MachineInstr *, 2> RemoveITs;
  481. for (auto *Dead : Killed) {
  482. if (MachineOperand *MO = Dead->findRegisterUseOperand(ARM::ITSTATE)) {
  483. MachineInstr *IT = RDA.getMIOperand(Dead, *MO);
  484. RemoveITs.insert(IT);
  485. auto &CurrentBlock = ITBlocks[IT];
  486. CurrentBlock.erase(Dead);
  487. if (CurrentBlock.empty())
  488. ModifiedITs.erase(IT);
  489. else
  490. ModifiedITs.insert(IT);
  491. }
  492. }
  493. if (!ModifiedITs.empty())
  494. return false;
  495. Killed.insert(RemoveITs.begin(), RemoveITs.end());
  496. return true;
  497. };
  498. SmallPtrSet<MachineInstr *, 2> Uses;
  499. if (!RDA.isSafeToRemove(MI, Uses, Ignore))
  500. return false;
  501. if (WontCorruptITs(Uses, RDA)) {
  502. ToRemove.insert(Uses.begin(), Uses.end());
  503. LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
  504. << " - can also remove:\n";
  505. for (auto *Use : Uses)
  506. dbgs() << " - " << *Use);
  507. SmallPtrSet<MachineInstr*, 4> Killed;
  508. RDA.collectKilledOperands(MI, Killed);
  509. if (WontCorruptITs(Killed, RDA)) {
  510. ToRemove.insert(Killed.begin(), Killed.end());
  511. LLVM_DEBUG(for (auto *Dead : Killed)
  512. dbgs() << " - " << *Dead);
  513. }
  514. return true;
  515. }
  516. return false;
  517. }
  518. bool LowOverheadLoop::ValidateTailPredicate() {
  519. if (!IsTailPredicationLegal()) {
  520. LLVM_DEBUG(if (VCTPs.empty())
  521. dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
  522. dbgs() << "ARM Loops: Tail-predication is not valid.\n");
  523. return false;
  524. }
  525. assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
  526. assert(ML.getBlocks().size() == 1 &&
  527. "Shouldn't be processing a loop with more than one block");
  528. if (DisableTailPredication) {
  529. LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
  530. return false;
  531. }
  532. if (!VPTState::isValid(RDA)) {
  533. LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
  534. return false;
  535. }
  536. if (!ValidateLiveOuts()) {
  537. LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
  538. return false;
  539. }
  540. // For tail predication, we need to provide the number of elements, instead
  541. // of the iteration count, to the loop start instruction. The number of
  542. // elements is provided to the vctp instruction, so we need to check that
  543. // we can use this register at InsertPt.
  544. MachineInstr *VCTP = VCTPs.back();
  545. if (Start->getOpcode() == ARM::t2DoLoopStartTP ||
  546. Start->getOpcode() == ARM::t2WhileLoopStartTP) {
  547. TPNumElements = Start->getOperand(2);
  548. StartInsertPt = Start;
  549. StartInsertBB = Start->getParent();
  550. } else {
  551. TPNumElements = VCTP->getOperand(1);
  552. MCRegister NumElements = TPNumElements.getReg().asMCReg();
  553. // If the register is defined within loop, then we can't perform TP.
  554. // TODO: Check whether this is just a mov of a register that would be
  555. // available.
  556. if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
  557. LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
  558. return false;
  559. }
  560. // The element count register maybe defined after InsertPt, in which case we
  561. // need to try to move either InsertPt or the def so that the [w|d]lstp can
  562. // use the value.
  563. if (StartInsertPt != StartInsertBB->end() &&
  564. !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
  565. if (auto *ElemDef =
  566. RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
  567. if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
  568. ElemDef->removeFromParent();
  569. StartInsertBB->insert(StartInsertPt, ElemDef);
  570. LLVM_DEBUG(dbgs()
  571. << "ARM Loops: Moved element count def: " << *ElemDef);
  572. } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
  573. StartInsertPt->removeFromParent();
  574. StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
  575. &*StartInsertPt);
  576. LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
  577. } else {
  578. // If we fail to move an instruction and the element count is provided
  579. // by a mov, use the mov operand if it will have the same value at the
  580. // insertion point
  581. MachineOperand Operand = ElemDef->getOperand(1);
  582. if (isMovRegOpcode(ElemDef->getOpcode()) &&
  583. RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==
  584. RDA.getUniqueReachingMIDef(&*StartInsertPt,
  585. Operand.getReg().asMCReg())) {
  586. TPNumElements = Operand;
  587. NumElements = TPNumElements.getReg();
  588. } else {
  589. LLVM_DEBUG(dbgs()
  590. << "ARM Loops: Unable to move element count to loop "
  591. << "start instruction.\n");
  592. return false;
  593. }
  594. }
  595. }
  596. }
  597. // Especially in the case of while loops, InsertBB may not be the
  598. // preheader, so we need to check that the register isn't redefined
  599. // before entering the loop.
  600. auto CannotProvideElements = [this](MachineBasicBlock *MBB,
  601. MCRegister NumElements) {
  602. if (MBB->empty())
  603. return false;
  604. // NumElements is redefined in this block.
  605. if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
  606. return true;
  607. // Don't continue searching up through multiple predecessors.
  608. if (MBB->pred_size() > 1)
  609. return true;
  610. return false;
  611. };
  612. // Search backwards for a def, until we get to InsertBB.
  613. MachineBasicBlock *MBB = Preheader;
  614. while (MBB && MBB != StartInsertBB) {
  615. if (CannotProvideElements(MBB, NumElements)) {
  616. LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
  617. return false;
  618. }
  619. MBB = *MBB->pred_begin();
  620. }
  621. }
  622. // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
  623. // world the [w|d]lstp instruction would be last instruction in the preheader
  624. // and so it would only affect instructions within the loop body. But due to
  625. // scheduling, and/or the logic in this pass (above), the insertion point can
  626. // be moved earlier. So if the Loop Start isn't the last instruction in the
  627. // preheader, and if the initial element count is smaller than the vector
  628. // width, the Loop Start instruction will immediately generate one or more
  629. // false lane mask which can, incorrectly, affect the proceeding MVE
  630. // instructions in the preheader.
  631. if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {
  632. LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");
  633. return false;
  634. }
  635. // For any DoubleWidthResultInstrs we found whilst scanning instructions, they
  636. // need to compute an output size that is smaller than the VCTP mask operates
  637. // on. The VecSize of the DoubleWidthResult is the larger vector size - the
  638. // size it extends into, so any VCTP VecSize <= is valid.
  639. unsigned VCTPVecSize = getVecSize(*VCTP);
  640. for (MachineInstr *MI : DoubleWidthResultInstrs) {
  641. unsigned InstrVecSize = getVecSize(*MI);
  642. if (InstrVecSize > VCTPVecSize) {
  643. LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
  644. << "VecSize:\n" << *MI);
  645. return false;
  646. }
  647. }
  648. // Check that the value change of the element count is what we expect and
  649. // that the predication will be equivalent. For this we need:
  650. // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
  651. // and we can also allow register copies within the chain too.
  652. auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
  653. return -getAddSubImmediate(*MI) == ExpectedVecWidth;
  654. };
  655. MachineBasicBlock *MBB = VCTP->getParent();
  656. // Remove modifications to the element count since they have no purpose in a
  657. // tail predicated loop. Explicitly refer to the vctp operand no matter which
  658. // register NumElements has been assigned to, since that is what the
  659. // modifications will be using
  660. if (auto *Def = RDA.getUniqueReachingMIDef(
  661. &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {
  662. SmallPtrSet<MachineInstr*, 2> ElementChain;
  663. SmallPtrSet<MachineInstr*, 2> Ignore;
  664. unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
  665. Ignore.insert(VCTPs.begin(), VCTPs.end());
  666. if (TryRemove(Def, RDA, ElementChain, Ignore)) {
  667. bool FoundSub = false;
  668. for (auto *MI : ElementChain) {
  669. if (isMovRegOpcode(MI->getOpcode()))
  670. continue;
  671. if (isSubImmOpcode(MI->getOpcode())) {
  672. if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
  673. LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
  674. " count: " << *MI);
  675. return false;
  676. }
  677. FoundSub = true;
  678. } else {
  679. LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
  680. " count: " << *MI);
  681. return false;
  682. }
  683. }
  684. ToRemove.insert(ElementChain.begin(), ElementChain.end());
  685. }
  686. }
  687. // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
  688. // can also remove any extra instructions in the preheader, which often
  689. // includes a now unused MOV.
  690. if ((Start->getOpcode() == ARM::t2DoLoopStartTP ||
  691. Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
  692. Preheader && !Preheader->empty() &&
  693. !RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) {
  694. if (auto *Def = RDA.getUniqueReachingMIDef(
  695. &Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) {
  696. SmallPtrSet<MachineInstr*, 2> Ignore;
  697. Ignore.insert(VCTPs.begin(), VCTPs.end());
  698. TryRemove(Def, RDA, ToRemove, Ignore);
  699. }
  700. }
  701. return true;
  702. }
  703. static bool isRegInClass(const MachineOperand &MO,
  704. const TargetRegisterClass *Class) {
  705. return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
  706. }
  707. // MVE 'narrowing' operate on half a lane, reading from half and writing
  708. // to half, which are referred to has the top and bottom half. The other
  709. // half retains its previous value.
  710. static bool retainsPreviousHalfElement(const MachineInstr &MI) {
  711. const MCInstrDesc &MCID = MI.getDesc();
  712. uint64_t Flags = MCID.TSFlags;
  713. return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
  714. }
  715. // Some MVE instructions read from the top/bottom halves of their operand(s)
  716. // and generate a vector result with result elements that are double the
  717. // width of the input.
  718. static bool producesDoubleWidthResult(const MachineInstr &MI) {
  719. const MCInstrDesc &MCID = MI.getDesc();
  720. uint64_t Flags = MCID.TSFlags;
  721. return (Flags & ARMII::DoubleWidthResult) != 0;
  722. }
  723. static bool isHorizontalReduction(const MachineInstr &MI) {
  724. const MCInstrDesc &MCID = MI.getDesc();
  725. uint64_t Flags = MCID.TSFlags;
  726. return (Flags & ARMII::HorizontalReduction) != 0;
  727. }
  728. // Can this instruction generate a non-zero result when given only zeroed
  729. // operands? This allows us to know that, given operands with false bytes
  730. // zeroed by masked loads, that the result will also contain zeros in those
  731. // bytes.
  732. static bool canGenerateNonZeros(const MachineInstr &MI) {
  733. // Check for instructions which can write into a larger element size,
  734. // possibly writing into a previous zero'd lane.
  735. if (producesDoubleWidthResult(MI))
  736. return true;
  737. switch (MI.getOpcode()) {
  738. default:
  739. break;
  740. // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
  741. // fp16 -> fp32 vector conversions.
  742. // Instructions that perform a NOT will generate 1s from 0s.
  743. case ARM::MVE_VMVN:
  744. case ARM::MVE_VORN:
  745. // Count leading zeros will do just that!
  746. case ARM::MVE_VCLZs8:
  747. case ARM::MVE_VCLZs16:
  748. case ARM::MVE_VCLZs32:
  749. return true;
  750. }
  751. return false;
  752. }
  753. // Look at its register uses to see if it only can only receive zeros
  754. // into its false lanes which would then produce zeros. Also check that
  755. // the output register is also defined by an FalseLanesZero instruction
  756. // so that if tail-predication happens, the lanes that aren't updated will
  757. // still be zeros.
  758. static bool producesFalseLanesZero(MachineInstr &MI,
  759. const TargetRegisterClass *QPRs,
  760. const ReachingDefAnalysis &RDA,
  761. InstSet &FalseLanesZero) {
  762. if (canGenerateNonZeros(MI))
  763. return false;
  764. bool isPredicated = isVectorPredicated(&MI);
  765. // Predicated loads will write zeros to the falsely predicated bytes of the
  766. // destination register.
  767. if (MI.mayLoad())
  768. return isPredicated;
  769. auto IsZeroInit = [](MachineInstr *Def) {
  770. return !isVectorPredicated(Def) &&
  771. Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
  772. Def->getOperand(1).getImm() == 0;
  773. };
  774. bool AllowScalars = isHorizontalReduction(MI);
  775. for (auto &MO : MI.operands()) {
  776. if (!MO.isReg() || !MO.getReg())
  777. continue;
  778. if (!isRegInClass(MO, QPRs) && AllowScalars)
  779. continue;
  780. // Skip the lr predicate reg
  781. int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
  782. if (PIdx != -1 && (int)MI.getOperandNo(&MO) == PIdx + 2)
  783. continue;
  784. // Check that this instruction will produce zeros in its false lanes:
  785. // - If it only consumes false lanes zero or constant 0 (vmov #0)
  786. // - If it's predicated, it only matters that it's def register already has
  787. // false lane zeros, so we can ignore the uses.
  788. SmallPtrSet<MachineInstr *, 2> Defs;
  789. RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
  790. if (Defs.empty())
  791. return false;
  792. for (auto *Def : Defs) {
  793. if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
  794. continue;
  795. if (MO.isUse() && isPredicated)
  796. continue;
  797. return false;
  798. }
  799. }
  800. LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
  801. return true;
  802. }
  803. bool LowOverheadLoop::ValidateLiveOuts() {
  804. // We want to find out if the tail-predicated version of this loop will
  805. // produce the same values as the loop in its original form. For this to
  806. // be true, the newly inserted implicit predication must not change the
  807. // the (observable) results.
  808. // We're doing this because many instructions in the loop will not be
  809. // predicated and so the conversion from VPT predication to tail-predication
  810. // can result in different values being produced; due to the tail-predication
  811. // preventing many instructions from updating their falsely predicated
  812. // lanes. This analysis assumes that all the instructions perform lane-wise
  813. // operations and don't perform any exchanges.
  814. // A masked load, whether through VPT or tail predication, will write zeros
  815. // to any of the falsely predicated bytes. So, from the loads, we know that
  816. // the false lanes are zeroed and here we're trying to track that those false
  817. // lanes remain zero, or where they change, the differences are masked away
  818. // by their user(s).
  819. // All MVE stores have to be predicated, so we know that any predicate load
  820. // operands, or stored results are equivalent already. Other explicitly
  821. // predicated instructions will perform the same operation in the original
  822. // loop and the tail-predicated form too. Because of this, we can insert
  823. // loads, stores and other predicated instructions into our Predicated
  824. // set and build from there.
  825. const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
  826. SetVector<MachineInstr *> FalseLanesUnknown;
  827. SmallPtrSet<MachineInstr *, 4> FalseLanesZero;
  828. SmallPtrSet<MachineInstr *, 4> Predicated;
  829. MachineBasicBlock *Header = ML.getHeader();
  830. LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");
  831. for (auto &MI : *Header) {
  832. if (!shouldInspect(MI))
  833. continue;
  834. if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
  835. continue;
  836. bool isPredicated = isVectorPredicated(&MI);
  837. bool retainsOrReduces =
  838. retainsPreviousHalfElement(MI) || isHorizontalReduction(MI);
  839. if (isPredicated)
  840. Predicated.insert(&MI);
  841. if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
  842. FalseLanesZero.insert(&MI);
  843. else if (MI.getNumDefs() == 0)
  844. continue;
  845. else if (!isPredicated && retainsOrReduces) {
  846. LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI);
  847. return false;
  848. } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)
  849. FalseLanesUnknown.insert(&MI);
  850. }
  851. LLVM_DEBUG({
  852. dbgs() << " Predicated:\n";
  853. for (auto *I : Predicated)
  854. dbgs() << " " << *I;
  855. dbgs() << " FalseLanesZero:\n";
  856. for (auto *I : FalseLanesZero)
  857. dbgs() << " " << *I;
  858. dbgs() << " FalseLanesUnknown:\n";
  859. for (auto *I : FalseLanesUnknown)
  860. dbgs() << " " << *I;
  861. });
  862. auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
  863. SmallPtrSetImpl<MachineInstr *> &Predicated) {
  864. SmallPtrSet<MachineInstr *, 2> Uses;
  865. RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);
  866. for (auto *Use : Uses) {
  867. if (Use != MI && !Predicated.count(Use))
  868. return false;
  869. }
  870. return true;
  871. };
  872. // Visit the unknowns in reverse so that we can start at the values being
  873. // stored and then we can work towards the leaves, hopefully adding more
  874. // instructions to Predicated. Successfully terminating the loop means that
  875. // all the unknown values have to found to be masked by predicated user(s).
  876. // For any unpredicated values, we store them in NonPredicated so that we
  877. // can later check whether these form a reduction.
  878. SmallPtrSet<MachineInstr*, 2> NonPredicated;
  879. for (auto *MI : reverse(FalseLanesUnknown)) {
  880. for (auto &MO : MI->operands()) {
  881. if (!isRegInClass(MO, QPRs) || !MO.isDef())
  882. continue;
  883. if (!HasPredicatedUsers(MI, MO, Predicated)) {
  884. LLVM_DEBUG(dbgs() << " Found an unknown def of : "
  885. << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
  886. NonPredicated.insert(MI);
  887. break;
  888. }
  889. }
  890. // Any unknown false lanes have been masked away by the user(s).
  891. if (!NonPredicated.contains(MI))
  892. Predicated.insert(MI);
  893. }
  894. SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
  895. SmallVector<MachineBasicBlock *, 2> ExitBlocks;
  896. ML.getExitBlocks(ExitBlocks);
  897. assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
  898. assert(ExitBlocks.size() == 1 && "Expected a single exit block");
  899. MachineBasicBlock *ExitBB = ExitBlocks.front();
  900. for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
  901. // TODO: Instead of blocking predication, we could move the vctp to the exit
  902. // block and calculate it's operand there in or the preheader.
  903. if (RegMask.PhysReg == ARM::VPR) {
  904. LLVM_DEBUG(dbgs() << " VPR is live in to the exit block.");
  905. return false;
  906. }
  907. // Check Q-regs that are live in the exit blocks. We don't collect scalars
  908. // because they won't be affected by lane predication.
  909. if (QPRs->contains(RegMask.PhysReg))
  910. if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
  911. LiveOutMIs.insert(MI);
  912. }
  913. // We've already validated that any VPT predication within the loop will be
  914. // equivalent when we perform the predication transformation; so we know that
  915. // any VPT predicated instruction is predicated upon VCTP. Any live-out
  916. // instruction needs to be predicated, so check this here. The instructions
  917. // in NonPredicated have been found to be a reduction that we can ensure its
  918. // legality. Any MQPRCopy found will need to validate its input as if it was
  919. // live out.
  920. SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());
  921. while (!Worklist.empty()) {
  922. MachineInstr *MI = Worklist.pop_back_val();
  923. if (MI->getOpcode() == ARM::MQPRCopy) {
  924. VMOVCopies.insert(MI);
  925. MachineInstr *CopySrc =
  926. RDA.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
  927. if (CopySrc)
  928. Worklist.push_back(CopySrc);
  929. } else if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
  930. LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
  931. VMOVCopies.clear();
  932. return false;
  933. }
  934. }
  935. return true;
  936. }
  937. void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
  938. if (Revert)
  939. return;
  940. // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
  941. // can only jump back.
  942. auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,
  943. ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
  944. MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
  945. ? End->getOperand(1).getMBB()
  946. : End->getOperand(2).getMBB();
  947. // TODO Maybe there's cases where the target doesn't have to be the header,
  948. // but for now be safe and revert.
  949. if (TgtBB != ML.getHeader()) {
  950. LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
  951. return false;
  952. }
  953. // The WLS and LE instructions have 12-bits for the label offset. WLS
  954. // requires a positive offset, while LE uses negative.
  955. if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
  956. !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
  957. LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
  958. return false;
  959. }
  960. if (isWhileLoopStart(*Start)) {
  961. MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start);
  962. if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) ||
  963. !BBUtils->isBBInRange(Start, TargetBB, 4094)) {
  964. LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
  965. return false;
  966. }
  967. }
  968. return true;
  969. };
  970. StartInsertPt = MachineBasicBlock::iterator(Start);
  971. StartInsertBB = Start->getParent();
  972. LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
  973. << *StartInsertPt);
  974. Revert = !ValidateRanges(Start, End, BBUtils, ML);
  975. CannotTailPredicate = !ValidateTailPredicate();
  976. }
  977. bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
  978. LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
  979. if (VCTPs.empty()) {
  980. VCTPs.push_back(MI);
  981. return true;
  982. }
  983. // If we find another VCTP, check whether it uses the same value as the main VCTP.
  984. // If it does, store it in the VCTPs set, else refuse it.
  985. MachineInstr *Prev = VCTPs.back();
  986. if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
  987. !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {
  988. LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
  989. "definition from the main VCTP");
  990. return false;
  991. }
  992. VCTPs.push_back(MI);
  993. return true;
  994. }
  995. static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) {
  996. auto GetFrameIndex = [](MachineMemOperand *Operand) {
  997. const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
  998. if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
  999. if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(PseudoValue)) {
  1000. return FS->getFrameIndex();
  1001. }
  1002. }
  1003. return -1;
  1004. };
  1005. auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
  1006. switch (I->getOpcode()) {
  1007. case ARM::MVE_VSTRWU32:
  1008. case ARM::MVE_VLDRWU32: {
  1009. return I->getOperand(1).getReg() == ARM::SP &&
  1010. I->memoperands().size() == 1 &&
  1011. GetFrameIndex(I->memoperands().front()) >= 0;
  1012. }
  1013. default:
  1014. return false;
  1015. }
  1016. };
  1017. // An unpredicated vector register spill is allowed if all of the uses of the
  1018. // stack slot are within the loop
  1019. if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI))
  1020. return false;
  1021. // Search all blocks after the loop for accesses to the same stack slot.
  1022. // ReachingDefAnalysis doesn't work for sp as it relies on registers being
  1023. // live-out (which sp never is) to know what blocks to look in
  1024. if (MI->memoperands().size() == 0)
  1025. return false;
  1026. int FI = GetFrameIndex(MI->memoperands().front());
  1027. auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();
  1028. if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI))
  1029. return false;
  1030. SmallVector<MachineBasicBlock *> Frontier;
  1031. ML->getExitBlocks(Frontier);
  1032. SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()};
  1033. unsigned Idx = 0;
  1034. while (Idx < Frontier.size()) {
  1035. MachineBasicBlock *BB = Frontier[Idx];
  1036. bool LookAtSuccessors = true;
  1037. for (auto &I : *BB) {
  1038. if (!IsStackOp(&I) || I.memoperands().size() == 0)
  1039. continue;
  1040. if (GetFrameIndex(I.memoperands().front()) != FI)
  1041. continue;
  1042. // If this block has a store to the stack slot before any loads then we
  1043. // can ignore the block
  1044. if (I.getOpcode() == ARM::MVE_VSTRWU32) {
  1045. LookAtSuccessors = false;
  1046. break;
  1047. }
  1048. // If the store and the load are using the same stack slot then the
  1049. // store isn't valid for tail predication
  1050. if (I.getOpcode() == ARM::MVE_VLDRWU32)
  1051. return false;
  1052. }
  1053. if (LookAtSuccessors) {
  1054. for (auto *Succ : BB->successors()) {
  1055. if (!Visited.contains(Succ) && !is_contained(Frontier, Succ))
  1056. Frontier.push_back(Succ);
  1057. }
  1058. }
  1059. Visited.insert(BB);
  1060. Idx++;
  1061. }
  1062. return true;
  1063. }
  1064. bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
  1065. if (CannotTailPredicate)
  1066. return false;
  1067. if (!shouldInspect(*MI))
  1068. return true;
  1069. if (MI->getOpcode() == ARM::MVE_VPSEL ||
  1070. MI->getOpcode() == ARM::MVE_VPNOT) {
  1071. // TODO: Allow VPSEL and VPNOT, we currently cannot because:
  1072. // 1) It will use the VPR as a predicate operand, but doesn't have to be
  1073. // instead a VPT block, which means we can assert while building up
  1074. // the VPT block because we don't find another VPT or VPST to being a new
  1075. // one.
  1076. // 2) VPSEL still requires a VPR operand even after tail predicating,
  1077. // which means we can't remove it unless there is another
  1078. // instruction, such as vcmp, that can provide the VPR def.
  1079. return false;
  1080. }
  1081. // Record all VCTPs and check that they're equivalent to one another.
  1082. if (isVCTP(MI) && !AddVCTP(MI))
  1083. return false;
  1084. // Inspect uses first so that any instructions that alter the VPR don't
  1085. // alter the predicate upon themselves.
  1086. const MCInstrDesc &MCID = MI->getDesc();
  1087. bool IsUse = false;
  1088. unsigned LastOpIdx = MI->getNumOperands() - 1;
  1089. for (auto &Op : enumerate(reverse(MCID.operands()))) {
  1090. const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());
  1091. if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
  1092. continue;
  1093. if (ARM::isVpred(Op.value().OperandType)) {
  1094. VPTState::addInst(MI);
  1095. IsUse = true;
  1096. } else if (MI->getOpcode() != ARM::MVE_VPST) {
  1097. LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
  1098. return false;
  1099. }
  1100. }
  1101. // If we find an instruction that has been marked as not valid for tail
  1102. // predication, only allow the instruction if it's contained within a valid
  1103. // VPT block.
  1104. bool RequiresExplicitPredication =
  1105. (MCID.TSFlags & ARMII::ValidForTailPredication) == 0;
  1106. if (isDomainMVE(MI) && RequiresExplicitPredication) {
  1107. if (MI->getOpcode() == ARM::MQPRCopy)
  1108. return true;
  1109. if (!IsUse && producesDoubleWidthResult(*MI)) {
  1110. DoubleWidthResultInstrs.insert(MI);
  1111. return true;
  1112. }
  1113. LLVM_DEBUG(if (!IsUse) dbgs()
  1114. << "ARM Loops: Can't tail predicate: " << *MI);
  1115. return IsUse;
  1116. }
  1117. // If the instruction is already explicitly predicated, then the conversion
  1118. // will be fine, but ensure that all store operations are predicated.
  1119. if (MI->mayStore() && !ValidateMVEStore(MI, &ML))
  1120. return IsUse;
  1121. // If this instruction defines the VPR, update the predicate for the
  1122. // proceeding instructions.
  1123. if (isVectorPredicate(MI)) {
  1124. // Clear the existing predicate when we're not in VPT Active state,
  1125. // otherwise we add to it.
  1126. if (!isVectorPredicated(MI))
  1127. VPTState::resetPredicate(MI);
  1128. else
  1129. VPTState::addPredicate(MI);
  1130. }
  1131. // Finally once the predicate has been modified, we can start a new VPT
  1132. // block if necessary.
  1133. if (isVPTOpcode(MI->getOpcode()))
  1134. VPTState::CreateVPTBlock(MI);
  1135. return true;
  1136. }
  1137. bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
  1138. const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();
  1139. if (!ST.hasLOB())
  1140. return false;
  1141. MF = &mf;
  1142. LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
  1143. MLI = &getAnalysis<MachineLoopInfo>();
  1144. RDA = &getAnalysis<ReachingDefAnalysis>();
  1145. MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
  1146. MRI = &MF->getRegInfo();
  1147. TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
  1148. TRI = ST.getRegisterInfo();
  1149. BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
  1150. BBUtils->computeAllBlockSizes();
  1151. BBUtils->adjustBBOffsetsAfter(&MF->front());
  1152. bool Changed = false;
  1153. for (auto *ML : *MLI) {
  1154. if (ML->isOutermost())
  1155. Changed |= ProcessLoop(ML);
  1156. }
  1157. Changed |= RevertNonLoops();
  1158. return Changed;
  1159. }
  1160. bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
  1161. bool Changed = false;
  1162. // Process inner loops first.
  1163. for (MachineLoop *L : *ML)
  1164. Changed |= ProcessLoop(L);
  1165. LLVM_DEBUG({
  1166. dbgs() << "ARM Loops: Processing loop containing:\n";
  1167. if (auto *Preheader = ML->getLoopPreheader())
  1168. dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
  1169. else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true))
  1170. dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
  1171. for (auto *MBB : ML->getBlocks())
  1172. dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";
  1173. });
  1174. // Search the given block for a loop start instruction. If one isn't found,
  1175. // and there's only one predecessor block, search that one too.
  1176. std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
  1177. [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
  1178. for (auto &MI : *MBB) {
  1179. if (isLoopStart(MI))
  1180. return &MI;
  1181. }
  1182. if (MBB->pred_size() == 1)
  1183. return SearchForStart(*MBB->pred_begin());
  1184. return nullptr;
  1185. };
  1186. LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);
  1187. // Search the preheader for the start intrinsic.
  1188. // FIXME: I don't see why we shouldn't be supporting multiple predecessors
  1189. // with potentially multiple set.loop.iterations, so we need to enable this.
  1190. if (LoLoop.Preheader)
  1191. LoLoop.Start = SearchForStart(LoLoop.Preheader);
  1192. else
  1193. return Changed;
  1194. // Find the low-overhead loop components and decide whether or not to fall
  1195. // back to a normal loop. Also look for a vctp instructions and decide
  1196. // whether we can convert that predicate using tail predication.
  1197. for (auto *MBB : reverse(ML->getBlocks())) {
  1198. for (auto &MI : *MBB) {
  1199. if (MI.isDebugValue())
  1200. continue;
  1201. else if (MI.getOpcode() == ARM::t2LoopDec)
  1202. LoLoop.Dec = &MI;
  1203. else if (MI.getOpcode() == ARM::t2LoopEnd)
  1204. LoLoop.End = &MI;
  1205. else if (MI.getOpcode() == ARM::t2LoopEndDec)
  1206. LoLoop.End = LoLoop.Dec = &MI;
  1207. else if (isLoopStart(MI))
  1208. LoLoop.Start = &MI;
  1209. else if (MI.getDesc().isCall()) {
  1210. // TODO: Though the call will require LE to execute again, does this
  1211. // mean we should revert? Always executing LE hopefully should be
  1212. // faster than performing a sub,cmp,br or even subs,br.
  1213. LoLoop.Revert = true;
  1214. LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
  1215. } else {
  1216. // Record VPR defs and build up their corresponding vpt blocks.
  1217. // Check we know how to tail predicate any mve instructions.
  1218. LoLoop.AnalyseMVEInst(&MI);
  1219. }
  1220. }
  1221. }
  1222. LLVM_DEBUG(LoLoop.dump());
  1223. if (!LoLoop.FoundAllComponents()) {
  1224. LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
  1225. return Changed;
  1226. }
  1227. assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
  1228. "Expected t2WhileLoopStart to be removed before regalloc!");
  1229. // Check that the only instruction using LoopDec is LoopEnd. This can only
  1230. // happen when the Dec and End are separate, not a single t2LoopEndDec.
  1231. // TODO: Check for copy chains that really have no effect.
  1232. if (LoLoop.Dec != LoLoop.End) {
  1233. SmallPtrSet<MachineInstr *, 2> Uses;
  1234. RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
  1235. if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
  1236. LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
  1237. LoLoop.Revert = true;
  1238. }
  1239. }
  1240. LoLoop.Validate(BBUtils.get());
  1241. Expand(LoLoop);
  1242. return true;
  1243. }
  1244. // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
  1245. // beq that branches to the exit branch.
  1246. // TODO: We could also try to generate a cbz if the value in LR is also in
  1247. // another low register.
  1248. void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
  1249. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
  1250. MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(*MI);
  1251. unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
  1252. ARM::tBcc : ARM::t2Bcc;
  1253. RevertWhileLoopStartLR(MI, TII, BrOpc);
  1254. }
  1255. void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
  1256. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
  1257. RevertDoLoopStart(MI, TII);
  1258. }
  1259. bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
  1260. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
  1261. MachineBasicBlock *MBB = MI->getParent();
  1262. SmallPtrSet<MachineInstr*, 1> Ignore;
  1263. for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {
  1264. if (I->getOpcode() == ARM::t2LoopEnd) {
  1265. Ignore.insert(&*I);
  1266. break;
  1267. }
  1268. }
  1269. // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
  1270. bool SetFlags =
  1271. RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore);
  1272. llvm::RevertLoopDec(MI, TII, SetFlags);
  1273. return SetFlags;
  1274. }
  1275. // Generate a subs, or sub and cmp, and a branch instead of an LE.
  1276. void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
  1277. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
  1278. MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
  1279. unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
  1280. ARM::tBcc : ARM::t2Bcc;
  1281. llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
  1282. }
  1283. // Generate a subs, or sub and cmp, and a branch instead of an LE.
  1284. void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {
  1285. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
  1286. assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
  1287. MachineBasicBlock *MBB = MI->getParent();
  1288. MachineInstrBuilder MIB =
  1289. BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
  1290. MIB.addDef(ARM::LR);
  1291. MIB.add(MI->getOperand(1));
  1292. MIB.addImm(1);
  1293. MIB.addImm(ARMCC::AL);
  1294. MIB.addReg(ARM::NoRegister);
  1295. MIB.addReg(ARM::CPSR);
  1296. MIB->getOperand(5).setIsDef(true);
  1297. MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
  1298. unsigned BrOpc =
  1299. BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;
  1300. // Create bne
  1301. MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
  1302. MIB.add(MI->getOperand(2)); // branch target
  1303. MIB.addImm(ARMCC::NE); // condition code
  1304. MIB.addReg(ARM::CPSR);
  1305. MI->eraseFromParent();
  1306. }
  1307. // Perform dead code elimation on the loop iteration count setup expression.
  1308. // If we are tail-predicating, the number of elements to be processed is the
  1309. // operand of the VCTP instruction in the vector body, see getCount(), which is
  1310. // register $r3 in this example:
  1311. //
  1312. // $lr = big-itercount-expression
  1313. // ..
  1314. // $lr = t2DoLoopStart renamable $lr
  1315. // vector.body:
  1316. // ..
  1317. // $vpr = MVE_VCTP32 renamable $r3
  1318. // renamable $lr = t2LoopDec killed renamable $lr, 1
  1319. // t2LoopEnd renamable $lr, %vector.body
  1320. // tB %end
  1321. //
  1322. // What we would like achieve here is to replace the do-loop start pseudo
  1323. // instruction t2DoLoopStart with:
  1324. //
  1325. // $lr = MVE_DLSTP_32 killed renamable $r3
  1326. //
  1327. // Thus, $r3 which defines the number of elements, is written to $lr,
  1328. // and then we want to delete the whole chain that used to define $lr,
  1329. // see the comment below how this chain could look like.
  1330. //
  1331. void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
  1332. if (!LoLoop.IsTailPredicationLegal())
  1333. return;
  1334. LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
  1335. MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 1);
  1336. if (!Def) {
  1337. LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
  1338. return;
  1339. }
  1340. // Collect and remove the users of iteration count.
  1341. SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec,
  1342. LoLoop.End };
  1343. if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
  1344. LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
  1345. }
  1346. MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
  1347. LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
  1348. // When using tail-predication, try to delete the dead code that was used to
  1349. // calculate the number of loop iterations.
  1350. IterationCountDCE(LoLoop);
  1351. MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
  1352. MachineInstr *Start = LoLoop.Start;
  1353. MachineBasicBlock *MBB = LoLoop.StartInsertBB;
  1354. unsigned Opc = LoLoop.getStartOpcode();
  1355. MachineOperand &Count = LoLoop.getLoopStartOperand();
  1356. // A DLS lr, lr we needn't emit
  1357. MachineInstr* NewStart;
  1358. if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() &&
  1359. Count.getReg() == ARM::LR) {
  1360. LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");
  1361. NewStart = nullptr;
  1362. } else {
  1363. MachineInstrBuilder MIB =
  1364. BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
  1365. MIB.addDef(ARM::LR);
  1366. MIB.add(Count);
  1367. if (isWhileLoopStart(*Start))
  1368. MIB.addMBB(getWhileLoopStartTargetBB(*Start));
  1369. LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
  1370. NewStart = &*MIB;
  1371. }
  1372. LoLoop.ToRemove.insert(Start);
  1373. return NewStart;
  1374. }
  1375. void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
  1376. auto RemovePredicate = [](MachineInstr *MI) {
  1377. if (MI->isDebugInstr())
  1378. return;
  1379. LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
  1380. int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
  1381. assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction");
  1382. assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
  1383. "Expected Then predicate!");
  1384. MI->getOperand(PIdx).setImm(ARMVCC::None);
  1385. MI->getOperand(PIdx + 1).setReg(0);
  1386. };
  1387. for (auto &Block : LoLoop.getVPTBlocks()) {
  1388. SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
  1389. auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {
  1390. assert(TheVCMP && "Replacing a removed or non-existent VCMP");
  1391. // Replace the VCMP with a VPT
  1392. MachineInstrBuilder MIB =
  1393. BuildMI(*At->getParent(), At, At->getDebugLoc(),
  1394. TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));
  1395. MIB.addImm(ARMVCC::Then);
  1396. // Register one
  1397. MIB.add(TheVCMP->getOperand(1));
  1398. // Register two
  1399. MIB.add(TheVCMP->getOperand(2));
  1400. // The comparison code, e.g. ge, eq, lt
  1401. MIB.add(TheVCMP->getOperand(3));
  1402. LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
  1403. LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
  1404. LoLoop.ToRemove.insert(TheVCMP);
  1405. TheVCMP = nullptr;
  1406. };
  1407. if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {
  1408. MachineInstr *VPST = Insts.front();
  1409. if (VPTState::hasUniformPredicate(Block)) {
  1410. // A vpt block starting with VPST, is only predicated upon vctp and has no
  1411. // internal vpr defs:
  1412. // - Remove vpst.
  1413. // - Unpredicate the remaining instructions.
  1414. LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
  1415. for (unsigned i = 1; i < Insts.size(); ++i)
  1416. RemovePredicate(Insts[i]);
  1417. } else {
  1418. // The VPT block has a non-uniform predicate but it uses a vpst and its
  1419. // entry is guarded only by a vctp, which means we:
  1420. // - Need to remove the original vpst.
  1421. // - Then need to unpredicate any following instructions, until
  1422. // we come across the divergent vpr def.
  1423. // - Insert a new vpst to predicate the instruction(s) that following
  1424. // the divergent vpr def.
  1425. MachineInstr *Divergent = VPTState::getDivergent(Block);
  1426. MachineBasicBlock *MBB = Divergent->getParent();
  1427. auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
  1428. while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr())
  1429. ++DivergentNext;
  1430. bool DivergentNextIsPredicated =
  1431. DivergentNext != MBB->end() &&
  1432. getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
  1433. for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
  1434. I != E; ++I)
  1435. RemovePredicate(&*I);
  1436. // Check if the instruction defining vpr is a vcmp so it can be combined
  1437. // with the VPST This should be the divergent instruction
  1438. MachineInstr *VCMP =
  1439. VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
  1440. if (DivergentNextIsPredicated) {
  1441. // Insert a VPST at the divergent only if the next instruction
  1442. // would actually use it. A VCMP following a VPST can be
  1443. // merged into a VPT so do that instead if the VCMP exists.
  1444. if (!VCMP) {
  1445. // Create a VPST (with a null mask for now, we'll recompute it
  1446. // later)
  1447. MachineInstrBuilder MIB =
  1448. BuildMI(*Divergent->getParent(), Divergent,
  1449. Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
  1450. MIB.addImm(0);
  1451. LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
  1452. LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
  1453. } else {
  1454. // No RDA checks are necessary here since the VPST would have been
  1455. // directly after the VCMP
  1456. ReplaceVCMPWithVPT(VCMP, VCMP);
  1457. }
  1458. }
  1459. }
  1460. LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
  1461. LoLoop.ToRemove.insert(VPST);
  1462. } else if (Block.containsVCTP()) {
  1463. // The vctp will be removed, so either the entire block will be dead or
  1464. // the block mask of the vp(s)t will need to be recomputed.
  1465. MachineInstr *VPST = Insts.front();
  1466. if (Block.size() == 2) {
  1467. assert(VPST->getOpcode() == ARM::MVE_VPST &&
  1468. "Found a VPST in an otherwise empty vpt block");
  1469. LoLoop.ToRemove.insert(VPST);
  1470. } else
  1471. LoLoop.BlockMasksToRecompute.insert(VPST);
  1472. } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
  1473. // If this block starts with a VPST then attempt to merge it with the
  1474. // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
  1475. // block that no longer exists
  1476. MachineInstr *VPST = Insts.front();
  1477. auto Next = ++MachineBasicBlock::iterator(VPST);
  1478. assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
  1479. "The instruction after a VPST must be predicated");
  1480. (void)Next;
  1481. MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
  1482. if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&
  1483. !LoLoop.ToRemove.contains(VprDef)) {
  1484. MachineInstr *VCMP = VprDef;
  1485. // The VCMP and VPST can only be merged if the VCMP's operands will have
  1486. // the same values at the VPST.
  1487. // If any of the instructions between the VCMP and VPST are predicated
  1488. // then a different code path is expected to have merged the VCMP and
  1489. // VPST already.
  1490. if (std::none_of(++MachineBasicBlock::iterator(VCMP),
  1491. MachineBasicBlock::iterator(VPST), hasVPRUse) &&
  1492. RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
  1493. RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
  1494. ReplaceVCMPWithVPT(VCMP, VPST);
  1495. LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
  1496. LoLoop.ToRemove.insert(VPST);
  1497. }
  1498. }
  1499. }
  1500. }
  1501. LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
  1502. }
  1503. void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
  1504. // Combine the LoopDec and LoopEnd instructions into LE(TP).
  1505. auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
  1506. MachineInstr *End = LoLoop.End;
  1507. MachineBasicBlock *MBB = End->getParent();
  1508. unsigned Opc = LoLoop.IsTailPredicationLegal() ?
  1509. ARM::MVE_LETP : ARM::t2LEUpdate;
  1510. MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
  1511. TII->get(Opc));
  1512. MIB.addDef(ARM::LR);
  1513. unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;
  1514. MIB.add(End->getOperand(Off + 0));
  1515. MIB.add(End->getOperand(Off + 1));
  1516. LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
  1517. LoLoop.ToRemove.insert(LoLoop.Dec);
  1518. LoLoop.ToRemove.insert(End);
  1519. return &*MIB;
  1520. };
  1521. // TODO: We should be able to automatically remove these branches before we
  1522. // get here - probably by teaching analyzeBranch about the pseudo
  1523. // instructions.
  1524. // If there is an unconditional branch, after I, that just branches to the
  1525. // next block, remove it.
  1526. auto RemoveDeadBranch = [](MachineInstr *I) {
  1527. MachineBasicBlock *BB = I->getParent();
  1528. MachineInstr *Terminator = &BB->instr_back();
  1529. if (Terminator->isUnconditionalBranch() && I != Terminator) {
  1530. MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
  1531. if (BB->isLayoutSuccessor(Succ)) {
  1532. LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
  1533. Terminator->eraseFromParent();
  1534. }
  1535. }
  1536. };
  1537. // And VMOVCopies need to become 2xVMOVD for tail predication to be valid.
  1538. // Anything other MQPRCopy can be converted to MVE_VORR later on.
  1539. auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) {
  1540. for (auto *MI : VMOVCopies) {
  1541. LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);
  1542. assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
  1543. MachineBasicBlock *MBB = MI->getParent();
  1544. Register Dst = MI->getOperand(0).getReg();
  1545. Register Src = MI->getOperand(1).getReg();
  1546. auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
  1547. ARM::D0 + (Dst - ARM::Q0) * 2)
  1548. .addReg(ARM::D0 + (Src - ARM::Q0) * 2)
  1549. .add(predOps(ARMCC::AL));
  1550. (void)MIB1;
  1551. LLVM_DEBUG(dbgs() << " into " << *MIB1);
  1552. auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
  1553. ARM::D0 + (Dst - ARM::Q0) * 2 + 1)
  1554. .addReg(ARM::D0 + (Src - ARM::Q0) * 2 + 1)
  1555. .add(predOps(ARMCC::AL));
  1556. LLVM_DEBUG(dbgs() << " and " << *MIB2);
  1557. (void)MIB2;
  1558. MI->eraseFromParent();
  1559. }
  1560. };
  1561. if (LoLoop.Revert) {
  1562. if (isWhileLoopStart(*LoLoop.Start))
  1563. RevertWhile(LoLoop.Start);
  1564. else
  1565. RevertDo(LoLoop.Start);
  1566. if (LoLoop.Dec == LoLoop.End)
  1567. RevertLoopEndDec(LoLoop.End);
  1568. else
  1569. RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
  1570. } else {
  1571. ExpandVMOVCopies(LoLoop.VMOVCopies);
  1572. LoLoop.Start = ExpandLoopStart(LoLoop);
  1573. if (LoLoop.Start)
  1574. RemoveDeadBranch(LoLoop.Start);
  1575. LoLoop.End = ExpandLoopEnd(LoLoop);
  1576. RemoveDeadBranch(LoLoop.End);
  1577. if (LoLoop.IsTailPredicationLegal())
  1578. ConvertVPTBlocks(LoLoop);
  1579. for (auto *I : LoLoop.ToRemove) {
  1580. LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
  1581. I->eraseFromParent();
  1582. }
  1583. for (auto *I : LoLoop.BlockMasksToRecompute) {
  1584. LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
  1585. recomputeVPTBlockMask(*I);
  1586. LLVM_DEBUG(dbgs() << " ... done: " << *I);
  1587. }
  1588. }
  1589. PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
  1590. DFS.ProcessLoop();
  1591. const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
  1592. for (auto *MBB : PostOrder) {
  1593. recomputeLiveIns(*MBB);
  1594. // FIXME: For some reason, the live-in print order is non-deterministic for
  1595. // our tests and I can't out why... So just sort them.
  1596. MBB->sortUniqueLiveIns();
  1597. }
  1598. for (auto *MBB : reverse(PostOrder))
  1599. recomputeLivenessFlags(*MBB);
  1600. // We've moved, removed and inserted new instructions, so update RDA.
  1601. RDA->reset();
  1602. }
  1603. bool ARMLowOverheadLoops::RevertNonLoops() {
  1604. LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
  1605. bool Changed = false;
  1606. for (auto &MBB : *MF) {
  1607. SmallVector<MachineInstr*, 4> Starts;
  1608. SmallVector<MachineInstr*, 4> Decs;
  1609. SmallVector<MachineInstr*, 4> Ends;
  1610. SmallVector<MachineInstr *, 4> EndDecs;
  1611. SmallVector<MachineInstr *, 4> MQPRCopies;
  1612. for (auto &I : MBB) {
  1613. if (isLoopStart(I))
  1614. Starts.push_back(&I);
  1615. else if (I.getOpcode() == ARM::t2LoopDec)
  1616. Decs.push_back(&I);
  1617. else if (I.getOpcode() == ARM::t2LoopEnd)
  1618. Ends.push_back(&I);
  1619. else if (I.getOpcode() == ARM::t2LoopEndDec)
  1620. EndDecs.push_back(&I);
  1621. else if (I.getOpcode() == ARM::MQPRCopy)
  1622. MQPRCopies.push_back(&I);
  1623. }
  1624. if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&
  1625. MQPRCopies.empty())
  1626. continue;
  1627. Changed = true;
  1628. for (auto *Start : Starts) {
  1629. if (isWhileLoopStart(*Start))
  1630. RevertWhile(Start);
  1631. else
  1632. RevertDo(Start);
  1633. }
  1634. for (auto *Dec : Decs)
  1635. RevertLoopDec(Dec);
  1636. for (auto *End : Ends)
  1637. RevertLoopEnd(End);
  1638. for (auto *End : EndDecs)
  1639. RevertLoopEndDec(End);
  1640. for (auto *MI : MQPRCopies) {
  1641. LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);
  1642. assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
  1643. MachineBasicBlock *MBB = MI->getParent();
  1644. auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR),
  1645. MI->getOperand(0).getReg())
  1646. .add(MI->getOperand(1))
  1647. .add(MI->getOperand(1));
  1648. addUnpredicatedMveVpredROp(MIB, MI->getOperand(0).getReg());
  1649. MI->eraseFromParent();
  1650. }
  1651. }
  1652. return Changed;
  1653. }
  1654. FunctionPass *llvm::createARMLowOverheadLoopsPass() {
  1655. return new ARMLowOverheadLoops();
  1656. }