AArch64LoadStoreOptimizer.cpp 85 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337
  1. //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file contains a pass that performs load / store related peephole
  10. // optimizations. This pass should be run after register allocation.
  11. //
  12. // The pass runs after the PrologEpilogInserter where we emit the CFI
  13. // instructions. In order to preserve the correctness of the unwind informaiton,
  14. // the pass should not change the order of any two instructions, one of which
  15. // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
  16. // to unwind information.
  17. //
  18. //===----------------------------------------------------------------------===//
  19. #include "AArch64InstrInfo.h"
  20. #include "AArch64MachineFunctionInfo.h"
  21. #include "AArch64Subtarget.h"
  22. #include "MCTargetDesc/AArch64AddressingModes.h"
  23. #include "llvm/ADT/BitVector.h"
  24. #include "llvm/ADT/SmallVector.h"
  25. #include "llvm/ADT/Statistic.h"
  26. #include "llvm/ADT/StringRef.h"
  27. #include "llvm/ADT/iterator_range.h"
  28. #include "llvm/Analysis/AliasAnalysis.h"
  29. #include "llvm/CodeGen/MachineBasicBlock.h"
  30. #include "llvm/CodeGen/MachineFunction.h"
  31. #include "llvm/CodeGen/MachineFunctionPass.h"
  32. #include "llvm/CodeGen/MachineInstr.h"
  33. #include "llvm/CodeGen/MachineInstrBuilder.h"
  34. #include "llvm/CodeGen/MachineOperand.h"
  35. #include "llvm/CodeGen/MachineRegisterInfo.h"
  36. #include "llvm/CodeGen/TargetRegisterInfo.h"
  37. #include "llvm/IR/DebugLoc.h"
  38. #include "llvm/MC/MCAsmInfo.h"
  39. #include "llvm/MC/MCDwarf.h"
  40. #include "llvm/MC/MCRegisterInfo.h"
  41. #include "llvm/Pass.h"
  42. #include "llvm/Support/CommandLine.h"
  43. #include "llvm/Support/Debug.h"
  44. #include "llvm/Support/DebugCounter.h"
  45. #include "llvm/Support/ErrorHandling.h"
  46. #include "llvm/Support/raw_ostream.h"
  47. #include <cassert>
  48. #include <cstdint>
  49. #include <functional>
  50. #include <iterator>
  51. #include <limits>
  52. #include <optional>
  53. using namespace llvm;
  54. #define DEBUG_TYPE "aarch64-ldst-opt"
  55. STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
  56. STATISTIC(NumPostFolded, "Number of post-index updates folded");
  57. STATISTIC(NumPreFolded, "Number of pre-index updates folded");
  58. STATISTIC(NumUnscaledPairCreated,
  59. "Number of load/store from unscaled generated");
  60. STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
  61. STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
  62. DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
  63. "Controls which pairs are considered for renaming");
  64. // The LdStLimit limits how far we search for load/store pairs.
  65. static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
  66. cl::init(20), cl::Hidden);
  67. // The UpdateLimit limits how far we search for update instructions when we form
  68. // pre-/post-index instructions.
  69. static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
  70. cl::Hidden);
  71. // Enable register renaming to find additional store pairing opportunities.
  72. static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
  73. cl::init(true), cl::Hidden);
  74. #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
  75. namespace {
  76. using LdStPairFlags = struct LdStPairFlags {
  77. // If a matching instruction is found, MergeForward is set to true if the
  78. // merge is to remove the first instruction and replace the second with
  79. // a pair-wise insn, and false if the reverse is true.
  80. bool MergeForward = false;
  81. // SExtIdx gives the index of the result of the load pair that must be
  82. // extended. The value of SExtIdx assumes that the paired load produces the
  83. // value in this order: (I, returned iterator), i.e., -1 means no value has
  84. // to be extended, 0 means I, and 1 means the returned iterator.
  85. int SExtIdx = -1;
  86. // If not none, RenameReg can be used to rename the result register of the
  87. // first store in a pair. Currently this only works when merging stores
  88. // forward.
  89. std::optional<MCPhysReg> RenameReg;
  90. LdStPairFlags() = default;
  91. void setMergeForward(bool V = true) { MergeForward = V; }
  92. bool getMergeForward() const { return MergeForward; }
  93. void setSExtIdx(int V) { SExtIdx = V; }
  94. int getSExtIdx() const { return SExtIdx; }
  95. void setRenameReg(MCPhysReg R) { RenameReg = R; }
  96. void clearRenameReg() { RenameReg = std::nullopt; }
  97. std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
  98. };
  99. struct AArch64LoadStoreOpt : public MachineFunctionPass {
  100. static char ID;
  101. AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
  102. initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
  103. }
  104. AliasAnalysis *AA;
  105. const AArch64InstrInfo *TII;
  106. const TargetRegisterInfo *TRI;
  107. const AArch64Subtarget *Subtarget;
  108. // Track which register units have been modified and used.
  109. LiveRegUnits ModifiedRegUnits, UsedRegUnits;
  110. LiveRegUnits DefinedInBB;
  111. void getAnalysisUsage(AnalysisUsage &AU) const override {
  112. AU.addRequired<AAResultsWrapperPass>();
  113. MachineFunctionPass::getAnalysisUsage(AU);
  114. }
  115. // Scan the instructions looking for a load/store that can be combined
  116. // with the current instruction into a load/store pair.
  117. // Return the matching instruction if one is found, else MBB->end().
  118. MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
  119. LdStPairFlags &Flags,
  120. unsigned Limit,
  121. bool FindNarrowMerge);
  122. // Scan the instructions looking for a store that writes to the address from
  123. // which the current load instruction reads. Return true if one is found.
  124. bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
  125. MachineBasicBlock::iterator &StoreI);
  126. // Merge the two instructions indicated into a wider narrow store instruction.
  127. MachineBasicBlock::iterator
  128. mergeNarrowZeroStores(MachineBasicBlock::iterator I,
  129. MachineBasicBlock::iterator MergeMI,
  130. const LdStPairFlags &Flags);
  131. // Merge the two instructions indicated into a single pair-wise instruction.
  132. MachineBasicBlock::iterator
  133. mergePairedInsns(MachineBasicBlock::iterator I,
  134. MachineBasicBlock::iterator Paired,
  135. const LdStPairFlags &Flags);
  136. // Promote the load that reads directly from the address stored to.
  137. MachineBasicBlock::iterator
  138. promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
  139. MachineBasicBlock::iterator StoreI);
  140. // Scan the instruction list to find a base register update that can
  141. // be combined with the current instruction (a load or store) using
  142. // pre or post indexed addressing with writeback. Scan forwards.
  143. MachineBasicBlock::iterator
  144. findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
  145. int UnscaledOffset, unsigned Limit);
  146. // Scan the instruction list to find a base register update that can
  147. // be combined with the current instruction (a load or store) using
  148. // pre or post indexed addressing with writeback. Scan backwards.
  149. MachineBasicBlock::iterator
  150. findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
  151. // Find an instruction that updates the base register of the ld/st
  152. // instruction.
  153. bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
  154. unsigned BaseReg, int Offset);
  155. // Merge a pre- or post-index base register update into a ld/st instruction.
  156. MachineBasicBlock::iterator
  157. mergeUpdateInsn(MachineBasicBlock::iterator I,
  158. MachineBasicBlock::iterator Update, bool IsPreIdx);
  159. // Find and merge zero store instructions.
  160. bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
  161. // Find and pair ldr/str instructions.
  162. bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
  163. // Find and promote load instructions which read directly from store.
  164. bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
  165. // Find and merge a base register updates before or after a ld/st instruction.
  166. bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
  167. bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
  168. bool runOnMachineFunction(MachineFunction &Fn) override;
  169. MachineFunctionProperties getRequiredProperties() const override {
  170. return MachineFunctionProperties().set(
  171. MachineFunctionProperties::Property::NoVRegs);
  172. }
  173. StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
  174. };
  175. char AArch64LoadStoreOpt::ID = 0;
  176. } // end anonymous namespace
  177. INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
  178. AARCH64_LOAD_STORE_OPT_NAME, false, false)
  179. static bool isNarrowStore(unsigned Opc) {
  180. switch (Opc) {
  181. default:
  182. return false;
  183. case AArch64::STRBBui:
  184. case AArch64::STURBBi:
  185. case AArch64::STRHHui:
  186. case AArch64::STURHHi:
  187. return true;
  188. }
  189. }
  190. // These instruction set memory tag and either keep memory contents unchanged or
  191. // set it to zero, ignoring the address part of the source register.
  192. static bool isTagStore(const MachineInstr &MI) {
  193. switch (MI.getOpcode()) {
  194. default:
  195. return false;
  196. case AArch64::STGOffset:
  197. case AArch64::STZGOffset:
  198. case AArch64::ST2GOffset:
  199. case AArch64::STZ2GOffset:
  200. return true;
  201. }
  202. }
  203. static unsigned getMatchingNonSExtOpcode(unsigned Opc,
  204. bool *IsValidLdStrOpc = nullptr) {
  205. if (IsValidLdStrOpc)
  206. *IsValidLdStrOpc = true;
  207. switch (Opc) {
  208. default:
  209. if (IsValidLdStrOpc)
  210. *IsValidLdStrOpc = false;
  211. return std::numeric_limits<unsigned>::max();
  212. case AArch64::STRDui:
  213. case AArch64::STURDi:
  214. case AArch64::STRDpre:
  215. case AArch64::STRQui:
  216. case AArch64::STURQi:
  217. case AArch64::STRQpre:
  218. case AArch64::STRBBui:
  219. case AArch64::STURBBi:
  220. case AArch64::STRHHui:
  221. case AArch64::STURHHi:
  222. case AArch64::STRWui:
  223. case AArch64::STRWpre:
  224. case AArch64::STURWi:
  225. case AArch64::STRXui:
  226. case AArch64::STRXpre:
  227. case AArch64::STURXi:
  228. case AArch64::LDRDui:
  229. case AArch64::LDURDi:
  230. case AArch64::LDRDpre:
  231. case AArch64::LDRQui:
  232. case AArch64::LDURQi:
  233. case AArch64::LDRQpre:
  234. case AArch64::LDRWui:
  235. case AArch64::LDURWi:
  236. case AArch64::LDRWpre:
  237. case AArch64::LDRXui:
  238. case AArch64::LDURXi:
  239. case AArch64::LDRXpre:
  240. case AArch64::STRSui:
  241. case AArch64::STURSi:
  242. case AArch64::STRSpre:
  243. case AArch64::LDRSui:
  244. case AArch64::LDURSi:
  245. case AArch64::LDRSpre:
  246. return Opc;
  247. case AArch64::LDRSWui:
  248. return AArch64::LDRWui;
  249. case AArch64::LDURSWi:
  250. return AArch64::LDURWi;
  251. }
  252. }
  253. static unsigned getMatchingWideOpcode(unsigned Opc) {
  254. switch (Opc) {
  255. default:
  256. llvm_unreachable("Opcode has no wide equivalent!");
  257. case AArch64::STRBBui:
  258. return AArch64::STRHHui;
  259. case AArch64::STRHHui:
  260. return AArch64::STRWui;
  261. case AArch64::STURBBi:
  262. return AArch64::STURHHi;
  263. case AArch64::STURHHi:
  264. return AArch64::STURWi;
  265. case AArch64::STURWi:
  266. return AArch64::STURXi;
  267. case AArch64::STRWui:
  268. return AArch64::STRXui;
  269. }
  270. }
  271. static unsigned getMatchingPairOpcode(unsigned Opc) {
  272. switch (Opc) {
  273. default:
  274. llvm_unreachable("Opcode has no pairwise equivalent!");
  275. case AArch64::STRSui:
  276. case AArch64::STURSi:
  277. return AArch64::STPSi;
  278. case AArch64::STRSpre:
  279. return AArch64::STPSpre;
  280. case AArch64::STRDui:
  281. case AArch64::STURDi:
  282. return AArch64::STPDi;
  283. case AArch64::STRDpre:
  284. return AArch64::STPDpre;
  285. case AArch64::STRQui:
  286. case AArch64::STURQi:
  287. return AArch64::STPQi;
  288. case AArch64::STRQpre:
  289. return AArch64::STPQpre;
  290. case AArch64::STRWui:
  291. case AArch64::STURWi:
  292. return AArch64::STPWi;
  293. case AArch64::STRWpre:
  294. return AArch64::STPWpre;
  295. case AArch64::STRXui:
  296. case AArch64::STURXi:
  297. return AArch64::STPXi;
  298. case AArch64::STRXpre:
  299. return AArch64::STPXpre;
  300. case AArch64::LDRSui:
  301. case AArch64::LDURSi:
  302. return AArch64::LDPSi;
  303. case AArch64::LDRSpre:
  304. return AArch64::LDPSpre;
  305. case AArch64::LDRDui:
  306. case AArch64::LDURDi:
  307. return AArch64::LDPDi;
  308. case AArch64::LDRDpre:
  309. return AArch64::LDPDpre;
  310. case AArch64::LDRQui:
  311. case AArch64::LDURQi:
  312. return AArch64::LDPQi;
  313. case AArch64::LDRQpre:
  314. return AArch64::LDPQpre;
  315. case AArch64::LDRWui:
  316. case AArch64::LDURWi:
  317. return AArch64::LDPWi;
  318. case AArch64::LDRWpre:
  319. return AArch64::LDPWpre;
  320. case AArch64::LDRXui:
  321. case AArch64::LDURXi:
  322. return AArch64::LDPXi;
  323. case AArch64::LDRXpre:
  324. return AArch64::LDPXpre;
  325. case AArch64::LDRSWui:
  326. case AArch64::LDURSWi:
  327. return AArch64::LDPSWi;
  328. }
  329. }
  330. static unsigned isMatchingStore(MachineInstr &LoadInst,
  331. MachineInstr &StoreInst) {
  332. unsigned LdOpc = LoadInst.getOpcode();
  333. unsigned StOpc = StoreInst.getOpcode();
  334. switch (LdOpc) {
  335. default:
  336. llvm_unreachable("Unsupported load instruction!");
  337. case AArch64::LDRBBui:
  338. return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
  339. StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
  340. case AArch64::LDURBBi:
  341. return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
  342. StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
  343. case AArch64::LDRHHui:
  344. return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
  345. StOpc == AArch64::STRXui;
  346. case AArch64::LDURHHi:
  347. return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
  348. StOpc == AArch64::STURXi;
  349. case AArch64::LDRWui:
  350. return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
  351. case AArch64::LDURWi:
  352. return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
  353. case AArch64::LDRXui:
  354. return StOpc == AArch64::STRXui;
  355. case AArch64::LDURXi:
  356. return StOpc == AArch64::STURXi;
  357. }
  358. }
  359. static unsigned getPreIndexedOpcode(unsigned Opc) {
  360. // FIXME: We don't currently support creating pre-indexed loads/stores when
  361. // the load or store is the unscaled version. If we decide to perform such an
  362. // optimization in the future the cases for the unscaled loads/stores will
  363. // need to be added here.
  364. switch (Opc) {
  365. default:
  366. llvm_unreachable("Opcode has no pre-indexed equivalent!");
  367. case AArch64::STRSui:
  368. return AArch64::STRSpre;
  369. case AArch64::STRDui:
  370. return AArch64::STRDpre;
  371. case AArch64::STRQui:
  372. return AArch64::STRQpre;
  373. case AArch64::STRBBui:
  374. return AArch64::STRBBpre;
  375. case AArch64::STRHHui:
  376. return AArch64::STRHHpre;
  377. case AArch64::STRWui:
  378. return AArch64::STRWpre;
  379. case AArch64::STRXui:
  380. return AArch64::STRXpre;
  381. case AArch64::LDRSui:
  382. return AArch64::LDRSpre;
  383. case AArch64::LDRDui:
  384. return AArch64::LDRDpre;
  385. case AArch64::LDRQui:
  386. return AArch64::LDRQpre;
  387. case AArch64::LDRBBui:
  388. return AArch64::LDRBBpre;
  389. case AArch64::LDRHHui:
  390. return AArch64::LDRHHpre;
  391. case AArch64::LDRWui:
  392. return AArch64::LDRWpre;
  393. case AArch64::LDRXui:
  394. return AArch64::LDRXpre;
  395. case AArch64::LDRSWui:
  396. return AArch64::LDRSWpre;
  397. case AArch64::LDPSi:
  398. return AArch64::LDPSpre;
  399. case AArch64::LDPSWi:
  400. return AArch64::LDPSWpre;
  401. case AArch64::LDPDi:
  402. return AArch64::LDPDpre;
  403. case AArch64::LDPQi:
  404. return AArch64::LDPQpre;
  405. case AArch64::LDPWi:
  406. return AArch64::LDPWpre;
  407. case AArch64::LDPXi:
  408. return AArch64::LDPXpre;
  409. case AArch64::STPSi:
  410. return AArch64::STPSpre;
  411. case AArch64::STPDi:
  412. return AArch64::STPDpre;
  413. case AArch64::STPQi:
  414. return AArch64::STPQpre;
  415. case AArch64::STPWi:
  416. return AArch64::STPWpre;
  417. case AArch64::STPXi:
  418. return AArch64::STPXpre;
  419. case AArch64::STGOffset:
  420. return AArch64::STGPreIndex;
  421. case AArch64::STZGOffset:
  422. return AArch64::STZGPreIndex;
  423. case AArch64::ST2GOffset:
  424. return AArch64::ST2GPreIndex;
  425. case AArch64::STZ2GOffset:
  426. return AArch64::STZ2GPreIndex;
  427. case AArch64::STGPi:
  428. return AArch64::STGPpre;
  429. }
  430. }
  431. static unsigned getPostIndexedOpcode(unsigned Opc) {
  432. switch (Opc) {
  433. default:
  434. llvm_unreachable("Opcode has no post-indexed wise equivalent!");
  435. case AArch64::STRSui:
  436. case AArch64::STURSi:
  437. return AArch64::STRSpost;
  438. case AArch64::STRDui:
  439. case AArch64::STURDi:
  440. return AArch64::STRDpost;
  441. case AArch64::STRQui:
  442. case AArch64::STURQi:
  443. return AArch64::STRQpost;
  444. case AArch64::STRBBui:
  445. return AArch64::STRBBpost;
  446. case AArch64::STRHHui:
  447. return AArch64::STRHHpost;
  448. case AArch64::STRWui:
  449. case AArch64::STURWi:
  450. return AArch64::STRWpost;
  451. case AArch64::STRXui:
  452. case AArch64::STURXi:
  453. return AArch64::STRXpost;
  454. case AArch64::LDRSui:
  455. case AArch64::LDURSi:
  456. return AArch64::LDRSpost;
  457. case AArch64::LDRDui:
  458. case AArch64::LDURDi:
  459. return AArch64::LDRDpost;
  460. case AArch64::LDRQui:
  461. case AArch64::LDURQi:
  462. return AArch64::LDRQpost;
  463. case AArch64::LDRBBui:
  464. return AArch64::LDRBBpost;
  465. case AArch64::LDRHHui:
  466. return AArch64::LDRHHpost;
  467. case AArch64::LDRWui:
  468. case AArch64::LDURWi:
  469. return AArch64::LDRWpost;
  470. case AArch64::LDRXui:
  471. case AArch64::LDURXi:
  472. return AArch64::LDRXpost;
  473. case AArch64::LDRSWui:
  474. return AArch64::LDRSWpost;
  475. case AArch64::LDPSi:
  476. return AArch64::LDPSpost;
  477. case AArch64::LDPSWi:
  478. return AArch64::LDPSWpost;
  479. case AArch64::LDPDi:
  480. return AArch64::LDPDpost;
  481. case AArch64::LDPQi:
  482. return AArch64::LDPQpost;
  483. case AArch64::LDPWi:
  484. return AArch64::LDPWpost;
  485. case AArch64::LDPXi:
  486. return AArch64::LDPXpost;
  487. case AArch64::STPSi:
  488. return AArch64::STPSpost;
  489. case AArch64::STPDi:
  490. return AArch64::STPDpost;
  491. case AArch64::STPQi:
  492. return AArch64::STPQpost;
  493. case AArch64::STPWi:
  494. return AArch64::STPWpost;
  495. case AArch64::STPXi:
  496. return AArch64::STPXpost;
  497. case AArch64::STGOffset:
  498. return AArch64::STGPostIndex;
  499. case AArch64::STZGOffset:
  500. return AArch64::STZGPostIndex;
  501. case AArch64::ST2GOffset:
  502. return AArch64::ST2GPostIndex;
  503. case AArch64::STZ2GOffset:
  504. return AArch64::STZ2GPostIndex;
  505. case AArch64::STGPi:
  506. return AArch64::STGPpost;
  507. }
  508. }
  509. static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
  510. unsigned OpcA = FirstMI.getOpcode();
  511. unsigned OpcB = MI.getOpcode();
  512. switch (OpcA) {
  513. default:
  514. return false;
  515. case AArch64::STRSpre:
  516. return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
  517. case AArch64::STRDpre:
  518. return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
  519. case AArch64::STRQpre:
  520. return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
  521. case AArch64::STRWpre:
  522. return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
  523. case AArch64::STRXpre:
  524. return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
  525. case AArch64::LDRSpre:
  526. return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
  527. case AArch64::LDRDpre:
  528. return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
  529. case AArch64::LDRQpre:
  530. return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
  531. case AArch64::LDRWpre:
  532. return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
  533. case AArch64::LDRXpre:
  534. return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
  535. }
  536. }
  537. // Returns the scale and offset range of pre/post indexed variants of MI.
  538. static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
  539. int &MinOffset, int &MaxOffset) {
  540. bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
  541. bool IsTagStore = isTagStore(MI);
  542. // ST*G and all paired ldst have the same scale in pre/post-indexed variants
  543. // as in the "unsigned offset" variant.
  544. // All other pre/post indexed ldst instructions are unscaled.
  545. Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
  546. if (IsPaired) {
  547. MinOffset = -64;
  548. MaxOffset = 63;
  549. } else {
  550. MinOffset = -256;
  551. MaxOffset = 255;
  552. }
  553. }
  554. static MachineOperand &getLdStRegOp(MachineInstr &MI,
  555. unsigned PairedRegOp = 0) {
  556. assert(PairedRegOp < 2 && "Unexpected register operand idx.");
  557. bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
  558. if (IsPreLdSt)
  559. PairedRegOp += 1;
  560. unsigned Idx =
  561. AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
  562. return MI.getOperand(Idx);
  563. }
  564. static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
  565. MachineInstr &StoreInst,
  566. const AArch64InstrInfo *TII) {
  567. assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
  568. int LoadSize = TII->getMemScale(LoadInst);
  569. int StoreSize = TII->getMemScale(StoreInst);
  570. int UnscaledStOffset =
  571. TII->hasUnscaledLdStOffset(StoreInst)
  572. ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()
  573. : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;
  574. int UnscaledLdOffset =
  575. TII->hasUnscaledLdStOffset(LoadInst)
  576. ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()
  577. : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;
  578. return (UnscaledStOffset <= UnscaledLdOffset) &&
  579. (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
  580. }
  581. static bool isPromotableZeroStoreInst(MachineInstr &MI) {
  582. unsigned Opc = MI.getOpcode();
  583. return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
  584. isNarrowStore(Opc)) &&
  585. getLdStRegOp(MI).getReg() == AArch64::WZR;
  586. }
  587. static bool isPromotableLoadFromStore(MachineInstr &MI) {
  588. switch (MI.getOpcode()) {
  589. default:
  590. return false;
  591. // Scaled instructions.
  592. case AArch64::LDRBBui:
  593. case AArch64::LDRHHui:
  594. case AArch64::LDRWui:
  595. case AArch64::LDRXui:
  596. // Unscaled instructions.
  597. case AArch64::LDURBBi:
  598. case AArch64::LDURHHi:
  599. case AArch64::LDURWi:
  600. case AArch64::LDURXi:
  601. return true;
  602. }
  603. }
  604. static bool isMergeableLdStUpdate(MachineInstr &MI) {
  605. unsigned Opc = MI.getOpcode();
  606. switch (Opc) {
  607. default:
  608. return false;
  609. // Scaled instructions.
  610. case AArch64::STRSui:
  611. case AArch64::STRDui:
  612. case AArch64::STRQui:
  613. case AArch64::STRXui:
  614. case AArch64::STRWui:
  615. case AArch64::STRHHui:
  616. case AArch64::STRBBui:
  617. case AArch64::LDRSui:
  618. case AArch64::LDRDui:
  619. case AArch64::LDRQui:
  620. case AArch64::LDRXui:
  621. case AArch64::LDRWui:
  622. case AArch64::LDRHHui:
  623. case AArch64::LDRBBui:
  624. case AArch64::STGOffset:
  625. case AArch64::STZGOffset:
  626. case AArch64::ST2GOffset:
  627. case AArch64::STZ2GOffset:
  628. case AArch64::STGPi:
  629. // Unscaled instructions.
  630. case AArch64::STURSi:
  631. case AArch64::STURDi:
  632. case AArch64::STURQi:
  633. case AArch64::STURWi:
  634. case AArch64::STURXi:
  635. case AArch64::LDURSi:
  636. case AArch64::LDURDi:
  637. case AArch64::LDURQi:
  638. case AArch64::LDURWi:
  639. case AArch64::LDURXi:
  640. // Paired instructions.
  641. case AArch64::LDPSi:
  642. case AArch64::LDPSWi:
  643. case AArch64::LDPDi:
  644. case AArch64::LDPQi:
  645. case AArch64::LDPWi:
  646. case AArch64::LDPXi:
  647. case AArch64::STPSi:
  648. case AArch64::STPDi:
  649. case AArch64::STPQi:
  650. case AArch64::STPWi:
  651. case AArch64::STPXi:
  652. // Make sure this is a reg+imm (as opposed to an address reloc).
  653. if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
  654. return false;
  655. return true;
  656. }
  657. }
  658. MachineBasicBlock::iterator
  659. AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
  660. MachineBasicBlock::iterator MergeMI,
  661. const LdStPairFlags &Flags) {
  662. assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
  663. "Expected promotable zero stores.");
  664. MachineBasicBlock::iterator E = I->getParent()->end();
  665. MachineBasicBlock::iterator NextI = next_nodbg(I, E);
  666. // If NextI is the second of the two instructions to be merged, we need
  667. // to skip one further. Either way we merge will invalidate the iterator,
  668. // and we don't need to scan the new instruction, as it's a pairwise
  669. // instruction, which we're not considering for further action anyway.
  670. if (NextI == MergeMI)
  671. NextI = next_nodbg(NextI, E);
  672. unsigned Opc = I->getOpcode();
  673. bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
  674. int OffsetStride = IsScaled ? 1 : TII->getMemScale(*I);
  675. bool MergeForward = Flags.getMergeForward();
  676. // Insert our new paired instruction after whichever of the paired
  677. // instructions MergeForward indicates.
  678. MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
  679. // Also based on MergeForward is from where we copy the base register operand
  680. // so we get the flags compatible with the input code.
  681. const MachineOperand &BaseRegOp =
  682. MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)
  683. : AArch64InstrInfo::getLdStBaseOp(*I);
  684. // Which register is Rt and which is Rt2 depends on the offset order.
  685. MachineInstr *RtMI;
  686. if (AArch64InstrInfo::getLdStOffsetOp(*I).getImm() ==
  687. AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
  688. RtMI = &*MergeMI;
  689. else
  690. RtMI = &*I;
  691. int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
  692. // Change the scaled offset from small to large type.
  693. if (IsScaled) {
  694. assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
  695. OffsetImm /= 2;
  696. }
  697. // Construct the new instruction.
  698. DebugLoc DL = I->getDebugLoc();
  699. MachineBasicBlock *MBB = I->getParent();
  700. MachineInstrBuilder MIB;
  701. MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
  702. .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
  703. .add(BaseRegOp)
  704. .addImm(OffsetImm)
  705. .cloneMergedMemRefs({&*I, &*MergeMI})
  706. .setMIFlags(I->mergeFlagsWith(*MergeMI));
  707. (void)MIB;
  708. LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
  709. LLVM_DEBUG(I->print(dbgs()));
  710. LLVM_DEBUG(dbgs() << " ");
  711. LLVM_DEBUG(MergeMI->print(dbgs()));
  712. LLVM_DEBUG(dbgs() << " with instruction:\n ");
  713. LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
  714. LLVM_DEBUG(dbgs() << "\n");
  715. // Erase the old instructions.
  716. I->eraseFromParent();
  717. MergeMI->eraseFromParent();
  718. return NextI;
  719. }
  720. // Apply Fn to all instructions between MI and the beginning of the block, until
  721. // a def for DefReg is reached. Returns true, iff Fn returns true for all
  722. // visited instructions. Stop after visiting Limit iterations.
  723. static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
  724. const TargetRegisterInfo *TRI, unsigned Limit,
  725. std::function<bool(MachineInstr &, bool)> &Fn) {
  726. auto MBB = MI.getParent();
  727. for (MachineInstr &I :
  728. instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
  729. if (!Limit)
  730. return false;
  731. --Limit;
  732. bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
  733. return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
  734. TRI->regsOverlap(MOP.getReg(), DefReg);
  735. });
  736. if (!Fn(I, isDef))
  737. return false;
  738. if (isDef)
  739. break;
  740. }
  741. return true;
  742. }
  743. static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
  744. const TargetRegisterInfo *TRI) {
  745. for (const MachineOperand &MOP : phys_regs_and_masks(MI))
  746. if (MOP.isReg() && MOP.isKill())
  747. Units.removeReg(MOP.getReg());
  748. for (const MachineOperand &MOP : phys_regs_and_masks(MI))
  749. if (MOP.isReg() && !MOP.isKill())
  750. Units.addReg(MOP.getReg());
  751. }
  752. MachineBasicBlock::iterator
  753. AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
  754. MachineBasicBlock::iterator Paired,
  755. const LdStPairFlags &Flags) {
  756. MachineBasicBlock::iterator E = I->getParent()->end();
  757. MachineBasicBlock::iterator NextI = next_nodbg(I, E);
  758. // If NextI is the second of the two instructions to be merged, we need
  759. // to skip one further. Either way we merge will invalidate the iterator,
  760. // and we don't need to scan the new instruction, as it's a pairwise
  761. // instruction, which we're not considering for further action anyway.
  762. if (NextI == Paired)
  763. NextI = next_nodbg(NextI, E);
  764. int SExtIdx = Flags.getSExtIdx();
  765. unsigned Opc =
  766. SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
  767. bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
  768. int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
  769. bool MergeForward = Flags.getMergeForward();
  770. std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
  771. if (MergeForward && RenameReg) {
  772. MCRegister RegToRename = getLdStRegOp(*I).getReg();
  773. DefinedInBB.addReg(*RenameReg);
  774. // Return the sub/super register for RenameReg, matching the size of
  775. // OriginalReg.
  776. auto GetMatchingSubReg = [this,
  777. RenameReg](MCPhysReg OriginalReg) -> MCPhysReg {
  778. for (MCPhysReg SubOrSuper : TRI->sub_and_superregs_inclusive(*RenameReg))
  779. if (TRI->getMinimalPhysRegClass(OriginalReg) ==
  780. TRI->getMinimalPhysRegClass(SubOrSuper))
  781. return SubOrSuper;
  782. llvm_unreachable("Should have found matching sub or super register!");
  783. };
  784. std::function<bool(MachineInstr &, bool)> UpdateMIs =
  785. [this, RegToRename, GetMatchingSubReg](MachineInstr &MI, bool IsDef) {
  786. if (IsDef) {
  787. bool SeenDef = false;
  788. for (auto &MOP : MI.operands()) {
  789. // Rename the first explicit definition and all implicit
  790. // definitions matching RegToRename.
  791. if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
  792. (!SeenDef || (MOP.isDef() && MOP.isImplicit())) &&
  793. TRI->regsOverlap(MOP.getReg(), RegToRename)) {
  794. assert((MOP.isImplicit() ||
  795. (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
  796. "Need renamable operands");
  797. MOP.setReg(GetMatchingSubReg(MOP.getReg()));
  798. SeenDef = true;
  799. }
  800. }
  801. } else {
  802. for (auto &MOP : MI.operands()) {
  803. if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
  804. TRI->regsOverlap(MOP.getReg(), RegToRename)) {
  805. assert((MOP.isImplicit() ||
  806. (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
  807. "Need renamable operands");
  808. MOP.setReg(GetMatchingSubReg(MOP.getReg()));
  809. }
  810. }
  811. }
  812. LLVM_DEBUG(dbgs() << "Renamed " << MI << "\n");
  813. return true;
  814. };
  815. forAllMIsUntilDef(*I, RegToRename, TRI, LdStLimit, UpdateMIs);
  816. #if !defined(NDEBUG)
  817. // Make sure the register used for renaming is not used between the paired
  818. // instructions. That would trash the content before the new paired
  819. // instruction.
  820. for (auto &MI :
  821. iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
  822. std::next(I), std::next(Paired)))
  823. assert(all_of(MI.operands(),
  824. [this, &RenameReg](const MachineOperand &MOP) {
  825. return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
  826. MOP.isUndef() ||
  827. !TRI->regsOverlap(MOP.getReg(), *RenameReg);
  828. }) &&
  829. "Rename register used between paired instruction, trashing the "
  830. "content");
  831. #endif
  832. }
  833. // Insert our new paired instruction after whichever of the paired
  834. // instructions MergeForward indicates.
  835. MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
  836. // Also based on MergeForward is from where we copy the base register operand
  837. // so we get the flags compatible with the input code.
  838. const MachineOperand &BaseRegOp =
  839. MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)
  840. : AArch64InstrInfo::getLdStBaseOp(*I);
  841. int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();
  842. int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();
  843. bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
  844. if (IsUnscaled != PairedIsUnscaled) {
  845. // We're trying to pair instructions that differ in how they are scaled. If
  846. // I is scaled then scale the offset of Paired accordingly. Otherwise, do
  847. // the opposite (i.e., make Paired's offset unscaled).
  848. int MemSize = TII->getMemScale(*Paired);
  849. if (PairedIsUnscaled) {
  850. // If the unscaled offset isn't a multiple of the MemSize, we can't
  851. // pair the operations together.
  852. assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
  853. "Offset should be a multiple of the stride!");
  854. PairedOffset /= MemSize;
  855. } else {
  856. PairedOffset *= MemSize;
  857. }
  858. }
  859. // Which register is Rt and which is Rt2 depends on the offset order.
  860. // However, for pre load/stores the Rt should be the one of the pre
  861. // load/store.
  862. MachineInstr *RtMI, *Rt2MI;
  863. if (Offset == PairedOffset + OffsetStride &&
  864. !AArch64InstrInfo::isPreLdSt(*I)) {
  865. RtMI = &*Paired;
  866. Rt2MI = &*I;
  867. // Here we swapped the assumption made for SExtIdx.
  868. // I.e., we turn ldp I, Paired into ldp Paired, I.
  869. // Update the index accordingly.
  870. if (SExtIdx != -1)
  871. SExtIdx = (SExtIdx + 1) % 2;
  872. } else {
  873. RtMI = &*I;
  874. Rt2MI = &*Paired;
  875. }
  876. int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
  877. // Scale the immediate offset, if necessary.
  878. if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
  879. assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
  880. "Unscaled offset cannot be scaled.");
  881. OffsetImm /= TII->getMemScale(*RtMI);
  882. }
  883. // Construct the new instruction.
  884. MachineInstrBuilder MIB;
  885. DebugLoc DL = I->getDebugLoc();
  886. MachineBasicBlock *MBB = I->getParent();
  887. MachineOperand RegOp0 = getLdStRegOp(*RtMI);
  888. MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
  889. // Kill flags may become invalid when moving stores for pairing.
  890. if (RegOp0.isUse()) {
  891. if (!MergeForward) {
  892. // Clear kill flags on store if moving upwards. Example:
  893. // STRWui %w0, ...
  894. // USE %w1
  895. // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
  896. RegOp0.setIsKill(false);
  897. RegOp1.setIsKill(false);
  898. } else {
  899. // Clear kill flags of the first stores register. Example:
  900. // STRWui %w1, ...
  901. // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
  902. // STRW %w0
  903. Register Reg = getLdStRegOp(*I).getReg();
  904. for (MachineInstr &MI : make_range(std::next(I), Paired))
  905. MI.clearRegisterKills(Reg, TRI);
  906. }
  907. }
  908. unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
  909. MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
  910. // Adds the pre-index operand for pre-indexed ld/st pairs.
  911. if (AArch64InstrInfo::isPreLdSt(*RtMI))
  912. MIB.addReg(BaseRegOp.getReg(), RegState::Define);
  913. MIB.add(RegOp0)
  914. .add(RegOp1)
  915. .add(BaseRegOp)
  916. .addImm(OffsetImm)
  917. .cloneMergedMemRefs({&*I, &*Paired})
  918. .setMIFlags(I->mergeFlagsWith(*Paired));
  919. (void)MIB;
  920. LLVM_DEBUG(
  921. dbgs() << "Creating pair load/store. Replacing instructions:\n ");
  922. LLVM_DEBUG(I->print(dbgs()));
  923. LLVM_DEBUG(dbgs() << " ");
  924. LLVM_DEBUG(Paired->print(dbgs()));
  925. LLVM_DEBUG(dbgs() << " with instruction:\n ");
  926. if (SExtIdx != -1) {
  927. // Generate the sign extension for the proper result of the ldp.
  928. // I.e., with X1, that would be:
  929. // %w1 = KILL %w1, implicit-def %x1
  930. // %x1 = SBFMXri killed %x1, 0, 31
  931. MachineOperand &DstMO = MIB->getOperand(SExtIdx);
  932. // Right now, DstMO has the extended register, since it comes from an
  933. // extended opcode.
  934. Register DstRegX = DstMO.getReg();
  935. // Get the W variant of that register.
  936. Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
  937. // Update the result of LDP to use the W instead of the X variant.
  938. DstMO.setReg(DstRegW);
  939. LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
  940. LLVM_DEBUG(dbgs() << "\n");
  941. // Make the machine verifier happy by providing a definition for
  942. // the X register.
  943. // Insert this definition right after the generated LDP, i.e., before
  944. // InsertionPoint.
  945. MachineInstrBuilder MIBKill =
  946. BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
  947. .addReg(DstRegW)
  948. .addReg(DstRegX, RegState::Define);
  949. MIBKill->getOperand(2).setImplicit();
  950. // Create the sign extension.
  951. MachineInstrBuilder MIBSXTW =
  952. BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
  953. .addReg(DstRegX)
  954. .addImm(0)
  955. .addImm(31);
  956. (void)MIBSXTW;
  957. LLVM_DEBUG(dbgs() << " Extend operand:\n ");
  958. LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
  959. } else {
  960. LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
  961. }
  962. LLVM_DEBUG(dbgs() << "\n");
  963. if (MergeForward)
  964. for (const MachineOperand &MOP : phys_regs_and_masks(*I))
  965. if (MOP.isReg() && MOP.isKill())
  966. DefinedInBB.addReg(MOP.getReg());
  967. // Erase the old instructions.
  968. I->eraseFromParent();
  969. Paired->eraseFromParent();
  970. return NextI;
  971. }
  972. MachineBasicBlock::iterator
  973. AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
  974. MachineBasicBlock::iterator StoreI) {
  975. MachineBasicBlock::iterator NextI =
  976. next_nodbg(LoadI, LoadI->getParent()->end());
  977. int LoadSize = TII->getMemScale(*LoadI);
  978. int StoreSize = TII->getMemScale(*StoreI);
  979. Register LdRt = getLdStRegOp(*LoadI).getReg();
  980. const MachineOperand &StMO = getLdStRegOp(*StoreI);
  981. Register StRt = getLdStRegOp(*StoreI).getReg();
  982. bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
  983. assert((IsStoreXReg ||
  984. TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
  985. "Unexpected RegClass");
  986. MachineInstr *BitExtMI;
  987. if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
  988. // Remove the load, if the destination register of the loads is the same
  989. // register for stored value.
  990. if (StRt == LdRt && LoadSize == 8) {
  991. for (MachineInstr &MI : make_range(StoreI->getIterator(),
  992. LoadI->getIterator())) {
  993. if (MI.killsRegister(StRt, TRI)) {
  994. MI.clearRegisterKills(StRt, TRI);
  995. break;
  996. }
  997. }
  998. LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
  999. LLVM_DEBUG(LoadI->print(dbgs()));
  1000. LLVM_DEBUG(dbgs() << "\n");
  1001. LoadI->eraseFromParent();
  1002. return NextI;
  1003. }
  1004. // Replace the load with a mov if the load and store are in the same size.
  1005. BitExtMI =
  1006. BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
  1007. TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
  1008. .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
  1009. .add(StMO)
  1010. .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
  1011. .setMIFlags(LoadI->getFlags());
  1012. } else {
  1013. // FIXME: Currently we disable this transformation in big-endian targets as
  1014. // performance and correctness are verified only in little-endian.
  1015. if (!Subtarget->isLittleEndian())
  1016. return NextI;
  1017. bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
  1018. assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
  1019. "Unsupported ld/st match");
  1020. assert(LoadSize <= StoreSize && "Invalid load size");
  1021. int UnscaledLdOffset =
  1022. IsUnscaled
  1023. ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()
  1024. : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;
  1025. int UnscaledStOffset =
  1026. IsUnscaled
  1027. ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()
  1028. : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;
  1029. int Width = LoadSize * 8;
  1030. Register DestReg =
  1031. IsStoreXReg ? Register(TRI->getMatchingSuperReg(
  1032. LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
  1033. : LdRt;
  1034. assert((UnscaledLdOffset >= UnscaledStOffset &&
  1035. (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
  1036. "Invalid offset");
  1037. int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
  1038. int Imms = Immr + Width - 1;
  1039. if (UnscaledLdOffset == UnscaledStOffset) {
  1040. uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
  1041. | ((Immr) << 6) // immr
  1042. | ((Imms) << 0) // imms
  1043. ;
  1044. BitExtMI =
  1045. BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
  1046. TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
  1047. DestReg)
  1048. .add(StMO)
  1049. .addImm(AndMaskEncoded)
  1050. .setMIFlags(LoadI->getFlags());
  1051. } else {
  1052. BitExtMI =
  1053. BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
  1054. TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
  1055. DestReg)
  1056. .add(StMO)
  1057. .addImm(Immr)
  1058. .addImm(Imms)
  1059. .setMIFlags(LoadI->getFlags());
  1060. }
  1061. }
  1062. // Clear kill flags between store and load.
  1063. for (MachineInstr &MI : make_range(StoreI->getIterator(),
  1064. BitExtMI->getIterator()))
  1065. if (MI.killsRegister(StRt, TRI)) {
  1066. MI.clearRegisterKills(StRt, TRI);
  1067. break;
  1068. }
  1069. LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
  1070. LLVM_DEBUG(StoreI->print(dbgs()));
  1071. LLVM_DEBUG(dbgs() << " ");
  1072. LLVM_DEBUG(LoadI->print(dbgs()));
  1073. LLVM_DEBUG(dbgs() << " with instructions:\n ");
  1074. LLVM_DEBUG(StoreI->print(dbgs()));
  1075. LLVM_DEBUG(dbgs() << " ");
  1076. LLVM_DEBUG((BitExtMI)->print(dbgs()));
  1077. LLVM_DEBUG(dbgs() << "\n");
  1078. // Erase the old instructions.
  1079. LoadI->eraseFromParent();
  1080. return NextI;
  1081. }
  1082. static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
  1083. // Convert the byte-offset used by unscaled into an "element" offset used
  1084. // by the scaled pair load/store instructions.
  1085. if (IsUnscaled) {
  1086. // If the byte-offset isn't a multiple of the stride, there's no point
  1087. // trying to match it.
  1088. if (Offset % OffsetStride)
  1089. return false;
  1090. Offset /= OffsetStride;
  1091. }
  1092. return Offset <= 63 && Offset >= -64;
  1093. }
  1094. // Do alignment, specialized to power of 2 and for signed ints,
  1095. // avoiding having to do a C-style cast from uint_64t to int when
  1096. // using alignTo from include/llvm/Support/MathExtras.h.
  1097. // FIXME: Move this function to include/MathExtras.h?
  1098. static int alignTo(int Num, int PowOf2) {
  1099. return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
  1100. }
  1101. static bool mayAlias(MachineInstr &MIa,
  1102. SmallVectorImpl<MachineInstr *> &MemInsns,
  1103. AliasAnalysis *AA) {
  1104. for (MachineInstr *MIb : MemInsns)
  1105. if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false))
  1106. return true;
  1107. return false;
  1108. }
  1109. bool AArch64LoadStoreOpt::findMatchingStore(
  1110. MachineBasicBlock::iterator I, unsigned Limit,
  1111. MachineBasicBlock::iterator &StoreI) {
  1112. MachineBasicBlock::iterator B = I->getParent()->begin();
  1113. MachineBasicBlock::iterator MBBI = I;
  1114. MachineInstr &LoadMI = *I;
  1115. Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();
  1116. // If the load is the first instruction in the block, there's obviously
  1117. // not any matching store.
  1118. if (MBBI == B)
  1119. return false;
  1120. // Track which register units have been modified and used between the first
  1121. // insn and the second insn.
  1122. ModifiedRegUnits.clear();
  1123. UsedRegUnits.clear();
  1124. unsigned Count = 0;
  1125. do {
  1126. MBBI = prev_nodbg(MBBI, B);
  1127. MachineInstr &MI = *MBBI;
  1128. // Don't count transient instructions towards the search limit since there
  1129. // may be different numbers of them if e.g. debug information is present.
  1130. if (!MI.isTransient())
  1131. ++Count;
  1132. // If the load instruction reads directly from the address to which the
  1133. // store instruction writes and the stored value is not modified, we can
  1134. // promote the load. Since we do not handle stores with pre-/post-index,
  1135. // it's unnecessary to check if BaseReg is modified by the store itself.
  1136. // Also we can't handle stores without an immediate offset operand,
  1137. // while the operand might be the address for a global variable.
  1138. if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
  1139. BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
  1140. AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
  1141. isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
  1142. ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
  1143. StoreI = MBBI;
  1144. return true;
  1145. }
  1146. if (MI.isCall())
  1147. return false;
  1148. // Update modified / uses register units.
  1149. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
  1150. // Otherwise, if the base register is modified, we have no match, so
  1151. // return early.
  1152. if (!ModifiedRegUnits.available(BaseReg))
  1153. return false;
  1154. // If we encounter a store aliased with the load, return early.
  1155. if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
  1156. return false;
  1157. } while (MBBI != B && Count < Limit);
  1158. return false;
  1159. }
  1160. static bool needsWinCFI(const MachineFunction *MF) {
  1161. return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
  1162. MF->getFunction().needsUnwindTableEntry();
  1163. }
  1164. // Returns true if FirstMI and MI are candidates for merging or pairing.
  1165. // Otherwise, returns false.
  1166. static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
  1167. LdStPairFlags &Flags,
  1168. const AArch64InstrInfo *TII) {
  1169. // If this is volatile or if pairing is suppressed, not a candidate.
  1170. if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
  1171. return false;
  1172. // We should have already checked FirstMI for pair suppression and volatility.
  1173. assert(!FirstMI.hasOrderedMemoryRef() &&
  1174. !TII->isLdStPairSuppressed(FirstMI) &&
  1175. "FirstMI shouldn't get here if either of these checks are true.");
  1176. if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) ||
  1177. MI.getFlag(MachineInstr::FrameDestroy)))
  1178. return false;
  1179. unsigned OpcA = FirstMI.getOpcode();
  1180. unsigned OpcB = MI.getOpcode();
  1181. // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
  1182. if (OpcA == OpcB)
  1183. return !AArch64InstrInfo::isPreLdSt(FirstMI);
  1184. // Try to match a sign-extended load/store with a zero-extended load/store.
  1185. bool IsValidLdStrOpc, PairIsValidLdStrOpc;
  1186. unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
  1187. assert(IsValidLdStrOpc &&
  1188. "Given Opc should be a Load or Store with an immediate");
  1189. // OpcA will be the first instruction in the pair.
  1190. if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
  1191. Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
  1192. return true;
  1193. }
  1194. // If the second instruction isn't even a mergable/pairable load/store, bail
  1195. // out.
  1196. if (!PairIsValidLdStrOpc)
  1197. return false;
  1198. // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
  1199. // offsets.
  1200. if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
  1201. return false;
  1202. // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
  1203. // LDR<S,D,Q,W,X>pre-LDR<S,D,Q,W,X>ui
  1204. // are candidate pairs that can be merged.
  1205. if (isPreLdStPairCandidate(FirstMI, MI))
  1206. return true;
  1207. // Try to match an unscaled load/store with a scaled load/store.
  1208. return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
  1209. getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
  1210. // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
  1211. }
  1212. static bool
  1213. canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
  1214. SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
  1215. const TargetRegisterInfo *TRI) {
  1216. if (!FirstMI.mayStore())
  1217. return false;
  1218. // Check if we can find an unused register which we can use to rename
  1219. // the register used by the first load/store.
  1220. auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
  1221. MachineFunction &MF = *FirstMI.getParent()->getParent();
  1222. if (!RegClass || !MF.getRegInfo().tracksLiveness())
  1223. return false;
  1224. auto RegToRename = getLdStRegOp(FirstMI).getReg();
  1225. // For now, we only rename if the store operand gets killed at the store.
  1226. if (!getLdStRegOp(FirstMI).isKill() &&
  1227. !any_of(FirstMI.operands(),
  1228. [TRI, RegToRename](const MachineOperand &MOP) {
  1229. return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
  1230. MOP.isImplicit() && MOP.isKill() &&
  1231. TRI->regsOverlap(RegToRename, MOP.getReg());
  1232. })) {
  1233. LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI << "\n");
  1234. return false;
  1235. }
  1236. auto canRenameMOP = [TRI](const MachineOperand &MOP) {
  1237. if (MOP.isReg()) {
  1238. auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
  1239. // Renaming registers with multiple disjunct sub-registers (e.g. the
  1240. // result of a LD3) means that all sub-registers are renamed, potentially
  1241. // impacting other instructions we did not check. Bail out.
  1242. // Note that this relies on the structure of the AArch64 register file. In
  1243. // particular, a subregister cannot be written without overwriting the
  1244. // whole register.
  1245. if (RegClass->HasDisjunctSubRegs) {
  1246. LLVM_DEBUG(
  1247. dbgs()
  1248. << " Cannot rename operands with multiple disjunct subregisters ("
  1249. << MOP << ")\n");
  1250. return false;
  1251. }
  1252. }
  1253. return MOP.isImplicit() ||
  1254. (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
  1255. };
  1256. bool FoundDef = false;
  1257. // For each instruction between FirstMI and the previous def for RegToRename,
  1258. // we
  1259. // * check if we can rename RegToRename in this instruction
  1260. // * collect the registers used and required register classes for RegToRename.
  1261. std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
  1262. bool IsDef) {
  1263. LLVM_DEBUG(dbgs() << "Checking " << MI << "\n");
  1264. // Currently we do not try to rename across frame-setup instructions.
  1265. if (MI.getFlag(MachineInstr::FrameSetup)) {
  1266. LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions currently ("
  1267. << MI << ")\n");
  1268. return false;
  1269. }
  1270. UsedInBetween.accumulate(MI);
  1271. // For a definition, check that we can rename the definition and exit the
  1272. // loop.
  1273. FoundDef = IsDef;
  1274. // For defs, check if we can rename the first def of RegToRename.
  1275. if (FoundDef) {
  1276. // For some pseudo instructions, we might not generate code in the end
  1277. // (e.g. KILL) and we would end up without a correct def for the rename
  1278. // register.
  1279. // TODO: This might be overly conservative and we could handle those cases
  1280. // in multiple ways:
  1281. // 1. Insert an extra copy, to materialize the def.
  1282. // 2. Skip pseudo-defs until we find an non-pseudo def.
  1283. if (MI.isPseudo()) {
  1284. LLVM_DEBUG(dbgs() << " Cannot rename pseudo instruction " << MI
  1285. << "\n");
  1286. return false;
  1287. }
  1288. for (auto &MOP : MI.operands()) {
  1289. if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
  1290. !TRI->regsOverlap(MOP.getReg(), RegToRename))
  1291. continue;
  1292. if (!canRenameMOP(MOP)) {
  1293. LLVM_DEBUG(dbgs()
  1294. << " Cannot rename " << MOP << " in " << MI << "\n");
  1295. return false;
  1296. }
  1297. RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
  1298. }
  1299. return true;
  1300. } else {
  1301. for (auto &MOP : MI.operands()) {
  1302. if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
  1303. !TRI->regsOverlap(MOP.getReg(), RegToRename))
  1304. continue;
  1305. if (!canRenameMOP(MOP)) {
  1306. LLVM_DEBUG(dbgs()
  1307. << " Cannot rename " << MOP << " in " << MI << "\n");
  1308. return false;
  1309. }
  1310. RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
  1311. }
  1312. }
  1313. return true;
  1314. };
  1315. if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
  1316. return false;
  1317. if (!FoundDef) {
  1318. LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
  1319. return false;
  1320. }
  1321. return true;
  1322. }
  1323. // Check if we can find a physical register for renaming \p Reg. This register
  1324. // must:
  1325. // * not be defined already in \p DefinedInBB; DefinedInBB must contain all
  1326. // defined registers up to the point where the renamed register will be used,
  1327. // * not used in \p UsedInBetween; UsedInBetween must contain all accessed
  1328. // registers in the range the rename register will be used,
  1329. // * is available in all used register classes (checked using RequiredClasses).
  1330. static std::optional<MCPhysReg> tryToFindRegisterToRename(
  1331. const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
  1332. LiveRegUnits &UsedInBetween,
  1333. SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
  1334. const TargetRegisterInfo *TRI) {
  1335. const MachineRegisterInfo &RegInfo = MF.getRegInfo();
  1336. // Checks if any sub- or super-register of PR is callee saved.
  1337. auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
  1338. return any_of(TRI->sub_and_superregs_inclusive(PR),
  1339. [&MF, TRI](MCPhysReg SubOrSuper) {
  1340. return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
  1341. });
  1342. };
  1343. // Check if PR or one of its sub- or super-registers can be used for all
  1344. // required register classes.
  1345. auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
  1346. return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
  1347. return any_of(TRI->sub_and_superregs_inclusive(PR),
  1348. [C, TRI](MCPhysReg SubOrSuper) {
  1349. return C == TRI->getMinimalPhysRegClass(SubOrSuper);
  1350. });
  1351. });
  1352. };
  1353. auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
  1354. for (const MCPhysReg &PR : *RegClass) {
  1355. if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
  1356. !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
  1357. CanBeUsedForAllClasses(PR)) {
  1358. DefinedInBB.addReg(PR);
  1359. LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
  1360. << "\n");
  1361. return {PR};
  1362. }
  1363. }
  1364. LLVM_DEBUG(dbgs() << "No rename register found from "
  1365. << TRI->getRegClassName(RegClass) << "\n");
  1366. return std::nullopt;
  1367. }
  1368. /// Scan the instructions looking for a load/store that can be combined with the
  1369. /// current instruction into a wider equivalent or a load/store pair.
  1370. MachineBasicBlock::iterator
  1371. AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
  1372. LdStPairFlags &Flags, unsigned Limit,
  1373. bool FindNarrowMerge) {
  1374. MachineBasicBlock::iterator E = I->getParent()->end();
  1375. MachineBasicBlock::iterator MBBI = I;
  1376. MachineBasicBlock::iterator MBBIWithRenameReg;
  1377. MachineInstr &FirstMI = *I;
  1378. MBBI = next_nodbg(MBBI, E);
  1379. bool MayLoad = FirstMI.mayLoad();
  1380. bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
  1381. Register Reg = getLdStRegOp(FirstMI).getReg();
  1382. Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
  1383. int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
  1384. int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
  1385. bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
  1386. std::optional<bool> MaybeCanRename;
  1387. if (!EnableRenaming)
  1388. MaybeCanRename = {false};
  1389. SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
  1390. LiveRegUnits UsedInBetween;
  1391. UsedInBetween.init(*TRI);
  1392. Flags.clearRenameReg();
  1393. // Track which register units have been modified and used between the first
  1394. // insn (inclusive) and the second insn.
  1395. ModifiedRegUnits.clear();
  1396. UsedRegUnits.clear();
  1397. // Remember any instructions that read/write memory between FirstMI and MI.
  1398. SmallVector<MachineInstr *, 4> MemInsns;
  1399. for (unsigned Count = 0; MBBI != E && Count < Limit;
  1400. MBBI = next_nodbg(MBBI, E)) {
  1401. MachineInstr &MI = *MBBI;
  1402. UsedInBetween.accumulate(MI);
  1403. // Don't count transient instructions towards the search limit since there
  1404. // may be different numbers of them if e.g. debug information is present.
  1405. if (!MI.isTransient())
  1406. ++Count;
  1407. Flags.setSExtIdx(-1);
  1408. if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
  1409. AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
  1410. assert(MI.mayLoadOrStore() && "Expected memory operation.");
  1411. // If we've found another instruction with the same opcode, check to see
  1412. // if the base and offset are compatible with our starting instruction.
  1413. // These instructions all have scaled immediate operands, so we just
  1414. // check for +1/-1. Make sure to check the new instruction offset is
  1415. // actually an immediate and not a symbolic reference destined for
  1416. // a relocation.
  1417. Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
  1418. int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
  1419. bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
  1420. if (IsUnscaled != MIIsUnscaled) {
  1421. // We're trying to pair instructions that differ in how they are scaled.
  1422. // If FirstMI is scaled then scale the offset of MI accordingly.
  1423. // Otherwise, do the opposite (i.e., make MI's offset unscaled).
  1424. int MemSize = TII->getMemScale(MI);
  1425. if (MIIsUnscaled) {
  1426. // If the unscaled offset isn't a multiple of the MemSize, we can't
  1427. // pair the operations together: bail and keep looking.
  1428. if (MIOffset % MemSize) {
  1429. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1430. UsedRegUnits, TRI);
  1431. MemInsns.push_back(&MI);
  1432. continue;
  1433. }
  1434. MIOffset /= MemSize;
  1435. } else {
  1436. MIOffset *= MemSize;
  1437. }
  1438. }
  1439. bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
  1440. if (BaseReg == MIBaseReg) {
  1441. // If the offset of the second ld/st is not equal to the size of the
  1442. // destination register it can’t be paired with a pre-index ld/st
  1443. // pair. Additionally if the base reg is used or modified the operations
  1444. // can't be paired: bail and keep looking.
  1445. if (IsPreLdSt) {
  1446. bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
  1447. bool IsBaseRegUsed = !UsedRegUnits.available(
  1448. AArch64InstrInfo::getLdStBaseOp(MI).getReg());
  1449. bool IsBaseRegModified = !ModifiedRegUnits.available(
  1450. AArch64InstrInfo::getLdStBaseOp(MI).getReg());
  1451. // If the stored value and the address of the second instruction is
  1452. // the same, it needs to be using the updated register and therefore
  1453. // it must not be folded.
  1454. bool IsMIRegTheSame =
  1455. TRI->regsOverlap(getLdStRegOp(MI).getReg(),
  1456. AArch64InstrInfo::getLdStBaseOp(MI).getReg());
  1457. if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
  1458. IsMIRegTheSame) {
  1459. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1460. UsedRegUnits, TRI);
  1461. MemInsns.push_back(&MI);
  1462. continue;
  1463. }
  1464. } else {
  1465. if ((Offset != MIOffset + OffsetStride) &&
  1466. (Offset + OffsetStride != MIOffset)) {
  1467. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1468. UsedRegUnits, TRI);
  1469. MemInsns.push_back(&MI);
  1470. continue;
  1471. }
  1472. }
  1473. int MinOffset = Offset < MIOffset ? Offset : MIOffset;
  1474. if (FindNarrowMerge) {
  1475. // If the alignment requirements of the scaled wide load/store
  1476. // instruction can't express the offset of the scaled narrow input,
  1477. // bail and keep looking. For promotable zero stores, allow only when
  1478. // the stored value is the same (i.e., WZR).
  1479. if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
  1480. (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
  1481. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1482. UsedRegUnits, TRI);
  1483. MemInsns.push_back(&MI);
  1484. continue;
  1485. }
  1486. } else {
  1487. // Pairwise instructions have a 7-bit signed offset field. Single
  1488. // insns have a 12-bit unsigned offset field. If the resultant
  1489. // immediate offset of merging these instructions is out of range for
  1490. // a pairwise instruction, bail and keep looking.
  1491. if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
  1492. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1493. UsedRegUnits, TRI);
  1494. MemInsns.push_back(&MI);
  1495. continue;
  1496. }
  1497. // If the alignment requirements of the paired (scaled) instruction
  1498. // can't express the offset of the unscaled input, bail and keep
  1499. // looking.
  1500. if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
  1501. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1502. UsedRegUnits, TRI);
  1503. MemInsns.push_back(&MI);
  1504. continue;
  1505. }
  1506. }
  1507. // If the destination register of one load is the same register or a
  1508. // sub/super register of the other load, bail and keep looking. A
  1509. // load-pair instruction with both destination registers the same is
  1510. // UNPREDICTABLE and will result in an exception.
  1511. if (MayLoad &&
  1512. TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) {
  1513. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
  1514. TRI);
  1515. MemInsns.push_back(&MI);
  1516. continue;
  1517. }
  1518. // If the BaseReg has been modified, then we cannot do the optimization.
  1519. // For example, in the following pattern
  1520. // ldr x1 [x2]
  1521. // ldr x2 [x3]
  1522. // ldr x4 [x2, #8],
  1523. // the first and third ldr cannot be converted to ldp x1, x4, [x2]
  1524. if (!ModifiedRegUnits.available(BaseReg))
  1525. return E;
  1526. // If the Rt of the second instruction was not modified or used between
  1527. // the two instructions and none of the instructions between the second
  1528. // and first alias with the second, we can combine the second into the
  1529. // first.
  1530. if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
  1531. !(MI.mayLoad() &&
  1532. !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
  1533. !mayAlias(MI, MemInsns, AA)) {
  1534. Flags.setMergeForward(false);
  1535. Flags.clearRenameReg();
  1536. return MBBI;
  1537. }
  1538. // Likewise, if the Rt of the first instruction is not modified or used
  1539. // between the two instructions and none of the instructions between the
  1540. // first and the second alias with the first, we can combine the first
  1541. // into the second.
  1542. if (!(MayLoad &&
  1543. !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
  1544. !mayAlias(FirstMI, MemInsns, AA)) {
  1545. if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
  1546. Flags.setMergeForward(true);
  1547. Flags.clearRenameReg();
  1548. return MBBI;
  1549. }
  1550. if (DebugCounter::shouldExecute(RegRenamingCounter)) {
  1551. if (!MaybeCanRename)
  1552. MaybeCanRename = {canRenameUpToDef(FirstMI, UsedInBetween,
  1553. RequiredClasses, TRI)};
  1554. if (*MaybeCanRename) {
  1555. std::optional<MCPhysReg> MaybeRenameReg =
  1556. tryToFindRegisterToRename(*FirstMI.getParent()->getParent(),
  1557. Reg, DefinedInBB, UsedInBetween,
  1558. RequiredClasses, TRI);
  1559. if (MaybeRenameReg) {
  1560. Flags.setRenameReg(*MaybeRenameReg);
  1561. Flags.setMergeForward(true);
  1562. MBBIWithRenameReg = MBBI;
  1563. }
  1564. }
  1565. }
  1566. }
  1567. // Unable to combine these instructions due to interference in between.
  1568. // Keep looking.
  1569. }
  1570. }
  1571. if (Flags.getRenameReg())
  1572. return MBBIWithRenameReg;
  1573. // If the instruction wasn't a matching load or store. Stop searching if we
  1574. // encounter a call instruction that might modify memory.
  1575. if (MI.isCall())
  1576. return E;
  1577. // Update modified / uses register units.
  1578. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
  1579. // Otherwise, if the base register is modified, we have no match, so
  1580. // return early.
  1581. if (!ModifiedRegUnits.available(BaseReg))
  1582. return E;
  1583. // Update list of instructions that read/write memory.
  1584. if (MI.mayLoadOrStore())
  1585. MemInsns.push_back(&MI);
  1586. }
  1587. return E;
  1588. }
  1589. static MachineBasicBlock::iterator
  1590. maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
  1591. auto End = MI.getParent()->end();
  1592. if (MaybeCFI == End ||
  1593. MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
  1594. !(MI.getFlag(MachineInstr::FrameSetup) ||
  1595. MI.getFlag(MachineInstr::FrameDestroy)) ||
  1596. AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)
  1597. return End;
  1598. const MachineFunction &MF = *MI.getParent()->getParent();
  1599. unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
  1600. const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
  1601. switch (CFI.getOperation()) {
  1602. case MCCFIInstruction::OpDefCfa:
  1603. case MCCFIInstruction::OpDefCfaOffset:
  1604. return MaybeCFI;
  1605. default:
  1606. return End;
  1607. }
  1608. }
  1609. MachineBasicBlock::iterator
  1610. AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
  1611. MachineBasicBlock::iterator Update,
  1612. bool IsPreIdx) {
  1613. assert((Update->getOpcode() == AArch64::ADDXri ||
  1614. Update->getOpcode() == AArch64::SUBXri) &&
  1615. "Unexpected base register update instruction to merge!");
  1616. MachineBasicBlock::iterator E = I->getParent()->end();
  1617. MachineBasicBlock::iterator NextI = next_nodbg(I, E);
  1618. // If updating the SP and the following instruction is CFA offset related CFI
  1619. // instruction move it after the merged instruction.
  1620. MachineBasicBlock::iterator CFI =
  1621. IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
  1622. // Return the instruction following the merged instruction, which is
  1623. // the instruction following our unmerged load. Unless that's the add/sub
  1624. // instruction we're merging, in which case it's the one after that.
  1625. if (NextI == Update)
  1626. NextI = next_nodbg(NextI, E);
  1627. int Value = Update->getOperand(2).getImm();
  1628. assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
  1629. "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
  1630. if (Update->getOpcode() == AArch64::SUBXri)
  1631. Value = -Value;
  1632. unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
  1633. : getPostIndexedOpcode(I->getOpcode());
  1634. MachineInstrBuilder MIB;
  1635. int Scale, MinOffset, MaxOffset;
  1636. getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
  1637. if (!AArch64InstrInfo::isPairedLdSt(*I)) {
  1638. // Non-paired instruction.
  1639. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
  1640. .add(getLdStRegOp(*Update))
  1641. .add(getLdStRegOp(*I))
  1642. .add(AArch64InstrInfo::getLdStBaseOp(*I))
  1643. .addImm(Value / Scale)
  1644. .setMemRefs(I->memoperands())
  1645. .setMIFlags(I->mergeFlagsWith(*Update));
  1646. } else {
  1647. // Paired instruction.
  1648. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
  1649. .add(getLdStRegOp(*Update))
  1650. .add(getLdStRegOp(*I, 0))
  1651. .add(getLdStRegOp(*I, 1))
  1652. .add(AArch64InstrInfo::getLdStBaseOp(*I))
  1653. .addImm(Value / Scale)
  1654. .setMemRefs(I->memoperands())
  1655. .setMIFlags(I->mergeFlagsWith(*Update));
  1656. }
  1657. if (CFI != E) {
  1658. MachineBasicBlock *MBB = I->getParent();
  1659. MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
  1660. }
  1661. if (IsPreIdx) {
  1662. ++NumPreFolded;
  1663. LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
  1664. } else {
  1665. ++NumPostFolded;
  1666. LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
  1667. }
  1668. LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
  1669. LLVM_DEBUG(I->print(dbgs()));
  1670. LLVM_DEBUG(dbgs() << " ");
  1671. LLVM_DEBUG(Update->print(dbgs()));
  1672. LLVM_DEBUG(dbgs() << " with instruction:\n ");
  1673. LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
  1674. LLVM_DEBUG(dbgs() << "\n");
  1675. // Erase the old instructions for the block.
  1676. I->eraseFromParent();
  1677. Update->eraseFromParent();
  1678. return NextI;
  1679. }
  1680. bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
  1681. MachineInstr &MI,
  1682. unsigned BaseReg, int Offset) {
  1683. switch (MI.getOpcode()) {
  1684. default:
  1685. break;
  1686. case AArch64::SUBXri:
  1687. case AArch64::ADDXri:
  1688. // Make sure it's a vanilla immediate operand, not a relocation or
  1689. // anything else we can't handle.
  1690. if (!MI.getOperand(2).isImm())
  1691. break;
  1692. // Watch out for 1 << 12 shifted value.
  1693. if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
  1694. break;
  1695. // The update instruction source and destination register must be the
  1696. // same as the load/store base register.
  1697. if (MI.getOperand(0).getReg() != BaseReg ||
  1698. MI.getOperand(1).getReg() != BaseReg)
  1699. break;
  1700. int UpdateOffset = MI.getOperand(2).getImm();
  1701. if (MI.getOpcode() == AArch64::SUBXri)
  1702. UpdateOffset = -UpdateOffset;
  1703. // The immediate must be a multiple of the scaling factor of the pre/post
  1704. // indexed instruction.
  1705. int Scale, MinOffset, MaxOffset;
  1706. getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
  1707. if (UpdateOffset % Scale != 0)
  1708. break;
  1709. // Scaled offset must fit in the instruction immediate.
  1710. int ScaledOffset = UpdateOffset / Scale;
  1711. if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
  1712. break;
  1713. // If we have a non-zero Offset, we check that it matches the amount
  1714. // we're adding to the register.
  1715. if (!Offset || Offset == UpdateOffset)
  1716. return true;
  1717. break;
  1718. }
  1719. return false;
  1720. }
  1721. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
  1722. MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
  1723. MachineBasicBlock::iterator E = I->getParent()->end();
  1724. MachineInstr &MemMI = *I;
  1725. MachineBasicBlock::iterator MBBI = I;
  1726. Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
  1727. int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
  1728. TII->getMemScale(MemMI);
  1729. // Scan forward looking for post-index opportunities. Updating instructions
  1730. // can't be formed if the memory instruction doesn't have the offset we're
  1731. // looking for.
  1732. if (MIUnscaledOffset != UnscaledOffset)
  1733. return E;
  1734. // If the base register overlaps a source/destination register, we can't
  1735. // merge the update. This does not apply to tag store instructions which
  1736. // ignore the address part of the source register.
  1737. // This does not apply to STGPi as well, which does not have unpredictable
  1738. // behavior in this case unlike normal stores, and always performs writeback
  1739. // after reading the source register value.
  1740. if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
  1741. bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
  1742. for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
  1743. Register DestReg = getLdStRegOp(MemMI, i).getReg();
  1744. if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
  1745. return E;
  1746. }
  1747. }
  1748. // Track which register units have been modified and used between the first
  1749. // insn (inclusive) and the second insn.
  1750. ModifiedRegUnits.clear();
  1751. UsedRegUnits.clear();
  1752. MBBI = next_nodbg(MBBI, E);
  1753. // We can't post-increment the stack pointer if any instruction between
  1754. // the memory access (I) and the increment (MBBI) can access the memory
  1755. // region defined by [SP, MBBI].
  1756. const bool BaseRegSP = BaseReg == AArch64::SP;
  1757. if (BaseRegSP && needsWinCFI(I->getMF())) {
  1758. // FIXME: For now, we always block the optimization over SP in windows
  1759. // targets as it requires to adjust the unwind/debug info, messing up
  1760. // the unwind info can actually cause a miscompile.
  1761. return E;
  1762. }
  1763. for (unsigned Count = 0; MBBI != E && Count < Limit;
  1764. MBBI = next_nodbg(MBBI, E)) {
  1765. MachineInstr &MI = *MBBI;
  1766. // Don't count transient instructions towards the search limit since there
  1767. // may be different numbers of them if e.g. debug information is present.
  1768. if (!MI.isTransient())
  1769. ++Count;
  1770. // If we found a match, return it.
  1771. if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
  1772. return MBBI;
  1773. // Update the status of what the instruction clobbered and used.
  1774. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
  1775. // Otherwise, if the base register is used or modified, we have no match, so
  1776. // return early.
  1777. // If we are optimizing SP, do not allow instructions that may load or store
  1778. // in between the load and the optimized value update.
  1779. if (!ModifiedRegUnits.available(BaseReg) ||
  1780. !UsedRegUnits.available(BaseReg) ||
  1781. (BaseRegSP && MBBI->mayLoadOrStore()))
  1782. return E;
  1783. }
  1784. return E;
  1785. }
  1786. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
  1787. MachineBasicBlock::iterator I, unsigned Limit) {
  1788. MachineBasicBlock::iterator B = I->getParent()->begin();
  1789. MachineBasicBlock::iterator E = I->getParent()->end();
  1790. MachineInstr &MemMI = *I;
  1791. MachineBasicBlock::iterator MBBI = I;
  1792. MachineFunction &MF = *MemMI.getMF();
  1793. Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
  1794. int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
  1795. // If the load/store is the first instruction in the block, there's obviously
  1796. // not any matching update. Ditto if the memory offset isn't zero.
  1797. if (MBBI == B || Offset != 0)
  1798. return E;
  1799. // If the base register overlaps a destination register, we can't
  1800. // merge the update.
  1801. if (!isTagStore(MemMI)) {
  1802. bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
  1803. for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
  1804. Register DestReg = getLdStRegOp(MemMI, i).getReg();
  1805. if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
  1806. return E;
  1807. }
  1808. }
  1809. const bool BaseRegSP = BaseReg == AArch64::SP;
  1810. if (BaseRegSP && needsWinCFI(I->getMF())) {
  1811. // FIXME: For now, we always block the optimization over SP in windows
  1812. // targets as it requires to adjust the unwind/debug info, messing up
  1813. // the unwind info can actually cause a miscompile.
  1814. return E;
  1815. }
  1816. const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  1817. unsigned RedZoneSize =
  1818. Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
  1819. // Track which register units have been modified and used between the first
  1820. // insn (inclusive) and the second insn.
  1821. ModifiedRegUnits.clear();
  1822. UsedRegUnits.clear();
  1823. unsigned Count = 0;
  1824. bool MemAcessBeforeSPPreInc = false;
  1825. do {
  1826. MBBI = prev_nodbg(MBBI, B);
  1827. MachineInstr &MI = *MBBI;
  1828. // Don't count transient instructions towards the search limit since there
  1829. // may be different numbers of them if e.g. debug information is present.
  1830. if (!MI.isTransient())
  1831. ++Count;
  1832. // If we found a match, return it.
  1833. if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
  1834. // Check that the update value is within our red zone limit (which may be
  1835. // zero).
  1836. if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
  1837. return E;
  1838. return MBBI;
  1839. }
  1840. // Update the status of what the instruction clobbered and used.
  1841. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
  1842. // Otherwise, if the base register is used or modified, we have no match, so
  1843. // return early.
  1844. if (!ModifiedRegUnits.available(BaseReg) ||
  1845. !UsedRegUnits.available(BaseReg))
  1846. return E;
  1847. // Keep track if we have a memory access before an SP pre-increment, in this
  1848. // case we need to validate later that the update amount respects the red
  1849. // zone.
  1850. if (BaseRegSP && MBBI->mayLoadOrStore())
  1851. MemAcessBeforeSPPreInc = true;
  1852. } while (MBBI != B && Count < Limit);
  1853. return E;
  1854. }
  1855. bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
  1856. MachineBasicBlock::iterator &MBBI) {
  1857. MachineInstr &MI = *MBBI;
  1858. // If this is a volatile load, don't mess with it.
  1859. if (MI.hasOrderedMemoryRef())
  1860. return false;
  1861. if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy))
  1862. return false;
  1863. // Make sure this is a reg+imm.
  1864. // FIXME: It is possible to extend it to handle reg+reg cases.
  1865. if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
  1866. return false;
  1867. // Look backward up to LdStLimit instructions.
  1868. MachineBasicBlock::iterator StoreI;
  1869. if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
  1870. ++NumLoadsFromStoresPromoted;
  1871. // Promote the load. Keeping the iterator straight is a
  1872. // pain, so we let the merge routine tell us what the next instruction
  1873. // is after it's done mucking about.
  1874. MBBI = promoteLoadFromStore(MBBI, StoreI);
  1875. return true;
  1876. }
  1877. return false;
  1878. }
  1879. // Merge adjacent zero stores into a wider store.
  1880. bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
  1881. MachineBasicBlock::iterator &MBBI) {
  1882. assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
  1883. MachineInstr &MI = *MBBI;
  1884. MachineBasicBlock::iterator E = MI.getParent()->end();
  1885. if (!TII->isCandidateToMergeOrPair(MI))
  1886. return false;
  1887. // Look ahead up to LdStLimit instructions for a mergable instruction.
  1888. LdStPairFlags Flags;
  1889. MachineBasicBlock::iterator MergeMI =
  1890. findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
  1891. if (MergeMI != E) {
  1892. ++NumZeroStoresPromoted;
  1893. // Keeping the iterator straight is a pain, so we let the merge routine tell
  1894. // us what the next instruction is after it's done mucking about.
  1895. MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
  1896. return true;
  1897. }
  1898. return false;
  1899. }
  1900. // Find loads and stores that can be merged into a single load or store pair
  1901. // instruction.
  1902. bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
  1903. MachineInstr &MI = *MBBI;
  1904. MachineBasicBlock::iterator E = MI.getParent()->end();
  1905. if (!TII->isCandidateToMergeOrPair(MI))
  1906. return false;
  1907. // Early exit if the offset is not possible to match. (6 bits of positive
  1908. // range, plus allow an extra one in case we find a later insn that matches
  1909. // with Offset-1)
  1910. bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
  1911. int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
  1912. int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
  1913. // Allow one more for offset.
  1914. if (Offset > 0)
  1915. Offset -= OffsetStride;
  1916. if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
  1917. return false;
  1918. // Look ahead up to LdStLimit instructions for a pairable instruction.
  1919. LdStPairFlags Flags;
  1920. MachineBasicBlock::iterator Paired =
  1921. findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
  1922. if (Paired != E) {
  1923. ++NumPairCreated;
  1924. if (TII->hasUnscaledLdStOffset(MI))
  1925. ++NumUnscaledPairCreated;
  1926. // Keeping the iterator straight is a pain, so we let the merge routine tell
  1927. // us what the next instruction is after it's done mucking about.
  1928. auto Prev = std::prev(MBBI);
  1929. MBBI = mergePairedInsns(MBBI, Paired, Flags);
  1930. // Collect liveness info for instructions between Prev and the new position
  1931. // MBBI.
  1932. for (auto I = std::next(Prev); I != MBBI; I++)
  1933. updateDefinedRegisters(*I, DefinedInBB, TRI);
  1934. return true;
  1935. }
  1936. return false;
  1937. }
  1938. bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
  1939. (MachineBasicBlock::iterator &MBBI) {
  1940. MachineInstr &MI = *MBBI;
  1941. MachineBasicBlock::iterator E = MI.getParent()->end();
  1942. MachineBasicBlock::iterator Update;
  1943. // Look forward to try to form a post-index instruction. For example,
  1944. // ldr x0, [x20]
  1945. // add x20, x20, #32
  1946. // merged into:
  1947. // ldr x0, [x20], #32
  1948. Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
  1949. if (Update != E) {
  1950. // Merge the update into the ld/st.
  1951. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
  1952. return true;
  1953. }
  1954. // Don't know how to handle unscaled pre/post-index versions below, so bail.
  1955. if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
  1956. return false;
  1957. // Look back to try to find a pre-index instruction. For example,
  1958. // add x0, x0, #8
  1959. // ldr x1, [x0]
  1960. // merged into:
  1961. // ldr x1, [x0, #8]!
  1962. Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
  1963. if (Update != E) {
  1964. // Merge the update into the ld/st.
  1965. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
  1966. return true;
  1967. }
  1968. // The immediate in the load/store is scaled by the size of the memory
  1969. // operation. The immediate in the add we're looking for,
  1970. // however, is not, so adjust here.
  1971. int UnscaledOffset =
  1972. AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
  1973. // Look forward to try to find a pre-index instruction. For example,
  1974. // ldr x1, [x0, #64]
  1975. // add x0, x0, #64
  1976. // merged into:
  1977. // ldr x1, [x0, #64]!
  1978. Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
  1979. if (Update != E) {
  1980. // Merge the update into the ld/st.
  1981. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
  1982. return true;
  1983. }
  1984. return false;
  1985. }
  1986. bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
  1987. bool EnableNarrowZeroStOpt) {
  1988. bool Modified = false;
  1989. // Four tranformations to do here:
  1990. // 1) Find loads that directly read from stores and promote them by
  1991. // replacing with mov instructions. If the store is wider than the load,
  1992. // the load will be replaced with a bitfield extract.
  1993. // e.g.,
  1994. // str w1, [x0, #4]
  1995. // ldrh w2, [x0, #6]
  1996. // ; becomes
  1997. // str w1, [x0, #4]
  1998. // lsr w2, w1, #16
  1999. for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
  2000. MBBI != E;) {
  2001. if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
  2002. Modified = true;
  2003. else
  2004. ++MBBI;
  2005. }
  2006. // 2) Merge adjacent zero stores into a wider store.
  2007. // e.g.,
  2008. // strh wzr, [x0]
  2009. // strh wzr, [x0, #2]
  2010. // ; becomes
  2011. // str wzr, [x0]
  2012. // e.g.,
  2013. // str wzr, [x0]
  2014. // str wzr, [x0, #4]
  2015. // ; becomes
  2016. // str xzr, [x0]
  2017. if (EnableNarrowZeroStOpt)
  2018. for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
  2019. MBBI != E;) {
  2020. if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
  2021. Modified = true;
  2022. else
  2023. ++MBBI;
  2024. }
  2025. // 3) Find loads and stores that can be merged into a single load or store
  2026. // pair instruction.
  2027. // e.g.,
  2028. // ldr x0, [x2]
  2029. // ldr x1, [x2, #8]
  2030. // ; becomes
  2031. // ldp x0, x1, [x2]
  2032. if (MBB.getParent()->getRegInfo().tracksLiveness()) {
  2033. DefinedInBB.clear();
  2034. DefinedInBB.addLiveIns(MBB);
  2035. }
  2036. for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
  2037. MBBI != E;) {
  2038. // Track currently live registers up to this point, to help with
  2039. // searching for a rename register on demand.
  2040. updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
  2041. if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
  2042. Modified = true;
  2043. else
  2044. ++MBBI;
  2045. }
  2046. // 4) Find base register updates that can be merged into the load or store
  2047. // as a base-reg writeback.
  2048. // e.g.,
  2049. // ldr x0, [x2]
  2050. // add x2, x2, #4
  2051. // ; becomes
  2052. // ldr x0, [x2], #4
  2053. for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
  2054. MBBI != E;) {
  2055. if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
  2056. Modified = true;
  2057. else
  2058. ++MBBI;
  2059. }
  2060. return Modified;
  2061. }
  2062. bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
  2063. if (skipFunction(Fn.getFunction()))
  2064. return false;
  2065. Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
  2066. TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
  2067. TRI = Subtarget->getRegisterInfo();
  2068. AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  2069. // Resize the modified and used register unit trackers. We do this once
  2070. // per function and then clear the register units each time we optimize a load
  2071. // or store.
  2072. ModifiedRegUnits.init(*TRI);
  2073. UsedRegUnits.init(*TRI);
  2074. DefinedInBB.init(*TRI);
  2075. bool Modified = false;
  2076. bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
  2077. for (auto &MBB : Fn) {
  2078. auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
  2079. Modified |= M;
  2080. }
  2081. return Modified;
  2082. }
  2083. // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
  2084. // stores near one another? Note: The pre-RA instruction scheduler already has
  2085. // hooks to try and schedule pairable loads/stores together to improve pairing
  2086. // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
  2087. // FIXME: When pairing store instructions it's very possible for this pass to
  2088. // hoist a store with a KILL marker above another use (without a KILL marker).
  2089. // The resulting IR is invalid, but nothing uses the KILL markers after this
  2090. // pass, so it's never caused a problem in practice.
  2091. /// createAArch64LoadStoreOptimizationPass - returns an instance of the
  2092. /// load / store optimization pass.
  2093. FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
  2094. return new AArch64LoadStoreOpt();
  2095. }