AArch64LoadStoreOptimizer.cpp 83 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307
  1. //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file contains a pass that performs load / store related peephole
  10. // optimizations. This pass should be run after register allocation.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "AArch64InstrInfo.h"
  14. #include "AArch64MachineFunctionInfo.h"
  15. #include "AArch64Subtarget.h"
  16. #include "MCTargetDesc/AArch64AddressingModes.h"
  17. #include "llvm/ADT/BitVector.h"
  18. #include "llvm/ADT/SmallVector.h"
  19. #include "llvm/ADT/Statistic.h"
  20. #include "llvm/ADT/StringRef.h"
  21. #include "llvm/ADT/iterator_range.h"
  22. #include "llvm/Analysis/AliasAnalysis.h"
  23. #include "llvm/CodeGen/MachineBasicBlock.h"
  24. #include "llvm/CodeGen/MachineFunction.h"
  25. #include "llvm/CodeGen/MachineFunctionPass.h"
  26. #include "llvm/CodeGen/MachineInstr.h"
  27. #include "llvm/CodeGen/MachineInstrBuilder.h"
  28. #include "llvm/CodeGen/MachineOperand.h"
  29. #include "llvm/CodeGen/MachineRegisterInfo.h"
  30. #include "llvm/CodeGen/TargetRegisterInfo.h"
  31. #include "llvm/IR/DebugLoc.h"
  32. #include "llvm/MC/MCAsmInfo.h"
  33. #include "llvm/MC/MCRegisterInfo.h"
  34. #include "llvm/Pass.h"
  35. #include "llvm/Support/CommandLine.h"
  36. #include "llvm/Support/Debug.h"
  37. #include "llvm/Support/DebugCounter.h"
  38. #include "llvm/Support/ErrorHandling.h"
  39. #include "llvm/Support/raw_ostream.h"
  40. #include <cassert>
  41. #include <cstdint>
  42. #include <functional>
  43. #include <iterator>
  44. #include <limits>
  45. using namespace llvm;
  46. #define DEBUG_TYPE "aarch64-ldst-opt"
  47. STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
  48. STATISTIC(NumPostFolded, "Number of post-index updates folded");
  49. STATISTIC(NumPreFolded, "Number of pre-index updates folded");
  50. STATISTIC(NumUnscaledPairCreated,
  51. "Number of load/store from unscaled generated");
  52. STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
  53. STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
  54. DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
  55. "Controls which pairs are considered for renaming");
  56. // The LdStLimit limits how far we search for load/store pairs.
  57. static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
  58. cl::init(20), cl::Hidden);
  59. // The UpdateLimit limits how far we search for update instructions when we form
  60. // pre-/post-index instructions.
  61. static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
  62. cl::Hidden);
  63. // Enable register renaming to find additional store pairing opportunities.
  64. static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
  65. cl::init(true), cl::Hidden);
  66. #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
  67. namespace {
  68. using LdStPairFlags = struct LdStPairFlags {
  69. // If a matching instruction is found, MergeForward is set to true if the
  70. // merge is to remove the first instruction and replace the second with
  71. // a pair-wise insn, and false if the reverse is true.
  72. bool MergeForward = false;
  73. // SExtIdx gives the index of the result of the load pair that must be
  74. // extended. The value of SExtIdx assumes that the paired load produces the
  75. // value in this order: (I, returned iterator), i.e., -1 means no value has
  76. // to be extended, 0 means I, and 1 means the returned iterator.
  77. int SExtIdx = -1;
  78. // If not none, RenameReg can be used to rename the result register of the
  79. // first store in a pair. Currently this only works when merging stores
  80. // forward.
  81. Optional<MCPhysReg> RenameReg = None;
  82. LdStPairFlags() = default;
  83. void setMergeForward(bool V = true) { MergeForward = V; }
  84. bool getMergeForward() const { return MergeForward; }
  85. void setSExtIdx(int V) { SExtIdx = V; }
  86. int getSExtIdx() const { return SExtIdx; }
  87. void setRenameReg(MCPhysReg R) { RenameReg = R; }
  88. void clearRenameReg() { RenameReg = None; }
  89. Optional<MCPhysReg> getRenameReg() const { return RenameReg; }
  90. };
  91. struct AArch64LoadStoreOpt : public MachineFunctionPass {
  92. static char ID;
  93. AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
  94. initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
  95. }
  96. AliasAnalysis *AA;
  97. const AArch64InstrInfo *TII;
  98. const TargetRegisterInfo *TRI;
  99. const AArch64Subtarget *Subtarget;
  100. // Track which register units have been modified and used.
  101. LiveRegUnits ModifiedRegUnits, UsedRegUnits;
  102. LiveRegUnits DefinedInBB;
  103. void getAnalysisUsage(AnalysisUsage &AU) const override {
  104. AU.addRequired<AAResultsWrapperPass>();
  105. MachineFunctionPass::getAnalysisUsage(AU);
  106. }
  107. // Scan the instructions looking for a load/store that can be combined
  108. // with the current instruction into a load/store pair.
  109. // Return the matching instruction if one is found, else MBB->end().
  110. MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
  111. LdStPairFlags &Flags,
  112. unsigned Limit,
  113. bool FindNarrowMerge);
  114. // Scan the instructions looking for a store that writes to the address from
  115. // which the current load instruction reads. Return true if one is found.
  116. bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
  117. MachineBasicBlock::iterator &StoreI);
  118. // Merge the two instructions indicated into a wider narrow store instruction.
  119. MachineBasicBlock::iterator
  120. mergeNarrowZeroStores(MachineBasicBlock::iterator I,
  121. MachineBasicBlock::iterator MergeMI,
  122. const LdStPairFlags &Flags);
  123. // Merge the two instructions indicated into a single pair-wise instruction.
  124. MachineBasicBlock::iterator
  125. mergePairedInsns(MachineBasicBlock::iterator I,
  126. MachineBasicBlock::iterator Paired,
  127. const LdStPairFlags &Flags);
  128. // Promote the load that reads directly from the address stored to.
  129. MachineBasicBlock::iterator
  130. promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
  131. MachineBasicBlock::iterator StoreI);
  132. // Scan the instruction list to find a base register update that can
  133. // be combined with the current instruction (a load or store) using
  134. // pre or post indexed addressing with writeback. Scan forwards.
  135. MachineBasicBlock::iterator
  136. findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
  137. int UnscaledOffset, unsigned Limit);
  138. // Scan the instruction list to find a base register update that can
  139. // be combined with the current instruction (a load or store) using
  140. // pre or post indexed addressing with writeback. Scan backwards.
  141. MachineBasicBlock::iterator
  142. findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
  143. // Find an instruction that updates the base register of the ld/st
  144. // instruction.
  145. bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
  146. unsigned BaseReg, int Offset);
  147. // Merge a pre- or post-index base register update into a ld/st instruction.
  148. MachineBasicBlock::iterator
  149. mergeUpdateInsn(MachineBasicBlock::iterator I,
  150. MachineBasicBlock::iterator Update, bool IsPreIdx);
  151. // Find and merge zero store instructions.
  152. bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
  153. // Find and pair ldr/str instructions.
  154. bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
  155. // Find and promote load instructions which read directly from store.
  156. bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
  157. // Find and merge a base register updates before or after a ld/st instruction.
  158. bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
  159. bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
  160. bool runOnMachineFunction(MachineFunction &Fn) override;
  161. MachineFunctionProperties getRequiredProperties() const override {
  162. return MachineFunctionProperties().set(
  163. MachineFunctionProperties::Property::NoVRegs);
  164. }
  165. StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
  166. };
  167. char AArch64LoadStoreOpt::ID = 0;
  168. } // end anonymous namespace
  169. INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
  170. AARCH64_LOAD_STORE_OPT_NAME, false, false)
  171. static bool isNarrowStore(unsigned Opc) {
  172. switch (Opc) {
  173. default:
  174. return false;
  175. case AArch64::STRBBui:
  176. case AArch64::STURBBi:
  177. case AArch64::STRHHui:
  178. case AArch64::STURHHi:
  179. return true;
  180. }
  181. }
  182. // These instruction set memory tag and either keep memory contents unchanged or
  183. // set it to zero, ignoring the address part of the source register.
  184. static bool isTagStore(const MachineInstr &MI) {
  185. switch (MI.getOpcode()) {
  186. default:
  187. return false;
  188. case AArch64::STGOffset:
  189. case AArch64::STZGOffset:
  190. case AArch64::ST2GOffset:
  191. case AArch64::STZ2GOffset:
  192. return true;
  193. }
  194. }
  195. static unsigned getMatchingNonSExtOpcode(unsigned Opc,
  196. bool *IsValidLdStrOpc = nullptr) {
  197. if (IsValidLdStrOpc)
  198. *IsValidLdStrOpc = true;
  199. switch (Opc) {
  200. default:
  201. if (IsValidLdStrOpc)
  202. *IsValidLdStrOpc = false;
  203. return std::numeric_limits<unsigned>::max();
  204. case AArch64::STRDui:
  205. case AArch64::STURDi:
  206. case AArch64::STRDpre:
  207. case AArch64::STRQui:
  208. case AArch64::STURQi:
  209. case AArch64::STRQpre:
  210. case AArch64::STRBBui:
  211. case AArch64::STURBBi:
  212. case AArch64::STRHHui:
  213. case AArch64::STURHHi:
  214. case AArch64::STRWui:
  215. case AArch64::STRWpre:
  216. case AArch64::STURWi:
  217. case AArch64::STRXui:
  218. case AArch64::STRXpre:
  219. case AArch64::STURXi:
  220. case AArch64::LDRDui:
  221. case AArch64::LDURDi:
  222. case AArch64::LDRDpre:
  223. case AArch64::LDRQui:
  224. case AArch64::LDURQi:
  225. case AArch64::LDRQpre:
  226. case AArch64::LDRWui:
  227. case AArch64::LDURWi:
  228. case AArch64::LDRWpre:
  229. case AArch64::LDRXui:
  230. case AArch64::LDURXi:
  231. case AArch64::LDRXpre:
  232. case AArch64::STRSui:
  233. case AArch64::STURSi:
  234. case AArch64::STRSpre:
  235. case AArch64::LDRSui:
  236. case AArch64::LDURSi:
  237. case AArch64::LDRSpre:
  238. return Opc;
  239. case AArch64::LDRSWui:
  240. return AArch64::LDRWui;
  241. case AArch64::LDURSWi:
  242. return AArch64::LDURWi;
  243. }
  244. }
  245. static unsigned getMatchingWideOpcode(unsigned Opc) {
  246. switch (Opc) {
  247. default:
  248. llvm_unreachable("Opcode has no wide equivalent!");
  249. case AArch64::STRBBui:
  250. return AArch64::STRHHui;
  251. case AArch64::STRHHui:
  252. return AArch64::STRWui;
  253. case AArch64::STURBBi:
  254. return AArch64::STURHHi;
  255. case AArch64::STURHHi:
  256. return AArch64::STURWi;
  257. case AArch64::STURWi:
  258. return AArch64::STURXi;
  259. case AArch64::STRWui:
  260. return AArch64::STRXui;
  261. }
  262. }
  263. static unsigned getMatchingPairOpcode(unsigned Opc) {
  264. switch (Opc) {
  265. default:
  266. llvm_unreachable("Opcode has no pairwise equivalent!");
  267. case AArch64::STRSui:
  268. case AArch64::STURSi:
  269. return AArch64::STPSi;
  270. case AArch64::STRSpre:
  271. return AArch64::STPSpre;
  272. case AArch64::STRDui:
  273. case AArch64::STURDi:
  274. return AArch64::STPDi;
  275. case AArch64::STRDpre:
  276. return AArch64::STPDpre;
  277. case AArch64::STRQui:
  278. case AArch64::STURQi:
  279. return AArch64::STPQi;
  280. case AArch64::STRQpre:
  281. return AArch64::STPQpre;
  282. case AArch64::STRWui:
  283. case AArch64::STURWi:
  284. return AArch64::STPWi;
  285. case AArch64::STRWpre:
  286. return AArch64::STPWpre;
  287. case AArch64::STRXui:
  288. case AArch64::STURXi:
  289. return AArch64::STPXi;
  290. case AArch64::STRXpre:
  291. return AArch64::STPXpre;
  292. case AArch64::LDRSui:
  293. case AArch64::LDURSi:
  294. return AArch64::LDPSi;
  295. case AArch64::LDRSpre:
  296. return AArch64::LDPSpre;
  297. case AArch64::LDRDui:
  298. case AArch64::LDURDi:
  299. return AArch64::LDPDi;
  300. case AArch64::LDRDpre:
  301. return AArch64::LDPDpre;
  302. case AArch64::LDRQui:
  303. case AArch64::LDURQi:
  304. return AArch64::LDPQi;
  305. case AArch64::LDRQpre:
  306. return AArch64::LDPQpre;
  307. case AArch64::LDRWui:
  308. case AArch64::LDURWi:
  309. return AArch64::LDPWi;
  310. case AArch64::LDRWpre:
  311. return AArch64::LDPWpre;
  312. case AArch64::LDRXui:
  313. case AArch64::LDURXi:
  314. return AArch64::LDPXi;
  315. case AArch64::LDRXpre:
  316. return AArch64::LDPXpre;
  317. case AArch64::LDRSWui:
  318. case AArch64::LDURSWi:
  319. return AArch64::LDPSWi;
  320. }
  321. }
  322. static unsigned isMatchingStore(MachineInstr &LoadInst,
  323. MachineInstr &StoreInst) {
  324. unsigned LdOpc = LoadInst.getOpcode();
  325. unsigned StOpc = StoreInst.getOpcode();
  326. switch (LdOpc) {
  327. default:
  328. llvm_unreachable("Unsupported load instruction!");
  329. case AArch64::LDRBBui:
  330. return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
  331. StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
  332. case AArch64::LDURBBi:
  333. return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
  334. StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
  335. case AArch64::LDRHHui:
  336. return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
  337. StOpc == AArch64::STRXui;
  338. case AArch64::LDURHHi:
  339. return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
  340. StOpc == AArch64::STURXi;
  341. case AArch64::LDRWui:
  342. return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
  343. case AArch64::LDURWi:
  344. return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
  345. case AArch64::LDRXui:
  346. return StOpc == AArch64::STRXui;
  347. case AArch64::LDURXi:
  348. return StOpc == AArch64::STURXi;
  349. }
  350. }
  351. static unsigned getPreIndexedOpcode(unsigned Opc) {
  352. // FIXME: We don't currently support creating pre-indexed loads/stores when
  353. // the load or store is the unscaled version. If we decide to perform such an
  354. // optimization in the future the cases for the unscaled loads/stores will
  355. // need to be added here.
  356. switch (Opc) {
  357. default:
  358. llvm_unreachable("Opcode has no pre-indexed equivalent!");
  359. case AArch64::STRSui:
  360. return AArch64::STRSpre;
  361. case AArch64::STRDui:
  362. return AArch64::STRDpre;
  363. case AArch64::STRQui:
  364. return AArch64::STRQpre;
  365. case AArch64::STRBBui:
  366. return AArch64::STRBBpre;
  367. case AArch64::STRHHui:
  368. return AArch64::STRHHpre;
  369. case AArch64::STRWui:
  370. return AArch64::STRWpre;
  371. case AArch64::STRXui:
  372. return AArch64::STRXpre;
  373. case AArch64::LDRSui:
  374. return AArch64::LDRSpre;
  375. case AArch64::LDRDui:
  376. return AArch64::LDRDpre;
  377. case AArch64::LDRQui:
  378. return AArch64::LDRQpre;
  379. case AArch64::LDRBBui:
  380. return AArch64::LDRBBpre;
  381. case AArch64::LDRHHui:
  382. return AArch64::LDRHHpre;
  383. case AArch64::LDRWui:
  384. return AArch64::LDRWpre;
  385. case AArch64::LDRXui:
  386. return AArch64::LDRXpre;
  387. case AArch64::LDRSWui:
  388. return AArch64::LDRSWpre;
  389. case AArch64::LDPSi:
  390. return AArch64::LDPSpre;
  391. case AArch64::LDPSWi:
  392. return AArch64::LDPSWpre;
  393. case AArch64::LDPDi:
  394. return AArch64::LDPDpre;
  395. case AArch64::LDPQi:
  396. return AArch64::LDPQpre;
  397. case AArch64::LDPWi:
  398. return AArch64::LDPWpre;
  399. case AArch64::LDPXi:
  400. return AArch64::LDPXpre;
  401. case AArch64::STPSi:
  402. return AArch64::STPSpre;
  403. case AArch64::STPDi:
  404. return AArch64::STPDpre;
  405. case AArch64::STPQi:
  406. return AArch64::STPQpre;
  407. case AArch64::STPWi:
  408. return AArch64::STPWpre;
  409. case AArch64::STPXi:
  410. return AArch64::STPXpre;
  411. case AArch64::STGOffset:
  412. return AArch64::STGPreIndex;
  413. case AArch64::STZGOffset:
  414. return AArch64::STZGPreIndex;
  415. case AArch64::ST2GOffset:
  416. return AArch64::ST2GPreIndex;
  417. case AArch64::STZ2GOffset:
  418. return AArch64::STZ2GPreIndex;
  419. case AArch64::STGPi:
  420. return AArch64::STGPpre;
  421. }
  422. }
  423. static unsigned getPostIndexedOpcode(unsigned Opc) {
  424. switch (Opc) {
  425. default:
  426. llvm_unreachable("Opcode has no post-indexed wise equivalent!");
  427. case AArch64::STRSui:
  428. case AArch64::STURSi:
  429. return AArch64::STRSpost;
  430. case AArch64::STRDui:
  431. case AArch64::STURDi:
  432. return AArch64::STRDpost;
  433. case AArch64::STRQui:
  434. case AArch64::STURQi:
  435. return AArch64::STRQpost;
  436. case AArch64::STRBBui:
  437. return AArch64::STRBBpost;
  438. case AArch64::STRHHui:
  439. return AArch64::STRHHpost;
  440. case AArch64::STRWui:
  441. case AArch64::STURWi:
  442. return AArch64::STRWpost;
  443. case AArch64::STRXui:
  444. case AArch64::STURXi:
  445. return AArch64::STRXpost;
  446. case AArch64::LDRSui:
  447. case AArch64::LDURSi:
  448. return AArch64::LDRSpost;
  449. case AArch64::LDRDui:
  450. case AArch64::LDURDi:
  451. return AArch64::LDRDpost;
  452. case AArch64::LDRQui:
  453. case AArch64::LDURQi:
  454. return AArch64::LDRQpost;
  455. case AArch64::LDRBBui:
  456. return AArch64::LDRBBpost;
  457. case AArch64::LDRHHui:
  458. return AArch64::LDRHHpost;
  459. case AArch64::LDRWui:
  460. case AArch64::LDURWi:
  461. return AArch64::LDRWpost;
  462. case AArch64::LDRXui:
  463. case AArch64::LDURXi:
  464. return AArch64::LDRXpost;
  465. case AArch64::LDRSWui:
  466. return AArch64::LDRSWpost;
  467. case AArch64::LDPSi:
  468. return AArch64::LDPSpost;
  469. case AArch64::LDPSWi:
  470. return AArch64::LDPSWpost;
  471. case AArch64::LDPDi:
  472. return AArch64::LDPDpost;
  473. case AArch64::LDPQi:
  474. return AArch64::LDPQpost;
  475. case AArch64::LDPWi:
  476. return AArch64::LDPWpost;
  477. case AArch64::LDPXi:
  478. return AArch64::LDPXpost;
  479. case AArch64::STPSi:
  480. return AArch64::STPSpost;
  481. case AArch64::STPDi:
  482. return AArch64::STPDpost;
  483. case AArch64::STPQi:
  484. return AArch64::STPQpost;
  485. case AArch64::STPWi:
  486. return AArch64::STPWpost;
  487. case AArch64::STPXi:
  488. return AArch64::STPXpost;
  489. case AArch64::STGOffset:
  490. return AArch64::STGPostIndex;
  491. case AArch64::STZGOffset:
  492. return AArch64::STZGPostIndex;
  493. case AArch64::ST2GOffset:
  494. return AArch64::ST2GPostIndex;
  495. case AArch64::STZ2GOffset:
  496. return AArch64::STZ2GPostIndex;
  497. case AArch64::STGPi:
  498. return AArch64::STGPpost;
  499. }
  500. }
  501. static bool isPairedLdSt(const MachineInstr &MI) {
  502. switch (MI.getOpcode()) {
  503. default:
  504. return false;
  505. case AArch64::LDPSi:
  506. case AArch64::LDPSWi:
  507. case AArch64::LDPDi:
  508. case AArch64::LDPQi:
  509. case AArch64::LDPWi:
  510. case AArch64::LDPXi:
  511. case AArch64::STPSi:
  512. case AArch64::STPDi:
  513. case AArch64::STPQi:
  514. case AArch64::STPWi:
  515. case AArch64::STPXi:
  516. case AArch64::STGPi:
  517. return true;
  518. }
  519. }
  520. static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
  521. unsigned OpcA = FirstMI.getOpcode();
  522. unsigned OpcB = MI.getOpcode();
  523. switch (OpcA) {
  524. default:
  525. return false;
  526. case AArch64::STRSpre:
  527. return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
  528. case AArch64::STRDpre:
  529. return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
  530. case AArch64::STRQpre:
  531. return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
  532. case AArch64::STRWpre:
  533. return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
  534. case AArch64::STRXpre:
  535. return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
  536. case AArch64::LDRSpre:
  537. return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
  538. case AArch64::LDRDpre:
  539. return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
  540. case AArch64::LDRQpre:
  541. return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
  542. case AArch64::LDRWpre:
  543. return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
  544. case AArch64::LDRXpre:
  545. return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
  546. }
  547. }
  548. // Returns the scale and offset range of pre/post indexed variants of MI.
  549. static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
  550. int &MinOffset, int &MaxOffset) {
  551. bool IsPaired = isPairedLdSt(MI);
  552. bool IsTagStore = isTagStore(MI);
  553. // ST*G and all paired ldst have the same scale in pre/post-indexed variants
  554. // as in the "unsigned offset" variant.
  555. // All other pre/post indexed ldst instructions are unscaled.
  556. Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
  557. if (IsPaired) {
  558. MinOffset = -64;
  559. MaxOffset = 63;
  560. } else {
  561. MinOffset = -256;
  562. MaxOffset = 255;
  563. }
  564. }
  565. static MachineOperand &getLdStRegOp(MachineInstr &MI,
  566. unsigned PairedRegOp = 0) {
  567. assert(PairedRegOp < 2 && "Unexpected register operand idx.");
  568. bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
  569. if (IsPreLdSt)
  570. PairedRegOp += 1;
  571. unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
  572. return MI.getOperand(Idx);
  573. }
  574. static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
  575. unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 : 1;
  576. return MI.getOperand(Idx);
  577. }
  578. static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
  579. unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 : 2;
  580. return MI.getOperand(Idx);
  581. }
  582. static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
  583. MachineInstr &StoreInst,
  584. const AArch64InstrInfo *TII) {
  585. assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
  586. int LoadSize = TII->getMemScale(LoadInst);
  587. int StoreSize = TII->getMemScale(StoreInst);
  588. int UnscaledStOffset = TII->hasUnscaledLdStOffset(StoreInst)
  589. ? getLdStOffsetOp(StoreInst).getImm()
  590. : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
  591. int UnscaledLdOffset = TII->hasUnscaledLdStOffset(LoadInst)
  592. ? getLdStOffsetOp(LoadInst).getImm()
  593. : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
  594. return (UnscaledStOffset <= UnscaledLdOffset) &&
  595. (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
  596. }
  597. static bool isPromotableZeroStoreInst(MachineInstr &MI) {
  598. unsigned Opc = MI.getOpcode();
  599. return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
  600. isNarrowStore(Opc)) &&
  601. getLdStRegOp(MI).getReg() == AArch64::WZR;
  602. }
  603. static bool isPromotableLoadFromStore(MachineInstr &MI) {
  604. switch (MI.getOpcode()) {
  605. default:
  606. return false;
  607. // Scaled instructions.
  608. case AArch64::LDRBBui:
  609. case AArch64::LDRHHui:
  610. case AArch64::LDRWui:
  611. case AArch64::LDRXui:
  612. // Unscaled instructions.
  613. case AArch64::LDURBBi:
  614. case AArch64::LDURHHi:
  615. case AArch64::LDURWi:
  616. case AArch64::LDURXi:
  617. return true;
  618. }
  619. }
  620. static bool isMergeableLdStUpdate(MachineInstr &MI) {
  621. unsigned Opc = MI.getOpcode();
  622. switch (Opc) {
  623. default:
  624. return false;
  625. // Scaled instructions.
  626. case AArch64::STRSui:
  627. case AArch64::STRDui:
  628. case AArch64::STRQui:
  629. case AArch64::STRXui:
  630. case AArch64::STRWui:
  631. case AArch64::STRHHui:
  632. case AArch64::STRBBui:
  633. case AArch64::LDRSui:
  634. case AArch64::LDRDui:
  635. case AArch64::LDRQui:
  636. case AArch64::LDRXui:
  637. case AArch64::LDRWui:
  638. case AArch64::LDRHHui:
  639. case AArch64::LDRBBui:
  640. case AArch64::STGOffset:
  641. case AArch64::STZGOffset:
  642. case AArch64::ST2GOffset:
  643. case AArch64::STZ2GOffset:
  644. case AArch64::STGPi:
  645. // Unscaled instructions.
  646. case AArch64::STURSi:
  647. case AArch64::STURDi:
  648. case AArch64::STURQi:
  649. case AArch64::STURWi:
  650. case AArch64::STURXi:
  651. case AArch64::LDURSi:
  652. case AArch64::LDURDi:
  653. case AArch64::LDURQi:
  654. case AArch64::LDURWi:
  655. case AArch64::LDURXi:
  656. // Paired instructions.
  657. case AArch64::LDPSi:
  658. case AArch64::LDPSWi:
  659. case AArch64::LDPDi:
  660. case AArch64::LDPQi:
  661. case AArch64::LDPWi:
  662. case AArch64::LDPXi:
  663. case AArch64::STPSi:
  664. case AArch64::STPDi:
  665. case AArch64::STPQi:
  666. case AArch64::STPWi:
  667. case AArch64::STPXi:
  668. // Make sure this is a reg+imm (as opposed to an address reloc).
  669. if (!getLdStOffsetOp(MI).isImm())
  670. return false;
  671. return true;
  672. }
  673. }
  674. MachineBasicBlock::iterator
  675. AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
  676. MachineBasicBlock::iterator MergeMI,
  677. const LdStPairFlags &Flags) {
  678. assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
  679. "Expected promotable zero stores.");
  680. MachineBasicBlock::iterator E = I->getParent()->end();
  681. MachineBasicBlock::iterator NextI = next_nodbg(I, E);
  682. // If NextI is the second of the two instructions to be merged, we need
  683. // to skip one further. Either way we merge will invalidate the iterator,
  684. // and we don't need to scan the new instruction, as it's a pairwise
  685. // instruction, which we're not considering for further action anyway.
  686. if (NextI == MergeMI)
  687. NextI = next_nodbg(NextI, E);
  688. unsigned Opc = I->getOpcode();
  689. bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
  690. int OffsetStride = IsScaled ? 1 : TII->getMemScale(*I);
  691. bool MergeForward = Flags.getMergeForward();
  692. // Insert our new paired instruction after whichever of the paired
  693. // instructions MergeForward indicates.
  694. MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
  695. // Also based on MergeForward is from where we copy the base register operand
  696. // so we get the flags compatible with the input code.
  697. const MachineOperand &BaseRegOp =
  698. MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
  699. // Which register is Rt and which is Rt2 depends on the offset order.
  700. MachineInstr *RtMI;
  701. if (getLdStOffsetOp(*I).getImm() ==
  702. getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
  703. RtMI = &*MergeMI;
  704. else
  705. RtMI = &*I;
  706. int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
  707. // Change the scaled offset from small to large type.
  708. if (IsScaled) {
  709. assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
  710. OffsetImm /= 2;
  711. }
  712. // Construct the new instruction.
  713. DebugLoc DL = I->getDebugLoc();
  714. MachineBasicBlock *MBB = I->getParent();
  715. MachineInstrBuilder MIB;
  716. MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
  717. .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
  718. .add(BaseRegOp)
  719. .addImm(OffsetImm)
  720. .cloneMergedMemRefs({&*I, &*MergeMI})
  721. .setMIFlags(I->mergeFlagsWith(*MergeMI));
  722. (void)MIB;
  723. LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
  724. LLVM_DEBUG(I->print(dbgs()));
  725. LLVM_DEBUG(dbgs() << " ");
  726. LLVM_DEBUG(MergeMI->print(dbgs()));
  727. LLVM_DEBUG(dbgs() << " with instruction:\n ");
  728. LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
  729. LLVM_DEBUG(dbgs() << "\n");
  730. // Erase the old instructions.
  731. I->eraseFromParent();
  732. MergeMI->eraseFromParent();
  733. return NextI;
  734. }
  735. // Apply Fn to all instructions between MI and the beginning of the block, until
  736. // a def for DefReg is reached. Returns true, iff Fn returns true for all
  737. // visited instructions. Stop after visiting Limit iterations.
  738. static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
  739. const TargetRegisterInfo *TRI, unsigned Limit,
  740. std::function<bool(MachineInstr &, bool)> &Fn) {
  741. auto MBB = MI.getParent();
  742. for (MachineInstr &I :
  743. instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
  744. if (!Limit)
  745. return false;
  746. --Limit;
  747. bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
  748. return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
  749. TRI->regsOverlap(MOP.getReg(), DefReg);
  750. });
  751. if (!Fn(I, isDef))
  752. return false;
  753. if (isDef)
  754. break;
  755. }
  756. return true;
  757. }
  758. static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
  759. const TargetRegisterInfo *TRI) {
  760. for (const MachineOperand &MOP : phys_regs_and_masks(MI))
  761. if (MOP.isReg() && MOP.isKill())
  762. Units.removeReg(MOP.getReg());
  763. for (const MachineOperand &MOP : phys_regs_and_masks(MI))
  764. if (MOP.isReg() && !MOP.isKill())
  765. Units.addReg(MOP.getReg());
  766. }
  767. MachineBasicBlock::iterator
  768. AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
  769. MachineBasicBlock::iterator Paired,
  770. const LdStPairFlags &Flags) {
  771. MachineBasicBlock::iterator E = I->getParent()->end();
  772. MachineBasicBlock::iterator NextI = next_nodbg(I, E);
  773. // If NextI is the second of the two instructions to be merged, we need
  774. // to skip one further. Either way we merge will invalidate the iterator,
  775. // and we don't need to scan the new instruction, as it's a pairwise
  776. // instruction, which we're not considering for further action anyway.
  777. if (NextI == Paired)
  778. NextI = next_nodbg(NextI, E);
  779. int SExtIdx = Flags.getSExtIdx();
  780. unsigned Opc =
  781. SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
  782. bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
  783. int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
  784. bool MergeForward = Flags.getMergeForward();
  785. Optional<MCPhysReg> RenameReg = Flags.getRenameReg();
  786. if (MergeForward && RenameReg) {
  787. MCRegister RegToRename = getLdStRegOp(*I).getReg();
  788. DefinedInBB.addReg(*RenameReg);
  789. // Return the sub/super register for RenameReg, matching the size of
  790. // OriginalReg.
  791. auto GetMatchingSubReg = [this,
  792. RenameReg](MCPhysReg OriginalReg) -> MCPhysReg {
  793. for (MCPhysReg SubOrSuper : TRI->sub_and_superregs_inclusive(*RenameReg))
  794. if (TRI->getMinimalPhysRegClass(OriginalReg) ==
  795. TRI->getMinimalPhysRegClass(SubOrSuper))
  796. return SubOrSuper;
  797. llvm_unreachable("Should have found matching sub or super register!");
  798. };
  799. std::function<bool(MachineInstr &, bool)> UpdateMIs =
  800. [this, RegToRename, GetMatchingSubReg](MachineInstr &MI, bool IsDef) {
  801. if (IsDef) {
  802. bool SeenDef = false;
  803. for (auto &MOP : MI.operands()) {
  804. // Rename the first explicit definition and all implicit
  805. // definitions matching RegToRename.
  806. if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
  807. (!SeenDef || (MOP.isDef() && MOP.isImplicit())) &&
  808. TRI->regsOverlap(MOP.getReg(), RegToRename)) {
  809. assert((MOP.isImplicit() ||
  810. (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
  811. "Need renamable operands");
  812. MOP.setReg(GetMatchingSubReg(MOP.getReg()));
  813. SeenDef = true;
  814. }
  815. }
  816. } else {
  817. for (auto &MOP : MI.operands()) {
  818. if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
  819. TRI->regsOverlap(MOP.getReg(), RegToRename)) {
  820. assert((MOP.isImplicit() ||
  821. (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
  822. "Need renamable operands");
  823. MOP.setReg(GetMatchingSubReg(MOP.getReg()));
  824. }
  825. }
  826. }
  827. LLVM_DEBUG(dbgs() << "Renamed " << MI << "\n");
  828. return true;
  829. };
  830. forAllMIsUntilDef(*I, RegToRename, TRI, LdStLimit, UpdateMIs);
  831. #if !defined(NDEBUG)
  832. // Make sure the register used for renaming is not used between the paired
  833. // instructions. That would trash the content before the new paired
  834. // instruction.
  835. for (auto &MI :
  836. iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
  837. std::next(I), std::next(Paired)))
  838. assert(all_of(MI.operands(),
  839. [this, &RenameReg](const MachineOperand &MOP) {
  840. return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
  841. !TRI->regsOverlap(MOP.getReg(), *RenameReg);
  842. }) &&
  843. "Rename register used between paired instruction, trashing the "
  844. "content");
  845. #endif
  846. }
  847. // Insert our new paired instruction after whichever of the paired
  848. // instructions MergeForward indicates.
  849. MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
  850. // Also based on MergeForward is from where we copy the base register operand
  851. // so we get the flags compatible with the input code.
  852. const MachineOperand &BaseRegOp =
  853. MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
  854. int Offset = getLdStOffsetOp(*I).getImm();
  855. int PairedOffset = getLdStOffsetOp(*Paired).getImm();
  856. bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
  857. if (IsUnscaled != PairedIsUnscaled) {
  858. // We're trying to pair instructions that differ in how they are scaled. If
  859. // I is scaled then scale the offset of Paired accordingly. Otherwise, do
  860. // the opposite (i.e., make Paired's offset unscaled).
  861. int MemSize = TII->getMemScale(*Paired);
  862. if (PairedIsUnscaled) {
  863. // If the unscaled offset isn't a multiple of the MemSize, we can't
  864. // pair the operations together.
  865. assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
  866. "Offset should be a multiple of the stride!");
  867. PairedOffset /= MemSize;
  868. } else {
  869. PairedOffset *= MemSize;
  870. }
  871. }
  872. // Which register is Rt and which is Rt2 depends on the offset order.
  873. // However, for pre load/stores the Rt should be the one of the pre
  874. // load/store.
  875. MachineInstr *RtMI, *Rt2MI;
  876. if (Offset == PairedOffset + OffsetStride &&
  877. !AArch64InstrInfo::isPreLdSt(*I)) {
  878. RtMI = &*Paired;
  879. Rt2MI = &*I;
  880. // Here we swapped the assumption made for SExtIdx.
  881. // I.e., we turn ldp I, Paired into ldp Paired, I.
  882. // Update the index accordingly.
  883. if (SExtIdx != -1)
  884. SExtIdx = (SExtIdx + 1) % 2;
  885. } else {
  886. RtMI = &*I;
  887. Rt2MI = &*Paired;
  888. }
  889. int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
  890. // Scale the immediate offset, if necessary.
  891. if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
  892. assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
  893. "Unscaled offset cannot be scaled.");
  894. OffsetImm /= TII->getMemScale(*RtMI);
  895. }
  896. // Construct the new instruction.
  897. MachineInstrBuilder MIB;
  898. DebugLoc DL = I->getDebugLoc();
  899. MachineBasicBlock *MBB = I->getParent();
  900. MachineOperand RegOp0 = getLdStRegOp(*RtMI);
  901. MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
  902. // Kill flags may become invalid when moving stores for pairing.
  903. if (RegOp0.isUse()) {
  904. if (!MergeForward) {
  905. // Clear kill flags on store if moving upwards. Example:
  906. // STRWui %w0, ...
  907. // USE %w1
  908. // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
  909. RegOp0.setIsKill(false);
  910. RegOp1.setIsKill(false);
  911. } else {
  912. // Clear kill flags of the first stores register. Example:
  913. // STRWui %w1, ...
  914. // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
  915. // STRW %w0
  916. Register Reg = getLdStRegOp(*I).getReg();
  917. for (MachineInstr &MI : make_range(std::next(I), Paired))
  918. MI.clearRegisterKills(Reg, TRI);
  919. }
  920. }
  921. unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
  922. MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
  923. // Adds the pre-index operand for pre-indexed ld/st pairs.
  924. if (AArch64InstrInfo::isPreLdSt(*RtMI))
  925. MIB.addReg(BaseRegOp.getReg(), RegState::Define);
  926. MIB.add(RegOp0)
  927. .add(RegOp1)
  928. .add(BaseRegOp)
  929. .addImm(OffsetImm)
  930. .cloneMergedMemRefs({&*I, &*Paired})
  931. .setMIFlags(I->mergeFlagsWith(*Paired));
  932. (void)MIB;
  933. LLVM_DEBUG(
  934. dbgs() << "Creating pair load/store. Replacing instructions:\n ");
  935. LLVM_DEBUG(I->print(dbgs()));
  936. LLVM_DEBUG(dbgs() << " ");
  937. LLVM_DEBUG(Paired->print(dbgs()));
  938. LLVM_DEBUG(dbgs() << " with instruction:\n ");
  939. if (SExtIdx != -1) {
  940. // Generate the sign extension for the proper result of the ldp.
  941. // I.e., with X1, that would be:
  942. // %w1 = KILL %w1, implicit-def %x1
  943. // %x1 = SBFMXri killed %x1, 0, 31
  944. MachineOperand &DstMO = MIB->getOperand(SExtIdx);
  945. // Right now, DstMO has the extended register, since it comes from an
  946. // extended opcode.
  947. Register DstRegX = DstMO.getReg();
  948. // Get the W variant of that register.
  949. Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
  950. // Update the result of LDP to use the W instead of the X variant.
  951. DstMO.setReg(DstRegW);
  952. LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
  953. LLVM_DEBUG(dbgs() << "\n");
  954. // Make the machine verifier happy by providing a definition for
  955. // the X register.
  956. // Insert this definition right after the generated LDP, i.e., before
  957. // InsertionPoint.
  958. MachineInstrBuilder MIBKill =
  959. BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
  960. .addReg(DstRegW)
  961. .addReg(DstRegX, RegState::Define);
  962. MIBKill->getOperand(2).setImplicit();
  963. // Create the sign extension.
  964. MachineInstrBuilder MIBSXTW =
  965. BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
  966. .addReg(DstRegX)
  967. .addImm(0)
  968. .addImm(31);
  969. (void)MIBSXTW;
  970. LLVM_DEBUG(dbgs() << " Extend operand:\n ");
  971. LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
  972. } else {
  973. LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
  974. }
  975. LLVM_DEBUG(dbgs() << "\n");
  976. if (MergeForward)
  977. for (const MachineOperand &MOP : phys_regs_and_masks(*I))
  978. if (MOP.isReg() && MOP.isKill())
  979. DefinedInBB.addReg(MOP.getReg());
  980. // Erase the old instructions.
  981. I->eraseFromParent();
  982. Paired->eraseFromParent();
  983. return NextI;
  984. }
  985. MachineBasicBlock::iterator
  986. AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
  987. MachineBasicBlock::iterator StoreI) {
  988. MachineBasicBlock::iterator NextI =
  989. next_nodbg(LoadI, LoadI->getParent()->end());
  990. int LoadSize = TII->getMemScale(*LoadI);
  991. int StoreSize = TII->getMemScale(*StoreI);
  992. Register LdRt = getLdStRegOp(*LoadI).getReg();
  993. const MachineOperand &StMO = getLdStRegOp(*StoreI);
  994. Register StRt = getLdStRegOp(*StoreI).getReg();
  995. bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
  996. assert((IsStoreXReg ||
  997. TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
  998. "Unexpected RegClass");
  999. MachineInstr *BitExtMI;
  1000. if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
  1001. // Remove the load, if the destination register of the loads is the same
  1002. // register for stored value.
  1003. if (StRt == LdRt && LoadSize == 8) {
  1004. for (MachineInstr &MI : make_range(StoreI->getIterator(),
  1005. LoadI->getIterator())) {
  1006. if (MI.killsRegister(StRt, TRI)) {
  1007. MI.clearRegisterKills(StRt, TRI);
  1008. break;
  1009. }
  1010. }
  1011. LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
  1012. LLVM_DEBUG(LoadI->print(dbgs()));
  1013. LLVM_DEBUG(dbgs() << "\n");
  1014. LoadI->eraseFromParent();
  1015. return NextI;
  1016. }
  1017. // Replace the load with a mov if the load and store are in the same size.
  1018. BitExtMI =
  1019. BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
  1020. TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
  1021. .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
  1022. .add(StMO)
  1023. .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
  1024. .setMIFlags(LoadI->getFlags());
  1025. } else {
  1026. // FIXME: Currently we disable this transformation in big-endian targets as
  1027. // performance and correctness are verified only in little-endian.
  1028. if (!Subtarget->isLittleEndian())
  1029. return NextI;
  1030. bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
  1031. assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
  1032. "Unsupported ld/st match");
  1033. assert(LoadSize <= StoreSize && "Invalid load size");
  1034. int UnscaledLdOffset = IsUnscaled
  1035. ? getLdStOffsetOp(*LoadI).getImm()
  1036. : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
  1037. int UnscaledStOffset = IsUnscaled
  1038. ? getLdStOffsetOp(*StoreI).getImm()
  1039. : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
  1040. int Width = LoadSize * 8;
  1041. Register DestReg =
  1042. IsStoreXReg ? Register(TRI->getMatchingSuperReg(
  1043. LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
  1044. : LdRt;
  1045. assert((UnscaledLdOffset >= UnscaledStOffset &&
  1046. (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
  1047. "Invalid offset");
  1048. int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
  1049. int Imms = Immr + Width - 1;
  1050. if (UnscaledLdOffset == UnscaledStOffset) {
  1051. uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
  1052. | ((Immr) << 6) // immr
  1053. | ((Imms) << 0) // imms
  1054. ;
  1055. BitExtMI =
  1056. BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
  1057. TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
  1058. DestReg)
  1059. .add(StMO)
  1060. .addImm(AndMaskEncoded)
  1061. .setMIFlags(LoadI->getFlags());
  1062. } else {
  1063. BitExtMI =
  1064. BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
  1065. TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
  1066. DestReg)
  1067. .add(StMO)
  1068. .addImm(Immr)
  1069. .addImm(Imms)
  1070. .setMIFlags(LoadI->getFlags());
  1071. }
  1072. }
  1073. // Clear kill flags between store and load.
  1074. for (MachineInstr &MI : make_range(StoreI->getIterator(),
  1075. BitExtMI->getIterator()))
  1076. if (MI.killsRegister(StRt, TRI)) {
  1077. MI.clearRegisterKills(StRt, TRI);
  1078. break;
  1079. }
  1080. LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
  1081. LLVM_DEBUG(StoreI->print(dbgs()));
  1082. LLVM_DEBUG(dbgs() << " ");
  1083. LLVM_DEBUG(LoadI->print(dbgs()));
  1084. LLVM_DEBUG(dbgs() << " with instructions:\n ");
  1085. LLVM_DEBUG(StoreI->print(dbgs()));
  1086. LLVM_DEBUG(dbgs() << " ");
  1087. LLVM_DEBUG((BitExtMI)->print(dbgs()));
  1088. LLVM_DEBUG(dbgs() << "\n");
  1089. // Erase the old instructions.
  1090. LoadI->eraseFromParent();
  1091. return NextI;
  1092. }
  1093. static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
  1094. // Convert the byte-offset used by unscaled into an "element" offset used
  1095. // by the scaled pair load/store instructions.
  1096. if (IsUnscaled) {
  1097. // If the byte-offset isn't a multiple of the stride, there's no point
  1098. // trying to match it.
  1099. if (Offset % OffsetStride)
  1100. return false;
  1101. Offset /= OffsetStride;
  1102. }
  1103. return Offset <= 63 && Offset >= -64;
  1104. }
  1105. // Do alignment, specialized to power of 2 and for signed ints,
  1106. // avoiding having to do a C-style cast from uint_64t to int when
  1107. // using alignTo from include/llvm/Support/MathExtras.h.
  1108. // FIXME: Move this function to include/MathExtras.h?
  1109. static int alignTo(int Num, int PowOf2) {
  1110. return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
  1111. }
  1112. static bool mayAlias(MachineInstr &MIa,
  1113. SmallVectorImpl<MachineInstr *> &MemInsns,
  1114. AliasAnalysis *AA) {
  1115. for (MachineInstr *MIb : MemInsns)
  1116. if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false))
  1117. return true;
  1118. return false;
  1119. }
  1120. bool AArch64LoadStoreOpt::findMatchingStore(
  1121. MachineBasicBlock::iterator I, unsigned Limit,
  1122. MachineBasicBlock::iterator &StoreI) {
  1123. MachineBasicBlock::iterator B = I->getParent()->begin();
  1124. MachineBasicBlock::iterator MBBI = I;
  1125. MachineInstr &LoadMI = *I;
  1126. Register BaseReg = getLdStBaseOp(LoadMI).getReg();
  1127. // If the load is the first instruction in the block, there's obviously
  1128. // not any matching store.
  1129. if (MBBI == B)
  1130. return false;
  1131. // Track which register units have been modified and used between the first
  1132. // insn and the second insn.
  1133. ModifiedRegUnits.clear();
  1134. UsedRegUnits.clear();
  1135. unsigned Count = 0;
  1136. do {
  1137. MBBI = prev_nodbg(MBBI, B);
  1138. MachineInstr &MI = *MBBI;
  1139. // Don't count transient instructions towards the search limit since there
  1140. // may be different numbers of them if e.g. debug information is present.
  1141. if (!MI.isTransient())
  1142. ++Count;
  1143. // If the load instruction reads directly from the address to which the
  1144. // store instruction writes and the stored value is not modified, we can
  1145. // promote the load. Since we do not handle stores with pre-/post-index,
  1146. // it's unnecessary to check if BaseReg is modified by the store itself.
  1147. // Also we can't handle stores without an immediate offset operand,
  1148. // while the operand might be the address for a global variable.
  1149. if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
  1150. BaseReg == getLdStBaseOp(MI).getReg() && getLdStOffsetOp(MI).isImm() &&
  1151. isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
  1152. ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
  1153. StoreI = MBBI;
  1154. return true;
  1155. }
  1156. if (MI.isCall())
  1157. return false;
  1158. // Update modified / uses register units.
  1159. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
  1160. // Otherwise, if the base register is modified, we have no match, so
  1161. // return early.
  1162. if (!ModifiedRegUnits.available(BaseReg))
  1163. return false;
  1164. // If we encounter a store aliased with the load, return early.
  1165. if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
  1166. return false;
  1167. } while (MBBI != B && Count < Limit);
  1168. return false;
  1169. }
  1170. // Returns true if FirstMI and MI are candidates for merging or pairing.
  1171. // Otherwise, returns false.
  1172. static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
  1173. LdStPairFlags &Flags,
  1174. const AArch64InstrInfo *TII) {
  1175. // If this is volatile or if pairing is suppressed, not a candidate.
  1176. if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
  1177. return false;
  1178. // We should have already checked FirstMI for pair suppression and volatility.
  1179. assert(!FirstMI.hasOrderedMemoryRef() &&
  1180. !TII->isLdStPairSuppressed(FirstMI) &&
  1181. "FirstMI shouldn't get here if either of these checks are true.");
  1182. unsigned OpcA = FirstMI.getOpcode();
  1183. unsigned OpcB = MI.getOpcode();
  1184. // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
  1185. if (OpcA == OpcB)
  1186. return !AArch64InstrInfo::isPreLdSt(FirstMI);
  1187. // Try to match a sign-extended load/store with a zero-extended load/store.
  1188. bool IsValidLdStrOpc, PairIsValidLdStrOpc;
  1189. unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
  1190. assert(IsValidLdStrOpc &&
  1191. "Given Opc should be a Load or Store with an immediate");
  1192. // OpcA will be the first instruction in the pair.
  1193. if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
  1194. Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
  1195. return true;
  1196. }
  1197. // If the second instruction isn't even a mergable/pairable load/store, bail
  1198. // out.
  1199. if (!PairIsValidLdStrOpc)
  1200. return false;
  1201. // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
  1202. // offsets.
  1203. if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
  1204. return false;
  1205. // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
  1206. // LDR<S,D,Q,W,X>pre-LDR<S,D,Q,W,X>ui
  1207. // are candidate pairs that can be merged.
  1208. if (isPreLdStPairCandidate(FirstMI, MI))
  1209. return true;
  1210. // Try to match an unscaled load/store with a scaled load/store.
  1211. return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
  1212. getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
  1213. // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
  1214. }
  1215. static bool
  1216. canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
  1217. SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
  1218. const TargetRegisterInfo *TRI) {
  1219. if (!FirstMI.mayStore())
  1220. return false;
  1221. // Check if we can find an unused register which we can use to rename
  1222. // the register used by the first load/store.
  1223. auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
  1224. MachineFunction &MF = *FirstMI.getParent()->getParent();
  1225. if (!RegClass || !MF.getRegInfo().tracksLiveness())
  1226. return false;
  1227. auto RegToRename = getLdStRegOp(FirstMI).getReg();
  1228. // For now, we only rename if the store operand gets killed at the store.
  1229. if (!getLdStRegOp(FirstMI).isKill() &&
  1230. !any_of(FirstMI.operands(),
  1231. [TRI, RegToRename](const MachineOperand &MOP) {
  1232. return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
  1233. MOP.isImplicit() && MOP.isKill() &&
  1234. TRI->regsOverlap(RegToRename, MOP.getReg());
  1235. })) {
  1236. LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI << "\n");
  1237. return false;
  1238. }
  1239. auto canRenameMOP = [TRI](const MachineOperand &MOP) {
  1240. if (MOP.isReg()) {
  1241. auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
  1242. // Renaming registers with multiple disjunct sub-registers (e.g. the
  1243. // result of a LD3) means that all sub-registers are renamed, potentially
  1244. // impacting other instructions we did not check. Bail out.
  1245. // Note that this relies on the structure of the AArch64 register file. In
  1246. // particular, a subregister cannot be written without overwriting the
  1247. // whole register.
  1248. if (RegClass->HasDisjunctSubRegs) {
  1249. LLVM_DEBUG(
  1250. dbgs()
  1251. << " Cannot rename operands with multiple disjunct subregisters ("
  1252. << MOP << ")\n");
  1253. return false;
  1254. }
  1255. }
  1256. return MOP.isImplicit() ||
  1257. (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
  1258. };
  1259. bool FoundDef = false;
  1260. // For each instruction between FirstMI and the previous def for RegToRename,
  1261. // we
  1262. // * check if we can rename RegToRename in this instruction
  1263. // * collect the registers used and required register classes for RegToRename.
  1264. std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
  1265. bool IsDef) {
  1266. LLVM_DEBUG(dbgs() << "Checking " << MI << "\n");
  1267. // Currently we do not try to rename across frame-setup instructions.
  1268. if (MI.getFlag(MachineInstr::FrameSetup)) {
  1269. LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions currently ("
  1270. << MI << ")\n");
  1271. return false;
  1272. }
  1273. UsedInBetween.accumulate(MI);
  1274. // For a definition, check that we can rename the definition and exit the
  1275. // loop.
  1276. FoundDef = IsDef;
  1277. // For defs, check if we can rename the first def of RegToRename.
  1278. if (FoundDef) {
  1279. // For some pseudo instructions, we might not generate code in the end
  1280. // (e.g. KILL) and we would end up without a correct def for the rename
  1281. // register.
  1282. // TODO: This might be overly conservative and we could handle those cases
  1283. // in multiple ways:
  1284. // 1. Insert an extra copy, to materialize the def.
  1285. // 2. Skip pseudo-defs until we find an non-pseudo def.
  1286. if (MI.isPseudo()) {
  1287. LLVM_DEBUG(dbgs() << " Cannot rename pseudo instruction " << MI
  1288. << "\n");
  1289. return false;
  1290. }
  1291. for (auto &MOP : MI.operands()) {
  1292. if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
  1293. !TRI->regsOverlap(MOP.getReg(), RegToRename))
  1294. continue;
  1295. if (!canRenameMOP(MOP)) {
  1296. LLVM_DEBUG(dbgs()
  1297. << " Cannot rename " << MOP << " in " << MI << "\n");
  1298. return false;
  1299. }
  1300. RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
  1301. }
  1302. return true;
  1303. } else {
  1304. for (auto &MOP : MI.operands()) {
  1305. if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
  1306. !TRI->regsOverlap(MOP.getReg(), RegToRename))
  1307. continue;
  1308. if (!canRenameMOP(MOP)) {
  1309. LLVM_DEBUG(dbgs()
  1310. << " Cannot rename " << MOP << " in " << MI << "\n");
  1311. return false;
  1312. }
  1313. RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
  1314. }
  1315. }
  1316. return true;
  1317. };
  1318. if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
  1319. return false;
  1320. if (!FoundDef) {
  1321. LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
  1322. return false;
  1323. }
  1324. return true;
  1325. }
  1326. // Check if we can find a physical register for renaming. This register must:
  1327. // * not be defined up to FirstMI (checking DefinedInBB)
  1328. // * not used between the MI and the defining instruction of the register to
  1329. // rename (checked using UsedInBetween).
  1330. // * is available in all used register classes (checked using RequiredClasses).
  1331. static Optional<MCPhysReg> tryToFindRegisterToRename(
  1332. MachineInstr &FirstMI, MachineInstr &MI, LiveRegUnits &DefinedInBB,
  1333. LiveRegUnits &UsedInBetween,
  1334. SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
  1335. const TargetRegisterInfo *TRI) {
  1336. auto &MF = *FirstMI.getParent()->getParent();
  1337. MachineRegisterInfo &RegInfo = MF.getRegInfo();
  1338. // Checks if any sub- or super-register of PR is callee saved.
  1339. auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
  1340. return any_of(TRI->sub_and_superregs_inclusive(PR),
  1341. [&MF, TRI](MCPhysReg SubOrSuper) {
  1342. return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
  1343. });
  1344. };
  1345. // Check if PR or one of its sub- or super-registers can be used for all
  1346. // required register classes.
  1347. auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
  1348. return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
  1349. return any_of(TRI->sub_and_superregs_inclusive(PR),
  1350. [C, TRI](MCPhysReg SubOrSuper) {
  1351. return C == TRI->getMinimalPhysRegClass(SubOrSuper);
  1352. });
  1353. });
  1354. };
  1355. auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
  1356. for (const MCPhysReg &PR : *RegClass) {
  1357. if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
  1358. !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
  1359. CanBeUsedForAllClasses(PR)) {
  1360. DefinedInBB.addReg(PR);
  1361. LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
  1362. << "\n");
  1363. return {PR};
  1364. }
  1365. }
  1366. LLVM_DEBUG(dbgs() << "No rename register found from "
  1367. << TRI->getRegClassName(RegClass) << "\n");
  1368. return None;
  1369. }
  1370. /// Scan the instructions looking for a load/store that can be combined with the
  1371. /// current instruction into a wider equivalent or a load/store pair.
  1372. MachineBasicBlock::iterator
  1373. AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
  1374. LdStPairFlags &Flags, unsigned Limit,
  1375. bool FindNarrowMerge) {
  1376. MachineBasicBlock::iterator E = I->getParent()->end();
  1377. MachineBasicBlock::iterator MBBI = I;
  1378. MachineBasicBlock::iterator MBBIWithRenameReg;
  1379. MachineInstr &FirstMI = *I;
  1380. MBBI = next_nodbg(MBBI, E);
  1381. bool MayLoad = FirstMI.mayLoad();
  1382. bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
  1383. Register Reg = getLdStRegOp(FirstMI).getReg();
  1384. Register BaseReg = getLdStBaseOp(FirstMI).getReg();
  1385. int Offset = getLdStOffsetOp(FirstMI).getImm();
  1386. int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
  1387. bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
  1388. Optional<bool> MaybeCanRename = None;
  1389. if (!EnableRenaming)
  1390. MaybeCanRename = {false};
  1391. SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
  1392. LiveRegUnits UsedInBetween;
  1393. UsedInBetween.init(*TRI);
  1394. Flags.clearRenameReg();
  1395. // Track which register units have been modified and used between the first
  1396. // insn (inclusive) and the second insn.
  1397. ModifiedRegUnits.clear();
  1398. UsedRegUnits.clear();
  1399. // Remember any instructions that read/write memory between FirstMI and MI.
  1400. SmallVector<MachineInstr *, 4> MemInsns;
  1401. for (unsigned Count = 0; MBBI != E && Count < Limit;
  1402. MBBI = next_nodbg(MBBI, E)) {
  1403. MachineInstr &MI = *MBBI;
  1404. UsedInBetween.accumulate(MI);
  1405. // Don't count transient instructions towards the search limit since there
  1406. // may be different numbers of them if e.g. debug information is present.
  1407. if (!MI.isTransient())
  1408. ++Count;
  1409. Flags.setSExtIdx(-1);
  1410. if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
  1411. getLdStOffsetOp(MI).isImm()) {
  1412. assert(MI.mayLoadOrStore() && "Expected memory operation.");
  1413. // If we've found another instruction with the same opcode, check to see
  1414. // if the base and offset are compatible with our starting instruction.
  1415. // These instructions all have scaled immediate operands, so we just
  1416. // check for +1/-1. Make sure to check the new instruction offset is
  1417. // actually an immediate and not a symbolic reference destined for
  1418. // a relocation.
  1419. Register MIBaseReg = getLdStBaseOp(MI).getReg();
  1420. int MIOffset = getLdStOffsetOp(MI).getImm();
  1421. bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
  1422. if (IsUnscaled != MIIsUnscaled) {
  1423. // We're trying to pair instructions that differ in how they are scaled.
  1424. // If FirstMI is scaled then scale the offset of MI accordingly.
  1425. // Otherwise, do the opposite (i.e., make MI's offset unscaled).
  1426. int MemSize = TII->getMemScale(MI);
  1427. if (MIIsUnscaled) {
  1428. // If the unscaled offset isn't a multiple of the MemSize, we can't
  1429. // pair the operations together: bail and keep looking.
  1430. if (MIOffset % MemSize) {
  1431. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1432. UsedRegUnits, TRI);
  1433. MemInsns.push_back(&MI);
  1434. continue;
  1435. }
  1436. MIOffset /= MemSize;
  1437. } else {
  1438. MIOffset *= MemSize;
  1439. }
  1440. }
  1441. bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
  1442. if (BaseReg == MIBaseReg) {
  1443. // If the offset of the second ld/st is not equal to the size of the
  1444. // destination register it can’t be paired with a pre-index ld/st
  1445. // pair. Additionally if the base reg is used or modified the operations
  1446. // can't be paired: bail and keep looking.
  1447. if (IsPreLdSt) {
  1448. bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
  1449. bool IsBaseRegUsed =
  1450. !UsedRegUnits.available(getLdStBaseOp(MI).getReg());
  1451. bool IsBaseRegModified =
  1452. !ModifiedRegUnits.available(getLdStBaseOp(MI).getReg());
  1453. // If the stored value and the address of the second instruction is
  1454. // the same, it needs to be using the updated register and therefore
  1455. // it must not be folded.
  1456. bool IsMIRegTheSame = TRI->regsOverlap(getLdStRegOp(MI).getReg(),
  1457. getLdStBaseOp(MI).getReg());
  1458. if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
  1459. IsMIRegTheSame) {
  1460. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1461. UsedRegUnits, TRI);
  1462. MemInsns.push_back(&MI);
  1463. continue;
  1464. }
  1465. } else {
  1466. if ((Offset != MIOffset + OffsetStride) &&
  1467. (Offset + OffsetStride != MIOffset)) {
  1468. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1469. UsedRegUnits, TRI);
  1470. MemInsns.push_back(&MI);
  1471. continue;
  1472. }
  1473. }
  1474. int MinOffset = Offset < MIOffset ? Offset : MIOffset;
  1475. if (FindNarrowMerge) {
  1476. // If the alignment requirements of the scaled wide load/store
  1477. // instruction can't express the offset of the scaled narrow input,
  1478. // bail and keep looking. For promotable zero stores, allow only when
  1479. // the stored value is the same (i.e., WZR).
  1480. if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
  1481. (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
  1482. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1483. UsedRegUnits, TRI);
  1484. MemInsns.push_back(&MI);
  1485. continue;
  1486. }
  1487. } else {
  1488. // Pairwise instructions have a 7-bit signed offset field. Single
  1489. // insns have a 12-bit unsigned offset field. If the resultant
  1490. // immediate offset of merging these instructions is out of range for
  1491. // a pairwise instruction, bail and keep looking.
  1492. if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
  1493. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1494. UsedRegUnits, TRI);
  1495. MemInsns.push_back(&MI);
  1496. continue;
  1497. }
  1498. // If the alignment requirements of the paired (scaled) instruction
  1499. // can't express the offset of the unscaled input, bail and keep
  1500. // looking.
  1501. if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
  1502. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
  1503. UsedRegUnits, TRI);
  1504. MemInsns.push_back(&MI);
  1505. continue;
  1506. }
  1507. }
  1508. // If the destination register of one load is the same register or a
  1509. // sub/super register of the other load, bail and keep looking. A
  1510. // load-pair instruction with both destination registers the same is
  1511. // UNPREDICTABLE and will result in an exception.
  1512. if (MayLoad &&
  1513. TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) {
  1514. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
  1515. TRI);
  1516. MemInsns.push_back(&MI);
  1517. continue;
  1518. }
  1519. // If the BaseReg has been modified, then we cannot do the optimization.
  1520. // For example, in the following pattern
  1521. // ldr x1 [x2]
  1522. // ldr x2 [x3]
  1523. // ldr x4 [x2, #8],
  1524. // the first and third ldr cannot be converted to ldp x1, x4, [x2]
  1525. if (!ModifiedRegUnits.available(BaseReg))
  1526. return E;
  1527. // If the Rt of the second instruction was not modified or used between
  1528. // the two instructions and none of the instructions between the second
  1529. // and first alias with the second, we can combine the second into the
  1530. // first.
  1531. if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) &&
  1532. !(MI.mayLoad() &&
  1533. !UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
  1534. !mayAlias(MI, MemInsns, AA)) {
  1535. Flags.setMergeForward(false);
  1536. Flags.clearRenameReg();
  1537. return MBBI;
  1538. }
  1539. // Likewise, if the Rt of the first instruction is not modified or used
  1540. // between the two instructions and none of the instructions between the
  1541. // first and the second alias with the first, we can combine the first
  1542. // into the second.
  1543. if (!(MayLoad &&
  1544. !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
  1545. !mayAlias(FirstMI, MemInsns, AA)) {
  1546. if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
  1547. Flags.setMergeForward(true);
  1548. Flags.clearRenameReg();
  1549. return MBBI;
  1550. }
  1551. if (DebugCounter::shouldExecute(RegRenamingCounter)) {
  1552. if (!MaybeCanRename)
  1553. MaybeCanRename = {canRenameUpToDef(FirstMI, UsedInBetween,
  1554. RequiredClasses, TRI)};
  1555. if (*MaybeCanRename) {
  1556. Optional<MCPhysReg> MaybeRenameReg = tryToFindRegisterToRename(
  1557. FirstMI, MI, DefinedInBB, UsedInBetween, RequiredClasses,
  1558. TRI);
  1559. if (MaybeRenameReg) {
  1560. Flags.setRenameReg(*MaybeRenameReg);
  1561. Flags.setMergeForward(true);
  1562. MBBIWithRenameReg = MBBI;
  1563. }
  1564. }
  1565. }
  1566. }
  1567. // Unable to combine these instructions due to interference in between.
  1568. // Keep looking.
  1569. }
  1570. }
  1571. if (Flags.getRenameReg())
  1572. return MBBIWithRenameReg;
  1573. // If the instruction wasn't a matching load or store. Stop searching if we
  1574. // encounter a call instruction that might modify memory.
  1575. if (MI.isCall())
  1576. return E;
  1577. // Update modified / uses register units.
  1578. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
  1579. // Otherwise, if the base register is modified, we have no match, so
  1580. // return early.
  1581. if (!ModifiedRegUnits.available(BaseReg))
  1582. return E;
  1583. // Update list of instructions that read/write memory.
  1584. if (MI.mayLoadOrStore())
  1585. MemInsns.push_back(&MI);
  1586. }
  1587. return E;
  1588. }
  1589. MachineBasicBlock::iterator
  1590. AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
  1591. MachineBasicBlock::iterator Update,
  1592. bool IsPreIdx) {
  1593. assert((Update->getOpcode() == AArch64::ADDXri ||
  1594. Update->getOpcode() == AArch64::SUBXri) &&
  1595. "Unexpected base register update instruction to merge!");
  1596. MachineBasicBlock::iterator E = I->getParent()->end();
  1597. MachineBasicBlock::iterator NextI = next_nodbg(I, E);
  1598. // Return the instruction following the merged instruction, which is
  1599. // the instruction following our unmerged load. Unless that's the add/sub
  1600. // instruction we're merging, in which case it's the one after that.
  1601. if (NextI == Update)
  1602. NextI = next_nodbg(NextI, E);
  1603. int Value = Update->getOperand(2).getImm();
  1604. assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
  1605. "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
  1606. if (Update->getOpcode() == AArch64::SUBXri)
  1607. Value = -Value;
  1608. unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
  1609. : getPostIndexedOpcode(I->getOpcode());
  1610. MachineInstrBuilder MIB;
  1611. int Scale, MinOffset, MaxOffset;
  1612. getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
  1613. if (!isPairedLdSt(*I)) {
  1614. // Non-paired instruction.
  1615. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
  1616. .add(getLdStRegOp(*Update))
  1617. .add(getLdStRegOp(*I))
  1618. .add(getLdStBaseOp(*I))
  1619. .addImm(Value / Scale)
  1620. .setMemRefs(I->memoperands())
  1621. .setMIFlags(I->mergeFlagsWith(*Update));
  1622. } else {
  1623. // Paired instruction.
  1624. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
  1625. .add(getLdStRegOp(*Update))
  1626. .add(getLdStRegOp(*I, 0))
  1627. .add(getLdStRegOp(*I, 1))
  1628. .add(getLdStBaseOp(*I))
  1629. .addImm(Value / Scale)
  1630. .setMemRefs(I->memoperands())
  1631. .setMIFlags(I->mergeFlagsWith(*Update));
  1632. }
  1633. (void)MIB;
  1634. if (IsPreIdx) {
  1635. ++NumPreFolded;
  1636. LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
  1637. } else {
  1638. ++NumPostFolded;
  1639. LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
  1640. }
  1641. LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
  1642. LLVM_DEBUG(I->print(dbgs()));
  1643. LLVM_DEBUG(dbgs() << " ");
  1644. LLVM_DEBUG(Update->print(dbgs()));
  1645. LLVM_DEBUG(dbgs() << " with instruction:\n ");
  1646. LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
  1647. LLVM_DEBUG(dbgs() << "\n");
  1648. // Erase the old instructions for the block.
  1649. I->eraseFromParent();
  1650. Update->eraseFromParent();
  1651. return NextI;
  1652. }
  1653. bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
  1654. MachineInstr &MI,
  1655. unsigned BaseReg, int Offset) {
  1656. switch (MI.getOpcode()) {
  1657. default:
  1658. break;
  1659. case AArch64::SUBXri:
  1660. case AArch64::ADDXri:
  1661. // Make sure it's a vanilla immediate operand, not a relocation or
  1662. // anything else we can't handle.
  1663. if (!MI.getOperand(2).isImm())
  1664. break;
  1665. // Watch out for 1 << 12 shifted value.
  1666. if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
  1667. break;
  1668. // The update instruction source and destination register must be the
  1669. // same as the load/store base register.
  1670. if (MI.getOperand(0).getReg() != BaseReg ||
  1671. MI.getOperand(1).getReg() != BaseReg)
  1672. break;
  1673. int UpdateOffset = MI.getOperand(2).getImm();
  1674. if (MI.getOpcode() == AArch64::SUBXri)
  1675. UpdateOffset = -UpdateOffset;
  1676. // The immediate must be a multiple of the scaling factor of the pre/post
  1677. // indexed instruction.
  1678. int Scale, MinOffset, MaxOffset;
  1679. getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
  1680. if (UpdateOffset % Scale != 0)
  1681. break;
  1682. // Scaled offset must fit in the instruction immediate.
  1683. int ScaledOffset = UpdateOffset / Scale;
  1684. if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
  1685. break;
  1686. // If we have a non-zero Offset, we check that it matches the amount
  1687. // we're adding to the register.
  1688. if (!Offset || Offset == UpdateOffset)
  1689. return true;
  1690. break;
  1691. }
  1692. return false;
  1693. }
  1694. static bool needsWinCFI(const MachineFunction *MF) {
  1695. return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
  1696. MF->getFunction().needsUnwindTableEntry();
  1697. }
  1698. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
  1699. MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
  1700. MachineBasicBlock::iterator E = I->getParent()->end();
  1701. MachineInstr &MemMI = *I;
  1702. MachineBasicBlock::iterator MBBI = I;
  1703. Register BaseReg = getLdStBaseOp(MemMI).getReg();
  1704. int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI);
  1705. // Scan forward looking for post-index opportunities. Updating instructions
  1706. // can't be formed if the memory instruction doesn't have the offset we're
  1707. // looking for.
  1708. if (MIUnscaledOffset != UnscaledOffset)
  1709. return E;
  1710. // If the base register overlaps a source/destination register, we can't
  1711. // merge the update. This does not apply to tag store instructions which
  1712. // ignore the address part of the source register.
  1713. // This does not apply to STGPi as well, which does not have unpredictable
  1714. // behavior in this case unlike normal stores, and always performs writeback
  1715. // after reading the source register value.
  1716. if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
  1717. bool IsPairedInsn = isPairedLdSt(MemMI);
  1718. for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
  1719. Register DestReg = getLdStRegOp(MemMI, i).getReg();
  1720. if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
  1721. return E;
  1722. }
  1723. }
  1724. // Track which register units have been modified and used between the first
  1725. // insn (inclusive) and the second insn.
  1726. ModifiedRegUnits.clear();
  1727. UsedRegUnits.clear();
  1728. MBBI = next_nodbg(MBBI, E);
  1729. // We can't post-increment the stack pointer if any instruction between
  1730. // the memory access (I) and the increment (MBBI) can access the memory
  1731. // region defined by [SP, MBBI].
  1732. const bool BaseRegSP = BaseReg == AArch64::SP;
  1733. if (BaseRegSP && needsWinCFI(I->getMF())) {
  1734. // FIXME: For now, we always block the optimization over SP in windows
  1735. // targets as it requires to adjust the unwind/debug info, messing up
  1736. // the unwind info can actually cause a miscompile.
  1737. return E;
  1738. }
  1739. for (unsigned Count = 0; MBBI != E && Count < Limit;
  1740. MBBI = next_nodbg(MBBI, E)) {
  1741. MachineInstr &MI = *MBBI;
  1742. // Don't count transient instructions towards the search limit since there
  1743. // may be different numbers of them if e.g. debug information is present.
  1744. if (!MI.isTransient())
  1745. ++Count;
  1746. // If we found a match, return it.
  1747. if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
  1748. return MBBI;
  1749. // Update the status of what the instruction clobbered and used.
  1750. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
  1751. // Otherwise, if the base register is used or modified, we have no match, so
  1752. // return early.
  1753. // If we are optimizing SP, do not allow instructions that may load or store
  1754. // in between the load and the optimized value update.
  1755. if (!ModifiedRegUnits.available(BaseReg) ||
  1756. !UsedRegUnits.available(BaseReg) ||
  1757. (BaseRegSP && MBBI->mayLoadOrStore()))
  1758. return E;
  1759. }
  1760. return E;
  1761. }
  1762. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
  1763. MachineBasicBlock::iterator I, unsigned Limit) {
  1764. MachineBasicBlock::iterator B = I->getParent()->begin();
  1765. MachineBasicBlock::iterator E = I->getParent()->end();
  1766. MachineInstr &MemMI = *I;
  1767. MachineBasicBlock::iterator MBBI = I;
  1768. MachineFunction &MF = *MemMI.getMF();
  1769. Register BaseReg = getLdStBaseOp(MemMI).getReg();
  1770. int Offset = getLdStOffsetOp(MemMI).getImm();
  1771. // If the load/store is the first instruction in the block, there's obviously
  1772. // not any matching update. Ditto if the memory offset isn't zero.
  1773. if (MBBI == B || Offset != 0)
  1774. return E;
  1775. // If the base register overlaps a destination register, we can't
  1776. // merge the update.
  1777. if (!isTagStore(MemMI)) {
  1778. bool IsPairedInsn = isPairedLdSt(MemMI);
  1779. for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
  1780. Register DestReg = getLdStRegOp(MemMI, i).getReg();
  1781. if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
  1782. return E;
  1783. }
  1784. }
  1785. const bool BaseRegSP = BaseReg == AArch64::SP;
  1786. if (BaseRegSP && needsWinCFI(I->getMF())) {
  1787. // FIXME: For now, we always block the optimization over SP in windows
  1788. // targets as it requires to adjust the unwind/debug info, messing up
  1789. // the unwind info can actually cause a miscompile.
  1790. return E;
  1791. }
  1792. const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
  1793. unsigned RedZoneSize =
  1794. Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
  1795. // Track which register units have been modified and used between the first
  1796. // insn (inclusive) and the second insn.
  1797. ModifiedRegUnits.clear();
  1798. UsedRegUnits.clear();
  1799. unsigned Count = 0;
  1800. bool MemAcessBeforeSPPreInc = false;
  1801. do {
  1802. MBBI = prev_nodbg(MBBI, B);
  1803. MachineInstr &MI = *MBBI;
  1804. // Don't count transient instructions towards the search limit since there
  1805. // may be different numbers of them if e.g. debug information is present.
  1806. if (!MI.isTransient())
  1807. ++Count;
  1808. // If we found a match, return it.
  1809. if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
  1810. // Check that the update value is within our red zone limit (which may be
  1811. // zero).
  1812. if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
  1813. return E;
  1814. return MBBI;
  1815. }
  1816. // Update the status of what the instruction clobbered and used.
  1817. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
  1818. // Otherwise, if the base register is used or modified, we have no match, so
  1819. // return early.
  1820. if (!ModifiedRegUnits.available(BaseReg) ||
  1821. !UsedRegUnits.available(BaseReg))
  1822. return E;
  1823. // Keep track if we have a memory access before an SP pre-increment, in this
  1824. // case we need to validate later that the update amount respects the red
  1825. // zone.
  1826. if (BaseRegSP && MBBI->mayLoadOrStore())
  1827. MemAcessBeforeSPPreInc = true;
  1828. } while (MBBI != B && Count < Limit);
  1829. return E;
  1830. }
  1831. bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
  1832. MachineBasicBlock::iterator &MBBI) {
  1833. MachineInstr &MI = *MBBI;
  1834. // If this is a volatile load, don't mess with it.
  1835. if (MI.hasOrderedMemoryRef())
  1836. return false;
  1837. // Make sure this is a reg+imm.
  1838. // FIXME: It is possible to extend it to handle reg+reg cases.
  1839. if (!getLdStOffsetOp(MI).isImm())
  1840. return false;
  1841. // Look backward up to LdStLimit instructions.
  1842. MachineBasicBlock::iterator StoreI;
  1843. if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
  1844. ++NumLoadsFromStoresPromoted;
  1845. // Promote the load. Keeping the iterator straight is a
  1846. // pain, so we let the merge routine tell us what the next instruction
  1847. // is after it's done mucking about.
  1848. MBBI = promoteLoadFromStore(MBBI, StoreI);
  1849. return true;
  1850. }
  1851. return false;
  1852. }
  1853. // Merge adjacent zero stores into a wider store.
  1854. bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
  1855. MachineBasicBlock::iterator &MBBI) {
  1856. assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
  1857. MachineInstr &MI = *MBBI;
  1858. MachineBasicBlock::iterator E = MI.getParent()->end();
  1859. if (!TII->isCandidateToMergeOrPair(MI))
  1860. return false;
  1861. // Look ahead up to LdStLimit instructions for a mergable instruction.
  1862. LdStPairFlags Flags;
  1863. MachineBasicBlock::iterator MergeMI =
  1864. findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
  1865. if (MergeMI != E) {
  1866. ++NumZeroStoresPromoted;
  1867. // Keeping the iterator straight is a pain, so we let the merge routine tell
  1868. // us what the next instruction is after it's done mucking about.
  1869. MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
  1870. return true;
  1871. }
  1872. return false;
  1873. }
  1874. // Find loads and stores that can be merged into a single load or store pair
  1875. // instruction.
  1876. bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
  1877. MachineInstr &MI = *MBBI;
  1878. MachineBasicBlock::iterator E = MI.getParent()->end();
  1879. if (!TII->isCandidateToMergeOrPair(MI))
  1880. return false;
  1881. // Early exit if the offset is not possible to match. (6 bits of positive
  1882. // range, plus allow an extra one in case we find a later insn that matches
  1883. // with Offset-1)
  1884. bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
  1885. int Offset = getLdStOffsetOp(MI).getImm();
  1886. int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
  1887. // Allow one more for offset.
  1888. if (Offset > 0)
  1889. Offset -= OffsetStride;
  1890. if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
  1891. return false;
  1892. // Look ahead up to LdStLimit instructions for a pairable instruction.
  1893. LdStPairFlags Flags;
  1894. MachineBasicBlock::iterator Paired =
  1895. findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
  1896. if (Paired != E) {
  1897. ++NumPairCreated;
  1898. if (TII->hasUnscaledLdStOffset(MI))
  1899. ++NumUnscaledPairCreated;
  1900. // Keeping the iterator straight is a pain, so we let the merge routine tell
  1901. // us what the next instruction is after it's done mucking about.
  1902. auto Prev = std::prev(MBBI);
  1903. MBBI = mergePairedInsns(MBBI, Paired, Flags);
  1904. // Collect liveness info for instructions between Prev and the new position
  1905. // MBBI.
  1906. for (auto I = std::next(Prev); I != MBBI; I++)
  1907. updateDefinedRegisters(*I, DefinedInBB, TRI);
  1908. return true;
  1909. }
  1910. return false;
  1911. }
  1912. bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
  1913. (MachineBasicBlock::iterator &MBBI) {
  1914. MachineInstr &MI = *MBBI;
  1915. MachineBasicBlock::iterator E = MI.getParent()->end();
  1916. MachineBasicBlock::iterator Update;
  1917. // Look forward to try to form a post-index instruction. For example,
  1918. // ldr x0, [x20]
  1919. // add x20, x20, #32
  1920. // merged into:
  1921. // ldr x0, [x20], #32
  1922. Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
  1923. if (Update != E) {
  1924. // Merge the update into the ld/st.
  1925. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
  1926. return true;
  1927. }
  1928. // Don't know how to handle unscaled pre/post-index versions below, so bail.
  1929. if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
  1930. return false;
  1931. // Look back to try to find a pre-index instruction. For example,
  1932. // add x0, x0, #8
  1933. // ldr x1, [x0]
  1934. // merged into:
  1935. // ldr x1, [x0, #8]!
  1936. Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
  1937. if (Update != E) {
  1938. // Merge the update into the ld/st.
  1939. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
  1940. return true;
  1941. }
  1942. // The immediate in the load/store is scaled by the size of the memory
  1943. // operation. The immediate in the add we're looking for,
  1944. // however, is not, so adjust here.
  1945. int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
  1946. // Look forward to try to find a pre-index instruction. For example,
  1947. // ldr x1, [x0, #64]
  1948. // add x0, x0, #64
  1949. // merged into:
  1950. // ldr x1, [x0, #64]!
  1951. Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
  1952. if (Update != E) {
  1953. // Merge the update into the ld/st.
  1954. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
  1955. return true;
  1956. }
  1957. return false;
  1958. }
  1959. bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
  1960. bool EnableNarrowZeroStOpt) {
  1961. bool Modified = false;
  1962. // Four tranformations to do here:
  1963. // 1) Find loads that directly read from stores and promote them by
  1964. // replacing with mov instructions. If the store is wider than the load,
  1965. // the load will be replaced with a bitfield extract.
  1966. // e.g.,
  1967. // str w1, [x0, #4]
  1968. // ldrh w2, [x0, #6]
  1969. // ; becomes
  1970. // str w1, [x0, #4]
  1971. // lsr w2, w1, #16
  1972. for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
  1973. MBBI != E;) {
  1974. if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
  1975. Modified = true;
  1976. else
  1977. ++MBBI;
  1978. }
  1979. // 2) Merge adjacent zero stores into a wider store.
  1980. // e.g.,
  1981. // strh wzr, [x0]
  1982. // strh wzr, [x0, #2]
  1983. // ; becomes
  1984. // str wzr, [x0]
  1985. // e.g.,
  1986. // str wzr, [x0]
  1987. // str wzr, [x0, #4]
  1988. // ; becomes
  1989. // str xzr, [x0]
  1990. if (EnableNarrowZeroStOpt)
  1991. for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
  1992. MBBI != E;) {
  1993. if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
  1994. Modified = true;
  1995. else
  1996. ++MBBI;
  1997. }
  1998. // 3) Find loads and stores that can be merged into a single load or store
  1999. // pair instruction.
  2000. // e.g.,
  2001. // ldr x0, [x2]
  2002. // ldr x1, [x2, #8]
  2003. // ; becomes
  2004. // ldp x0, x1, [x2]
  2005. if (MBB.getParent()->getRegInfo().tracksLiveness()) {
  2006. DefinedInBB.clear();
  2007. DefinedInBB.addLiveIns(MBB);
  2008. }
  2009. for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
  2010. MBBI != E;) {
  2011. // Track currently live registers up to this point, to help with
  2012. // searching for a rename register on demand.
  2013. updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
  2014. if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
  2015. Modified = true;
  2016. else
  2017. ++MBBI;
  2018. }
  2019. // 4) Find base register updates that can be merged into the load or store
  2020. // as a base-reg writeback.
  2021. // e.g.,
  2022. // ldr x0, [x2]
  2023. // add x2, x2, #4
  2024. // ; becomes
  2025. // ldr x0, [x2], #4
  2026. for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
  2027. MBBI != E;) {
  2028. if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
  2029. Modified = true;
  2030. else
  2031. ++MBBI;
  2032. }
  2033. return Modified;
  2034. }
  2035. bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
  2036. if (skipFunction(Fn.getFunction()))
  2037. return false;
  2038. Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
  2039. TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
  2040. TRI = Subtarget->getRegisterInfo();
  2041. AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  2042. // Resize the modified and used register unit trackers. We do this once
  2043. // per function and then clear the register units each time we optimize a load
  2044. // or store.
  2045. ModifiedRegUnits.init(*TRI);
  2046. UsedRegUnits.init(*TRI);
  2047. DefinedInBB.init(*TRI);
  2048. bool Modified = false;
  2049. bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
  2050. for (auto &MBB : Fn) {
  2051. auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
  2052. Modified |= M;
  2053. }
  2054. return Modified;
  2055. }
  2056. // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
  2057. // stores near one another? Note: The pre-RA instruction scheduler already has
  2058. // hooks to try and schedule pairable loads/stores together to improve pairing
  2059. // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
  2060. // FIXME: When pairing store instructions it's very possible for this pass to
  2061. // hoist a store with a KILL marker above another use (without a KILL marker).
  2062. // The resulting IR is invalid, but nothing uses the KILL markers after this
  2063. // pass, so it's never caused a problem in practice.
  2064. /// createAArch64LoadStoreOptimizationPass - returns an instance of the
  2065. /// load / store optimization pass.
  2066. FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
  2067. return new AArch64LoadStoreOpt();
  2068. }