AArch64FalkorHWPFFix.cpp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838
  1. //===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
  9. /// that may inhibit the HW prefetching. This is done in two steps. Before
  10. /// ISel, we mark strided loads (i.e. those that will likely benefit from
  11. /// prefetching) with metadata. Then, after opcodes have been finalized, we
  12. /// insert MOVs and re-write loads to prevent unintentional tag collisions.
  13. // ===---------------------------------------------------------------------===//
  14. #include "AArch64.h"
  15. #include "AArch64InstrInfo.h"
  16. #include "AArch64Subtarget.h"
  17. #include "AArch64TargetMachine.h"
  18. #include "llvm/ADT/DenseMap.h"
  19. #include "llvm/ADT/DepthFirstIterator.h"
  20. #include "llvm/ADT/SmallVector.h"
  21. #include "llvm/ADT/Statistic.h"
  22. #include "llvm/Analysis/LoopInfo.h"
  23. #include "llvm/Analysis/ScalarEvolution.h"
  24. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  25. #include "llvm/CodeGen/LiveRegUnits.h"
  26. #include "llvm/CodeGen/MachineBasicBlock.h"
  27. #include "llvm/CodeGen/MachineFunction.h"
  28. #include "llvm/CodeGen/MachineFunctionPass.h"
  29. #include "llvm/CodeGen/MachineInstr.h"
  30. #include "llvm/CodeGen/MachineInstrBuilder.h"
  31. #include "llvm/CodeGen/MachineLoopInfo.h"
  32. #include "llvm/CodeGen/MachineOperand.h"
  33. #include "llvm/CodeGen/MachineRegisterInfo.h"
  34. #include "llvm/CodeGen/TargetPassConfig.h"
  35. #include "llvm/CodeGen/TargetRegisterInfo.h"
  36. #include "llvm/IR/DebugLoc.h"
  37. #include "llvm/IR/Dominators.h"
  38. #include "llvm/IR/Function.h"
  39. #include "llvm/IR/Instruction.h"
  40. #include "llvm/IR/Instructions.h"
  41. #include "llvm/IR/Metadata.h"
  42. #include "llvm/InitializePasses.h"
  43. #include "llvm/Pass.h"
  44. #include "llvm/Support/Casting.h"
  45. #include "llvm/Support/Debug.h"
  46. #include "llvm/Support/DebugCounter.h"
  47. #include "llvm/Support/raw_ostream.h"
  48. #include <cassert>
  49. #include <iterator>
  50. #include <utility>
  51. using namespace llvm;
  52. #define DEBUG_TYPE "aarch64-falkor-hwpf-fix"
  53. STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
  54. STATISTIC(NumCollisionsAvoided,
  55. "Number of HW prefetch tag collisions avoided");
  56. STATISTIC(NumCollisionsNotAvoided,
  57. "Number of HW prefetch tag collisions not avoided due to lack of registers");
  58. DEBUG_COUNTER(FixCounter, "falkor-hwpf",
  59. "Controls which tag collisions are avoided");
  60. namespace {
  61. class FalkorMarkStridedAccesses {
  62. public:
  63. FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
  64. : LI(LI), SE(SE) {}
  65. bool run();
  66. private:
  67. bool runOnLoop(Loop &L);
  68. LoopInfo &LI;
  69. ScalarEvolution &SE;
  70. };
  71. class FalkorMarkStridedAccessesLegacy : public FunctionPass {
  72. public:
  73. static char ID; // Pass ID, replacement for typeid
  74. FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
  75. initializeFalkorMarkStridedAccessesLegacyPass(
  76. *PassRegistry::getPassRegistry());
  77. }
  78. void getAnalysisUsage(AnalysisUsage &AU) const override {
  79. AU.addRequired<TargetPassConfig>();
  80. AU.addPreserved<DominatorTreeWrapperPass>();
  81. AU.addRequired<LoopInfoWrapperPass>();
  82. AU.addPreserved<LoopInfoWrapperPass>();
  83. AU.addRequired<ScalarEvolutionWrapperPass>();
  84. AU.addPreserved<ScalarEvolutionWrapperPass>();
  85. }
  86. bool runOnFunction(Function &F) override;
  87. };
  88. } // end anonymous namespace
  89. char FalkorMarkStridedAccessesLegacy::ID = 0;
  90. INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
  91. "Falkor HW Prefetch Fix", false, false)
  92. INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
  93. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  94. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  95. INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
  96. "Falkor HW Prefetch Fix", false, false)
  97. FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
  98. return new FalkorMarkStridedAccessesLegacy();
  99. }
  100. bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
  101. TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
  102. const AArch64Subtarget *ST =
  103. TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
  104. if (ST->getProcFamily() != AArch64Subtarget::Falkor)
  105. return false;
  106. if (skipFunction(F))
  107. return false;
  108. LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  109. ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  110. FalkorMarkStridedAccesses LDP(LI, SE);
  111. return LDP.run();
  112. }
  113. bool FalkorMarkStridedAccesses::run() {
  114. bool MadeChange = false;
  115. for (Loop *L : LI)
  116. for (Loop *LIt : depth_first(L))
  117. MadeChange |= runOnLoop(*LIt);
  118. return MadeChange;
  119. }
  120. bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
  121. // Only mark strided loads in the inner-most loop
  122. if (!L.isInnermost())
  123. return false;
  124. bool MadeChange = false;
  125. for (BasicBlock *BB : L.blocks()) {
  126. for (Instruction &I : *BB) {
  127. LoadInst *LoadI = dyn_cast<LoadInst>(&I);
  128. if (!LoadI)
  129. continue;
  130. Value *PtrValue = LoadI->getPointerOperand();
  131. if (L.isLoopInvariant(PtrValue))
  132. continue;
  133. const SCEV *LSCEV = SE.getSCEV(PtrValue);
  134. const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
  135. if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
  136. continue;
  137. LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
  138. MDNode::get(LoadI->getContext(), {}));
  139. ++NumStridedLoadsMarked;
  140. LLVM_DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
  141. MadeChange = true;
  142. }
  143. }
  144. return MadeChange;
  145. }
  146. namespace {
  147. class FalkorHWPFFix : public MachineFunctionPass {
  148. public:
  149. static char ID;
  150. FalkorHWPFFix() : MachineFunctionPass(ID) {
  151. initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry());
  152. }
  153. bool runOnMachineFunction(MachineFunction &Fn) override;
  154. void getAnalysisUsage(AnalysisUsage &AU) const override {
  155. AU.setPreservesCFG();
  156. AU.addRequired<MachineLoopInfo>();
  157. MachineFunctionPass::getAnalysisUsage(AU);
  158. }
  159. MachineFunctionProperties getRequiredProperties() const override {
  160. return MachineFunctionProperties().set(
  161. MachineFunctionProperties::Property::NoVRegs);
  162. }
  163. private:
  164. void runOnLoop(MachineLoop &L, MachineFunction &Fn);
  165. const AArch64InstrInfo *TII;
  166. const TargetRegisterInfo *TRI;
  167. DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap;
  168. bool Modified;
  169. };
  170. /// Bits from load opcodes used to compute HW prefetcher instruction tags.
  171. struct LoadInfo {
  172. LoadInfo() = default;
  173. Register DestReg;
  174. Register BaseReg;
  175. int BaseRegIdx = -1;
  176. const MachineOperand *OffsetOpnd = nullptr;
  177. bool IsPrePost = false;
  178. };
  179. } // end anonymous namespace
  180. char FalkorHWPFFix::ID = 0;
  181. INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",
  182. "Falkor HW Prefetch Fix Late Phase", false, false)
  183. INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
  184. INITIALIZE_PASS_END(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",
  185. "Falkor HW Prefetch Fix Late Phase", false, false)
  186. static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
  187. return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
  188. }
  189. static std::optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
  190. int DestRegIdx;
  191. int BaseRegIdx;
  192. int OffsetIdx;
  193. bool IsPrePost;
  194. switch (MI.getOpcode()) {
  195. default:
  196. return std::nullopt;
  197. case AArch64::LD1i64:
  198. case AArch64::LD2i64:
  199. DestRegIdx = 0;
  200. BaseRegIdx = 3;
  201. OffsetIdx = -1;
  202. IsPrePost = false;
  203. break;
  204. case AArch64::LD1i8:
  205. case AArch64::LD1i16:
  206. case AArch64::LD1i32:
  207. case AArch64::LD2i8:
  208. case AArch64::LD2i16:
  209. case AArch64::LD2i32:
  210. case AArch64::LD3i8:
  211. case AArch64::LD3i16:
  212. case AArch64::LD3i32:
  213. case AArch64::LD3i64:
  214. case AArch64::LD4i8:
  215. case AArch64::LD4i16:
  216. case AArch64::LD4i32:
  217. case AArch64::LD4i64:
  218. DestRegIdx = -1;
  219. BaseRegIdx = 3;
  220. OffsetIdx = -1;
  221. IsPrePost = false;
  222. break;
  223. case AArch64::LD1Onev1d:
  224. case AArch64::LD1Onev2s:
  225. case AArch64::LD1Onev4h:
  226. case AArch64::LD1Onev8b:
  227. case AArch64::LD1Onev2d:
  228. case AArch64::LD1Onev4s:
  229. case AArch64::LD1Onev8h:
  230. case AArch64::LD1Onev16b:
  231. case AArch64::LD1Rv1d:
  232. case AArch64::LD1Rv2s:
  233. case AArch64::LD1Rv4h:
  234. case AArch64::LD1Rv8b:
  235. case AArch64::LD1Rv2d:
  236. case AArch64::LD1Rv4s:
  237. case AArch64::LD1Rv8h:
  238. case AArch64::LD1Rv16b:
  239. DestRegIdx = 0;
  240. BaseRegIdx = 1;
  241. OffsetIdx = -1;
  242. IsPrePost = false;
  243. break;
  244. case AArch64::LD1Twov1d:
  245. case AArch64::LD1Twov2s:
  246. case AArch64::LD1Twov4h:
  247. case AArch64::LD1Twov8b:
  248. case AArch64::LD1Twov2d:
  249. case AArch64::LD1Twov4s:
  250. case AArch64::LD1Twov8h:
  251. case AArch64::LD1Twov16b:
  252. case AArch64::LD1Threev1d:
  253. case AArch64::LD1Threev2s:
  254. case AArch64::LD1Threev4h:
  255. case AArch64::LD1Threev8b:
  256. case AArch64::LD1Threev2d:
  257. case AArch64::LD1Threev4s:
  258. case AArch64::LD1Threev8h:
  259. case AArch64::LD1Threev16b:
  260. case AArch64::LD1Fourv1d:
  261. case AArch64::LD1Fourv2s:
  262. case AArch64::LD1Fourv4h:
  263. case AArch64::LD1Fourv8b:
  264. case AArch64::LD1Fourv2d:
  265. case AArch64::LD1Fourv4s:
  266. case AArch64::LD1Fourv8h:
  267. case AArch64::LD1Fourv16b:
  268. case AArch64::LD2Twov2s:
  269. case AArch64::LD2Twov4s:
  270. case AArch64::LD2Twov8b:
  271. case AArch64::LD2Twov2d:
  272. case AArch64::LD2Twov4h:
  273. case AArch64::LD2Twov8h:
  274. case AArch64::LD2Twov16b:
  275. case AArch64::LD2Rv1d:
  276. case AArch64::LD2Rv2s:
  277. case AArch64::LD2Rv4s:
  278. case AArch64::LD2Rv8b:
  279. case AArch64::LD2Rv2d:
  280. case AArch64::LD2Rv4h:
  281. case AArch64::LD2Rv8h:
  282. case AArch64::LD2Rv16b:
  283. case AArch64::LD3Threev2s:
  284. case AArch64::LD3Threev4h:
  285. case AArch64::LD3Threev8b:
  286. case AArch64::LD3Threev2d:
  287. case AArch64::LD3Threev4s:
  288. case AArch64::LD3Threev8h:
  289. case AArch64::LD3Threev16b:
  290. case AArch64::LD3Rv1d:
  291. case AArch64::LD3Rv2s:
  292. case AArch64::LD3Rv4h:
  293. case AArch64::LD3Rv8b:
  294. case AArch64::LD3Rv2d:
  295. case AArch64::LD3Rv4s:
  296. case AArch64::LD3Rv8h:
  297. case AArch64::LD3Rv16b:
  298. case AArch64::LD4Fourv2s:
  299. case AArch64::LD4Fourv4h:
  300. case AArch64::LD4Fourv8b:
  301. case AArch64::LD4Fourv2d:
  302. case AArch64::LD4Fourv4s:
  303. case AArch64::LD4Fourv8h:
  304. case AArch64::LD4Fourv16b:
  305. case AArch64::LD4Rv1d:
  306. case AArch64::LD4Rv2s:
  307. case AArch64::LD4Rv4h:
  308. case AArch64::LD4Rv8b:
  309. case AArch64::LD4Rv2d:
  310. case AArch64::LD4Rv4s:
  311. case AArch64::LD4Rv8h:
  312. case AArch64::LD4Rv16b:
  313. DestRegIdx = -1;
  314. BaseRegIdx = 1;
  315. OffsetIdx = -1;
  316. IsPrePost = false;
  317. break;
  318. case AArch64::LD1i64_POST:
  319. case AArch64::LD2i64_POST:
  320. DestRegIdx = 1;
  321. BaseRegIdx = 4;
  322. OffsetIdx = 5;
  323. IsPrePost = true;
  324. break;
  325. case AArch64::LD1i8_POST:
  326. case AArch64::LD1i16_POST:
  327. case AArch64::LD1i32_POST:
  328. case AArch64::LD2i8_POST:
  329. case AArch64::LD2i16_POST:
  330. case AArch64::LD2i32_POST:
  331. case AArch64::LD3i8_POST:
  332. case AArch64::LD3i16_POST:
  333. case AArch64::LD3i32_POST:
  334. case AArch64::LD3i64_POST:
  335. case AArch64::LD4i8_POST:
  336. case AArch64::LD4i16_POST:
  337. case AArch64::LD4i32_POST:
  338. case AArch64::LD4i64_POST:
  339. DestRegIdx = -1;
  340. BaseRegIdx = 4;
  341. OffsetIdx = 5;
  342. IsPrePost = true;
  343. break;
  344. case AArch64::LD1Onev1d_POST:
  345. case AArch64::LD1Onev2s_POST:
  346. case AArch64::LD1Onev4h_POST:
  347. case AArch64::LD1Onev8b_POST:
  348. case AArch64::LD1Onev2d_POST:
  349. case AArch64::LD1Onev4s_POST:
  350. case AArch64::LD1Onev8h_POST:
  351. case AArch64::LD1Onev16b_POST:
  352. case AArch64::LD1Rv1d_POST:
  353. case AArch64::LD1Rv2s_POST:
  354. case AArch64::LD1Rv4h_POST:
  355. case AArch64::LD1Rv8b_POST:
  356. case AArch64::LD1Rv2d_POST:
  357. case AArch64::LD1Rv4s_POST:
  358. case AArch64::LD1Rv8h_POST:
  359. case AArch64::LD1Rv16b_POST:
  360. DestRegIdx = 1;
  361. BaseRegIdx = 2;
  362. OffsetIdx = 3;
  363. IsPrePost = true;
  364. break;
  365. case AArch64::LD1Twov1d_POST:
  366. case AArch64::LD1Twov2s_POST:
  367. case AArch64::LD1Twov4h_POST:
  368. case AArch64::LD1Twov8b_POST:
  369. case AArch64::LD1Twov2d_POST:
  370. case AArch64::LD1Twov4s_POST:
  371. case AArch64::LD1Twov8h_POST:
  372. case AArch64::LD1Twov16b_POST:
  373. case AArch64::LD1Threev1d_POST:
  374. case AArch64::LD1Threev2s_POST:
  375. case AArch64::LD1Threev4h_POST:
  376. case AArch64::LD1Threev8b_POST:
  377. case AArch64::LD1Threev2d_POST:
  378. case AArch64::LD1Threev4s_POST:
  379. case AArch64::LD1Threev8h_POST:
  380. case AArch64::LD1Threev16b_POST:
  381. case AArch64::LD1Fourv1d_POST:
  382. case AArch64::LD1Fourv2s_POST:
  383. case AArch64::LD1Fourv4h_POST:
  384. case AArch64::LD1Fourv8b_POST:
  385. case AArch64::LD1Fourv2d_POST:
  386. case AArch64::LD1Fourv4s_POST:
  387. case AArch64::LD1Fourv8h_POST:
  388. case AArch64::LD1Fourv16b_POST:
  389. case AArch64::LD2Twov2s_POST:
  390. case AArch64::LD2Twov4s_POST:
  391. case AArch64::LD2Twov8b_POST:
  392. case AArch64::LD2Twov2d_POST:
  393. case AArch64::LD2Twov4h_POST:
  394. case AArch64::LD2Twov8h_POST:
  395. case AArch64::LD2Twov16b_POST:
  396. case AArch64::LD2Rv1d_POST:
  397. case AArch64::LD2Rv2s_POST:
  398. case AArch64::LD2Rv4s_POST:
  399. case AArch64::LD2Rv8b_POST:
  400. case AArch64::LD2Rv2d_POST:
  401. case AArch64::LD2Rv4h_POST:
  402. case AArch64::LD2Rv8h_POST:
  403. case AArch64::LD2Rv16b_POST:
  404. case AArch64::LD3Threev2s_POST:
  405. case AArch64::LD3Threev4h_POST:
  406. case AArch64::LD3Threev8b_POST:
  407. case AArch64::LD3Threev2d_POST:
  408. case AArch64::LD3Threev4s_POST:
  409. case AArch64::LD3Threev8h_POST:
  410. case AArch64::LD3Threev16b_POST:
  411. case AArch64::LD3Rv1d_POST:
  412. case AArch64::LD3Rv2s_POST:
  413. case AArch64::LD3Rv4h_POST:
  414. case AArch64::LD3Rv8b_POST:
  415. case AArch64::LD3Rv2d_POST:
  416. case AArch64::LD3Rv4s_POST:
  417. case AArch64::LD3Rv8h_POST:
  418. case AArch64::LD3Rv16b_POST:
  419. case AArch64::LD4Fourv2s_POST:
  420. case AArch64::LD4Fourv4h_POST:
  421. case AArch64::LD4Fourv8b_POST:
  422. case AArch64::LD4Fourv2d_POST:
  423. case AArch64::LD4Fourv4s_POST:
  424. case AArch64::LD4Fourv8h_POST:
  425. case AArch64::LD4Fourv16b_POST:
  426. case AArch64::LD4Rv1d_POST:
  427. case AArch64::LD4Rv2s_POST:
  428. case AArch64::LD4Rv4h_POST:
  429. case AArch64::LD4Rv8b_POST:
  430. case AArch64::LD4Rv2d_POST:
  431. case AArch64::LD4Rv4s_POST:
  432. case AArch64::LD4Rv8h_POST:
  433. case AArch64::LD4Rv16b_POST:
  434. DestRegIdx = -1;
  435. BaseRegIdx = 2;
  436. OffsetIdx = 3;
  437. IsPrePost = true;
  438. break;
  439. case AArch64::LDRBBroW:
  440. case AArch64::LDRBBroX:
  441. case AArch64::LDRBBui:
  442. case AArch64::LDRBroW:
  443. case AArch64::LDRBroX:
  444. case AArch64::LDRBui:
  445. case AArch64::LDRDl:
  446. case AArch64::LDRDroW:
  447. case AArch64::LDRDroX:
  448. case AArch64::LDRDui:
  449. case AArch64::LDRHHroW:
  450. case AArch64::LDRHHroX:
  451. case AArch64::LDRHHui:
  452. case AArch64::LDRHroW:
  453. case AArch64::LDRHroX:
  454. case AArch64::LDRHui:
  455. case AArch64::LDRQl:
  456. case AArch64::LDRQroW:
  457. case AArch64::LDRQroX:
  458. case AArch64::LDRQui:
  459. case AArch64::LDRSBWroW:
  460. case AArch64::LDRSBWroX:
  461. case AArch64::LDRSBWui:
  462. case AArch64::LDRSBXroW:
  463. case AArch64::LDRSBXroX:
  464. case AArch64::LDRSBXui:
  465. case AArch64::LDRSHWroW:
  466. case AArch64::LDRSHWroX:
  467. case AArch64::LDRSHWui:
  468. case AArch64::LDRSHXroW:
  469. case AArch64::LDRSHXroX:
  470. case AArch64::LDRSHXui:
  471. case AArch64::LDRSWl:
  472. case AArch64::LDRSWroW:
  473. case AArch64::LDRSWroX:
  474. case AArch64::LDRSWui:
  475. case AArch64::LDRSl:
  476. case AArch64::LDRSroW:
  477. case AArch64::LDRSroX:
  478. case AArch64::LDRSui:
  479. case AArch64::LDRWl:
  480. case AArch64::LDRWroW:
  481. case AArch64::LDRWroX:
  482. case AArch64::LDRWui:
  483. case AArch64::LDRXl:
  484. case AArch64::LDRXroW:
  485. case AArch64::LDRXroX:
  486. case AArch64::LDRXui:
  487. case AArch64::LDURBBi:
  488. case AArch64::LDURBi:
  489. case AArch64::LDURDi:
  490. case AArch64::LDURHHi:
  491. case AArch64::LDURHi:
  492. case AArch64::LDURQi:
  493. case AArch64::LDURSBWi:
  494. case AArch64::LDURSBXi:
  495. case AArch64::LDURSHWi:
  496. case AArch64::LDURSHXi:
  497. case AArch64::LDURSWi:
  498. case AArch64::LDURSi:
  499. case AArch64::LDURWi:
  500. case AArch64::LDURXi:
  501. DestRegIdx = 0;
  502. BaseRegIdx = 1;
  503. OffsetIdx = 2;
  504. IsPrePost = false;
  505. break;
  506. case AArch64::LDRBBpost:
  507. case AArch64::LDRBBpre:
  508. case AArch64::LDRBpost:
  509. case AArch64::LDRBpre:
  510. case AArch64::LDRDpost:
  511. case AArch64::LDRDpre:
  512. case AArch64::LDRHHpost:
  513. case AArch64::LDRHHpre:
  514. case AArch64::LDRHpost:
  515. case AArch64::LDRHpre:
  516. case AArch64::LDRQpost:
  517. case AArch64::LDRQpre:
  518. case AArch64::LDRSBWpost:
  519. case AArch64::LDRSBWpre:
  520. case AArch64::LDRSBXpost:
  521. case AArch64::LDRSBXpre:
  522. case AArch64::LDRSHWpost:
  523. case AArch64::LDRSHWpre:
  524. case AArch64::LDRSHXpost:
  525. case AArch64::LDRSHXpre:
  526. case AArch64::LDRSWpost:
  527. case AArch64::LDRSWpre:
  528. case AArch64::LDRSpost:
  529. case AArch64::LDRSpre:
  530. case AArch64::LDRWpost:
  531. case AArch64::LDRWpre:
  532. case AArch64::LDRXpost:
  533. case AArch64::LDRXpre:
  534. DestRegIdx = 1;
  535. BaseRegIdx = 2;
  536. OffsetIdx = 3;
  537. IsPrePost = true;
  538. break;
  539. case AArch64::LDNPDi:
  540. case AArch64::LDNPQi:
  541. case AArch64::LDNPSi:
  542. case AArch64::LDPQi:
  543. case AArch64::LDPDi:
  544. case AArch64::LDPSi:
  545. DestRegIdx = -1;
  546. BaseRegIdx = 2;
  547. OffsetIdx = 3;
  548. IsPrePost = false;
  549. break;
  550. case AArch64::LDPSWi:
  551. case AArch64::LDPWi:
  552. case AArch64::LDPXi:
  553. DestRegIdx = 0;
  554. BaseRegIdx = 2;
  555. OffsetIdx = 3;
  556. IsPrePost = false;
  557. break;
  558. case AArch64::LDPQpost:
  559. case AArch64::LDPQpre:
  560. case AArch64::LDPDpost:
  561. case AArch64::LDPDpre:
  562. case AArch64::LDPSpost:
  563. case AArch64::LDPSpre:
  564. DestRegIdx = -1;
  565. BaseRegIdx = 3;
  566. OffsetIdx = 4;
  567. IsPrePost = true;
  568. break;
  569. case AArch64::LDPSWpost:
  570. case AArch64::LDPSWpre:
  571. case AArch64::LDPWpost:
  572. case AArch64::LDPWpre:
  573. case AArch64::LDPXpost:
  574. case AArch64::LDPXpre:
  575. DestRegIdx = 1;
  576. BaseRegIdx = 3;
  577. OffsetIdx = 4;
  578. IsPrePost = true;
  579. break;
  580. }
  581. // Loads from the stack pointer don't get prefetched.
  582. Register BaseReg = MI.getOperand(BaseRegIdx).getReg();
  583. if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP)
  584. return std::nullopt;
  585. LoadInfo LI;
  586. LI.DestReg = DestRegIdx == -1 ? Register() : MI.getOperand(DestRegIdx).getReg();
  587. LI.BaseReg = BaseReg;
  588. LI.BaseRegIdx = BaseRegIdx;
  589. LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);
  590. LI.IsPrePost = IsPrePost;
  591. return LI;
  592. }
  593. static std::optional<unsigned> getTag(const TargetRegisterInfo *TRI,
  594. const MachineInstr &MI,
  595. const LoadInfo &LI) {
  596. unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0;
  597. unsigned Base = TRI->getEncodingValue(LI.BaseReg);
  598. unsigned Off;
  599. if (LI.OffsetOpnd == nullptr)
  600. Off = 0;
  601. else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
  602. LI.OffsetOpnd->isCPI())
  603. return std::nullopt;
  604. else if (LI.OffsetOpnd->isReg())
  605. Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());
  606. else
  607. Off = LI.OffsetOpnd->getImm() >> 2;
  608. return makeTag(Dest, Base, Off);
  609. }
  610. void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
  611. // Build the initial tag map for the whole loop.
  612. TagMap.clear();
  613. for (MachineBasicBlock *MBB : L.getBlocks())
  614. for (MachineInstr &MI : *MBB) {
  615. std::optional<LoadInfo> LInfo = getLoadInfo(MI);
  616. if (!LInfo)
  617. continue;
  618. std::optional<unsigned> Tag = getTag(TRI, MI, *LInfo);
  619. if (!Tag)
  620. continue;
  621. TagMap[*Tag].push_back(&MI);
  622. }
  623. bool AnyCollisions = false;
  624. for (auto &P : TagMap) {
  625. auto Size = P.second.size();
  626. if (Size > 1) {
  627. for (auto *MI : P.second) {
  628. if (TII->isStridedAccess(*MI)) {
  629. AnyCollisions = true;
  630. break;
  631. }
  632. }
  633. }
  634. if (AnyCollisions)
  635. break;
  636. }
  637. // Nothing to fix.
  638. if (!AnyCollisions)
  639. return;
  640. MachineRegisterInfo &MRI = Fn.getRegInfo();
  641. // Go through all the basic blocks in the current loop and fix any streaming
  642. // loads to avoid collisions with any other loads.
  643. LiveRegUnits LR(*TRI);
  644. for (MachineBasicBlock *MBB : L.getBlocks()) {
  645. LR.clear();
  646. LR.addLiveOuts(*MBB);
  647. for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) {
  648. MachineInstr &MI = *I;
  649. if (!TII->isStridedAccess(MI))
  650. continue;
  651. std::optional<LoadInfo> OptLdI = getLoadInfo(MI);
  652. if (!OptLdI)
  653. continue;
  654. LoadInfo LdI = *OptLdI;
  655. std::optional<unsigned> OptOldTag = getTag(TRI, MI, LdI);
  656. if (!OptOldTag)
  657. continue;
  658. auto &OldCollisions = TagMap[*OptOldTag];
  659. if (OldCollisions.size() <= 1)
  660. continue;
  661. bool Fixed = false;
  662. LLVM_DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
  663. if (!DebugCounter::shouldExecute(FixCounter)) {
  664. LLVM_DEBUG(dbgs() << "Skipping fix due to debug counter:\n " << MI);
  665. continue;
  666. }
  667. // Add the non-base registers of MI as live so we don't use them as
  668. // scratch registers.
  669. for (unsigned OpI = 0, OpE = MI.getNumOperands(); OpI < OpE; ++OpI) {
  670. if (OpI == static_cast<unsigned>(LdI.BaseRegIdx))
  671. continue;
  672. MachineOperand &MO = MI.getOperand(OpI);
  673. if (MO.isReg() && MO.readsReg())
  674. LR.addReg(MO.getReg());
  675. }
  676. for (unsigned ScratchReg : AArch64::GPR64RegClass) {
  677. if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
  678. continue;
  679. LoadInfo NewLdI(LdI);
  680. NewLdI.BaseReg = ScratchReg;
  681. unsigned NewTag = *getTag(TRI, MI, NewLdI);
  682. // Scratch reg tag would collide too, so don't use it.
  683. if (TagMap.count(NewTag))
  684. continue;
  685. LLVM_DEBUG(dbgs() << "Changing base reg to: "
  686. << printReg(ScratchReg, TRI) << '\n');
  687. // Rewrite:
  688. // Xd = LOAD Xb, off
  689. // to:
  690. // Xc = MOV Xb
  691. // Xd = LOAD Xc, off
  692. DebugLoc DL = MI.getDebugLoc();
  693. BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)
  694. .addReg(AArch64::XZR)
  695. .addReg(LdI.BaseReg)
  696. .addImm(0);
  697. MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);
  698. BaseOpnd.setReg(ScratchReg);
  699. // If the load does a pre/post increment, then insert a MOV after as
  700. // well to update the real base register.
  701. if (LdI.IsPrePost) {
  702. LLVM_DEBUG(dbgs() << "Doing post MOV of incremented reg: "
  703. << printReg(ScratchReg, TRI) << '\n');
  704. MI.getOperand(0).setReg(
  705. ScratchReg); // Change tied operand pre/post update dest.
  706. BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
  707. TII->get(AArch64::ORRXrs), LdI.BaseReg)
  708. .addReg(AArch64::XZR)
  709. .addReg(ScratchReg)
  710. .addImm(0);
  711. }
  712. for (int I = 0, E = OldCollisions.size(); I != E; ++I)
  713. if (OldCollisions[I] == &MI) {
  714. std::swap(OldCollisions[I], OldCollisions[E - 1]);
  715. OldCollisions.pop_back();
  716. break;
  717. }
  718. // Update TagMap to reflect instruction changes to reduce the number
  719. // of later MOVs to be inserted. This needs to be done after
  720. // OldCollisions is updated since it may be relocated by this
  721. // insertion.
  722. TagMap[NewTag].push_back(&MI);
  723. ++NumCollisionsAvoided;
  724. Fixed = true;
  725. Modified = true;
  726. break;
  727. }
  728. if (!Fixed)
  729. ++NumCollisionsNotAvoided;
  730. }
  731. }
  732. }
  733. bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
  734. auto &ST = Fn.getSubtarget<AArch64Subtarget>();
  735. if (ST.getProcFamily() != AArch64Subtarget::Falkor)
  736. return false;
  737. if (skipFunction(Fn.getFunction()))
  738. return false;
  739. TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
  740. TRI = ST.getRegisterInfo();
  741. MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
  742. Modified = false;
  743. for (MachineLoop *I : LI)
  744. for (MachineLoop *L : depth_first(I))
  745. // Only process inner-loops
  746. if (L->isInnermost())
  747. runOnLoop(*L, Fn);
  748. return Modified;
  749. }
  750. FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }