AArch64FalkorHWPFFix.cpp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839
  1. //===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
  9. /// that may inhibit the HW prefetching. This is done in two steps. Before
  10. /// ISel, we mark strided loads (i.e. those that will likely benefit from
  11. /// prefetching) with metadata. Then, after opcodes have been finalized, we
  12. /// insert MOVs and re-write loads to prevent unintentional tag collisions.
  13. // ===---------------------------------------------------------------------===//
  14. #include "AArch64.h"
  15. #include "AArch64InstrInfo.h"
  16. #include "AArch64Subtarget.h"
  17. #include "AArch64TargetMachine.h"
  18. #include "llvm/ADT/DenseMap.h"
  19. #include "llvm/ADT/DepthFirstIterator.h"
  20. #include "llvm/ADT/None.h"
  21. #include "llvm/ADT/Optional.h"
  22. #include "llvm/ADT/SmallVector.h"
  23. #include "llvm/ADT/Statistic.h"
  24. #include "llvm/Analysis/LoopInfo.h"
  25. #include "llvm/Analysis/ScalarEvolution.h"
  26. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  27. #include "llvm/CodeGen/LiveRegUnits.h"
  28. #include "llvm/CodeGen/MachineBasicBlock.h"
  29. #include "llvm/CodeGen/MachineFunction.h"
  30. #include "llvm/CodeGen/MachineFunctionPass.h"
  31. #include "llvm/CodeGen/MachineInstr.h"
  32. #include "llvm/CodeGen/MachineInstrBuilder.h"
  33. #include "llvm/CodeGen/MachineLoopInfo.h"
  34. #include "llvm/CodeGen/MachineOperand.h"
  35. #include "llvm/CodeGen/MachineRegisterInfo.h"
  36. #include "llvm/CodeGen/TargetPassConfig.h"
  37. #include "llvm/CodeGen/TargetRegisterInfo.h"
  38. #include "llvm/IR/DebugLoc.h"
  39. #include "llvm/IR/Dominators.h"
  40. #include "llvm/IR/Function.h"
  41. #include "llvm/IR/Instruction.h"
  42. #include "llvm/IR/Instructions.h"
  43. #include "llvm/IR/Metadata.h"
  44. #include "llvm/InitializePasses.h"
  45. #include "llvm/Pass.h"
  46. #include "llvm/Support/Casting.h"
  47. #include "llvm/Support/Debug.h"
  48. #include "llvm/Support/DebugCounter.h"
  49. #include "llvm/Support/raw_ostream.h"
  50. #include <cassert>
  51. #include <iterator>
  52. #include <utility>
  53. using namespace llvm;
  54. #define DEBUG_TYPE "aarch64-falkor-hwpf-fix"
  55. STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
  56. STATISTIC(NumCollisionsAvoided,
  57. "Number of HW prefetch tag collisions avoided");
  58. STATISTIC(NumCollisionsNotAvoided,
  59. "Number of HW prefetch tag collisions not avoided due to lack of registers");
  60. DEBUG_COUNTER(FixCounter, "falkor-hwpf",
  61. "Controls which tag collisions are avoided");
  62. namespace {
  63. class FalkorMarkStridedAccesses {
  64. public:
  65. FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
  66. : LI(LI), SE(SE) {}
  67. bool run();
  68. private:
  69. bool runOnLoop(Loop &L);
  70. LoopInfo &LI;
  71. ScalarEvolution &SE;
  72. };
  73. class FalkorMarkStridedAccessesLegacy : public FunctionPass {
  74. public:
  75. static char ID; // Pass ID, replacement for typeid
  76. FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
  77. initializeFalkorMarkStridedAccessesLegacyPass(
  78. *PassRegistry::getPassRegistry());
  79. }
  80. void getAnalysisUsage(AnalysisUsage &AU) const override {
  81. AU.addRequired<TargetPassConfig>();
  82. AU.addPreserved<DominatorTreeWrapperPass>();
  83. AU.addRequired<LoopInfoWrapperPass>();
  84. AU.addPreserved<LoopInfoWrapperPass>();
  85. AU.addRequired<ScalarEvolutionWrapperPass>();
  86. AU.addPreserved<ScalarEvolutionWrapperPass>();
  87. }
  88. bool runOnFunction(Function &F) override;
  89. };
  90. } // end anonymous namespace
  91. char FalkorMarkStridedAccessesLegacy::ID = 0;
  92. INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
  93. "Falkor HW Prefetch Fix", false, false)
  94. INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
  95. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  96. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  97. INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
  98. "Falkor HW Prefetch Fix", false, false)
  99. FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
  100. return new FalkorMarkStridedAccessesLegacy();
  101. }
  102. bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
  103. TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
  104. const AArch64Subtarget *ST =
  105. TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
  106. if (ST->getProcFamily() != AArch64Subtarget::Falkor)
  107. return false;
  108. if (skipFunction(F))
  109. return false;
  110. LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  111. ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  112. FalkorMarkStridedAccesses LDP(LI, SE);
  113. return LDP.run();
  114. }
  115. bool FalkorMarkStridedAccesses::run() {
  116. bool MadeChange = false;
  117. for (Loop *L : LI)
  118. for (Loop *LIt : depth_first(L))
  119. MadeChange |= runOnLoop(*LIt);
  120. return MadeChange;
  121. }
  122. bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
  123. // Only mark strided loads in the inner-most loop
  124. if (!L.isInnermost())
  125. return false;
  126. bool MadeChange = false;
  127. for (BasicBlock *BB : L.blocks()) {
  128. for (Instruction &I : *BB) {
  129. LoadInst *LoadI = dyn_cast<LoadInst>(&I);
  130. if (!LoadI)
  131. continue;
  132. Value *PtrValue = LoadI->getPointerOperand();
  133. if (L.isLoopInvariant(PtrValue))
  134. continue;
  135. const SCEV *LSCEV = SE.getSCEV(PtrValue);
  136. const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
  137. if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
  138. continue;
  139. LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
  140. MDNode::get(LoadI->getContext(), {}));
  141. ++NumStridedLoadsMarked;
  142. LLVM_DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
  143. MadeChange = true;
  144. }
  145. }
  146. return MadeChange;
  147. }
  148. namespace {
  149. class FalkorHWPFFix : public MachineFunctionPass {
  150. public:
  151. static char ID;
  152. FalkorHWPFFix() : MachineFunctionPass(ID) {
  153. initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry());
  154. }
  155. bool runOnMachineFunction(MachineFunction &Fn) override;
  156. void getAnalysisUsage(AnalysisUsage &AU) const override {
  157. AU.setPreservesCFG();
  158. AU.addRequired<MachineLoopInfo>();
  159. MachineFunctionPass::getAnalysisUsage(AU);
  160. }
  161. MachineFunctionProperties getRequiredProperties() const override {
  162. return MachineFunctionProperties().set(
  163. MachineFunctionProperties::Property::NoVRegs);
  164. }
  165. private:
  166. void runOnLoop(MachineLoop &L, MachineFunction &Fn);
  167. const AArch64InstrInfo *TII;
  168. const TargetRegisterInfo *TRI;
  169. DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap;
  170. bool Modified;
  171. };
  172. /// Bits from load opcodes used to compute HW prefetcher instruction tags.
  173. struct LoadInfo {
  174. LoadInfo() = default;
  175. Register DestReg;
  176. Register BaseReg;
  177. int BaseRegIdx = -1;
  178. const MachineOperand *OffsetOpnd = nullptr;
  179. bool IsPrePost = false;
  180. };
  181. } // end anonymous namespace
  182. char FalkorHWPFFix::ID = 0;
  183. INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",
  184. "Falkor HW Prefetch Fix Late Phase", false, false)
  185. INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
  186. INITIALIZE_PASS_END(FalkorHWPFFix, "aarch64-falkor-hwpf-fix-late",
  187. "Falkor HW Prefetch Fix Late Phase", false, false)
  188. static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
  189. return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
  190. }
  191. static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
  192. int DestRegIdx;
  193. int BaseRegIdx;
  194. int OffsetIdx;
  195. bool IsPrePost;
  196. switch (MI.getOpcode()) {
  197. default:
  198. return None;
  199. case AArch64::LD1i64:
  200. case AArch64::LD2i64:
  201. DestRegIdx = 0;
  202. BaseRegIdx = 3;
  203. OffsetIdx = -1;
  204. IsPrePost = false;
  205. break;
  206. case AArch64::LD1i8:
  207. case AArch64::LD1i16:
  208. case AArch64::LD1i32:
  209. case AArch64::LD2i8:
  210. case AArch64::LD2i16:
  211. case AArch64::LD2i32:
  212. case AArch64::LD3i8:
  213. case AArch64::LD3i16:
  214. case AArch64::LD3i32:
  215. case AArch64::LD3i64:
  216. case AArch64::LD4i8:
  217. case AArch64::LD4i16:
  218. case AArch64::LD4i32:
  219. case AArch64::LD4i64:
  220. DestRegIdx = -1;
  221. BaseRegIdx = 3;
  222. OffsetIdx = -1;
  223. IsPrePost = false;
  224. break;
  225. case AArch64::LD1Onev1d:
  226. case AArch64::LD1Onev2s:
  227. case AArch64::LD1Onev4h:
  228. case AArch64::LD1Onev8b:
  229. case AArch64::LD1Onev2d:
  230. case AArch64::LD1Onev4s:
  231. case AArch64::LD1Onev8h:
  232. case AArch64::LD1Onev16b:
  233. case AArch64::LD1Rv1d:
  234. case AArch64::LD1Rv2s:
  235. case AArch64::LD1Rv4h:
  236. case AArch64::LD1Rv8b:
  237. case AArch64::LD1Rv2d:
  238. case AArch64::LD1Rv4s:
  239. case AArch64::LD1Rv8h:
  240. case AArch64::LD1Rv16b:
  241. DestRegIdx = 0;
  242. BaseRegIdx = 1;
  243. OffsetIdx = -1;
  244. IsPrePost = false;
  245. break;
  246. case AArch64::LD1Twov1d:
  247. case AArch64::LD1Twov2s:
  248. case AArch64::LD1Twov4h:
  249. case AArch64::LD1Twov8b:
  250. case AArch64::LD1Twov2d:
  251. case AArch64::LD1Twov4s:
  252. case AArch64::LD1Twov8h:
  253. case AArch64::LD1Twov16b:
  254. case AArch64::LD1Threev1d:
  255. case AArch64::LD1Threev2s:
  256. case AArch64::LD1Threev4h:
  257. case AArch64::LD1Threev8b:
  258. case AArch64::LD1Threev2d:
  259. case AArch64::LD1Threev4s:
  260. case AArch64::LD1Threev8h:
  261. case AArch64::LD1Threev16b:
  262. case AArch64::LD1Fourv1d:
  263. case AArch64::LD1Fourv2s:
  264. case AArch64::LD1Fourv4h:
  265. case AArch64::LD1Fourv8b:
  266. case AArch64::LD1Fourv2d:
  267. case AArch64::LD1Fourv4s:
  268. case AArch64::LD1Fourv8h:
  269. case AArch64::LD1Fourv16b:
  270. case AArch64::LD2Twov2s:
  271. case AArch64::LD2Twov4s:
  272. case AArch64::LD2Twov8b:
  273. case AArch64::LD2Twov2d:
  274. case AArch64::LD2Twov4h:
  275. case AArch64::LD2Twov8h:
  276. case AArch64::LD2Twov16b:
  277. case AArch64::LD2Rv1d:
  278. case AArch64::LD2Rv2s:
  279. case AArch64::LD2Rv4s:
  280. case AArch64::LD2Rv8b:
  281. case AArch64::LD2Rv2d:
  282. case AArch64::LD2Rv4h:
  283. case AArch64::LD2Rv8h:
  284. case AArch64::LD2Rv16b:
  285. case AArch64::LD3Threev2s:
  286. case AArch64::LD3Threev4h:
  287. case AArch64::LD3Threev8b:
  288. case AArch64::LD3Threev2d:
  289. case AArch64::LD3Threev4s:
  290. case AArch64::LD3Threev8h:
  291. case AArch64::LD3Threev16b:
  292. case AArch64::LD3Rv1d:
  293. case AArch64::LD3Rv2s:
  294. case AArch64::LD3Rv4h:
  295. case AArch64::LD3Rv8b:
  296. case AArch64::LD3Rv2d:
  297. case AArch64::LD3Rv4s:
  298. case AArch64::LD3Rv8h:
  299. case AArch64::LD3Rv16b:
  300. case AArch64::LD4Fourv2s:
  301. case AArch64::LD4Fourv4h:
  302. case AArch64::LD4Fourv8b:
  303. case AArch64::LD4Fourv2d:
  304. case AArch64::LD4Fourv4s:
  305. case AArch64::LD4Fourv8h:
  306. case AArch64::LD4Fourv16b:
  307. case AArch64::LD4Rv1d:
  308. case AArch64::LD4Rv2s:
  309. case AArch64::LD4Rv4h:
  310. case AArch64::LD4Rv8b:
  311. case AArch64::LD4Rv2d:
  312. case AArch64::LD4Rv4s:
  313. case AArch64::LD4Rv8h:
  314. case AArch64::LD4Rv16b:
  315. DestRegIdx = -1;
  316. BaseRegIdx = 1;
  317. OffsetIdx = -1;
  318. IsPrePost = false;
  319. break;
  320. case AArch64::LD1i64_POST:
  321. case AArch64::LD2i64_POST:
  322. DestRegIdx = 1;
  323. BaseRegIdx = 4;
  324. OffsetIdx = 5;
  325. IsPrePost = true;
  326. break;
  327. case AArch64::LD1i8_POST:
  328. case AArch64::LD1i16_POST:
  329. case AArch64::LD1i32_POST:
  330. case AArch64::LD2i8_POST:
  331. case AArch64::LD2i16_POST:
  332. case AArch64::LD2i32_POST:
  333. case AArch64::LD3i8_POST:
  334. case AArch64::LD3i16_POST:
  335. case AArch64::LD3i32_POST:
  336. case AArch64::LD3i64_POST:
  337. case AArch64::LD4i8_POST:
  338. case AArch64::LD4i16_POST:
  339. case AArch64::LD4i32_POST:
  340. case AArch64::LD4i64_POST:
  341. DestRegIdx = -1;
  342. BaseRegIdx = 4;
  343. OffsetIdx = 5;
  344. IsPrePost = true;
  345. break;
  346. case AArch64::LD1Onev1d_POST:
  347. case AArch64::LD1Onev2s_POST:
  348. case AArch64::LD1Onev4h_POST:
  349. case AArch64::LD1Onev8b_POST:
  350. case AArch64::LD1Onev2d_POST:
  351. case AArch64::LD1Onev4s_POST:
  352. case AArch64::LD1Onev8h_POST:
  353. case AArch64::LD1Onev16b_POST:
  354. case AArch64::LD1Rv1d_POST:
  355. case AArch64::LD1Rv2s_POST:
  356. case AArch64::LD1Rv4h_POST:
  357. case AArch64::LD1Rv8b_POST:
  358. case AArch64::LD1Rv2d_POST:
  359. case AArch64::LD1Rv4s_POST:
  360. case AArch64::LD1Rv8h_POST:
  361. case AArch64::LD1Rv16b_POST:
  362. DestRegIdx = 1;
  363. BaseRegIdx = 2;
  364. OffsetIdx = 3;
  365. IsPrePost = true;
  366. break;
  367. case AArch64::LD1Twov1d_POST:
  368. case AArch64::LD1Twov2s_POST:
  369. case AArch64::LD1Twov4h_POST:
  370. case AArch64::LD1Twov8b_POST:
  371. case AArch64::LD1Twov2d_POST:
  372. case AArch64::LD1Twov4s_POST:
  373. case AArch64::LD1Twov8h_POST:
  374. case AArch64::LD1Twov16b_POST:
  375. case AArch64::LD1Threev1d_POST:
  376. case AArch64::LD1Threev2s_POST:
  377. case AArch64::LD1Threev4h_POST:
  378. case AArch64::LD1Threev8b_POST:
  379. case AArch64::LD1Threev2d_POST:
  380. case AArch64::LD1Threev4s_POST:
  381. case AArch64::LD1Threev8h_POST:
  382. case AArch64::LD1Threev16b_POST:
  383. case AArch64::LD1Fourv1d_POST:
  384. case AArch64::LD1Fourv2s_POST:
  385. case AArch64::LD1Fourv4h_POST:
  386. case AArch64::LD1Fourv8b_POST:
  387. case AArch64::LD1Fourv2d_POST:
  388. case AArch64::LD1Fourv4s_POST:
  389. case AArch64::LD1Fourv8h_POST:
  390. case AArch64::LD1Fourv16b_POST:
  391. case AArch64::LD2Twov2s_POST:
  392. case AArch64::LD2Twov4s_POST:
  393. case AArch64::LD2Twov8b_POST:
  394. case AArch64::LD2Twov2d_POST:
  395. case AArch64::LD2Twov4h_POST:
  396. case AArch64::LD2Twov8h_POST:
  397. case AArch64::LD2Twov16b_POST:
  398. case AArch64::LD2Rv1d_POST:
  399. case AArch64::LD2Rv2s_POST:
  400. case AArch64::LD2Rv4s_POST:
  401. case AArch64::LD2Rv8b_POST:
  402. case AArch64::LD2Rv2d_POST:
  403. case AArch64::LD2Rv4h_POST:
  404. case AArch64::LD2Rv8h_POST:
  405. case AArch64::LD2Rv16b_POST:
  406. case AArch64::LD3Threev2s_POST:
  407. case AArch64::LD3Threev4h_POST:
  408. case AArch64::LD3Threev8b_POST:
  409. case AArch64::LD3Threev2d_POST:
  410. case AArch64::LD3Threev4s_POST:
  411. case AArch64::LD3Threev8h_POST:
  412. case AArch64::LD3Threev16b_POST:
  413. case AArch64::LD3Rv1d_POST:
  414. case AArch64::LD3Rv2s_POST:
  415. case AArch64::LD3Rv4h_POST:
  416. case AArch64::LD3Rv8b_POST:
  417. case AArch64::LD3Rv2d_POST:
  418. case AArch64::LD3Rv4s_POST:
  419. case AArch64::LD3Rv8h_POST:
  420. case AArch64::LD3Rv16b_POST:
  421. case AArch64::LD4Fourv2s_POST:
  422. case AArch64::LD4Fourv4h_POST:
  423. case AArch64::LD4Fourv8b_POST:
  424. case AArch64::LD4Fourv2d_POST:
  425. case AArch64::LD4Fourv4s_POST:
  426. case AArch64::LD4Fourv8h_POST:
  427. case AArch64::LD4Fourv16b_POST:
  428. case AArch64::LD4Rv1d_POST:
  429. case AArch64::LD4Rv2s_POST:
  430. case AArch64::LD4Rv4h_POST:
  431. case AArch64::LD4Rv8b_POST:
  432. case AArch64::LD4Rv2d_POST:
  433. case AArch64::LD4Rv4s_POST:
  434. case AArch64::LD4Rv8h_POST:
  435. case AArch64::LD4Rv16b_POST:
  436. DestRegIdx = -1;
  437. BaseRegIdx = 2;
  438. OffsetIdx = 3;
  439. IsPrePost = true;
  440. break;
  441. case AArch64::LDRBBroW:
  442. case AArch64::LDRBBroX:
  443. case AArch64::LDRBBui:
  444. case AArch64::LDRBroW:
  445. case AArch64::LDRBroX:
  446. case AArch64::LDRBui:
  447. case AArch64::LDRDl:
  448. case AArch64::LDRDroW:
  449. case AArch64::LDRDroX:
  450. case AArch64::LDRDui:
  451. case AArch64::LDRHHroW:
  452. case AArch64::LDRHHroX:
  453. case AArch64::LDRHHui:
  454. case AArch64::LDRHroW:
  455. case AArch64::LDRHroX:
  456. case AArch64::LDRHui:
  457. case AArch64::LDRQl:
  458. case AArch64::LDRQroW:
  459. case AArch64::LDRQroX:
  460. case AArch64::LDRQui:
  461. case AArch64::LDRSBWroW:
  462. case AArch64::LDRSBWroX:
  463. case AArch64::LDRSBWui:
  464. case AArch64::LDRSBXroW:
  465. case AArch64::LDRSBXroX:
  466. case AArch64::LDRSBXui:
  467. case AArch64::LDRSHWroW:
  468. case AArch64::LDRSHWroX:
  469. case AArch64::LDRSHWui:
  470. case AArch64::LDRSHXroW:
  471. case AArch64::LDRSHXroX:
  472. case AArch64::LDRSHXui:
  473. case AArch64::LDRSWl:
  474. case AArch64::LDRSWroW:
  475. case AArch64::LDRSWroX:
  476. case AArch64::LDRSWui:
  477. case AArch64::LDRSl:
  478. case AArch64::LDRSroW:
  479. case AArch64::LDRSroX:
  480. case AArch64::LDRSui:
  481. case AArch64::LDRWl:
  482. case AArch64::LDRWroW:
  483. case AArch64::LDRWroX:
  484. case AArch64::LDRWui:
  485. case AArch64::LDRXl:
  486. case AArch64::LDRXroW:
  487. case AArch64::LDRXroX:
  488. case AArch64::LDRXui:
  489. case AArch64::LDURBBi:
  490. case AArch64::LDURBi:
  491. case AArch64::LDURDi:
  492. case AArch64::LDURHHi:
  493. case AArch64::LDURHi:
  494. case AArch64::LDURQi:
  495. case AArch64::LDURSBWi:
  496. case AArch64::LDURSBXi:
  497. case AArch64::LDURSHWi:
  498. case AArch64::LDURSHXi:
  499. case AArch64::LDURSWi:
  500. case AArch64::LDURSi:
  501. case AArch64::LDURWi:
  502. case AArch64::LDURXi:
  503. DestRegIdx = 0;
  504. BaseRegIdx = 1;
  505. OffsetIdx = 2;
  506. IsPrePost = false;
  507. break;
  508. case AArch64::LDRBBpost:
  509. case AArch64::LDRBBpre:
  510. case AArch64::LDRBpost:
  511. case AArch64::LDRBpre:
  512. case AArch64::LDRDpost:
  513. case AArch64::LDRDpre:
  514. case AArch64::LDRHHpost:
  515. case AArch64::LDRHHpre:
  516. case AArch64::LDRHpost:
  517. case AArch64::LDRHpre:
  518. case AArch64::LDRQpost:
  519. case AArch64::LDRQpre:
  520. case AArch64::LDRSBWpost:
  521. case AArch64::LDRSBWpre:
  522. case AArch64::LDRSBXpost:
  523. case AArch64::LDRSBXpre:
  524. case AArch64::LDRSHWpost:
  525. case AArch64::LDRSHWpre:
  526. case AArch64::LDRSHXpost:
  527. case AArch64::LDRSHXpre:
  528. case AArch64::LDRSWpost:
  529. case AArch64::LDRSWpre:
  530. case AArch64::LDRSpost:
  531. case AArch64::LDRSpre:
  532. case AArch64::LDRWpost:
  533. case AArch64::LDRWpre:
  534. case AArch64::LDRXpost:
  535. case AArch64::LDRXpre:
  536. DestRegIdx = 1;
  537. BaseRegIdx = 2;
  538. OffsetIdx = 3;
  539. IsPrePost = true;
  540. break;
  541. case AArch64::LDNPDi:
  542. case AArch64::LDNPQi:
  543. case AArch64::LDNPSi:
  544. case AArch64::LDPQi:
  545. case AArch64::LDPDi:
  546. case AArch64::LDPSi:
  547. DestRegIdx = -1;
  548. BaseRegIdx = 2;
  549. OffsetIdx = 3;
  550. IsPrePost = false;
  551. break;
  552. case AArch64::LDPSWi:
  553. case AArch64::LDPWi:
  554. case AArch64::LDPXi:
  555. DestRegIdx = 0;
  556. BaseRegIdx = 2;
  557. OffsetIdx = 3;
  558. IsPrePost = false;
  559. break;
  560. case AArch64::LDPQpost:
  561. case AArch64::LDPQpre:
  562. case AArch64::LDPDpost:
  563. case AArch64::LDPDpre:
  564. case AArch64::LDPSpost:
  565. case AArch64::LDPSpre:
  566. DestRegIdx = -1;
  567. BaseRegIdx = 3;
  568. OffsetIdx = 4;
  569. IsPrePost = true;
  570. break;
  571. case AArch64::LDPSWpost:
  572. case AArch64::LDPSWpre:
  573. case AArch64::LDPWpost:
  574. case AArch64::LDPWpre:
  575. case AArch64::LDPXpost:
  576. case AArch64::LDPXpre:
  577. DestRegIdx = 1;
  578. BaseRegIdx = 3;
  579. OffsetIdx = 4;
  580. IsPrePost = true;
  581. break;
  582. }
  583. // Loads from the stack pointer don't get prefetched.
  584. Register BaseReg = MI.getOperand(BaseRegIdx).getReg();
  585. if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP)
  586. return None;
  587. LoadInfo LI;
  588. LI.DestReg = DestRegIdx == -1 ? Register() : MI.getOperand(DestRegIdx).getReg();
  589. LI.BaseReg = BaseReg;
  590. LI.BaseRegIdx = BaseRegIdx;
  591. LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);
  592. LI.IsPrePost = IsPrePost;
  593. return LI;
  594. }
  595. static Optional<unsigned> getTag(const TargetRegisterInfo *TRI,
  596. const MachineInstr &MI, const LoadInfo &LI) {
  597. unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0;
  598. unsigned Base = TRI->getEncodingValue(LI.BaseReg);
  599. unsigned Off;
  600. if (LI.OffsetOpnd == nullptr)
  601. Off = 0;
  602. else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
  603. LI.OffsetOpnd->isCPI())
  604. return None;
  605. else if (LI.OffsetOpnd->isReg())
  606. Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());
  607. else
  608. Off = LI.OffsetOpnd->getImm() >> 2;
  609. return makeTag(Dest, Base, Off);
  610. }
  611. void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
  612. // Build the initial tag map for the whole loop.
  613. TagMap.clear();
  614. for (MachineBasicBlock *MBB : L.getBlocks())
  615. for (MachineInstr &MI : *MBB) {
  616. Optional<LoadInfo> LInfo = getLoadInfo(MI);
  617. if (!LInfo)
  618. continue;
  619. Optional<unsigned> Tag = getTag(TRI, MI, *LInfo);
  620. if (!Tag)
  621. continue;
  622. TagMap[*Tag].push_back(&MI);
  623. }
  624. bool AnyCollisions = false;
  625. for (auto &P : TagMap) {
  626. auto Size = P.second.size();
  627. if (Size > 1) {
  628. for (auto *MI : P.second) {
  629. if (TII->isStridedAccess(*MI)) {
  630. AnyCollisions = true;
  631. break;
  632. }
  633. }
  634. }
  635. if (AnyCollisions)
  636. break;
  637. }
  638. // Nothing to fix.
  639. if (!AnyCollisions)
  640. return;
  641. MachineRegisterInfo &MRI = Fn.getRegInfo();
  642. // Go through all the basic blocks in the current loop and fix any streaming
  643. // loads to avoid collisions with any other loads.
  644. LiveRegUnits LR(*TRI);
  645. for (MachineBasicBlock *MBB : L.getBlocks()) {
  646. LR.clear();
  647. LR.addLiveOuts(*MBB);
  648. for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) {
  649. MachineInstr &MI = *I;
  650. if (!TII->isStridedAccess(MI))
  651. continue;
  652. Optional<LoadInfo> OptLdI = getLoadInfo(MI);
  653. if (!OptLdI)
  654. continue;
  655. LoadInfo LdI = *OptLdI;
  656. Optional<unsigned> OptOldTag = getTag(TRI, MI, LdI);
  657. if (!OptOldTag)
  658. continue;
  659. auto &OldCollisions = TagMap[*OptOldTag];
  660. if (OldCollisions.size() <= 1)
  661. continue;
  662. bool Fixed = false;
  663. LLVM_DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
  664. if (!DebugCounter::shouldExecute(FixCounter)) {
  665. LLVM_DEBUG(dbgs() << "Skipping fix due to debug counter:\n " << MI);
  666. continue;
  667. }
  668. // Add the non-base registers of MI as live so we don't use them as
  669. // scratch registers.
  670. for (unsigned OpI = 0, OpE = MI.getNumOperands(); OpI < OpE; ++OpI) {
  671. if (OpI == static_cast<unsigned>(LdI.BaseRegIdx))
  672. continue;
  673. MachineOperand &MO = MI.getOperand(OpI);
  674. if (MO.isReg() && MO.readsReg())
  675. LR.addReg(MO.getReg());
  676. }
  677. for (unsigned ScratchReg : AArch64::GPR64RegClass) {
  678. if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
  679. continue;
  680. LoadInfo NewLdI(LdI);
  681. NewLdI.BaseReg = ScratchReg;
  682. unsigned NewTag = *getTag(TRI, MI, NewLdI);
  683. // Scratch reg tag would collide too, so don't use it.
  684. if (TagMap.count(NewTag))
  685. continue;
  686. LLVM_DEBUG(dbgs() << "Changing base reg to: "
  687. << printReg(ScratchReg, TRI) << '\n');
  688. // Rewrite:
  689. // Xd = LOAD Xb, off
  690. // to:
  691. // Xc = MOV Xb
  692. // Xd = LOAD Xc, off
  693. DebugLoc DL = MI.getDebugLoc();
  694. BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)
  695. .addReg(AArch64::XZR)
  696. .addReg(LdI.BaseReg)
  697. .addImm(0);
  698. MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);
  699. BaseOpnd.setReg(ScratchReg);
  700. // If the load does a pre/post increment, then insert a MOV after as
  701. // well to update the real base register.
  702. if (LdI.IsPrePost) {
  703. LLVM_DEBUG(dbgs() << "Doing post MOV of incremented reg: "
  704. << printReg(ScratchReg, TRI) << '\n');
  705. MI.getOperand(0).setReg(
  706. ScratchReg); // Change tied operand pre/post update dest.
  707. BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
  708. TII->get(AArch64::ORRXrs), LdI.BaseReg)
  709. .addReg(AArch64::XZR)
  710. .addReg(ScratchReg)
  711. .addImm(0);
  712. }
  713. for (int I = 0, E = OldCollisions.size(); I != E; ++I)
  714. if (OldCollisions[I] == &MI) {
  715. std::swap(OldCollisions[I], OldCollisions[E - 1]);
  716. OldCollisions.pop_back();
  717. break;
  718. }
  719. // Update TagMap to reflect instruction changes to reduce the number
  720. // of later MOVs to be inserted. This needs to be done after
  721. // OldCollisions is updated since it may be relocated by this
  722. // insertion.
  723. TagMap[NewTag].push_back(&MI);
  724. ++NumCollisionsAvoided;
  725. Fixed = true;
  726. Modified = true;
  727. break;
  728. }
  729. if (!Fixed)
  730. ++NumCollisionsNotAvoided;
  731. }
  732. }
  733. }
  734. bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
  735. auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
  736. if (ST.getProcFamily() != AArch64Subtarget::Falkor)
  737. return false;
  738. if (skipFunction(Fn.getFunction()))
  739. return false;
  740. TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
  741. TRI = ST.getRegisterInfo();
  742. MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
  743. Modified = false;
  744. for (MachineLoop *I : LI)
  745. for (MachineLoop *L : depth_first(I))
  746. // Only process inner-loops
  747. if (L->isInnermost())
  748. runOnLoop(*L, Fn);
  749. return Modified;
  750. }
  751. FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }