X86AvoidStoreForwardingBlocks.cpp 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728
  1. //===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // If a load follows a store and reloads data that the store has written to
  10. // memory, Intel microarchitectures can in many cases forward the data directly
  11. // from the store to the load, This "store forwarding" saves cycles by enabling
  12. // the load to directly obtain the data instead of accessing the data from
  13. // cache or memory.
  14. // A "store forward block" occurs in cases that a store cannot be forwarded to
  15. // the load. The most typical case of store forward block on Intel Core
  16. // microarchitecture that a small store cannot be forwarded to a large load.
  17. // The estimated penalty for a store forward block is ~13 cycles.
  18. //
  19. // This pass tries to recognize and handle cases where "store forward block"
  20. // is created by the compiler when lowering memcpy calls to a sequence
  21. // of a load and a store.
  22. //
  23. // The pass currently only handles cases where memcpy is lowered to
  24. // XMM/YMM registers, it tries to break the memcpy into smaller copies.
  25. // breaking the memcpy should be possible since there is no atomicity
  26. // guarantee for loads and stores to XMM/YMM.
  27. //
  28. // It could be better for performance to solve the problem by loading
  29. // to XMM/YMM then inserting the partial store before storing back from XMM/YMM
  30. // to memory, but this will result in a more conservative optimization since it
  31. // requires we prove that all memory accesses between the blocking store and the
  32. // load must alias/don't alias before we can move the store, whereas the
  33. // transformation done here is correct regardless to other memory accesses.
  34. //===----------------------------------------------------------------------===//
  35. #include "X86.h"
  36. #include "X86InstrInfo.h"
  37. #include "X86Subtarget.h"
  38. #include "llvm/Analysis/AliasAnalysis.h"
  39. #include "llvm/CodeGen/MachineBasicBlock.h"
  40. #include "llvm/CodeGen/MachineFunction.h"
  41. #include "llvm/CodeGen/MachineFunctionPass.h"
  42. #include "llvm/CodeGen/MachineInstr.h"
  43. #include "llvm/CodeGen/MachineInstrBuilder.h"
  44. #include "llvm/CodeGen/MachineOperand.h"
  45. #include "llvm/CodeGen/MachineRegisterInfo.h"
  46. #include "llvm/IR/DebugInfoMetadata.h"
  47. #include "llvm/IR/DebugLoc.h"
  48. #include "llvm/IR/Function.h"
  49. #include "llvm/InitializePasses.h"
  50. #include "llvm/MC/MCInstrDesc.h"
  51. using namespace llvm;
  52. #define DEBUG_TYPE "x86-avoid-SFB"
  53. static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(
  54. "x86-disable-avoid-SFB", cl::Hidden,
  55. cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
  56. static cl::opt<unsigned> X86AvoidSFBInspectionLimit(
  57. "x86-sfb-inspection-limit",
  58. cl::desc("X86: Number of instructions backward to "
  59. "inspect for store forwarding blocks."),
  60. cl::init(20), cl::Hidden);
  61. namespace {
  62. using DisplacementSizeMap = std::map<int64_t, unsigned>;
  63. class X86AvoidSFBPass : public MachineFunctionPass {
  64. public:
  65. static char ID;
  66. X86AvoidSFBPass() : MachineFunctionPass(ID) { }
  67. StringRef getPassName() const override {
  68. return "X86 Avoid Store Forwarding Blocks";
  69. }
  70. bool runOnMachineFunction(MachineFunction &MF) override;
  71. void getAnalysisUsage(AnalysisUsage &AU) const override {
  72. MachineFunctionPass::getAnalysisUsage(AU);
  73. AU.addRequired<AAResultsWrapperPass>();
  74. }
  75. private:
  76. MachineRegisterInfo *MRI = nullptr;
  77. const X86InstrInfo *TII = nullptr;
  78. const X86RegisterInfo *TRI = nullptr;
  79. SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2>
  80. BlockedLoadsStoresPairs;
  81. SmallVector<MachineInstr *, 2> ForRemoval;
  82. AliasAnalysis *AA = nullptr;
  83. /// Returns couples of Load then Store to memory which look
  84. /// like a memcpy.
  85. void findPotentiallylBlockedCopies(MachineFunction &MF);
  86. /// Break the memcpy's load and store into smaller copies
  87. /// such that each memory load that was blocked by a smaller store
  88. /// would now be copied separately.
  89. void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
  90. const DisplacementSizeMap &BlockingStoresDispSizeMap);
  91. /// Break a copy of size Size to smaller copies.
  92. void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
  93. MachineInstr *StoreInst, int64_t StDispImm,
  94. int64_t LMMOffset, int64_t SMMOffset);
  95. void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
  96. MachineInstr *StoreInst, unsigned NStoreOpcode,
  97. int64_t StoreDisp, unsigned Size, int64_t LMMOffset,
  98. int64_t SMMOffset);
  99. bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
  100. unsigned getRegSizeInBytes(MachineInstr *Inst);
  101. };
  102. } // end anonymous namespace
  103. char X86AvoidSFBPass::ID = 0;
  104. INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking",
  105. false, false)
  106. INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
  107. INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false,
  108. false)
  109. FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() {
  110. return new X86AvoidSFBPass();
  111. }
  112. static bool isXMMLoadOpcode(unsigned Opcode) {
  113. return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
  114. Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
  115. Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
  116. Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
  117. Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
  118. Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
  119. Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
  120. Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
  121. }
  122. static bool isYMMLoadOpcode(unsigned Opcode) {
  123. return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
  124. Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
  125. Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
  126. Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
  127. Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
  128. Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
  129. Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
  130. }
  131. static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
  132. return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
  133. }
  134. static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
  135. switch (LdOpcode) {
  136. case X86::MOVUPSrm:
  137. case X86::MOVAPSrm:
  138. return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
  139. case X86::VMOVUPSrm:
  140. case X86::VMOVAPSrm:
  141. return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
  142. case X86::VMOVUPDrm:
  143. case X86::VMOVAPDrm:
  144. return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
  145. case X86::VMOVDQUrm:
  146. case X86::VMOVDQArm:
  147. return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
  148. case X86::VMOVUPSZ128rm:
  149. case X86::VMOVAPSZ128rm:
  150. return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
  151. case X86::VMOVUPDZ128rm:
  152. case X86::VMOVAPDZ128rm:
  153. return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
  154. case X86::VMOVUPSYrm:
  155. case X86::VMOVAPSYrm:
  156. return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
  157. case X86::VMOVUPDYrm:
  158. case X86::VMOVAPDYrm:
  159. return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
  160. case X86::VMOVDQUYrm:
  161. case X86::VMOVDQAYrm:
  162. return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
  163. case X86::VMOVUPSZ256rm:
  164. case X86::VMOVAPSZ256rm:
  165. return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
  166. case X86::VMOVUPDZ256rm:
  167. case X86::VMOVAPDZ256rm:
  168. return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
  169. case X86::VMOVDQU64Z128rm:
  170. case X86::VMOVDQA64Z128rm:
  171. return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
  172. case X86::VMOVDQU32Z128rm:
  173. case X86::VMOVDQA32Z128rm:
  174. return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
  175. case X86::VMOVDQU64Z256rm:
  176. case X86::VMOVDQA64Z256rm:
  177. return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
  178. case X86::VMOVDQU32Z256rm:
  179. case X86::VMOVDQA32Z256rm:
  180. return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
  181. default:
  182. return false;
  183. }
  184. }
  185. static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
  186. bool PBlock = false;
  187. PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
  188. Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
  189. Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
  190. Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
  191. if (isYMMLoadOpcode(LoadOpcode))
  192. PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
  193. Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
  194. Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
  195. Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
  196. Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
  197. Opcode == X86::VMOVDQU64Z128mr ||
  198. Opcode == X86::VMOVDQA64Z128mr ||
  199. Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
  200. return PBlock;
  201. }
  202. static const int MOV128SZ = 16;
  203. static const int MOV64SZ = 8;
  204. static const int MOV32SZ = 4;
  205. static const int MOV16SZ = 2;
  206. static const int MOV8SZ = 1;
  207. static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
  208. switch (LoadOpcode) {
  209. case X86::VMOVUPSYrm:
  210. case X86::VMOVAPSYrm:
  211. return X86::VMOVUPSrm;
  212. case X86::VMOVUPDYrm:
  213. case X86::VMOVAPDYrm:
  214. return X86::VMOVUPDrm;
  215. case X86::VMOVDQUYrm:
  216. case X86::VMOVDQAYrm:
  217. return X86::VMOVDQUrm;
  218. case X86::VMOVUPSZ256rm:
  219. case X86::VMOVAPSZ256rm:
  220. return X86::VMOVUPSZ128rm;
  221. case X86::VMOVUPDZ256rm:
  222. case X86::VMOVAPDZ256rm:
  223. return X86::VMOVUPDZ128rm;
  224. case X86::VMOVDQU64Z256rm:
  225. case X86::VMOVDQA64Z256rm:
  226. return X86::VMOVDQU64Z128rm;
  227. case X86::VMOVDQU32Z256rm:
  228. case X86::VMOVDQA32Z256rm:
  229. return X86::VMOVDQU32Z128rm;
  230. default:
  231. llvm_unreachable("Unexpected Load Instruction Opcode");
  232. }
  233. return 0;
  234. }
  235. static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
  236. switch (StoreOpcode) {
  237. case X86::VMOVUPSYmr:
  238. case X86::VMOVAPSYmr:
  239. return X86::VMOVUPSmr;
  240. case X86::VMOVUPDYmr:
  241. case X86::VMOVAPDYmr:
  242. return X86::VMOVUPDmr;
  243. case X86::VMOVDQUYmr:
  244. case X86::VMOVDQAYmr:
  245. return X86::VMOVDQUmr;
  246. case X86::VMOVUPSZ256mr:
  247. case X86::VMOVAPSZ256mr:
  248. return X86::VMOVUPSZ128mr;
  249. case X86::VMOVUPDZ256mr:
  250. case X86::VMOVAPDZ256mr:
  251. return X86::VMOVUPDZ128mr;
  252. case X86::VMOVDQU64Z256mr:
  253. case X86::VMOVDQA64Z256mr:
  254. return X86::VMOVDQU64Z128mr;
  255. case X86::VMOVDQU32Z256mr:
  256. case X86::VMOVDQA32Z256mr:
  257. return X86::VMOVDQU32Z128mr;
  258. default:
  259. llvm_unreachable("Unexpected Load Instruction Opcode");
  260. }
  261. return 0;
  262. }
  263. static int getAddrOffset(const MachineInstr *MI) {
  264. const MCInstrDesc &Descl = MI->getDesc();
  265. int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
  266. assert(AddrOffset != -1 && "Expected Memory Operand");
  267. AddrOffset += X86II::getOperandBias(Descl);
  268. return AddrOffset;
  269. }
  270. static MachineOperand &getBaseOperand(MachineInstr *MI) {
  271. int AddrOffset = getAddrOffset(MI);
  272. return MI->getOperand(AddrOffset + X86::AddrBaseReg);
  273. }
  274. static MachineOperand &getDispOperand(MachineInstr *MI) {
  275. int AddrOffset = getAddrOffset(MI);
  276. return MI->getOperand(AddrOffset + X86::AddrDisp);
  277. }
  278. // Relevant addressing modes contain only base register and immediate
  279. // displacement or frameindex and immediate displacement.
  280. // TODO: Consider expanding to other addressing modes in the future
  281. static bool isRelevantAddressingMode(MachineInstr *MI) {
  282. int AddrOffset = getAddrOffset(MI);
  283. const MachineOperand &Base = getBaseOperand(MI);
  284. const MachineOperand &Disp = getDispOperand(MI);
  285. const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
  286. const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
  287. const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
  288. if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
  289. return false;
  290. if (!Disp.isImm())
  291. return false;
  292. if (Scale.getImm() != 1)
  293. return false;
  294. if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
  295. return false;
  296. if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
  297. return false;
  298. return true;
  299. }
  300. // Collect potentially blocking stores.
  301. // Limit the number of instructions backwards we want to inspect
  302. // since the effect of store block won't be visible if the store
  303. // and load instructions have enough instructions in between to
  304. // keep the core busy.
  305. static SmallVector<MachineInstr *, 2>
  306. findPotentialBlockers(MachineInstr *LoadInst) {
  307. SmallVector<MachineInstr *, 2> PotentialBlockers;
  308. unsigned BlockCount = 0;
  309. const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
  310. for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
  311. E = LoadInst->getParent()->rend();
  312. PBInst != E; ++PBInst) {
  313. if (PBInst->isMetaInstruction())
  314. continue;
  315. BlockCount++;
  316. if (BlockCount >= InspectionLimit)
  317. break;
  318. MachineInstr &MI = *PBInst;
  319. if (MI.getDesc().isCall())
  320. return PotentialBlockers;
  321. PotentialBlockers.push_back(&MI);
  322. }
  323. // If we didn't get to the instructions limit try predecessing blocks.
  324. // Ideally we should traverse the predecessor blocks in depth with some
  325. // coloring algorithm, but for now let's just look at the first order
  326. // predecessors.
  327. if (BlockCount < InspectionLimit) {
  328. MachineBasicBlock *MBB = LoadInst->getParent();
  329. int LimitLeft = InspectionLimit - BlockCount;
  330. for (MachineBasicBlock *PMBB : MBB->predecessors()) {
  331. int PredCount = 0;
  332. for (MachineInstr &PBInst : llvm::reverse(*PMBB)) {
  333. if (PBInst.isMetaInstruction())
  334. continue;
  335. PredCount++;
  336. if (PredCount >= LimitLeft)
  337. break;
  338. if (PBInst.getDesc().isCall())
  339. break;
  340. PotentialBlockers.push_back(&PBInst);
  341. }
  342. }
  343. }
  344. return PotentialBlockers;
  345. }
  346. void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
  347. int64_t LoadDisp, MachineInstr *StoreInst,
  348. unsigned NStoreOpcode, int64_t StoreDisp,
  349. unsigned Size, int64_t LMMOffset,
  350. int64_t SMMOffset) {
  351. MachineOperand &LoadBase = getBaseOperand(LoadInst);
  352. MachineOperand &StoreBase = getBaseOperand(StoreInst);
  353. MachineBasicBlock *MBB = LoadInst->getParent();
  354. MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
  355. MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
  356. Register Reg1 = MRI->createVirtualRegister(
  357. TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
  358. MachineInstr *NewLoad =
  359. BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
  360. Reg1)
  361. .add(LoadBase)
  362. .addImm(1)
  363. .addReg(X86::NoRegister)
  364. .addImm(LoadDisp)
  365. .addReg(X86::NoRegister)
  366. .addMemOperand(
  367. MBB->getParent()->getMachineMemOperand(LMMO, LMMOffset, Size));
  368. if (LoadBase.isReg())
  369. getBaseOperand(NewLoad).setIsKill(false);
  370. LLVM_DEBUG(NewLoad->dump());
  371. // If the load and store are consecutive, use the loadInst location to
  372. // reduce register pressure.
  373. MachineInstr *StInst = StoreInst;
  374. auto PrevInstrIt = prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
  375. MBB->instr_begin());
  376. if (PrevInstrIt.getNodePtr() == LoadInst)
  377. StInst = LoadInst;
  378. MachineInstr *NewStore =
  379. BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
  380. .add(StoreBase)
  381. .addImm(1)
  382. .addReg(X86::NoRegister)
  383. .addImm(StoreDisp)
  384. .addReg(X86::NoRegister)
  385. .addReg(Reg1)
  386. .addMemOperand(
  387. MBB->getParent()->getMachineMemOperand(SMMO, SMMOffset, Size));
  388. if (StoreBase.isReg())
  389. getBaseOperand(NewStore).setIsKill(false);
  390. MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands);
  391. assert(StoreSrcVReg.isReg() && "Expected virtual register");
  392. NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
  393. LLVM_DEBUG(NewStore->dump());
  394. }
  395. void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
  396. int64_t LdDispImm, MachineInstr *StoreInst,
  397. int64_t StDispImm, int64_t LMMOffset,
  398. int64_t SMMOffset) {
  399. int LdDisp = LdDispImm;
  400. int StDisp = StDispImm;
  401. while (Size > 0) {
  402. if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
  403. Size = Size - MOV128SZ;
  404. buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
  405. StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),
  406. StDisp, MOV128SZ, LMMOffset, SMMOffset);
  407. LdDisp += MOV128SZ;
  408. StDisp += MOV128SZ;
  409. LMMOffset += MOV128SZ;
  410. SMMOffset += MOV128SZ;
  411. continue;
  412. }
  413. if (Size - MOV64SZ >= 0) {
  414. Size = Size - MOV64SZ;
  415. buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
  416. MOV64SZ, LMMOffset, SMMOffset);
  417. LdDisp += MOV64SZ;
  418. StDisp += MOV64SZ;
  419. LMMOffset += MOV64SZ;
  420. SMMOffset += MOV64SZ;
  421. continue;
  422. }
  423. if (Size - MOV32SZ >= 0) {
  424. Size = Size - MOV32SZ;
  425. buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
  426. MOV32SZ, LMMOffset, SMMOffset);
  427. LdDisp += MOV32SZ;
  428. StDisp += MOV32SZ;
  429. LMMOffset += MOV32SZ;
  430. SMMOffset += MOV32SZ;
  431. continue;
  432. }
  433. if (Size - MOV16SZ >= 0) {
  434. Size = Size - MOV16SZ;
  435. buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
  436. MOV16SZ, LMMOffset, SMMOffset);
  437. LdDisp += MOV16SZ;
  438. StDisp += MOV16SZ;
  439. LMMOffset += MOV16SZ;
  440. SMMOffset += MOV16SZ;
  441. continue;
  442. }
  443. if (Size - MOV8SZ >= 0) {
  444. Size = Size - MOV8SZ;
  445. buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
  446. MOV8SZ, LMMOffset, SMMOffset);
  447. LdDisp += MOV8SZ;
  448. StDisp += MOV8SZ;
  449. LMMOffset += MOV8SZ;
  450. SMMOffset += MOV8SZ;
  451. continue;
  452. }
  453. }
  454. assert(Size == 0 && "Wrong size division");
  455. }
  456. static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
  457. MachineOperand &LoadBase = getBaseOperand(LoadInst);
  458. MachineOperand &StoreBase = getBaseOperand(StoreInst);
  459. auto *StorePrevNonDbgInstr =
  460. prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
  461. LoadInst->getParent()->instr_begin())
  462. .getNodePtr();
  463. if (LoadBase.isReg()) {
  464. MachineInstr *LastLoad = LoadInst->getPrevNode();
  465. // If the original load and store to xmm/ymm were consecutive
  466. // then the partial copies were also created in
  467. // a consecutive order to reduce register pressure,
  468. // and the location of the last load is before the last store.
  469. if (StorePrevNonDbgInstr == LoadInst)
  470. LastLoad = LoadInst->getPrevNode()->getPrevNode();
  471. getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
  472. }
  473. if (StoreBase.isReg()) {
  474. MachineInstr *StInst = StoreInst;
  475. if (StorePrevNonDbgInstr == LoadInst)
  476. StInst = LoadInst;
  477. getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
  478. }
  479. }
  480. bool X86AvoidSFBPass::alias(const MachineMemOperand &Op1,
  481. const MachineMemOperand &Op2) const {
  482. if (!Op1.getValue() || !Op2.getValue())
  483. return true;
  484. int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
  485. int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
  486. int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
  487. return !AA->isNoAlias(
  488. MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
  489. MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
  490. }
  491. void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
  492. for (auto &MBB : MF)
  493. for (auto &MI : MBB) {
  494. if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
  495. continue;
  496. int DefVR = MI.getOperand(0).getReg();
  497. if (!MRI->hasOneNonDBGUse(DefVR))
  498. continue;
  499. for (MachineOperand &StoreMO :
  500. llvm::make_early_inc_range(MRI->use_nodbg_operands(DefVR))) {
  501. MachineInstr &StoreMI = *StoreMO.getParent();
  502. // Skip cases where the memcpy may overlap.
  503. if (StoreMI.getParent() == MI.getParent() &&
  504. isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
  505. isRelevantAddressingMode(&MI) &&
  506. isRelevantAddressingMode(&StoreMI) &&
  507. MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {
  508. if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
  509. BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
  510. }
  511. }
  512. }
  513. }
  514. unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
  515. const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
  516. *LoadInst->getParent()->getParent());
  517. return TRI->getRegSizeInBits(*TRC) / 8;
  518. }
  519. void X86AvoidSFBPass::breakBlockedCopies(
  520. MachineInstr *LoadInst, MachineInstr *StoreInst,
  521. const DisplacementSizeMap &BlockingStoresDispSizeMap) {
  522. int64_t LdDispImm = getDispOperand(LoadInst).getImm();
  523. int64_t StDispImm = getDispOperand(StoreInst).getImm();
  524. int64_t LMMOffset = 0;
  525. int64_t SMMOffset = 0;
  526. int64_t LdDisp1 = LdDispImm;
  527. int64_t LdDisp2 = 0;
  528. int64_t StDisp1 = StDispImm;
  529. int64_t StDisp2 = 0;
  530. unsigned Size1 = 0;
  531. unsigned Size2 = 0;
  532. int64_t LdStDelta = StDispImm - LdDispImm;
  533. for (auto DispSizePair : BlockingStoresDispSizeMap) {
  534. LdDisp2 = DispSizePair.first;
  535. StDisp2 = DispSizePair.first + LdStDelta;
  536. Size2 = DispSizePair.second;
  537. // Avoid copying overlapping areas.
  538. if (LdDisp2 < LdDisp1) {
  539. int OverlapDelta = LdDisp1 - LdDisp2;
  540. LdDisp2 += OverlapDelta;
  541. StDisp2 += OverlapDelta;
  542. Size2 -= OverlapDelta;
  543. }
  544. Size1 = LdDisp2 - LdDisp1;
  545. // Build a copy for the point until the current blocking store's
  546. // displacement.
  547. buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
  548. SMMOffset);
  549. // Build a copy for the current blocking store.
  550. buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, LMMOffset + Size1,
  551. SMMOffset + Size1);
  552. LdDisp1 = LdDisp2 + Size2;
  553. StDisp1 = StDisp2 + Size2;
  554. LMMOffset += Size1 + Size2;
  555. SMMOffset += Size1 + Size2;
  556. }
  557. unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
  558. buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, LMMOffset,
  559. LMMOffset);
  560. }
  561. static bool hasSameBaseOpValue(MachineInstr *LoadInst,
  562. MachineInstr *StoreInst) {
  563. const MachineOperand &LoadBase = getBaseOperand(LoadInst);
  564. const MachineOperand &StoreBase = getBaseOperand(StoreInst);
  565. if (LoadBase.isReg() != StoreBase.isReg())
  566. return false;
  567. if (LoadBase.isReg())
  568. return LoadBase.getReg() == StoreBase.getReg();
  569. return LoadBase.getIndex() == StoreBase.getIndex();
  570. }
  571. static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
  572. int64_t StoreDispImm, unsigned StoreSize) {
  573. return ((StoreDispImm >= LoadDispImm) &&
  574. (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
  575. }
  576. // Keep track of all stores blocking a load
  577. static void
  578. updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
  579. int64_t DispImm, unsigned Size) {
  580. if (BlockingStoresDispSizeMap.count(DispImm)) {
  581. // Choose the smallest blocking store starting at this displacement.
  582. if (BlockingStoresDispSizeMap[DispImm] > Size)
  583. BlockingStoresDispSizeMap[DispImm] = Size;
  584. } else
  585. BlockingStoresDispSizeMap[DispImm] = Size;
  586. }
  587. // Remove blocking stores contained in each other.
  588. static void
  589. removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
  590. if (BlockingStoresDispSizeMap.size() <= 1)
  591. return;
  592. SmallVector<std::pair<int64_t, unsigned>, 0> DispSizeStack;
  593. for (auto DispSizePair : BlockingStoresDispSizeMap) {
  594. int64_t CurrDisp = DispSizePair.first;
  595. unsigned CurrSize = DispSizePair.second;
  596. while (DispSizeStack.size()) {
  597. int64_t PrevDisp = DispSizeStack.back().first;
  598. unsigned PrevSize = DispSizeStack.back().second;
  599. if (CurrDisp + CurrSize > PrevDisp + PrevSize)
  600. break;
  601. DispSizeStack.pop_back();
  602. }
  603. DispSizeStack.push_back(DispSizePair);
  604. }
  605. BlockingStoresDispSizeMap.clear();
  606. for (auto Disp : DispSizeStack)
  607. BlockingStoresDispSizeMap.insert(Disp);
  608. }
  609. bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
  610. bool Changed = false;
  611. if (DisableX86AvoidStoreForwardBlocks || skipFunction(MF.getFunction()) ||
  612. !MF.getSubtarget<X86Subtarget>().is64Bit())
  613. return false;
  614. MRI = &MF.getRegInfo();
  615. assert(MRI->isSSA() && "Expected MIR to be in SSA form");
  616. TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
  617. TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
  618. AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  619. LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
  620. // Look for a load then a store to XMM/YMM which look like a memcpy
  621. findPotentiallylBlockedCopies(MF);
  622. for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
  623. MachineInstr *LoadInst = LoadStoreInstPair.first;
  624. int64_t LdDispImm = getDispOperand(LoadInst).getImm();
  625. DisplacementSizeMap BlockingStoresDispSizeMap;
  626. SmallVector<MachineInstr *, 2> PotentialBlockers =
  627. findPotentialBlockers(LoadInst);
  628. for (auto *PBInst : PotentialBlockers) {
  629. if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
  630. LoadInst->getOpcode()) ||
  631. !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
  632. continue;
  633. int64_t PBstDispImm = getDispOperand(PBInst).getImm();
  634. unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
  635. // This check doesn't cover all cases, but it will suffice for now.
  636. // TODO: take branch probability into consideration, if the blocking
  637. // store is in an unreached block, breaking the memcopy could lose
  638. // performance.
  639. if (hasSameBaseOpValue(LoadInst, PBInst) &&
  640. isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
  641. PBstSize))
  642. updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
  643. PBstSize);
  644. }
  645. if (BlockingStoresDispSizeMap.empty())
  646. continue;
  647. // We found a store forward block, break the memcpy's load and store
  648. // into smaller copies such that each smaller store that was causing
  649. // a store block would now be copied separately.
  650. MachineInstr *StoreInst = LoadStoreInstPair.second;
  651. LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
  652. LLVM_DEBUG(LoadInst->dump());
  653. LLVM_DEBUG(StoreInst->dump());
  654. LLVM_DEBUG(dbgs() << "Replaced with:\n");
  655. removeRedundantBlockingStores(BlockingStoresDispSizeMap);
  656. breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
  657. updateKillStatus(LoadInst, StoreInst);
  658. ForRemoval.push_back(LoadInst);
  659. ForRemoval.push_back(StoreInst);
  660. }
  661. for (auto *RemovedInst : ForRemoval) {
  662. RemovedInst->eraseFromParent();
  663. }
  664. ForRemoval.clear();
  665. BlockedLoadsStoresPairs.clear();
  666. LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
  667. return Changed;
  668. }