AArch64SIMDInstrOpt.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740
  1. //
  2. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  3. // See https://llvm.org/LICENSE.txt for license information.
  4. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  5. //
  6. //===----------------------------------------------------------------------===//
  7. //
  8. // This file contains a pass that performs optimization on SIMD instructions
  9. // with high latency by splitting them into more efficient series of
  10. // instructions.
  11. //
  12. // 1. Rewrite certain SIMD instructions with vector element due to their
  13. // inefficiency on some targets.
  14. //
  15. // For example:
  16. // fmla v0.4s, v1.4s, v2.s[1]
  17. //
  18. // Is rewritten into:
  19. // dup v3.4s, v2.s[1]
  20. // fmla v0.4s, v1.4s, v3.4s
  21. //
  22. // 2. Rewrite interleaved memory access instructions due to their
  23. // inefficiency on some targets.
  24. //
  25. // For example:
  26. // st2 {v0.4s, v1.4s}, addr
  27. //
  28. // Is rewritten into:
  29. // zip1 v2.4s, v0.4s, v1.4s
  30. // zip2 v3.4s, v0.4s, v1.4s
  31. // stp q2, q3, addr
  32. //
  33. //===----------------------------------------------------------------------===//
  34. #include "AArch64InstrInfo.h"
  35. #include "llvm/ADT/SmallVector.h"
  36. #include "llvm/ADT/Statistic.h"
  37. #include "llvm/ADT/StringRef.h"
  38. #include "llvm/CodeGen/MachineBasicBlock.h"
  39. #include "llvm/CodeGen/MachineFunction.h"
  40. #include "llvm/CodeGen/MachineFunctionPass.h"
  41. #include "llvm/CodeGen/MachineInstr.h"
  42. #include "llvm/CodeGen/MachineInstrBuilder.h"
  43. #include "llvm/CodeGen/MachineOperand.h"
  44. #include "llvm/CodeGen/MachineRegisterInfo.h"
  45. #include "llvm/CodeGen/TargetInstrInfo.h"
  46. #include "llvm/CodeGen/TargetSchedule.h"
  47. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  48. #include "llvm/MC/MCInstrDesc.h"
  49. #include "llvm/MC/MCSchedule.h"
  50. #include "llvm/Pass.h"
  51. #include <unordered_map>
  52. using namespace llvm;
  53. #define DEBUG_TYPE "aarch64-simdinstr-opt"
  54. STATISTIC(NumModifiedInstr,
  55. "Number of SIMD instructions modified");
  56. #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
  57. "AArch64 SIMD instructions optimization pass"
  58. namespace {
  59. struct AArch64SIMDInstrOpt : public MachineFunctionPass {
  60. static char ID;
  61. const TargetInstrInfo *TII;
  62. MachineRegisterInfo *MRI;
  63. TargetSchedModel SchedModel;
  64. // The two maps below are used to cache decisions instead of recomputing:
  65. // This is used to cache instruction replacement decisions within function
  66. // units and across function units.
  67. std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;
  68. // This is used to cache the decision of whether to leave the interleaved
  69. // store instructions replacement pass early or not for a particular target.
  70. std::unordered_map<std::string, bool> InterlEarlyExit;
  71. typedef enum {
  72. VectorElem,
  73. Interleave
  74. } Subpass;
  75. // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
  76. struct InstReplInfo {
  77. unsigned OrigOpc;
  78. std::vector<unsigned> ReplOpc;
  79. const TargetRegisterClass RC;
  80. };
  81. #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
  82. {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
  83. #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
  84. OpcR7, OpcR8, OpcR9, RC) \
  85. {OpcOrg, \
  86. {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
  87. // The Instruction Replacement Table:
  88. std::vector<InstReplInfo> IRT = {
  89. // ST2 instructions
  90. RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
  91. AArch64::STPQi, AArch64::FPR128RegClass),
  92. RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
  93. AArch64::STPQi, AArch64::FPR128RegClass),
  94. RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
  95. AArch64::STPDi, AArch64::FPR64RegClass),
  96. RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
  97. AArch64::STPQi, AArch64::FPR128RegClass),
  98. RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
  99. AArch64::STPDi, AArch64::FPR64RegClass),
  100. RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
  101. AArch64::STPQi, AArch64::FPR128RegClass),
  102. RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
  103. AArch64::STPDi, AArch64::FPR64RegClass),
  104. // ST4 instructions
  105. RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
  106. AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
  107. AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
  108. AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
  109. RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
  110. AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
  111. AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
  112. AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
  113. RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
  114. AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
  115. AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
  116. AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
  117. RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
  118. AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
  119. AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
  120. AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
  121. RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
  122. AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
  123. AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
  124. AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
  125. RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
  126. AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
  127. AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
  128. AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
  129. RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
  130. AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
  131. AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
  132. AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
  133. };
  134. // A costly instruction is replaced in this work by N efficient instructions
  135. // The maximum of N is curently 10 and it is for ST4 case.
  136. static const unsigned MaxNumRepl = 10;
  137. AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {
  138. initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry());
  139. }
  140. /// Based only on latency of instructions, determine if it is cost efficient
  141. /// to replace the instruction InstDesc by the instructions stored in the
  142. /// array InstDescRepl.
  143. /// Return true if replacement is expected to be faster.
  144. bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
  145. SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
  146. /// Determine if we need to exit the instruction replacement optimization
  147. /// passes early. This makes sure that no compile time is spent in this pass
  148. /// for targets with no need for any of these optimizations.
  149. /// Return true if early exit of the pass is recommended.
  150. bool shouldExitEarly(MachineFunction *MF, Subpass SP);
  151. /// Check whether an equivalent DUP instruction has already been
  152. /// created or not.
  153. /// Return true when the DUP instruction already exists. In this case,
  154. /// DestReg will point to the destination of the already created DUP.
  155. bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
  156. unsigned LaneNumber, unsigned *DestReg) const;
  157. /// Certain SIMD instructions with vector element operand are not efficient.
  158. /// Rewrite them into SIMD instructions with vector operands. This rewrite
  159. /// is driven by the latency of the instructions.
  160. /// Return true if the SIMD instruction is modified.
  161. bool optimizeVectElement(MachineInstr &MI);
  162. /// Process The REG_SEQUENCE instruction, and extract the source
  163. /// operands of the ST2/4 instruction from it.
  164. /// Example of such instructions.
  165. /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
  166. /// Return true when the instruction is processed successfully.
  167. bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg,
  168. unsigned* StRegKill, unsigned NumArg) const;
  169. /// Load/Store Interleaving instructions are not always beneficial.
  170. /// Replace them by ZIP instructionand classical load/store.
  171. /// Return true if the SIMD instruction is modified.
  172. bool optimizeLdStInterleave(MachineInstr &MI);
  173. /// Return the number of useful source registers for this
  174. /// instruction (2 for ST2 and 4 for ST4).
  175. unsigned determineSrcReg(MachineInstr &MI) const;
  176. bool runOnMachineFunction(MachineFunction &Fn) override;
  177. StringRef getPassName() const override {
  178. return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
  179. }
  180. };
  181. char AArch64SIMDInstrOpt::ID = 0;
  182. } // end anonymous namespace
  183. INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt",
  184. AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
  185. /// Based only on latency of instructions, determine if it is cost efficient
  186. /// to replace the instruction InstDesc by the instructions stored in the
  187. /// array InstDescRepl.
  188. /// Return true if replacement is expected to be faster.
  189. bool AArch64SIMDInstrOpt::
  190. shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
  191. SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
  192. // Check if replacement decision is already available in the cached table.
  193. // if so, return it.
  194. std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
  195. auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
  196. auto It = SIMDInstrTable.find(InstID);
  197. if (It != SIMDInstrTable.end())
  198. return It->second;
  199. unsigned SCIdx = InstDesc->getSchedClass();
  200. const MCSchedClassDesc *SCDesc =
  201. SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
  202. // If a target does not define resources for the instructions
  203. // of interest, then return false for no replacement.
  204. const MCSchedClassDesc *SCDescRepl;
  205. if (!SCDesc->isValid() || SCDesc->isVariant())
  206. {
  207. SIMDInstrTable[InstID] = false;
  208. return false;
  209. }
  210. for (const auto *IDesc : InstDescRepl)
  211. {
  212. SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
  213. IDesc->getSchedClass());
  214. if (!SCDescRepl->isValid() || SCDescRepl->isVariant())
  215. {
  216. SIMDInstrTable[InstID] = false;
  217. return false;
  218. }
  219. }
  220. // Replacement cost.
  221. unsigned ReplCost = 0;
  222. for (const auto *IDesc :InstDescRepl)
  223. ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
  224. if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
  225. {
  226. SIMDInstrTable[InstID] = true;
  227. return true;
  228. }
  229. else
  230. {
  231. SIMDInstrTable[InstID] = false;
  232. return false;
  233. }
  234. }
  235. /// Determine if we need to exit this pass for a kind of instruction replacement
  236. /// early. This makes sure that no compile time is spent in this pass for
  237. /// targets with no need for any of these optimizations beyond performing this
  238. /// check.
  239. /// Return true if early exit of this pass for a kind of instruction
  240. /// replacement is recommended for a target.
  241. bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
  242. const MCInstrDesc* OriginalMCID;
  243. SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
  244. switch (SP) {
  245. // For this optimization, check by comparing the latency of a representative
  246. // instruction to that of the replacement instructions.
  247. // TODO: check for all concerned instructions.
  248. case VectorElem:
  249. OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);
  250. ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));
  251. ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32));
  252. if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
  253. return false;
  254. break;
  255. // For this optimization, check for all concerned instructions.
  256. case Interleave:
  257. std::string Subtarget =
  258. std::string(SchedModel.getSubtargetInfo()->getCPU());
  259. auto It = InterlEarlyExit.find(Subtarget);
  260. if (It != InterlEarlyExit.end())
  261. return It->second;
  262. for (auto &I : IRT) {
  263. OriginalMCID = &TII->get(I.OrigOpc);
  264. for (auto &Repl : I.ReplOpc)
  265. ReplInstrMCID.push_back(&TII->get(Repl));
  266. if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
  267. InterlEarlyExit[Subtarget] = false;
  268. return false;
  269. }
  270. ReplInstrMCID.clear();
  271. }
  272. InterlEarlyExit[Subtarget] = true;
  273. break;
  274. }
  275. return true;
  276. }
  277. /// Check whether an equivalent DUP instruction has already been
  278. /// created or not.
  279. /// Return true when the DUP instruction already exists. In this case,
  280. /// DestReg will point to the destination of the already created DUP.
  281. bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
  282. unsigned SrcReg, unsigned LaneNumber,
  283. unsigned *DestReg) const {
  284. for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
  285. MII != MIE;) {
  286. MII--;
  287. MachineInstr *CurrentMI = &*MII;
  288. if (CurrentMI->getOpcode() == DupOpcode &&
  289. CurrentMI->getNumOperands() == 3 &&
  290. CurrentMI->getOperand(1).getReg() == SrcReg &&
  291. CurrentMI->getOperand(2).getImm() == LaneNumber) {
  292. *DestReg = CurrentMI->getOperand(0).getReg();
  293. return true;
  294. }
  295. }
  296. return false;
  297. }
  298. /// Certain SIMD instructions with vector element operand are not efficient.
  299. /// Rewrite them into SIMD instructions with vector operands. This rewrite
  300. /// is driven by the latency of the instructions.
  301. /// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
  302. /// and FMULX and hence they are hardcoded.
  303. ///
  304. /// For example:
  305. /// fmla v0.4s, v1.4s, v2.s[1]
  306. ///
  307. /// Is rewritten into
  308. /// dup v3.4s, v2.s[1] // DUP not necessary if redundant
  309. /// fmla v0.4s, v1.4s, v3.4s
  310. ///
  311. /// Return true if the SIMD instruction is modified.
  312. bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
  313. const MCInstrDesc *MulMCID, *DupMCID;
  314. const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
  315. switch (MI.getOpcode()) {
  316. default:
  317. return false;
  318. // 4X32 instructions
  319. case AArch64::FMLAv4i32_indexed:
  320. DupMCID = &TII->get(AArch64::DUPv4i32lane);
  321. MulMCID = &TII->get(AArch64::FMLAv4f32);
  322. break;
  323. case AArch64::FMLSv4i32_indexed:
  324. DupMCID = &TII->get(AArch64::DUPv4i32lane);
  325. MulMCID = &TII->get(AArch64::FMLSv4f32);
  326. break;
  327. case AArch64::FMULXv4i32_indexed:
  328. DupMCID = &TII->get(AArch64::DUPv4i32lane);
  329. MulMCID = &TII->get(AArch64::FMULXv4f32);
  330. break;
  331. case AArch64::FMULv4i32_indexed:
  332. DupMCID = &TII->get(AArch64::DUPv4i32lane);
  333. MulMCID = &TII->get(AArch64::FMULv4f32);
  334. break;
  335. // 2X64 instructions
  336. case AArch64::FMLAv2i64_indexed:
  337. DupMCID = &TII->get(AArch64::DUPv2i64lane);
  338. MulMCID = &TII->get(AArch64::FMLAv2f64);
  339. break;
  340. case AArch64::FMLSv2i64_indexed:
  341. DupMCID = &TII->get(AArch64::DUPv2i64lane);
  342. MulMCID = &TII->get(AArch64::FMLSv2f64);
  343. break;
  344. case AArch64::FMULXv2i64_indexed:
  345. DupMCID = &TII->get(AArch64::DUPv2i64lane);
  346. MulMCID = &TII->get(AArch64::FMULXv2f64);
  347. break;
  348. case AArch64::FMULv2i64_indexed:
  349. DupMCID = &TII->get(AArch64::DUPv2i64lane);
  350. MulMCID = &TII->get(AArch64::FMULv2f64);
  351. break;
  352. // 2X32 instructions
  353. case AArch64::FMLAv2i32_indexed:
  354. RC = &AArch64::FPR64RegClass;
  355. DupMCID = &TII->get(AArch64::DUPv2i32lane);
  356. MulMCID = &TII->get(AArch64::FMLAv2f32);
  357. break;
  358. case AArch64::FMLSv2i32_indexed:
  359. RC = &AArch64::FPR64RegClass;
  360. DupMCID = &TII->get(AArch64::DUPv2i32lane);
  361. MulMCID = &TII->get(AArch64::FMLSv2f32);
  362. break;
  363. case AArch64::FMULXv2i32_indexed:
  364. RC = &AArch64::FPR64RegClass;
  365. DupMCID = &TII->get(AArch64::DUPv2i32lane);
  366. MulMCID = &TII->get(AArch64::FMULXv2f32);
  367. break;
  368. case AArch64::FMULv2i32_indexed:
  369. RC = &AArch64::FPR64RegClass;
  370. DupMCID = &TII->get(AArch64::DUPv2i32lane);
  371. MulMCID = &TII->get(AArch64::FMULv2f32);
  372. break;
  373. }
  374. SmallVector<const MCInstrDesc*, 2> ReplInstrMCID;
  375. ReplInstrMCID.push_back(DupMCID);
  376. ReplInstrMCID.push_back(MulMCID);
  377. if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
  378. ReplInstrMCID))
  379. return false;
  380. const DebugLoc &DL = MI.getDebugLoc();
  381. MachineBasicBlock &MBB = *MI.getParent();
  382. MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
  383. // Get the operands of the current SIMD arithmetic instruction.
  384. Register MulDest = MI.getOperand(0).getReg();
  385. Register SrcReg0 = MI.getOperand(1).getReg();
  386. unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
  387. Register SrcReg1 = MI.getOperand(2).getReg();
  388. unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
  389. unsigned DupDest;
  390. // Instructions of interest have either 4 or 5 operands.
  391. if (MI.getNumOperands() == 5) {
  392. Register SrcReg2 = MI.getOperand(3).getReg();
  393. unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
  394. unsigned LaneNumber = MI.getOperand(4).getImm();
  395. // Create a new DUP instruction. Note that if an equivalent DUP instruction
  396. // has already been created before, then use that one instead of creating
  397. // a new one.
  398. if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
  399. DupDest = MRI.createVirtualRegister(RC);
  400. BuildMI(MBB, MI, DL, *DupMCID, DupDest)
  401. .addReg(SrcReg2, Src2IsKill)
  402. .addImm(LaneNumber);
  403. }
  404. BuildMI(MBB, MI, DL, *MulMCID, MulDest)
  405. .addReg(SrcReg0, Src0IsKill)
  406. .addReg(SrcReg1, Src1IsKill)
  407. .addReg(DupDest, Src2IsKill);
  408. } else if (MI.getNumOperands() == 4) {
  409. unsigned LaneNumber = MI.getOperand(3).getImm();
  410. if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
  411. DupDest = MRI.createVirtualRegister(RC);
  412. BuildMI(MBB, MI, DL, *DupMCID, DupDest)
  413. .addReg(SrcReg1, Src1IsKill)
  414. .addImm(LaneNumber);
  415. }
  416. BuildMI(MBB, MI, DL, *MulMCID, MulDest)
  417. .addReg(SrcReg0, Src0IsKill)
  418. .addReg(DupDest, Src1IsKill);
  419. } else {
  420. return false;
  421. }
  422. ++NumModifiedInstr;
  423. return true;
  424. }
  425. /// Load/Store Interleaving instructions are not always beneficial.
  426. /// Replace them by ZIP instructions and classical load/store.
  427. ///
  428. /// For example:
  429. /// st2 {v0.4s, v1.4s}, addr
  430. ///
  431. /// Is rewritten into:
  432. /// zip1 v2.4s, v0.4s, v1.4s
  433. /// zip2 v3.4s, v0.4s, v1.4s
  434. /// stp q2, q3, addr
  435. //
  436. /// For example:
  437. /// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
  438. ///
  439. /// Is rewritten into:
  440. /// zip1 v4.4s, v0.4s, v2.4s
  441. /// zip2 v5.4s, v0.4s, v2.4s
  442. /// zip1 v6.4s, v1.4s, v3.4s
  443. /// zip2 v7.4s, v1.4s, v3.4s
  444. /// zip1 v8.4s, v4.4s, v6.4s
  445. /// zip2 v9.4s, v4.4s, v6.4s
  446. /// zip1 v10.4s, v5.4s, v7.4s
  447. /// zip2 v11.4s, v5.4s, v7.4s
  448. /// stp q8, q9, addr
  449. /// stp q10, q11, addr+32
  450. ///
  451. /// Currently only instructions related to ST2 and ST4 are considered.
  452. /// Other may be added later.
  453. /// Return true if the SIMD instruction is modified.
  454. bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
  455. unsigned SeqReg, AddrReg;
  456. unsigned StReg[4], StRegKill[4];
  457. MachineInstr *DefiningMI;
  458. const DebugLoc &DL = MI.getDebugLoc();
  459. MachineBasicBlock &MBB = *MI.getParent();
  460. SmallVector<unsigned, MaxNumRepl> ZipDest;
  461. SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
  462. // If current instruction matches any of the rewriting rules, then
  463. // gather information about parameters of the new instructions.
  464. bool Match = false;
  465. for (auto &I : IRT) {
  466. if (MI.getOpcode() == I.OrigOpc) {
  467. SeqReg = MI.getOperand(0).getReg();
  468. AddrReg = MI.getOperand(1).getReg();
  469. DefiningMI = MRI->getUniqueVRegDef(SeqReg);
  470. unsigned NumReg = determineSrcReg(MI);
  471. if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
  472. return false;
  473. for (auto &Repl : I.ReplOpc) {
  474. ReplInstrMCID.push_back(&TII->get(Repl));
  475. // Generate destination registers but only for non-store instruction.
  476. if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
  477. ZipDest.push_back(MRI->createVirtualRegister(&I.RC));
  478. }
  479. Match = true;
  480. break;
  481. }
  482. }
  483. if (!Match)
  484. return false;
  485. // Determine if it is profitable to replace MI by the series of instructions
  486. // represented in ReplInstrMCID.
  487. if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
  488. ReplInstrMCID))
  489. return false;
  490. // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
  491. // this point, the code generation is hardcoded and does not rely on the IRT
  492. // table used above given that code generation for ST2 replacement is somewhat
  493. // different than for ST4 replacement. We could have added more info into the
  494. // table related to how we build new instructions but we may be adding more
  495. // complexity with that).
  496. switch (MI.getOpcode()) {
  497. default:
  498. return false;
  499. case AArch64::ST2Twov16b:
  500. case AArch64::ST2Twov8b:
  501. case AArch64::ST2Twov8h:
  502. case AArch64::ST2Twov4h:
  503. case AArch64::ST2Twov4s:
  504. case AArch64::ST2Twov2s:
  505. case AArch64::ST2Twov2d:
  506. // ZIP instructions
  507. BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
  508. .addReg(StReg[0])
  509. .addReg(StReg[1]);
  510. BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
  511. .addReg(StReg[0], StRegKill[0])
  512. .addReg(StReg[1], StRegKill[1]);
  513. // STP instructions
  514. BuildMI(MBB, MI, DL, *ReplInstrMCID[2])
  515. .addReg(ZipDest[0])
  516. .addReg(ZipDest[1])
  517. .addReg(AddrReg)
  518. .addImm(0);
  519. break;
  520. case AArch64::ST4Fourv16b:
  521. case AArch64::ST4Fourv8b:
  522. case AArch64::ST4Fourv8h:
  523. case AArch64::ST4Fourv4h:
  524. case AArch64::ST4Fourv4s:
  525. case AArch64::ST4Fourv2s:
  526. case AArch64::ST4Fourv2d:
  527. // ZIP instructions
  528. BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
  529. .addReg(StReg[0])
  530. .addReg(StReg[2]);
  531. BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
  532. .addReg(StReg[0], StRegKill[0])
  533. .addReg(StReg[2], StRegKill[2]);
  534. BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])
  535. .addReg(StReg[1])
  536. .addReg(StReg[3]);
  537. BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])
  538. .addReg(StReg[1], StRegKill[1])
  539. .addReg(StReg[3], StRegKill[3]);
  540. BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])
  541. .addReg(ZipDest[0])
  542. .addReg(ZipDest[2]);
  543. BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])
  544. .addReg(ZipDest[0])
  545. .addReg(ZipDest[2]);
  546. BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])
  547. .addReg(ZipDest[1])
  548. .addReg(ZipDest[3]);
  549. BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])
  550. .addReg(ZipDest[1])
  551. .addReg(ZipDest[3]);
  552. // stp instructions
  553. BuildMI(MBB, MI, DL, *ReplInstrMCID[8])
  554. .addReg(ZipDest[4])
  555. .addReg(ZipDest[5])
  556. .addReg(AddrReg)
  557. .addImm(0);
  558. BuildMI(MBB, MI, DL, *ReplInstrMCID[9])
  559. .addReg(ZipDest[6])
  560. .addReg(ZipDest[7])
  561. .addReg(AddrReg)
  562. .addImm(2);
  563. break;
  564. }
  565. ++NumModifiedInstr;
  566. return true;
  567. }
  568. /// Process The REG_SEQUENCE instruction, and extract the source
  569. /// operands of the ST2/4 instruction from it.
  570. /// Example of such instruction.
  571. /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
  572. /// Return true when the instruction is processed successfully.
  573. bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
  574. unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {
  575. assert(DefiningMI != nullptr);
  576. if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
  577. return false;
  578. for (unsigned i=0; i<NumArg; i++) {
  579. StReg[i] = DefiningMI->getOperand(2*i+1).getReg();
  580. StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());
  581. // Validation check for the other arguments.
  582. if (DefiningMI->getOperand(2*i+2).isImm()) {
  583. switch (DefiningMI->getOperand(2*i+2).getImm()) {
  584. default:
  585. return false;
  586. case AArch64::dsub0:
  587. case AArch64::dsub1:
  588. case AArch64::dsub2:
  589. case AArch64::dsub3:
  590. case AArch64::qsub0:
  591. case AArch64::qsub1:
  592. case AArch64::qsub2:
  593. case AArch64::qsub3:
  594. break;
  595. }
  596. }
  597. else
  598. return false;
  599. }
  600. return true;
  601. }
  602. /// Return the number of useful source registers for this instruction
  603. /// (2 for ST2 and 4 for ST4).
  604. unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {
  605. switch (MI.getOpcode()) {
  606. default:
  607. llvm_unreachable("Unsupported instruction for this pass");
  608. case AArch64::ST2Twov16b:
  609. case AArch64::ST2Twov8b:
  610. case AArch64::ST2Twov8h:
  611. case AArch64::ST2Twov4h:
  612. case AArch64::ST2Twov4s:
  613. case AArch64::ST2Twov2s:
  614. case AArch64::ST2Twov2d:
  615. return 2;
  616. case AArch64::ST4Fourv16b:
  617. case AArch64::ST4Fourv8b:
  618. case AArch64::ST4Fourv8h:
  619. case AArch64::ST4Fourv4h:
  620. case AArch64::ST4Fourv4s:
  621. case AArch64::ST4Fourv2s:
  622. case AArch64::ST4Fourv2d:
  623. return 4;
  624. }
  625. }
  626. bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
  627. if (skipFunction(MF.getFunction()))
  628. return false;
  629. TII = MF.getSubtarget().getInstrInfo();
  630. MRI = &MF.getRegInfo();
  631. const TargetSubtargetInfo &ST = MF.getSubtarget();
  632. const AArch64InstrInfo *AAII =
  633. static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
  634. if (!AAII)
  635. return false;
  636. SchedModel.init(&ST);
  637. if (!SchedModel.hasInstrSchedModel())
  638. return false;
  639. bool Changed = false;
  640. for (auto OptimizationKind : {VectorElem, Interleave}) {
  641. if (!shouldExitEarly(&MF, OptimizationKind)) {
  642. SmallVector<MachineInstr *, 8> RemoveMIs;
  643. for (MachineBasicBlock &MBB : MF) {
  644. for (MachineInstr &MI : MBB) {
  645. bool InstRewrite;
  646. if (OptimizationKind == VectorElem)
  647. InstRewrite = optimizeVectElement(MI) ;
  648. else
  649. InstRewrite = optimizeLdStInterleave(MI);
  650. if (InstRewrite) {
  651. // Add MI to the list of instructions to be removed given that it
  652. // has been replaced.
  653. RemoveMIs.push_back(&MI);
  654. Changed = true;
  655. }
  656. }
  657. }
  658. for (MachineInstr *MI : RemoveMIs)
  659. MI->eraseFromParent();
  660. }
  661. }
  662. return Changed;
  663. }
  664. /// Returns an instance of the high cost ASIMD instruction replacement
  665. /// optimization pass.
  666. FunctionPass *llvm::createAArch64SIMDInstrOptPass() {
  667. return new AArch64SIMDInstrOpt();
  668. }