AArch64AdvSIMDScalarPass.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. //===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. // When profitable, replace GPR targeting i64 instructions with their
  9. // AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
  10. // as minimizing the number of cross-class register copies.
  11. //===----------------------------------------------------------------------===//
  12. //===----------------------------------------------------------------------===//
  13. // TODO: Graph based predicate heuristics.
  14. // Walking the instruction list linearly will get many, perhaps most, of
  15. // the cases, but to do a truly thorough job of this, we need a more
  16. // wholistic approach.
  17. //
  18. // This optimization is very similar in spirit to the register allocator's
  19. // spill placement, only here we're determining where to place cross-class
  20. // register copies rather than spills. As such, a similar approach is
  21. // called for.
  22. //
  23. // We want to build up a set of graphs of all instructions which are candidates
  24. // for transformation along with instructions which generate their inputs and
  25. // consume their outputs. For each edge in the graph, we assign a weight
  26. // based on whether there is a copy required there (weight zero if not) and
  27. // the block frequency of the block containing the defining or using
  28. // instruction, whichever is less. Our optimization is then a graph problem
  29. // to minimize the total weight of all the graphs, then transform instructions
  30. // and add or remove copy instructions as called for to implement the
  31. // solution.
  32. //===----------------------------------------------------------------------===//
  33. #include "AArch64.h"
  34. #include "AArch64InstrInfo.h"
  35. #include "AArch64RegisterInfo.h"
  36. #include "llvm/ADT/Statistic.h"
  37. #include "llvm/CodeGen/MachineFunction.h"
  38. #include "llvm/CodeGen/MachineFunctionPass.h"
  39. #include "llvm/CodeGen/MachineInstr.h"
  40. #include "llvm/CodeGen/MachineInstrBuilder.h"
  41. #include "llvm/CodeGen/MachineRegisterInfo.h"
  42. #include "llvm/Support/CommandLine.h"
  43. #include "llvm/Support/Debug.h"
  44. #include "llvm/Support/raw_ostream.h"
  45. using namespace llvm;
  46. #define DEBUG_TYPE "aarch64-simd-scalar"
  47. // Allow forcing all i64 operations with equivalent SIMD instructions to use
  48. // them. For stress-testing the transformation function.
  49. static cl::opt<bool>
  50. TransformAll("aarch64-simd-scalar-force-all",
  51. cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
  52. cl::init(false), cl::Hidden);
  53. STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
  54. STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
  55. STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
  56. #define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization"
  57. namespace {
  58. class AArch64AdvSIMDScalar : public MachineFunctionPass {
  59. MachineRegisterInfo *MRI;
  60. const TargetInstrInfo *TII;
  61. private:
  62. // isProfitableToTransform - Predicate function to determine whether an
  63. // instruction should be transformed to its equivalent AdvSIMD scalar
  64. // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
  65. bool isProfitableToTransform(const MachineInstr &MI) const;
  66. // transformInstruction - Perform the transformation of an instruction
  67. // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
  68. // to be the correct register class, minimizing cross-class copies.
  69. void transformInstruction(MachineInstr &MI);
  70. // processMachineBasicBlock - Main optimzation loop.
  71. bool processMachineBasicBlock(MachineBasicBlock *MBB);
  72. public:
  73. static char ID; // Pass identification, replacement for typeid.
  74. explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {
  75. initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry());
  76. }
  77. bool runOnMachineFunction(MachineFunction &F) override;
  78. StringRef getPassName() const override { return AARCH64_ADVSIMD_NAME; }
  79. void getAnalysisUsage(AnalysisUsage &AU) const override {
  80. AU.setPreservesCFG();
  81. MachineFunctionPass::getAnalysisUsage(AU);
  82. }
  83. };
  84. char AArch64AdvSIMDScalar::ID = 0;
  85. } // end anonymous namespace
  86. INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar",
  87. AARCH64_ADVSIMD_NAME, false, false)
  88. static bool isGPR64(unsigned Reg, unsigned SubReg,
  89. const MachineRegisterInfo *MRI) {
  90. if (SubReg)
  91. return false;
  92. if (Register::isVirtualRegister(Reg))
  93. return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
  94. return AArch64::GPR64RegClass.contains(Reg);
  95. }
  96. static bool isFPR64(unsigned Reg, unsigned SubReg,
  97. const MachineRegisterInfo *MRI) {
  98. if (Register::isVirtualRegister(Reg))
  99. return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
  100. SubReg == 0) ||
  101. (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
  102. SubReg == AArch64::dsub);
  103. // Physical register references just check the register class directly.
  104. return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
  105. (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
  106. }
  107. // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
  108. // copy instruction. Return nullptr if the instruction is not a copy.
  109. static MachineOperand *getSrcFromCopy(MachineInstr *MI,
  110. const MachineRegisterInfo *MRI,
  111. unsigned &SubReg) {
  112. SubReg = 0;
  113. // The "FMOV Xd, Dn" instruction is the typical form.
  114. if (MI->getOpcode() == AArch64::FMOVDXr ||
  115. MI->getOpcode() == AArch64::FMOVXDr)
  116. return &MI->getOperand(1);
  117. // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
  118. // these at this stage, but it's easy to check for.
  119. if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
  120. SubReg = AArch64::dsub;
  121. return &MI->getOperand(1);
  122. }
  123. // Or just a plain COPY instruction. This can be directly to/from FPR64,
  124. // or it can be a dsub subreg reference to an FPR128.
  125. if (MI->getOpcode() == AArch64::COPY) {
  126. if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
  127. MRI) &&
  128. isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
  129. return &MI->getOperand(1);
  130. if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
  131. MRI) &&
  132. isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
  133. MRI)) {
  134. SubReg = MI->getOperand(1).getSubReg();
  135. return &MI->getOperand(1);
  136. }
  137. }
  138. // Otherwise, this is some other kind of instruction.
  139. return nullptr;
  140. }
  141. // getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
  142. // that we're considering transforming to, return that AdvSIMD opcode. For all
  143. // others, return the original opcode.
  144. static unsigned getTransformOpcode(unsigned Opc) {
  145. switch (Opc) {
  146. default:
  147. break;
  148. // FIXME: Lots more possibilities.
  149. case AArch64::ADDXrr:
  150. return AArch64::ADDv1i64;
  151. case AArch64::SUBXrr:
  152. return AArch64::SUBv1i64;
  153. case AArch64::ANDXrr:
  154. return AArch64::ANDv8i8;
  155. case AArch64::EORXrr:
  156. return AArch64::EORv8i8;
  157. case AArch64::ORRXrr:
  158. return AArch64::ORRv8i8;
  159. }
  160. // No AdvSIMD equivalent, so just return the original opcode.
  161. return Opc;
  162. }
  163. static bool isTransformable(const MachineInstr &MI) {
  164. unsigned Opc = MI.getOpcode();
  165. return Opc != getTransformOpcode(Opc);
  166. }
  167. // isProfitableToTransform - Predicate function to determine whether an
  168. // instruction should be transformed to its equivalent AdvSIMD scalar
  169. // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
  170. bool AArch64AdvSIMDScalar::isProfitableToTransform(
  171. const MachineInstr &MI) const {
  172. // If this instruction isn't eligible to be transformed (no SIMD equivalent),
  173. // early exit since that's the common case.
  174. if (!isTransformable(MI))
  175. return false;
  176. // Count the number of copies we'll need to add and approximate the number
  177. // of copies that a transform will enable us to remove.
  178. unsigned NumNewCopies = 3;
  179. unsigned NumRemovableCopies = 0;
  180. Register OrigSrc0 = MI.getOperand(1).getReg();
  181. Register OrigSrc1 = MI.getOperand(2).getReg();
  182. unsigned SubReg0;
  183. unsigned SubReg1;
  184. if (!MRI->def_empty(OrigSrc0)) {
  185. MachineRegisterInfo::def_instr_iterator Def =
  186. MRI->def_instr_begin(OrigSrc0);
  187. assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
  188. MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
  189. // If the source was from a copy, we don't need to insert a new copy.
  190. if (MOSrc0)
  191. --NumNewCopies;
  192. // If there are no other users of the original source, we can delete
  193. // that instruction.
  194. if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0))
  195. ++NumRemovableCopies;
  196. }
  197. if (!MRI->def_empty(OrigSrc1)) {
  198. MachineRegisterInfo::def_instr_iterator Def =
  199. MRI->def_instr_begin(OrigSrc1);
  200. assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
  201. MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
  202. if (MOSrc1)
  203. --NumNewCopies;
  204. // If there are no other users of the original source, we can delete
  205. // that instruction.
  206. if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1))
  207. ++NumRemovableCopies;
  208. }
  209. // If any of the uses of the original instructions is a cross class copy,
  210. // that's a copy that will be removable if we transform. Likewise, if
  211. // any of the uses is a transformable instruction, it's likely the tranforms
  212. // will chain, enabling us to save a copy there, too. This is an aggressive
  213. // heuristic that approximates the graph based cost analysis described above.
  214. Register Dst = MI.getOperand(0).getReg();
  215. bool AllUsesAreCopies = true;
  216. for (MachineRegisterInfo::use_instr_nodbg_iterator
  217. Use = MRI->use_instr_nodbg_begin(Dst),
  218. E = MRI->use_instr_nodbg_end();
  219. Use != E; ++Use) {
  220. unsigned SubReg;
  221. if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use))
  222. ++NumRemovableCopies;
  223. // If the use is an INSERT_SUBREG, that's still something that can
  224. // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
  225. // preferable to have it use the FPR64 in most cases, as if the source
  226. // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
  227. // Ditto for a lane insert.
  228. else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
  229. Use->getOpcode() == AArch64::INSvi64gpr)
  230. ;
  231. else
  232. AllUsesAreCopies = false;
  233. }
  234. // If all of the uses of the original destination register are copies to
  235. // FPR64, then we won't end up having a new copy back to GPR64 either.
  236. if (AllUsesAreCopies)
  237. --NumNewCopies;
  238. // If a transform will not increase the number of cross-class copies required,
  239. // return true.
  240. if (NumNewCopies <= NumRemovableCopies)
  241. return true;
  242. // Finally, even if we otherwise wouldn't transform, check if we're forcing
  243. // transformation of everything.
  244. return TransformAll;
  245. }
  246. static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
  247. unsigned Dst, unsigned Src, bool IsKill) {
  248. MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
  249. TII->get(AArch64::COPY), Dst)
  250. .addReg(Src, getKillRegState(IsKill));
  251. LLVM_DEBUG(dbgs() << " adding copy: " << *MIB);
  252. ++NumCopiesInserted;
  253. return MIB;
  254. }
  255. // transformInstruction - Perform the transformation of an instruction
  256. // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
  257. // to be the correct register class, minimizing cross-class copies.
  258. void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
  259. LLVM_DEBUG(dbgs() << "Scalar transform: " << MI);
  260. MachineBasicBlock *MBB = MI.getParent();
  261. unsigned OldOpc = MI.getOpcode();
  262. unsigned NewOpc = getTransformOpcode(OldOpc);
  263. assert(OldOpc != NewOpc && "transform an instruction to itself?!");
  264. // Check if we need a copy for the source registers.
  265. Register OrigSrc0 = MI.getOperand(1).getReg();
  266. Register OrigSrc1 = MI.getOperand(2).getReg();
  267. unsigned Src0 = 0, SubReg0;
  268. unsigned Src1 = 0, SubReg1;
  269. bool KillSrc0 = false, KillSrc1 = false;
  270. if (!MRI->def_empty(OrigSrc0)) {
  271. MachineRegisterInfo::def_instr_iterator Def =
  272. MRI->def_instr_begin(OrigSrc0);
  273. assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
  274. MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
  275. // If there are no other users of the original source, we can delete
  276. // that instruction.
  277. if (MOSrc0) {
  278. Src0 = MOSrc0->getReg();
  279. KillSrc0 = MOSrc0->isKill();
  280. // Src0 is going to be reused, thus, it cannot be killed anymore.
  281. MOSrc0->setIsKill(false);
  282. if (MRI->hasOneNonDBGUse(OrigSrc0)) {
  283. assert(MOSrc0 && "Can't delete copy w/o a valid original source!");
  284. Def->eraseFromParent();
  285. ++NumCopiesDeleted;
  286. }
  287. }
  288. }
  289. if (!MRI->def_empty(OrigSrc1)) {
  290. MachineRegisterInfo::def_instr_iterator Def =
  291. MRI->def_instr_begin(OrigSrc1);
  292. assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
  293. MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
  294. // If there are no other users of the original source, we can delete
  295. // that instruction.
  296. if (MOSrc1) {
  297. Src1 = MOSrc1->getReg();
  298. KillSrc1 = MOSrc1->isKill();
  299. // Src0 is going to be reused, thus, it cannot be killed anymore.
  300. MOSrc1->setIsKill(false);
  301. if (MRI->hasOneNonDBGUse(OrigSrc1)) {
  302. assert(MOSrc1 && "Can't delete copy w/o a valid original source!");
  303. Def->eraseFromParent();
  304. ++NumCopiesDeleted;
  305. }
  306. }
  307. }
  308. // If we weren't able to reference the original source directly, create a
  309. // copy.
  310. if (!Src0) {
  311. SubReg0 = 0;
  312. Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
  313. insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0);
  314. KillSrc0 = true;
  315. }
  316. if (!Src1) {
  317. SubReg1 = 0;
  318. Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
  319. insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1);
  320. KillSrc1 = true;
  321. }
  322. // Create a vreg for the destination.
  323. // FIXME: No need to do this if the ultimate user expects an FPR64.
  324. // Check for that and avoid the copy if possible.
  325. Register Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
  326. // For now, all of the new instructions have the same simple three-register
  327. // form, so no need to special case based on what instruction we're
  328. // building.
  329. BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst)
  330. .addReg(Src0, getKillRegState(KillSrc0), SubReg0)
  331. .addReg(Src1, getKillRegState(KillSrc1), SubReg1);
  332. // Now copy the result back out to a GPR.
  333. // FIXME: Try to avoid this if all uses could actually just use the FPR64
  334. // directly.
  335. insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true);
  336. // Erase the old instruction.
  337. MI.eraseFromParent();
  338. ++NumScalarInsnsUsed;
  339. }
  340. // processMachineBasicBlock - Main optimzation loop.
  341. bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
  342. bool Changed = false;
  343. for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
  344. if (isProfitableToTransform(MI)) {
  345. transformInstruction(MI);
  346. Changed = true;
  347. }
  348. }
  349. return Changed;
  350. }
  351. // runOnMachineFunction - Pass entry point from PassManager.
  352. bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
  353. bool Changed = false;
  354. LLVM_DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
  355. if (skipFunction(mf.getFunction()))
  356. return false;
  357. MRI = &mf.getRegInfo();
  358. TII = mf.getSubtarget().getInstrInfo();
  359. // Just check things on a one-block-at-a-time basis.
  360. for (MachineBasicBlock &MBB : mf)
  361. if (processMachineBasicBlock(&MBB))
  362. Changed = true;
  363. return Changed;
  364. }
  365. // createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
  366. // to add the pass to the PassManager.
  367. FunctionPass *llvm::createAArch64AdvSIMDScalar() {
  368. return new AArch64AdvSIMDScalar();
  369. }