MachineCombiner.cpp 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763
  1. //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // The machine combiner pass uses machine trace metrics to ensure the combined
  10. // instructions do not lengthen the critical path or the resource depth.
  11. //===----------------------------------------------------------------------===//
  12. #include "llvm/ADT/DenseMap.h"
  13. #include "llvm/ADT/Statistic.h"
  14. #include "llvm/Analysis/ProfileSummaryInfo.h"
  15. #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
  16. #include "llvm/CodeGen/MachineCombinerPattern.h"
  17. #include "llvm/CodeGen/MachineDominators.h"
  18. #include "llvm/CodeGen/MachineFunction.h"
  19. #include "llvm/CodeGen/MachineFunctionPass.h"
  20. #include "llvm/CodeGen/MachineLoopInfo.h"
  21. #include "llvm/CodeGen/MachineRegisterInfo.h"
  22. #include "llvm/CodeGen/MachineSizeOpts.h"
  23. #include "llvm/CodeGen/MachineTraceMetrics.h"
  24. #include "llvm/CodeGen/RegisterClassInfo.h"
  25. #include "llvm/CodeGen/TargetInstrInfo.h"
  26. #include "llvm/CodeGen/TargetRegisterInfo.h"
  27. #include "llvm/CodeGen/TargetSchedule.h"
  28. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  29. #include "llvm/InitializePasses.h"
  30. #include "llvm/Support/CommandLine.h"
  31. #include "llvm/Support/Debug.h"
  32. #include "llvm/Support/raw_ostream.h"
  33. using namespace llvm;
  34. #define DEBUG_TYPE "machine-combiner"
  35. STATISTIC(NumInstCombined, "Number of machineinst combined");
  36. static cl::opt<unsigned>
  37. inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
  38. cl::desc("Incremental depth computation will be used for basic "
  39. "blocks with more instructions."), cl::init(500));
  40. static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
  41. cl::desc("Dump all substituted intrs"),
  42. cl::init(false));
  43. #ifdef EXPENSIVE_CHECKS
  44. static cl::opt<bool> VerifyPatternOrder(
  45. "machine-combiner-verify-pattern-order", cl::Hidden,
  46. cl::desc(
  47. "Verify that the generated patterns are ordered by increasing latency"),
  48. cl::init(true));
  49. #else
  50. static cl::opt<bool> VerifyPatternOrder(
  51. "machine-combiner-verify-pattern-order", cl::Hidden,
  52. cl::desc(
  53. "Verify that the generated patterns are ordered by increasing latency"),
  54. cl::init(false));
  55. #endif
  56. namespace {
  57. class MachineCombiner : public MachineFunctionPass {
  58. const TargetSubtargetInfo *STI;
  59. const TargetInstrInfo *TII;
  60. const TargetRegisterInfo *TRI;
  61. MCSchedModel SchedModel;
  62. MachineRegisterInfo *MRI;
  63. MachineLoopInfo *MLI; // Current MachineLoopInfo
  64. MachineTraceMetrics *Traces;
  65. MachineTraceMetrics::Ensemble *MinInstr;
  66. MachineBlockFrequencyInfo *MBFI;
  67. ProfileSummaryInfo *PSI;
  68. RegisterClassInfo RegClassInfo;
  69. TargetSchedModel TSchedModel;
  70. /// True if optimizing for code size.
  71. bool OptSize;
  72. public:
  73. static char ID;
  74. MachineCombiner() : MachineFunctionPass(ID) {
  75. initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
  76. }
  77. void getAnalysisUsage(AnalysisUsage &AU) const override;
  78. bool runOnMachineFunction(MachineFunction &MF) override;
  79. StringRef getPassName() const override { return "Machine InstCombiner"; }
  80. private:
  81. bool combineInstructions(MachineBasicBlock *);
  82. MachineInstr *getOperandDef(const MachineOperand &MO);
  83. bool isTransientMI(const MachineInstr *MI);
  84. unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
  85. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  86. MachineTraceMetrics::Trace BlockTrace);
  87. unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
  88. MachineTraceMetrics::Trace BlockTrace);
  89. bool
  90. improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
  91. MachineTraceMetrics::Trace BlockTrace,
  92. SmallVectorImpl<MachineInstr *> &InsInstrs,
  93. SmallVectorImpl<MachineInstr *> &DelInstrs,
  94. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  95. MachineCombinerPattern Pattern, bool SlackIsAccurate);
  96. bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
  97. SmallVectorImpl<MachineInstr *> &InsInstrs,
  98. SmallVectorImpl<MachineInstr *> &DelInstrs,
  99. MachineCombinerPattern Pattern);
  100. bool preservesResourceLen(MachineBasicBlock *MBB,
  101. MachineTraceMetrics::Trace BlockTrace,
  102. SmallVectorImpl<MachineInstr *> &InsInstrs,
  103. SmallVectorImpl<MachineInstr *> &DelInstrs);
  104. void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
  105. SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
  106. std::pair<unsigned, unsigned>
  107. getLatenciesForInstrSequences(MachineInstr &MI,
  108. SmallVectorImpl<MachineInstr *> &InsInstrs,
  109. SmallVectorImpl<MachineInstr *> &DelInstrs,
  110. MachineTraceMetrics::Trace BlockTrace);
  111. void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
  112. SmallVector<MachineCombinerPattern, 16> &Patterns);
  113. };
  114. }
  115. char MachineCombiner::ID = 0;
  116. char &llvm::MachineCombinerID = MachineCombiner::ID;
  117. INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
  118. "Machine InstCombiner", false, false)
  119. INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
  120. INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
  121. INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
  122. false, false)
  123. void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
  124. AU.setPreservesCFG();
  125. AU.addPreserved<MachineDominatorTree>();
  126. AU.addRequired<MachineLoopInfo>();
  127. AU.addPreserved<MachineLoopInfo>();
  128. AU.addRequired<MachineTraceMetrics>();
  129. AU.addPreserved<MachineTraceMetrics>();
  130. AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
  131. AU.addRequired<ProfileSummaryInfoWrapperPass>();
  132. MachineFunctionPass::getAnalysisUsage(AU);
  133. }
  134. MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
  135. MachineInstr *DefInstr = nullptr;
  136. // We need a virtual register definition.
  137. if (MO.isReg() && MO.getReg().isVirtual())
  138. DefInstr = MRI->getUniqueVRegDef(MO.getReg());
  139. // PHI's have no depth etc.
  140. if (DefInstr && DefInstr->isPHI())
  141. DefInstr = nullptr;
  142. return DefInstr;
  143. }
  144. /// Return true if MI is unlikely to generate an actual target instruction.
  145. bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
  146. if (!MI->isCopy())
  147. return MI->isTransient();
  148. // If MI is a COPY, check if its src and dst registers can be coalesced.
  149. Register Dst = MI->getOperand(0).getReg();
  150. Register Src = MI->getOperand(1).getReg();
  151. if (!MI->isFullCopy()) {
  152. // If src RC contains super registers of dst RC, it can also be coalesced.
  153. if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
  154. return false;
  155. auto SrcSub = MI->getOperand(1).getSubReg();
  156. auto SrcRC = MRI->getRegClass(Src);
  157. auto DstRC = MRI->getRegClass(Dst);
  158. return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
  159. }
  160. if (Src.isPhysical() && Dst.isPhysical())
  161. return Src == Dst;
  162. if (Src.isVirtual() && Dst.isVirtual()) {
  163. auto SrcRC = MRI->getRegClass(Src);
  164. auto DstRC = MRI->getRegClass(Dst);
  165. return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
  166. }
  167. if (Src.isVirtual())
  168. std::swap(Src, Dst);
  169. // Now Src is physical register, Dst is virtual register.
  170. auto DstRC = MRI->getRegClass(Dst);
  171. return DstRC->contains(Src);
  172. }
  173. /// Computes depth of instructions in vector \InsInstr.
  174. ///
  175. /// \param InsInstrs is a vector of machine instructions
  176. /// \param InstrIdxForVirtReg is a dense map of virtual register to index
  177. /// of defining machine instruction in \p InsInstrs
  178. /// \param BlockTrace is a trace of machine instructions
  179. ///
  180. /// \returns Depth of last instruction in \InsInstrs ("NewRoot")
  181. unsigned
  182. MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
  183. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  184. MachineTraceMetrics::Trace BlockTrace) {
  185. SmallVector<unsigned, 16> InstrDepth;
  186. // For each instruction in the new sequence compute the depth based on the
  187. // operands. Use the trace information when possible. For new operands which
  188. // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
  189. for (auto *InstrPtr : InsInstrs) { // for each Use
  190. unsigned IDepth = 0;
  191. for (const MachineOperand &MO : InstrPtr->operands()) {
  192. // Check for virtual register operand.
  193. if (!(MO.isReg() && MO.getReg().isVirtual()))
  194. continue;
  195. if (!MO.isUse())
  196. continue;
  197. unsigned DepthOp = 0;
  198. unsigned LatencyOp = 0;
  199. DenseMap<unsigned, unsigned>::iterator II =
  200. InstrIdxForVirtReg.find(MO.getReg());
  201. if (II != InstrIdxForVirtReg.end()) {
  202. // Operand is new virtual register not in trace
  203. assert(II->second < InstrDepth.size() && "Bad Index");
  204. MachineInstr *DefInstr = InsInstrs[II->second];
  205. assert(DefInstr &&
  206. "There must be a definition for a new virtual register");
  207. DepthOp = InstrDepth[II->second];
  208. int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());
  209. int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());
  210. LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
  211. InstrPtr, UseIdx);
  212. } else {
  213. MachineInstr *DefInstr = getOperandDef(MO);
  214. if (DefInstr) {
  215. DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
  216. if (!isTransientMI(DefInstr))
  217. LatencyOp = TSchedModel.computeOperandLatency(
  218. DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
  219. InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
  220. }
  221. }
  222. IDepth = std::max(IDepth, DepthOp + LatencyOp);
  223. }
  224. InstrDepth.push_back(IDepth);
  225. }
  226. unsigned NewRootIdx = InsInstrs.size() - 1;
  227. return InstrDepth[NewRootIdx];
  228. }
  229. /// Computes instruction latency as max of latency of defined operands.
  230. ///
  231. /// \param Root is a machine instruction that could be replaced by NewRoot.
  232. /// It is used to compute a more accurate latency information for NewRoot in
  233. /// case there is a dependent instruction in the same trace (\p BlockTrace)
  234. /// \param NewRoot is the instruction for which the latency is computed
  235. /// \param BlockTrace is a trace of machine instructions
  236. ///
  237. /// \returns Latency of \p NewRoot
  238. unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
  239. MachineTraceMetrics::Trace BlockTrace) {
  240. // Check each definition in NewRoot and compute the latency
  241. unsigned NewRootLatency = 0;
  242. for (const MachineOperand &MO : NewRoot->operands()) {
  243. // Check for virtual register operand.
  244. if (!(MO.isReg() && MO.getReg().isVirtual()))
  245. continue;
  246. if (!MO.isDef())
  247. continue;
  248. // Get the first instruction that uses MO
  249. MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
  250. RI++;
  251. if (RI == MRI->reg_end())
  252. continue;
  253. MachineInstr *UseMO = RI->getParent();
  254. unsigned LatencyOp = 0;
  255. if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
  256. LatencyOp = TSchedModel.computeOperandLatency(
  257. NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
  258. UseMO->findRegisterUseOperandIdx(MO.getReg()));
  259. } else {
  260. LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
  261. }
  262. NewRootLatency = std::max(NewRootLatency, LatencyOp);
  263. }
  264. return NewRootLatency;
  265. }
  266. /// The combiner's goal may differ based on which pattern it is attempting
  267. /// to optimize.
  268. enum class CombinerObjective {
  269. MustReduceDepth, // The data dependency chain must be improved.
  270. MustReduceRegisterPressure, // The register pressure must be reduced.
  271. Default // The critical path must not be lengthened.
  272. };
  273. static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
  274. // TODO: If C++ ever gets a real enum class, make this part of the
  275. // MachineCombinerPattern class.
  276. switch (P) {
  277. case MachineCombinerPattern::REASSOC_AX_BY:
  278. case MachineCombinerPattern::REASSOC_AX_YB:
  279. case MachineCombinerPattern::REASSOC_XA_BY:
  280. case MachineCombinerPattern::REASSOC_XA_YB:
  281. case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
  282. case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
  283. case MachineCombinerPattern::SUBADD_OP1:
  284. case MachineCombinerPattern::SUBADD_OP2:
  285. case MachineCombinerPattern::FMADD_AX:
  286. case MachineCombinerPattern::FMADD_XA:
  287. case MachineCombinerPattern::FMSUB:
  288. case MachineCombinerPattern::FNMSUB:
  289. return CombinerObjective::MustReduceDepth;
  290. case MachineCombinerPattern::REASSOC_XY_BCA:
  291. case MachineCombinerPattern::REASSOC_XY_BAC:
  292. return CombinerObjective::MustReduceRegisterPressure;
  293. default:
  294. return CombinerObjective::Default;
  295. }
  296. }
  297. /// Estimate the latency of the new and original instruction sequence by summing
  298. /// up the latencies of the inserted and deleted instructions. This assumes
  299. /// that the inserted and deleted instructions are dependent instruction chains,
  300. /// which might not hold in all cases.
  301. std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
  302. MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
  303. SmallVectorImpl<MachineInstr *> &DelInstrs,
  304. MachineTraceMetrics::Trace BlockTrace) {
  305. assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
  306. unsigned NewRootLatency = 0;
  307. // NewRoot is the last instruction in the \p InsInstrs vector.
  308. MachineInstr *NewRoot = InsInstrs.back();
  309. for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
  310. NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
  311. NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
  312. unsigned RootLatency = 0;
  313. for (auto *I : DelInstrs)
  314. RootLatency += TSchedModel.computeInstrLatency(I);
  315. return {NewRootLatency, RootLatency};
  316. }
  317. bool MachineCombiner::reduceRegisterPressure(
  318. MachineInstr &Root, MachineBasicBlock *MBB,
  319. SmallVectorImpl<MachineInstr *> &InsInstrs,
  320. SmallVectorImpl<MachineInstr *> &DelInstrs,
  321. MachineCombinerPattern Pattern) {
  322. // FIXME: for now, we don't do any check for the register pressure patterns.
  323. // We treat them as always profitable. But we can do better if we make
  324. // RegPressureTracker class be aware of TIE attribute. Then we can get an
  325. // accurate compare of register pressure with DelInstrs or InsInstrs.
  326. return true;
  327. }
  328. /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
  329. /// The new code sequence ends in MI NewRoot. A necessary condition for the new
  330. /// sequence to replace the old sequence is that it cannot lengthen the critical
  331. /// path. The definition of "improve" may be restricted by specifying that the
  332. /// new path improves the data dependency chain (MustReduceDepth).
  333. bool MachineCombiner::improvesCriticalPathLen(
  334. MachineBasicBlock *MBB, MachineInstr *Root,
  335. MachineTraceMetrics::Trace BlockTrace,
  336. SmallVectorImpl<MachineInstr *> &InsInstrs,
  337. SmallVectorImpl<MachineInstr *> &DelInstrs,
  338. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  339. MachineCombinerPattern Pattern,
  340. bool SlackIsAccurate) {
  341. // Get depth and latency of NewRoot and Root.
  342. unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
  343. unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
  344. LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
  345. << NewRootDepth << "\tRootDepth: " << RootDepth);
  346. // For a transform such as reassociation, the cost equation is
  347. // conservatively calculated so that we must improve the depth (data
  348. // dependency cycles) in the critical path to proceed with the transform.
  349. // Being conservative also protects against inaccuracies in the underlying
  350. // machine trace metrics and CPU models.
  351. if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
  352. LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
  353. LLVM_DEBUG(NewRootDepth < RootDepth
  354. ? dbgs() << "\t and it does it\n"
  355. : dbgs() << "\t but it does NOT do it\n");
  356. return NewRootDepth < RootDepth;
  357. }
  358. // A more flexible cost calculation for the critical path includes the slack
  359. // of the original code sequence. This may allow the transform to proceed
  360. // even if the instruction depths (data dependency cycles) become worse.
  361. // Account for the latency of the inserted and deleted instructions by
  362. unsigned NewRootLatency, RootLatency;
  363. std::tie(NewRootLatency, RootLatency) =
  364. getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
  365. unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
  366. unsigned NewCycleCount = NewRootDepth + NewRootLatency;
  367. unsigned OldCycleCount =
  368. RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
  369. LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
  370. << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
  371. << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
  372. << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
  373. << "\n\tRootDepth + RootLatency + RootSlack = "
  374. << OldCycleCount;);
  375. LLVM_DEBUG(NewCycleCount <= OldCycleCount
  376. ? dbgs() << "\n\t It IMPROVES PathLen because"
  377. : dbgs() << "\n\t It DOES NOT improve PathLen because");
  378. LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
  379. << ", OldCycleCount = " << OldCycleCount << "\n");
  380. return NewCycleCount <= OldCycleCount;
  381. }
  382. /// helper routine to convert instructions into SC
  383. void MachineCombiner::instr2instrSC(
  384. SmallVectorImpl<MachineInstr *> &Instrs,
  385. SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
  386. for (auto *InstrPtr : Instrs) {
  387. unsigned Opc = InstrPtr->getOpcode();
  388. unsigned Idx = TII->get(Opc).getSchedClass();
  389. const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
  390. InstrsSC.push_back(SC);
  391. }
  392. }
  393. /// True when the new instructions do not increase resource length
  394. bool MachineCombiner::preservesResourceLen(
  395. MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
  396. SmallVectorImpl<MachineInstr *> &InsInstrs,
  397. SmallVectorImpl<MachineInstr *> &DelInstrs) {
  398. if (!TSchedModel.hasInstrSchedModel())
  399. return true;
  400. // Compute current resource length
  401. //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
  402. SmallVector <const MachineBasicBlock *, 1> MBBarr;
  403. MBBarr.push_back(MBB);
  404. unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
  405. // Deal with SC rather than Instructions.
  406. SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
  407. SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
  408. instr2instrSC(InsInstrs, InsInstrsSC);
  409. instr2instrSC(DelInstrs, DelInstrsSC);
  410. ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
  411. ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
  412. // Compute new resource length.
  413. unsigned ResLenAfterCombine =
  414. BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
  415. LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
  416. << ResLenBeforeCombine
  417. << " and after: " << ResLenAfterCombine << "\n";);
  418. LLVM_DEBUG(
  419. ResLenAfterCombine <=
  420. ResLenBeforeCombine + TII->getExtendResourceLenLimit()
  421. ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
  422. : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
  423. "Length\n");
  424. return ResLenAfterCombine <=
  425. ResLenBeforeCombine + TII->getExtendResourceLenLimit();
  426. }
  427. /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
  428. /// depths if requested.
  429. ///
  430. /// \param MBB basic block to insert instructions in
  431. /// \param MI current machine instruction
  432. /// \param InsInstrs new instructions to insert in \p MBB
  433. /// \param DelInstrs instruction to delete from \p MBB
  434. /// \param MinInstr is a pointer to the machine trace information
  435. /// \param RegUnits set of live registers, needed to compute instruction depths
  436. /// \param TII is target instruction info, used to call target hook
  437. /// \param Pattern is used to call target hook finalizeInsInstrs
  438. /// \param IncrementalUpdate if true, compute instruction depths incrementally,
  439. /// otherwise invalidate the trace
  440. static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
  441. SmallVector<MachineInstr *, 16> InsInstrs,
  442. SmallVector<MachineInstr *, 16> DelInstrs,
  443. MachineTraceMetrics::Ensemble *MinInstr,
  444. SparseSet<LiveRegUnit> &RegUnits,
  445. const TargetInstrInfo *TII,
  446. MachineCombinerPattern Pattern,
  447. bool IncrementalUpdate) {
  448. // If we want to fix up some placeholder for some target, do it now.
  449. // We need this because in genAlternativeCodeSequence, we have not decided the
  450. // better pattern InsInstrs or DelInstrs, so we don't want generate some
  451. // sideeffect to the function. For example we need to delay the constant pool
  452. // entry creation here after InsInstrs is selected as better pattern.
  453. // Otherwise the constant pool entry created for InsInstrs will not be deleted
  454. // even if InsInstrs is not the better pattern.
  455. TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
  456. for (auto *InstrPtr : InsInstrs)
  457. MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
  458. for (auto *InstrPtr : DelInstrs) {
  459. InstrPtr->eraseFromParent();
  460. // Erase all LiveRegs defined by the removed instruction
  461. for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
  462. if (I->MI == InstrPtr)
  463. I = RegUnits.erase(I);
  464. else
  465. I++;
  466. }
  467. }
  468. if (IncrementalUpdate)
  469. for (auto *InstrPtr : InsInstrs)
  470. MinInstr->updateDepth(MBB, *InstrPtr, RegUnits);
  471. else
  472. MinInstr->invalidate(MBB);
  473. NumInstCombined++;
  474. }
  475. // Check that the difference between original and new latency is decreasing for
  476. // later patterns. This helps to discover sub-optimal pattern orderings.
  477. void MachineCombiner::verifyPatternOrder(
  478. MachineBasicBlock *MBB, MachineInstr &Root,
  479. SmallVector<MachineCombinerPattern, 16> &Patterns) {
  480. long PrevLatencyDiff = std::numeric_limits<long>::max();
  481. (void)PrevLatencyDiff; // Variable is used in assert only.
  482. for (auto P : Patterns) {
  483. SmallVector<MachineInstr *, 16> InsInstrs;
  484. SmallVector<MachineInstr *, 16> DelInstrs;
  485. DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
  486. TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
  487. InstrIdxForVirtReg);
  488. // Found pattern, but did not generate alternative sequence.
  489. // This can happen e.g. when an immediate could not be materialized
  490. // in a single instruction.
  491. if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
  492. continue;
  493. unsigned NewRootLatency, RootLatency;
  494. std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
  495. Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB));
  496. long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
  497. assert(CurrentLatencyDiff <= PrevLatencyDiff &&
  498. "Current pattern is better than previous pattern.");
  499. PrevLatencyDiff = CurrentLatencyDiff;
  500. }
  501. }
  502. /// Substitute a slow code sequence with a faster one by
  503. /// evaluating instruction combining pattern.
  504. /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
  505. /// combining based on machine trace metrics. Only combine a sequence of
  506. /// instructions when this neither lengthens the critical path nor increases
  507. /// resource pressure. When optimizing for codesize always combine when the new
  508. /// sequence is shorter.
  509. bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
  510. bool Changed = false;
  511. LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
  512. bool IncrementalUpdate = false;
  513. auto BlockIter = MBB->begin();
  514. decltype(BlockIter) LastUpdate;
  515. // Check if the block is in a loop.
  516. const MachineLoop *ML = MLI->getLoopFor(MBB);
  517. if (!MinInstr)
  518. MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
  519. SparseSet<LiveRegUnit> RegUnits;
  520. RegUnits.setUniverse(TRI->getNumRegUnits());
  521. bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
  522. bool DoRegPressureReduce =
  523. TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
  524. while (BlockIter != MBB->end()) {
  525. auto &MI = *BlockIter++;
  526. SmallVector<MachineCombinerPattern, 16> Patterns;
  527. // The motivating example is:
  528. //
  529. // MUL Other MUL_op1 MUL_op2 Other
  530. // \ / \ | /
  531. // ADD/SUB => MADD/MSUB
  532. // (=Root) (=NewRoot)
  533. // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
  534. // usually beneficial for code size it unfortunately can hurt performance
  535. // when the ADD is on the critical path, but the MUL is not. With the
  536. // substitution the MUL becomes part of the critical path (in form of the
  537. // MADD) and can lengthen it on architectures where the MADD latency is
  538. // longer than the ADD latency.
  539. //
  540. // For each instruction we check if it can be the root of a combiner
  541. // pattern. Then for each pattern the new code sequence in form of MI is
  542. // generated and evaluated. When the efficiency criteria (don't lengthen
  543. // critical path, don't use more resources) is met the new sequence gets
  544. // hooked up into the basic block before the old sequence is removed.
  545. //
  546. // The algorithm does not try to evaluate all patterns and pick the best.
  547. // This is only an artificial restriction though. In practice there is
  548. // mostly one pattern, and getMachineCombinerPatterns() can order patterns
  549. // based on an internal cost heuristic. If
  550. // machine-combiner-verify-pattern-order is enabled, all patterns are
  551. // checked to ensure later patterns do not provide better latency savings.
  552. if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
  553. continue;
  554. if (VerifyPatternOrder)
  555. verifyPatternOrder(MBB, MI, Patterns);
  556. for (const auto P : Patterns) {
  557. SmallVector<MachineInstr *, 16> InsInstrs;
  558. SmallVector<MachineInstr *, 16> DelInstrs;
  559. DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
  560. TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
  561. InstrIdxForVirtReg);
  562. // Found pattern, but did not generate alternative sequence.
  563. // This can happen e.g. when an immediate could not be materialized
  564. // in a single instruction.
  565. if (InsInstrs.empty())
  566. continue;
  567. LLVM_DEBUG(if (dump_intrs) {
  568. dbgs() << "\tFor the Pattern (" << (int)P
  569. << ") these instructions could be removed\n";
  570. for (auto const *InstrPtr : DelInstrs)
  571. InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
  572. /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
  573. dbgs() << "\tThese instructions could replace the removed ones\n";
  574. for (auto const *InstrPtr : InsInstrs)
  575. InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
  576. /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
  577. });
  578. if (IncrementalUpdate && LastUpdate != BlockIter) {
  579. // Update depths since the last incremental update.
  580. MinInstr->updateDepths(LastUpdate, BlockIter, RegUnits);
  581. LastUpdate = BlockIter;
  582. }
  583. if (DoRegPressureReduce &&
  584. getCombinerObjective(P) ==
  585. CombinerObjective::MustReduceRegisterPressure) {
  586. if (MBB->size() > inc_threshold) {
  587. // Use incremental depth updates for basic blocks above threshold
  588. IncrementalUpdate = true;
  589. LastUpdate = BlockIter;
  590. }
  591. if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
  592. // Replace DelInstrs with InsInstrs.
  593. insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
  594. RegUnits, TII, P, IncrementalUpdate);
  595. Changed |= true;
  596. // Go back to previous instruction as it may have ILP reassociation
  597. // opportunity.
  598. BlockIter--;
  599. break;
  600. }
  601. }
  602. if (ML && TII->isThroughputPattern(P)) {
  603. LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
  604. insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
  605. RegUnits, TII, P, IncrementalUpdate);
  606. // Eagerly stop after the first pattern fires.
  607. Changed = true;
  608. break;
  609. } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
  610. LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
  611. << InsInstrs.size() << " < "
  612. << DelInstrs.size() << ")\n");
  613. insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
  614. RegUnits, TII, P, IncrementalUpdate);
  615. // Eagerly stop after the first pattern fires.
  616. Changed = true;
  617. break;
  618. } else {
  619. // For big basic blocks, we only compute the full trace the first time
  620. // we hit this. We do not invalidate the trace, but instead update the
  621. // instruction depths incrementally.
  622. // NOTE: Only the instruction depths up to MI are accurate. All other
  623. // trace information is not updated.
  624. MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
  625. Traces->verifyAnalysis();
  626. if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
  627. InstrIdxForVirtReg, P,
  628. !IncrementalUpdate) &&
  629. preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
  630. if (MBB->size() > inc_threshold) {
  631. // Use incremental depth updates for basic blocks above treshold
  632. IncrementalUpdate = true;
  633. LastUpdate = BlockIter;
  634. }
  635. insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
  636. RegUnits, TII, P, IncrementalUpdate);
  637. // Eagerly stop after the first pattern fires.
  638. Changed = true;
  639. break;
  640. }
  641. // Cleanup instructions of the alternative code sequence. There is no
  642. // use for them.
  643. MachineFunction *MF = MBB->getParent();
  644. for (auto *InstrPtr : InsInstrs)
  645. MF->deleteMachineInstr(InstrPtr);
  646. }
  647. InstrIdxForVirtReg.clear();
  648. }
  649. }
  650. if (Changed && IncrementalUpdate)
  651. Traces->invalidate(MBB);
  652. return Changed;
  653. }
  654. bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
  655. STI = &MF.getSubtarget();
  656. TII = STI->getInstrInfo();
  657. TRI = STI->getRegisterInfo();
  658. SchedModel = STI->getSchedModel();
  659. TSchedModel.init(STI);
  660. MRI = &MF.getRegInfo();
  661. MLI = &getAnalysis<MachineLoopInfo>();
  662. Traces = &getAnalysis<MachineTraceMetrics>();
  663. PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
  664. MBFI = (PSI && PSI->hasProfileSummary()) ?
  665. &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
  666. nullptr;
  667. MinInstr = nullptr;
  668. OptSize = MF.getFunction().hasOptSize();
  669. RegClassInfo.runOnMachineFunction(MF);
  670. LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
  671. if (!TII->useMachineCombiner()) {
  672. LLVM_DEBUG(
  673. dbgs()
  674. << " Skipping pass: Target does not support machine combiner\n");
  675. return false;
  676. }
  677. bool Changed = false;
  678. // Try to combine instructions.
  679. for (auto &MBB : MF)
  680. Changed |= combineInstructions(&MBB);
  681. return Changed;
  682. }