MachineCombiner.cpp 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739
  1. //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // The machine combiner pass uses machine trace metrics to ensure the combined
  10. // instructions do not lengthen the critical path or the resource depth.
  11. //===----------------------------------------------------------------------===//
  12. #include "llvm/ADT/DenseMap.h"
  13. #include "llvm/ADT/Statistic.h"
  14. #include "llvm/Analysis/ProfileSummaryInfo.h"
  15. #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
  16. #include "llvm/CodeGen/MachineDominators.h"
  17. #include "llvm/CodeGen/MachineFunction.h"
  18. #include "llvm/CodeGen/MachineFunctionPass.h"
  19. #include "llvm/CodeGen/MachineLoopInfo.h"
  20. #include "llvm/CodeGen/MachineRegisterInfo.h"
  21. #include "llvm/CodeGen/MachineSizeOpts.h"
  22. #include "llvm/CodeGen/MachineTraceMetrics.h"
  23. #include "llvm/CodeGen/Passes.h"
  24. #include "llvm/CodeGen/RegisterClassInfo.h"
  25. #include "llvm/CodeGen/TargetInstrInfo.h"
  26. #include "llvm/CodeGen/TargetRegisterInfo.h"
  27. #include "llvm/CodeGen/TargetSchedule.h"
  28. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  29. #include "llvm/InitializePasses.h"
  30. #include "llvm/Support/CommandLine.h"
  31. #include "llvm/Support/Debug.h"
  32. #include "llvm/Support/raw_ostream.h"
  33. using namespace llvm;
  34. #define DEBUG_TYPE "machine-combiner"
  35. STATISTIC(NumInstCombined, "Number of machineinst combined");
  36. static cl::opt<unsigned>
  37. inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
  38. cl::desc("Incremental depth computation will be used for basic "
  39. "blocks with more instructions."), cl::init(500));
  40. static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
  41. cl::desc("Dump all substituted intrs"),
  42. cl::init(false));
  43. #ifdef EXPENSIVE_CHECKS
  44. static cl::opt<bool> VerifyPatternOrder(
  45. "machine-combiner-verify-pattern-order", cl::Hidden,
  46. cl::desc(
  47. "Verify that the generated patterns are ordered by increasing latency"),
  48. cl::init(true));
  49. #else
  50. static cl::opt<bool> VerifyPatternOrder(
  51. "machine-combiner-verify-pattern-order", cl::Hidden,
  52. cl::desc(
  53. "Verify that the generated patterns are ordered by increasing latency"),
  54. cl::init(false));
  55. #endif
  56. namespace {
  57. class MachineCombiner : public MachineFunctionPass {
  58. const TargetSubtargetInfo *STI;
  59. const TargetInstrInfo *TII;
  60. const TargetRegisterInfo *TRI;
  61. MCSchedModel SchedModel;
  62. MachineRegisterInfo *MRI;
  63. MachineLoopInfo *MLI; // Current MachineLoopInfo
  64. MachineTraceMetrics *Traces;
  65. MachineTraceMetrics::Ensemble *MinInstr;
  66. MachineBlockFrequencyInfo *MBFI;
  67. ProfileSummaryInfo *PSI;
  68. RegisterClassInfo RegClassInfo;
  69. TargetSchedModel TSchedModel;
  70. /// True if optimizing for code size.
  71. bool OptSize;
  72. public:
  73. static char ID;
  74. MachineCombiner() : MachineFunctionPass(ID) {
  75. initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
  76. }
  77. void getAnalysisUsage(AnalysisUsage &AU) const override;
  78. bool runOnMachineFunction(MachineFunction &MF) override;
  79. StringRef getPassName() const override { return "Machine InstCombiner"; }
  80. private:
  81. bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize);
  82. bool combineInstructions(MachineBasicBlock *);
  83. MachineInstr *getOperandDef(const MachineOperand &MO);
  84. unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
  85. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  86. MachineTraceMetrics::Trace BlockTrace);
  87. unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
  88. MachineTraceMetrics::Trace BlockTrace);
  89. bool
  90. improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
  91. MachineTraceMetrics::Trace BlockTrace,
  92. SmallVectorImpl<MachineInstr *> &InsInstrs,
  93. SmallVectorImpl<MachineInstr *> &DelInstrs,
  94. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  95. MachineCombinerPattern Pattern, bool SlackIsAccurate);
  96. bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
  97. SmallVectorImpl<MachineInstr *> &InsInstrs,
  98. SmallVectorImpl<MachineInstr *> &DelInstrs,
  99. MachineCombinerPattern Pattern);
  100. bool preservesResourceLen(MachineBasicBlock *MBB,
  101. MachineTraceMetrics::Trace BlockTrace,
  102. SmallVectorImpl<MachineInstr *> &InsInstrs,
  103. SmallVectorImpl<MachineInstr *> &DelInstrs);
  104. void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
  105. SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
  106. std::pair<unsigned, unsigned>
  107. getLatenciesForInstrSequences(MachineInstr &MI,
  108. SmallVectorImpl<MachineInstr *> &InsInstrs,
  109. SmallVectorImpl<MachineInstr *> &DelInstrs,
  110. MachineTraceMetrics::Trace BlockTrace);
  111. void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
  112. SmallVector<MachineCombinerPattern, 16> &Patterns);
  113. };
  114. }
  115. char MachineCombiner::ID = 0;
  116. char &llvm::MachineCombinerID = MachineCombiner::ID;
  117. INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
  118. "Machine InstCombiner", false, false)
  119. INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
  120. INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
  121. INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
  122. false, false)
  123. void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
  124. AU.setPreservesCFG();
  125. AU.addPreserved<MachineDominatorTree>();
  126. AU.addRequired<MachineLoopInfo>();
  127. AU.addPreserved<MachineLoopInfo>();
  128. AU.addRequired<MachineTraceMetrics>();
  129. AU.addPreserved<MachineTraceMetrics>();
  130. AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
  131. AU.addRequired<ProfileSummaryInfoWrapperPass>();
  132. MachineFunctionPass::getAnalysisUsage(AU);
  133. }
  134. MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
  135. MachineInstr *DefInstr = nullptr;
  136. // We need a virtual register definition.
  137. if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
  138. DefInstr = MRI->getUniqueVRegDef(MO.getReg());
  139. // PHI's have no depth etc.
  140. if (DefInstr && DefInstr->isPHI())
  141. DefInstr = nullptr;
  142. return DefInstr;
  143. }
  144. /// Computes depth of instructions in vector \InsInstr.
  145. ///
  146. /// \param InsInstrs is a vector of machine instructions
  147. /// \param InstrIdxForVirtReg is a dense map of virtual register to index
  148. /// of defining machine instruction in \p InsInstrs
  149. /// \param BlockTrace is a trace of machine instructions
  150. ///
  151. /// \returns Depth of last instruction in \InsInstrs ("NewRoot")
  152. unsigned
  153. MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
  154. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  155. MachineTraceMetrics::Trace BlockTrace) {
  156. SmallVector<unsigned, 16> InstrDepth;
  157. assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
  158. "Missing machine model\n");
  159. // For each instruction in the new sequence compute the depth based on the
  160. // operands. Use the trace information when possible. For new operands which
  161. // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
  162. for (auto *InstrPtr : InsInstrs) { // for each Use
  163. unsigned IDepth = 0;
  164. for (const MachineOperand &MO : InstrPtr->operands()) {
  165. // Check for virtual register operand.
  166. if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
  167. continue;
  168. if (!MO.isUse())
  169. continue;
  170. unsigned DepthOp = 0;
  171. unsigned LatencyOp = 0;
  172. DenseMap<unsigned, unsigned>::iterator II =
  173. InstrIdxForVirtReg.find(MO.getReg());
  174. if (II != InstrIdxForVirtReg.end()) {
  175. // Operand is new virtual register not in trace
  176. assert(II->second < InstrDepth.size() && "Bad Index");
  177. MachineInstr *DefInstr = InsInstrs[II->second];
  178. assert(DefInstr &&
  179. "There must be a definition for a new virtual register");
  180. DepthOp = InstrDepth[II->second];
  181. int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());
  182. int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());
  183. LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
  184. InstrPtr, UseIdx);
  185. } else {
  186. MachineInstr *DefInstr = getOperandDef(MO);
  187. if (DefInstr) {
  188. DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
  189. LatencyOp = TSchedModel.computeOperandLatency(
  190. DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
  191. InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
  192. }
  193. }
  194. IDepth = std::max(IDepth, DepthOp + LatencyOp);
  195. }
  196. InstrDepth.push_back(IDepth);
  197. }
  198. unsigned NewRootIdx = InsInstrs.size() - 1;
  199. return InstrDepth[NewRootIdx];
  200. }
  201. /// Computes instruction latency as max of latency of defined operands.
  202. ///
  203. /// \param Root is a machine instruction that could be replaced by NewRoot.
  204. /// It is used to compute a more accurate latency information for NewRoot in
  205. /// case there is a dependent instruction in the same trace (\p BlockTrace)
  206. /// \param NewRoot is the instruction for which the latency is computed
  207. /// \param BlockTrace is a trace of machine instructions
  208. ///
  209. /// \returns Latency of \p NewRoot
  210. unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
  211. MachineTraceMetrics::Trace BlockTrace) {
  212. assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
  213. "Missing machine model\n");
  214. // Check each definition in NewRoot and compute the latency
  215. unsigned NewRootLatency = 0;
  216. for (const MachineOperand &MO : NewRoot->operands()) {
  217. // Check for virtual register operand.
  218. if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
  219. continue;
  220. if (!MO.isDef())
  221. continue;
  222. // Get the first instruction that uses MO
  223. MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
  224. RI++;
  225. if (RI == MRI->reg_end())
  226. continue;
  227. MachineInstr *UseMO = RI->getParent();
  228. unsigned LatencyOp = 0;
  229. if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
  230. LatencyOp = TSchedModel.computeOperandLatency(
  231. NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
  232. UseMO->findRegisterUseOperandIdx(MO.getReg()));
  233. } else {
  234. LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
  235. }
  236. NewRootLatency = std::max(NewRootLatency, LatencyOp);
  237. }
  238. return NewRootLatency;
  239. }
  240. /// The combiner's goal may differ based on which pattern it is attempting
  241. /// to optimize.
  242. enum class CombinerObjective {
  243. MustReduceDepth, // The data dependency chain must be improved.
  244. MustReduceRegisterPressure, // The register pressure must be reduced.
  245. Default // The critical path must not be lengthened.
  246. };
  247. static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
  248. // TODO: If C++ ever gets a real enum class, make this part of the
  249. // MachineCombinerPattern class.
  250. switch (P) {
  251. case MachineCombinerPattern::REASSOC_AX_BY:
  252. case MachineCombinerPattern::REASSOC_AX_YB:
  253. case MachineCombinerPattern::REASSOC_XA_BY:
  254. case MachineCombinerPattern::REASSOC_XA_YB:
  255. case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
  256. case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
  257. return CombinerObjective::MustReduceDepth;
  258. case MachineCombinerPattern::REASSOC_XY_BCA:
  259. case MachineCombinerPattern::REASSOC_XY_BAC:
  260. return CombinerObjective::MustReduceRegisterPressure;
  261. default:
  262. return CombinerObjective::Default;
  263. }
  264. }
  265. /// Estimate the latency of the new and original instruction sequence by summing
  266. /// up the latencies of the inserted and deleted instructions. This assumes
  267. /// that the inserted and deleted instructions are dependent instruction chains,
  268. /// which might not hold in all cases.
  269. std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
  270. MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
  271. SmallVectorImpl<MachineInstr *> &DelInstrs,
  272. MachineTraceMetrics::Trace BlockTrace) {
  273. assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
  274. unsigned NewRootLatency = 0;
  275. // NewRoot is the last instruction in the \p InsInstrs vector.
  276. MachineInstr *NewRoot = InsInstrs.back();
  277. for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
  278. NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
  279. NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
  280. unsigned RootLatency = 0;
  281. for (auto I : DelInstrs)
  282. RootLatency += TSchedModel.computeInstrLatency(I);
  283. return {NewRootLatency, RootLatency};
  284. }
  285. bool MachineCombiner::reduceRegisterPressure(
  286. MachineInstr &Root, MachineBasicBlock *MBB,
  287. SmallVectorImpl<MachineInstr *> &InsInstrs,
  288. SmallVectorImpl<MachineInstr *> &DelInstrs,
  289. MachineCombinerPattern Pattern) {
  290. // FIXME: for now, we don't do any check for the register pressure patterns.
  291. // We treat them as always profitable. But we can do better if we make
  292. // RegPressureTracker class be aware of TIE attribute. Then we can get an
  293. // accurate compare of register pressure with DelInstrs or InsInstrs.
  294. return true;
  295. }
  296. /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
  297. /// The new code sequence ends in MI NewRoot. A necessary condition for the new
  298. /// sequence to replace the old sequence is that it cannot lengthen the critical
  299. /// path. The definition of "improve" may be restricted by specifying that the
  300. /// new path improves the data dependency chain (MustReduceDepth).
  301. bool MachineCombiner::improvesCriticalPathLen(
  302. MachineBasicBlock *MBB, MachineInstr *Root,
  303. MachineTraceMetrics::Trace BlockTrace,
  304. SmallVectorImpl<MachineInstr *> &InsInstrs,
  305. SmallVectorImpl<MachineInstr *> &DelInstrs,
  306. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  307. MachineCombinerPattern Pattern,
  308. bool SlackIsAccurate) {
  309. assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
  310. "Missing machine model\n");
  311. // Get depth and latency of NewRoot and Root.
  312. unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
  313. unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
  314. LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
  315. << NewRootDepth << "\tRootDepth: " << RootDepth);
  316. // For a transform such as reassociation, the cost equation is
  317. // conservatively calculated so that we must improve the depth (data
  318. // dependency cycles) in the critical path to proceed with the transform.
  319. // Being conservative also protects against inaccuracies in the underlying
  320. // machine trace metrics and CPU models.
  321. if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
  322. LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
  323. LLVM_DEBUG(NewRootDepth < RootDepth
  324. ? dbgs() << "\t and it does it\n"
  325. : dbgs() << "\t but it does NOT do it\n");
  326. return NewRootDepth < RootDepth;
  327. }
  328. // A more flexible cost calculation for the critical path includes the slack
  329. // of the original code sequence. This may allow the transform to proceed
  330. // even if the instruction depths (data dependency cycles) become worse.
  331. // Account for the latency of the inserted and deleted instructions by
  332. unsigned NewRootLatency, RootLatency;
  333. std::tie(NewRootLatency, RootLatency) =
  334. getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
  335. unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
  336. unsigned NewCycleCount = NewRootDepth + NewRootLatency;
  337. unsigned OldCycleCount =
  338. RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
  339. LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
  340. << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
  341. << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
  342. << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
  343. << "\n\tRootDepth + RootLatency + RootSlack = "
  344. << OldCycleCount;);
  345. LLVM_DEBUG(NewCycleCount <= OldCycleCount
  346. ? dbgs() << "\n\t It IMPROVES PathLen because"
  347. : dbgs() << "\n\t It DOES NOT improve PathLen because");
  348. LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
  349. << ", OldCycleCount = " << OldCycleCount << "\n");
  350. return NewCycleCount <= OldCycleCount;
  351. }
  352. /// helper routine to convert instructions into SC
  353. void MachineCombiner::instr2instrSC(
  354. SmallVectorImpl<MachineInstr *> &Instrs,
  355. SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
  356. for (auto *InstrPtr : Instrs) {
  357. unsigned Opc = InstrPtr->getOpcode();
  358. unsigned Idx = TII->get(Opc).getSchedClass();
  359. const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
  360. InstrsSC.push_back(SC);
  361. }
  362. }
  363. /// True when the new instructions do not increase resource length
  364. bool MachineCombiner::preservesResourceLen(
  365. MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
  366. SmallVectorImpl<MachineInstr *> &InsInstrs,
  367. SmallVectorImpl<MachineInstr *> &DelInstrs) {
  368. if (!TSchedModel.hasInstrSchedModel())
  369. return true;
  370. // Compute current resource length
  371. //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
  372. SmallVector <const MachineBasicBlock *, 1> MBBarr;
  373. MBBarr.push_back(MBB);
  374. unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
  375. // Deal with SC rather than Instructions.
  376. SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
  377. SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
  378. instr2instrSC(InsInstrs, InsInstrsSC);
  379. instr2instrSC(DelInstrs, DelInstrsSC);
  380. ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC);
  381. ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC);
  382. // Compute new resource length.
  383. unsigned ResLenAfterCombine =
  384. BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
  385. LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
  386. << ResLenBeforeCombine
  387. << " and after: " << ResLenAfterCombine << "\n";);
  388. LLVM_DEBUG(
  389. ResLenAfterCombine <=
  390. ResLenBeforeCombine + TII->getExtendResourceLenLimit()
  391. ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
  392. : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
  393. "Length\n");
  394. return ResLenAfterCombine <=
  395. ResLenBeforeCombine + TII->getExtendResourceLenLimit();
  396. }
  397. /// \returns true when new instruction sequence should be generated
  398. /// independent if it lengthens critical path or not
  399. bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize,
  400. bool OptForSize) {
  401. if (OptForSize && (NewSize < OldSize))
  402. return true;
  403. if (!TSchedModel.hasInstrSchedModelOrItineraries())
  404. return true;
  405. return false;
  406. }
  407. /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
  408. /// depths if requested.
  409. ///
  410. /// \param MBB basic block to insert instructions in
  411. /// \param MI current machine instruction
  412. /// \param InsInstrs new instructions to insert in \p MBB
  413. /// \param DelInstrs instruction to delete from \p MBB
  414. /// \param MinInstr is a pointer to the machine trace information
  415. /// \param RegUnits set of live registers, needed to compute instruction depths
  416. /// \param TII is target instruction info, used to call target hook
  417. /// \param Pattern is used to call target hook finalizeInsInstrs
  418. /// \param IncrementalUpdate if true, compute instruction depths incrementally,
  419. /// otherwise invalidate the trace
  420. static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
  421. SmallVector<MachineInstr *, 16> InsInstrs,
  422. SmallVector<MachineInstr *, 16> DelInstrs,
  423. MachineTraceMetrics::Ensemble *MinInstr,
  424. SparseSet<LiveRegUnit> &RegUnits,
  425. const TargetInstrInfo *TII,
  426. MachineCombinerPattern Pattern,
  427. bool IncrementalUpdate) {
  428. // If we want to fix up some placeholder for some target, do it now.
  429. // We need this because in genAlternativeCodeSequence, we have not decided the
  430. // better pattern InsInstrs or DelInstrs, so we don't want generate some
  431. // sideeffect to the function. For example we need to delay the constant pool
  432. // entry creation here after InsInstrs is selected as better pattern.
  433. // Otherwise the constant pool entry created for InsInstrs will not be deleted
  434. // even if InsInstrs is not the better pattern.
  435. TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
  436. for (auto *InstrPtr : InsInstrs)
  437. MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
  438. for (auto *InstrPtr : DelInstrs) {
  439. InstrPtr->eraseFromParent();
  440. // Erase all LiveRegs defined by the removed instruction
  441. for (auto I = RegUnits.begin(); I != RegUnits.end(); ) {
  442. if (I->MI == InstrPtr)
  443. I = RegUnits.erase(I);
  444. else
  445. I++;
  446. }
  447. }
  448. if (IncrementalUpdate)
  449. for (auto *InstrPtr : InsInstrs)
  450. MinInstr->updateDepth(MBB, *InstrPtr, RegUnits);
  451. else
  452. MinInstr->invalidate(MBB);
  453. NumInstCombined++;
  454. }
  455. // Check that the difference between original and new latency is decreasing for
  456. // later patterns. This helps to discover sub-optimal pattern orderings.
  457. void MachineCombiner::verifyPatternOrder(
  458. MachineBasicBlock *MBB, MachineInstr &Root,
  459. SmallVector<MachineCombinerPattern, 16> &Patterns) {
  460. long PrevLatencyDiff = std::numeric_limits<long>::max();
  461. (void)PrevLatencyDiff; // Variable is used in assert only.
  462. for (auto P : Patterns) {
  463. SmallVector<MachineInstr *, 16> InsInstrs;
  464. SmallVector<MachineInstr *, 16> DelInstrs;
  465. DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
  466. TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
  467. InstrIdxForVirtReg);
  468. // Found pattern, but did not generate alternative sequence.
  469. // This can happen e.g. when an immediate could not be materialized
  470. // in a single instruction.
  471. if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
  472. continue;
  473. unsigned NewRootLatency, RootLatency;
  474. std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
  475. Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB));
  476. long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
  477. assert(CurrentLatencyDiff <= PrevLatencyDiff &&
  478. "Current pattern is better than previous pattern.");
  479. PrevLatencyDiff = CurrentLatencyDiff;
  480. }
  481. }
  482. /// Substitute a slow code sequence with a faster one by
  483. /// evaluating instruction combining pattern.
  484. /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
  485. /// combining based on machine trace metrics. Only combine a sequence of
  486. /// instructions when this neither lengthens the critical path nor increases
  487. /// resource pressure. When optimizing for codesize always combine when the new
  488. /// sequence is shorter.
  489. bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
  490. bool Changed = false;
  491. LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
  492. bool IncrementalUpdate = false;
  493. auto BlockIter = MBB->begin();
  494. decltype(BlockIter) LastUpdate;
  495. // Check if the block is in a loop.
  496. const MachineLoop *ML = MLI->getLoopFor(MBB);
  497. if (!MinInstr)
  498. MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
  499. SparseSet<LiveRegUnit> RegUnits;
  500. RegUnits.setUniverse(TRI->getNumRegUnits());
  501. bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
  502. bool DoRegPressureReduce =
  503. TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
  504. while (BlockIter != MBB->end()) {
  505. auto &MI = *BlockIter++;
  506. SmallVector<MachineCombinerPattern, 16> Patterns;
  507. // The motivating example is:
  508. //
  509. // MUL Other MUL_op1 MUL_op2 Other
  510. // \ / \ | /
  511. // ADD/SUB => MADD/MSUB
  512. // (=Root) (=NewRoot)
  513. // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
  514. // usually beneficial for code size it unfortunately can hurt performance
  515. // when the ADD is on the critical path, but the MUL is not. With the
  516. // substitution the MUL becomes part of the critical path (in form of the
  517. // MADD) and can lengthen it on architectures where the MADD latency is
  518. // longer than the ADD latency.
  519. //
  520. // For each instruction we check if it can be the root of a combiner
  521. // pattern. Then for each pattern the new code sequence in form of MI is
  522. // generated and evaluated. When the efficiency criteria (don't lengthen
  523. // critical path, don't use more resources) is met the new sequence gets
  524. // hooked up into the basic block before the old sequence is removed.
  525. //
  526. // The algorithm does not try to evaluate all patterns and pick the best.
  527. // This is only an artificial restriction though. In practice there is
  528. // mostly one pattern, and getMachineCombinerPatterns() can order patterns
  529. // based on an internal cost heuristic. If
  530. // machine-combiner-verify-pattern-order is enabled, all patterns are
  531. // checked to ensure later patterns do not provide better latency savings.
  532. if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
  533. continue;
  534. if (VerifyPatternOrder)
  535. verifyPatternOrder(MBB, MI, Patterns);
  536. for (auto P : Patterns) {
  537. SmallVector<MachineInstr *, 16> InsInstrs;
  538. SmallVector<MachineInstr *, 16> DelInstrs;
  539. DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
  540. TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
  541. InstrIdxForVirtReg);
  542. unsigned NewInstCount = InsInstrs.size();
  543. unsigned OldInstCount = DelInstrs.size();
  544. // Found pattern, but did not generate alternative sequence.
  545. // This can happen e.g. when an immediate could not be materialized
  546. // in a single instruction.
  547. if (!NewInstCount)
  548. continue;
  549. LLVM_DEBUG(if (dump_intrs) {
  550. dbgs() << "\tFor the Pattern (" << (int)P
  551. << ") these instructions could be removed\n";
  552. for (auto const *InstrPtr : DelInstrs)
  553. InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
  554. /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
  555. dbgs() << "\tThese instructions could replace the removed ones\n";
  556. for (auto const *InstrPtr : InsInstrs)
  557. InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
  558. /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
  559. });
  560. bool SubstituteAlways = false;
  561. if (ML && TII->isThroughputPattern(P))
  562. SubstituteAlways = true;
  563. if (IncrementalUpdate && LastUpdate != BlockIter) {
  564. // Update depths since the last incremental update.
  565. MinInstr->updateDepths(LastUpdate, BlockIter, RegUnits);
  566. LastUpdate = BlockIter;
  567. }
  568. if (DoRegPressureReduce &&
  569. getCombinerObjective(P) ==
  570. CombinerObjective::MustReduceRegisterPressure) {
  571. if (MBB->size() > inc_threshold) {
  572. // Use incremental depth updates for basic blocks above threshold
  573. IncrementalUpdate = true;
  574. LastUpdate = BlockIter;
  575. }
  576. if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
  577. // Replace DelInstrs with InsInstrs.
  578. insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
  579. RegUnits, TII, P, IncrementalUpdate);
  580. Changed |= true;
  581. // Go back to previous instruction as it may have ILP reassociation
  582. // opportunity.
  583. BlockIter--;
  584. break;
  585. }
  586. }
  587. // Substitute when we optimize for codesize and the new sequence has
  588. // fewer instructions OR
  589. // the new sequence neither lengthens the critical path nor increases
  590. // resource pressure.
  591. if (SubstituteAlways ||
  592. doSubstitute(NewInstCount, OldInstCount, OptForSize)) {
  593. insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
  594. RegUnits, TII, P, IncrementalUpdate);
  595. // Eagerly stop after the first pattern fires.
  596. Changed = true;
  597. break;
  598. } else {
  599. // For big basic blocks, we only compute the full trace the first time
  600. // we hit this. We do not invalidate the trace, but instead update the
  601. // instruction depths incrementally.
  602. // NOTE: Only the instruction depths up to MI are accurate. All other
  603. // trace information is not updated.
  604. MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
  605. Traces->verifyAnalysis();
  606. if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
  607. InstrIdxForVirtReg, P,
  608. !IncrementalUpdate) &&
  609. preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
  610. if (MBB->size() > inc_threshold) {
  611. // Use incremental depth updates for basic blocks above treshold
  612. IncrementalUpdate = true;
  613. LastUpdate = BlockIter;
  614. }
  615. insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
  616. RegUnits, TII, P, IncrementalUpdate);
  617. // Eagerly stop after the first pattern fires.
  618. Changed = true;
  619. break;
  620. }
  621. // Cleanup instructions of the alternative code sequence. There is no
  622. // use for them.
  623. MachineFunction *MF = MBB->getParent();
  624. for (auto *InstrPtr : InsInstrs)
  625. MF->deleteMachineInstr(InstrPtr);
  626. }
  627. InstrIdxForVirtReg.clear();
  628. }
  629. }
  630. if (Changed && IncrementalUpdate)
  631. Traces->invalidate(MBB);
  632. return Changed;
  633. }
  634. bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
  635. STI = &MF.getSubtarget();
  636. TII = STI->getInstrInfo();
  637. TRI = STI->getRegisterInfo();
  638. SchedModel = STI->getSchedModel();
  639. TSchedModel.init(STI);
  640. MRI = &MF.getRegInfo();
  641. MLI = &getAnalysis<MachineLoopInfo>();
  642. Traces = &getAnalysis<MachineTraceMetrics>();
  643. PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
  644. MBFI = (PSI && PSI->hasProfileSummary()) ?
  645. &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
  646. nullptr;
  647. MinInstr = nullptr;
  648. OptSize = MF.getFunction().hasOptSize();
  649. RegClassInfo.runOnMachineFunction(MF);
  650. LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
  651. if (!TII->useMachineCombiner()) {
  652. LLVM_DEBUG(
  653. dbgs()
  654. << " Skipping pass: Target does not support machine combiner\n");
  655. return false;
  656. }
  657. bool Changed = false;
  658. // Try to combine instructions.
  659. for (auto &MBB : MF)
  660. Changed |= combineInstructions(&MBB);
  661. return Changed;
  662. }