X86FixupBWInsts.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. //===-- X86FixupBWInsts.cpp - Fixup Byte or Word instructions -----------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file
  9. /// This file defines the pass that looks through the machine instructions
  10. /// late in the compilation, and finds byte or word instructions that
  11. /// can be profitably replaced with 32 bit instructions that give equivalent
  12. /// results for the bits of the results that are used. There are two possible
  13. /// reasons to do this.
  14. ///
  15. /// One reason is to avoid false-dependences on the upper portions
  16. /// of the registers. Only instructions that have a destination register
  17. /// which is not in any of the source registers can be affected by this.
  18. /// Any instruction where one of the source registers is also the destination
  19. /// register is unaffected, because it has a true dependence on the source
  20. /// register already. So, this consideration primarily affects load
  21. /// instructions and register-to-register moves. It would
  22. /// seem like cmov(s) would also be affected, but because of the way cmov is
  23. /// really implemented by most machines as reading both the destination and
  24. /// and source registers, and then "merging" the two based on a condition,
  25. /// it really already should be considered as having a true dependence on the
  26. /// destination register as well.
  27. ///
  28. /// The other reason to do this is for potential code size savings. Word
  29. /// operations need an extra override byte compared to their 32 bit
  30. /// versions. So this can convert many word operations to their larger
  31. /// size, saving a byte in encoding. This could introduce partial register
  32. /// dependences where none existed however. As an example take:
  33. /// orw ax, $0x1000
  34. /// addw ax, $3
  35. /// now if this were to get transformed into
  36. /// orw ax, $1000
  37. /// addl eax, $3
  38. /// because the addl encodes shorter than the addw, this would introduce
  39. /// a use of a register that was only partially written earlier. On older
  40. /// Intel processors this can be quite a performance penalty, so this should
  41. /// probably only be done when it can be proven that a new partial dependence
  42. /// wouldn't be created, or when your know a newer processor is being
  43. /// targeted, or when optimizing for minimum code size.
  44. ///
  45. //===----------------------------------------------------------------------===//
  46. #include "X86.h"
  47. #include "X86InstrInfo.h"
  48. #include "X86Subtarget.h"
  49. #include "llvm/ADT/Statistic.h"
  50. #include "llvm/Analysis/ProfileSummaryInfo.h"
  51. #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
  52. #include "llvm/CodeGen/LivePhysRegs.h"
  53. #include "llvm/CodeGen/MachineFunctionPass.h"
  54. #include "llvm/CodeGen/MachineInstrBuilder.h"
  55. #include "llvm/CodeGen/MachineLoopInfo.h"
  56. #include "llvm/CodeGen/MachineRegisterInfo.h"
  57. #include "llvm/CodeGen/MachineSizeOpts.h"
  58. #include "llvm/CodeGen/Passes.h"
  59. #include "llvm/CodeGen/TargetInstrInfo.h"
  60. #include "llvm/Support/Debug.h"
  61. #include "llvm/Support/raw_ostream.h"
  62. using namespace llvm;
  63. #define FIXUPBW_DESC "X86 Byte/Word Instruction Fixup"
  64. #define FIXUPBW_NAME "x86-fixup-bw-insts"
  65. #define DEBUG_TYPE FIXUPBW_NAME
  66. // Option to allow this optimization pass to have fine-grained control.
  67. static cl::opt<bool>
  68. FixupBWInsts("fixup-byte-word-insts",
  69. cl::desc("Change byte and word instructions to larger sizes"),
  70. cl::init(true), cl::Hidden);
  71. namespace {
  72. class FixupBWInstPass : public MachineFunctionPass {
  73. /// Loop over all of the instructions in the basic block replacing applicable
  74. /// byte or word instructions with better alternatives.
  75. void processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
  76. /// This sets the \p SuperDestReg to the 32 bit super reg of the original
  77. /// destination register of the MachineInstr passed in. It returns true if
  78. /// that super register is dead just prior to \p OrigMI, and false if not.
  79. bool getSuperRegDestIfDead(MachineInstr *OrigMI,
  80. Register &SuperDestReg) const;
  81. /// Change the MachineInstr \p MI into the equivalent extending load to 32 bit
  82. /// register if it is safe to do so. Return the replacement instruction if
  83. /// OK, otherwise return nullptr.
  84. MachineInstr *tryReplaceLoad(unsigned New32BitOpcode, MachineInstr *MI) const;
  85. /// Change the MachineInstr \p MI into the equivalent 32-bit copy if it is
  86. /// safe to do so. Return the replacement instruction if OK, otherwise return
  87. /// nullptr.
  88. MachineInstr *tryReplaceCopy(MachineInstr *MI) const;
  89. /// Change the MachineInstr \p MI into the equivalent extend to 32 bit
  90. /// register if it is safe to do so. Return the replacement instruction if
  91. /// OK, otherwise return nullptr.
  92. MachineInstr *tryReplaceExtend(unsigned New32BitOpcode,
  93. MachineInstr *MI) const;
  94. // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
  95. // possible. Return the replacement instruction if OK, return nullptr
  96. // otherwise.
  97. MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB) const;
  98. public:
  99. static char ID;
  100. StringRef getPassName() const override { return FIXUPBW_DESC; }
  101. FixupBWInstPass() : MachineFunctionPass(ID) { }
  102. void getAnalysisUsage(AnalysisUsage &AU) const override {
  103. AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
  104. // guide some heuristics.
  105. AU.addRequired<ProfileSummaryInfoWrapperPass>();
  106. AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
  107. MachineFunctionPass::getAnalysisUsage(AU);
  108. }
  109. /// Loop over all of the basic blocks, replacing byte and word instructions by
  110. /// equivalent 32 bit instructions where performance or code size can be
  111. /// improved.
  112. bool runOnMachineFunction(MachineFunction &MF) override;
  113. MachineFunctionProperties getRequiredProperties() const override {
  114. return MachineFunctionProperties().set(
  115. MachineFunctionProperties::Property::NoVRegs);
  116. }
  117. private:
  118. MachineFunction *MF = nullptr;
  119. /// Machine instruction info used throughout the class.
  120. const X86InstrInfo *TII = nullptr;
  121. const TargetRegisterInfo *TRI = nullptr;
  122. /// Local member for function's OptForSize attribute.
  123. bool OptForSize = false;
  124. /// Machine loop info used for guiding some heruistics.
  125. MachineLoopInfo *MLI = nullptr;
  126. /// Register Liveness information after the current instruction.
  127. LivePhysRegs LiveRegs;
  128. ProfileSummaryInfo *PSI;
  129. MachineBlockFrequencyInfo *MBFI;
  130. };
  131. char FixupBWInstPass::ID = 0;
  132. }
  133. INITIALIZE_PASS(FixupBWInstPass, FIXUPBW_NAME, FIXUPBW_DESC, false, false)
  134. FunctionPass *llvm::createX86FixupBWInsts() { return new FixupBWInstPass(); }
  135. bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
  136. if (!FixupBWInsts || skipFunction(MF.getFunction()))
  137. return false;
  138. this->MF = &MF;
  139. TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
  140. TRI = MF.getRegInfo().getTargetRegisterInfo();
  141. MLI = &getAnalysis<MachineLoopInfo>();
  142. PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
  143. MBFI = (PSI && PSI->hasProfileSummary()) ?
  144. &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
  145. nullptr;
  146. LiveRegs.init(TII->getRegisterInfo());
  147. LLVM_DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
  148. // Process all basic blocks.
  149. for (auto &MBB : MF)
  150. processBasicBlock(MF, MBB);
  151. LLVM_DEBUG(dbgs() << "End X86FixupBWInsts\n";);
  152. return true;
  153. }
  154. /// Check if after \p OrigMI the only portion of super register
  155. /// of the destination register of \p OrigMI that is alive is that
  156. /// destination register.
  157. ///
  158. /// If so, return that super register in \p SuperDestReg.
  159. bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
  160. Register &SuperDestReg) const {
  161. const X86RegisterInfo *TRI = &TII->getRegisterInfo();
  162. Register OrigDestReg = OrigMI->getOperand(0).getReg();
  163. SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
  164. const auto SubRegIdx = TRI->getSubRegIndex(SuperDestReg, OrigDestReg);
  165. // Make sure that the sub-register that this instruction has as its
  166. // destination is the lowest order sub-register of the super-register.
  167. // If it isn't, then the register isn't really dead even if the
  168. // super-register is considered dead.
  169. if (SubRegIdx == X86::sub_8bit_hi)
  170. return false;
  171. // If neither the destination-super register nor any applicable subregisters
  172. // are live after this instruction, then the super register is safe to use.
  173. if (!LiveRegs.contains(SuperDestReg)) {
  174. // If the original destination register was not the low 8-bit subregister
  175. // then the super register check is sufficient.
  176. if (SubRegIdx != X86::sub_8bit)
  177. return true;
  178. // If the original destination register was the low 8-bit subregister and
  179. // we also need to check the 16-bit subregister and the high 8-bit
  180. // subregister.
  181. if (!LiveRegs.contains(getX86SubSuperRegister(OrigDestReg, 16)) &&
  182. !LiveRegs.contains(getX86SubSuperRegister(SuperDestReg, 8,
  183. /*High=*/true)))
  184. return true;
  185. // Otherwise, we have a little more checking to do.
  186. }
  187. // If we get here, the super-register destination (or some part of it) is
  188. // marked as live after the original instruction.
  189. //
  190. // The X86 backend does not have subregister liveness tracking enabled,
  191. // so liveness information might be overly conservative. Specifically, the
  192. // super register might be marked as live because it is implicitly defined
  193. // by the instruction we are examining.
  194. //
  195. // However, for some specific instructions (this pass only cares about MOVs)
  196. // we can produce more precise results by analysing that MOV's operands.
  197. //
  198. // Indeed, if super-register is not live before the mov it means that it
  199. // was originally <read-undef> and so we are free to modify these
  200. // undef upper bits. That may happen in case where the use is in another MBB
  201. // and the vreg/physreg corresponding to the move has higher width than
  202. // necessary (e.g. due to register coalescing with a "truncate" copy).
  203. // So, we would like to handle patterns like this:
  204. //
  205. // %bb.2: derived from LLVM BB %if.then
  206. // Live Ins: %rdi
  207. // Predecessors according to CFG: %bb.0
  208. // %ax<def> = MOV16rm killed %rdi, 1, %noreg, 0, %noreg, implicit-def %eax
  209. // ; No implicit %eax
  210. // Successors according to CFG: %bb.3(?%)
  211. //
  212. // %bb.3: derived from LLVM BB %if.end
  213. // Live Ins: %eax Only %ax is actually live
  214. // Predecessors according to CFG: %bb.2 %bb.1
  215. // %ax = KILL %ax, implicit killed %eax
  216. // RET 0, %ax
  217. unsigned Opc = OrigMI->getOpcode(); (void)Opc;
  218. // These are the opcodes currently known to work with the code below, if
  219. // something // else will be added we need to ensure that new opcode has the
  220. // same properties.
  221. if (Opc != X86::MOV8rm && Opc != X86::MOV16rm && Opc != X86::MOV8rr &&
  222. Opc != X86::MOV16rr)
  223. return false;
  224. bool IsDefined = false;
  225. for (auto &MO: OrigMI->implicit_operands()) {
  226. if (!MO.isReg())
  227. continue;
  228. assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!");
  229. if (MO.isDef() && TRI->isSuperRegisterEq(OrigDestReg, MO.getReg()))
  230. IsDefined = true;
  231. // If MO is a use of any part of the destination register but is not equal
  232. // to OrigDestReg or one of its subregisters, we cannot use SuperDestReg.
  233. // For example, if OrigDestReg is %al then an implicit use of %ah, %ax,
  234. // %eax, or %rax will prevent us from using the %eax register.
  235. if (MO.isUse() && !TRI->isSubRegisterEq(OrigDestReg, MO.getReg()) &&
  236. TRI->regsOverlap(SuperDestReg, MO.getReg()))
  237. return false;
  238. }
  239. // Reg is not Imp-def'ed -> it's live both before/after the instruction.
  240. if (!IsDefined)
  241. return false;
  242. // Otherwise, the Reg is not live before the MI and the MOV can't
  243. // make it really live, so it's in fact dead even after the MI.
  244. return true;
  245. }
  246. MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode,
  247. MachineInstr *MI) const {
  248. Register NewDestReg;
  249. // We are going to try to rewrite this load to a larger zero-extending
  250. // load. This is safe if all portions of the 32 bit super-register
  251. // of the original destination register, except for the original destination
  252. // register are dead. getSuperRegDestIfDead checks that.
  253. if (!getSuperRegDestIfDead(MI, NewDestReg))
  254. return nullptr;
  255. // Safe to change the instruction.
  256. MachineInstrBuilder MIB =
  257. BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg);
  258. unsigned NumArgs = MI->getNumOperands();
  259. for (unsigned i = 1; i < NumArgs; ++i)
  260. MIB.add(MI->getOperand(i));
  261. MIB.setMemRefs(MI->memoperands());
  262. // If it was debug tracked, record a substitution.
  263. if (unsigned OldInstrNum = MI->peekDebugInstrNum()) {
  264. unsigned Subreg = TRI->getSubRegIndex(MIB->getOperand(0).getReg(),
  265. MI->getOperand(0).getReg());
  266. unsigned NewInstrNum = MIB->getDebugInstrNum(*MF);
  267. MF->makeDebugValueSubstitution({OldInstrNum, 0}, {NewInstrNum, 0}, Subreg);
  268. }
  269. return MIB;
  270. }
  271. MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
  272. assert(MI->getNumExplicitOperands() == 2);
  273. auto &OldDest = MI->getOperand(0);
  274. auto &OldSrc = MI->getOperand(1);
  275. Register NewDestReg;
  276. if (!getSuperRegDestIfDead(MI, NewDestReg))
  277. return nullptr;
  278. Register NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32);
  279. // This is only correct if we access the same subregister index: otherwise,
  280. // we could try to replace "movb %ah, %al" with "movl %eax, %eax".
  281. const X86RegisterInfo *TRI = &TII->getRegisterInfo();
  282. if (TRI->getSubRegIndex(NewSrcReg, OldSrc.getReg()) !=
  283. TRI->getSubRegIndex(NewDestReg, OldDest.getReg()))
  284. return nullptr;
  285. // Safe to change the instruction.
  286. // Don't set src flags, as we don't know if we're also killing the superreg.
  287. // However, the superregister might not be defined; make it explicit that
  288. // we don't care about the higher bits by reading it as Undef, and adding
  289. // an imp-use on the original subregister.
  290. MachineInstrBuilder MIB =
  291. BuildMI(*MF, MI->getDebugLoc(), TII->get(X86::MOV32rr), NewDestReg)
  292. .addReg(NewSrcReg, RegState::Undef)
  293. .addReg(OldSrc.getReg(), RegState::Implicit);
  294. // Drop imp-defs/uses that would be redundant with the new def/use.
  295. for (auto &Op : MI->implicit_operands())
  296. if (Op.getReg() != (Op.isDef() ? NewDestReg : NewSrcReg))
  297. MIB.add(Op);
  298. return MIB;
  299. }
  300. MachineInstr *FixupBWInstPass::tryReplaceExtend(unsigned New32BitOpcode,
  301. MachineInstr *MI) const {
  302. Register NewDestReg;
  303. if (!getSuperRegDestIfDead(MI, NewDestReg))
  304. return nullptr;
  305. // Don't interfere with formation of CBW instructions which should be a
  306. // shorter encoding than even the MOVSX32rr8. It's also immune to partial
  307. // merge issues on Intel CPUs.
  308. if (MI->getOpcode() == X86::MOVSX16rr8 &&
  309. MI->getOperand(0).getReg() == X86::AX &&
  310. MI->getOperand(1).getReg() == X86::AL)
  311. return nullptr;
  312. // Safe to change the instruction.
  313. MachineInstrBuilder MIB =
  314. BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg);
  315. unsigned NumArgs = MI->getNumOperands();
  316. for (unsigned i = 1; i < NumArgs; ++i)
  317. MIB.add(MI->getOperand(i));
  318. MIB.setMemRefs(MI->memoperands());
  319. if (unsigned OldInstrNum = MI->peekDebugInstrNum()) {
  320. unsigned Subreg = TRI->getSubRegIndex(MIB->getOperand(0).getReg(),
  321. MI->getOperand(0).getReg());
  322. unsigned NewInstrNum = MIB->getDebugInstrNum(*MF);
  323. MF->makeDebugValueSubstitution({OldInstrNum, 0}, {NewInstrNum, 0}, Subreg);
  324. }
  325. return MIB;
  326. }
  327. MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
  328. MachineBasicBlock &MBB) const {
  329. // See if this is an instruction of the type we are currently looking for.
  330. switch (MI->getOpcode()) {
  331. case X86::MOV8rm:
  332. // Replace 8-bit loads with the zero-extending version if not optimizing
  333. // for size. The extending op is cheaper across a wide range of uarch and
  334. // it avoids a potentially expensive partial register stall. It takes an
  335. // extra byte to encode, however, so don't do this when optimizing for size.
  336. if (!OptForSize)
  337. return tryReplaceLoad(X86::MOVZX32rm8, MI);
  338. break;
  339. case X86::MOV16rm:
  340. // Always try to replace 16 bit load with 32 bit zero extending.
  341. // Code size is the same, and there is sometimes a perf advantage
  342. // from eliminating a false dependence on the upper portion of
  343. // the register.
  344. return tryReplaceLoad(X86::MOVZX32rm16, MI);
  345. case X86::MOV8rr:
  346. case X86::MOV16rr:
  347. // Always try to replace 8/16 bit copies with a 32 bit copy.
  348. // Code size is either less (16) or equal (8), and there is sometimes a
  349. // perf advantage from eliminating a false dependence on the upper portion
  350. // of the register.
  351. return tryReplaceCopy(MI);
  352. case X86::MOVSX16rr8:
  353. return tryReplaceExtend(X86::MOVSX32rr8, MI);
  354. case X86::MOVSX16rm8:
  355. return tryReplaceExtend(X86::MOVSX32rm8, MI);
  356. case X86::MOVZX16rr8:
  357. return tryReplaceExtend(X86::MOVZX32rr8, MI);
  358. case X86::MOVZX16rm8:
  359. return tryReplaceExtend(X86::MOVZX32rm8, MI);
  360. default:
  361. // nothing to do here.
  362. break;
  363. }
  364. return nullptr;
  365. }
  366. void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
  367. MachineBasicBlock &MBB) {
  368. // This algorithm doesn't delete the instructions it is replacing
  369. // right away. By leaving the existing instructions in place, the
  370. // register liveness information doesn't change, and this makes the
  371. // analysis that goes on be better than if the replaced instructions
  372. // were immediately removed.
  373. //
  374. // This algorithm always creates a replacement instruction
  375. // and notes that and the original in a data structure, until the
  376. // whole BB has been analyzed. This keeps the replacement instructions
  377. // from making it seem as if the larger register might be live.
  378. SmallVector<std::pair<MachineInstr *, MachineInstr *>, 8> MIReplacements;
  379. // Start computing liveness for this block. We iterate from the end to be able
  380. // to update this for each instruction.
  381. LiveRegs.clear();
  382. // We run after PEI, so we need to AddPristinesAndCSRs.
  383. LiveRegs.addLiveOuts(MBB);
  384. OptForSize = MF.getFunction().hasOptSize() ||
  385. llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
  386. for (MachineInstr &MI : llvm::reverse(MBB)) {
  387. if (MachineInstr *NewMI = tryReplaceInstr(&MI, MBB))
  388. MIReplacements.push_back(std::make_pair(&MI, NewMI));
  389. // We're done with this instruction, update liveness for the next one.
  390. LiveRegs.stepBackward(MI);
  391. }
  392. while (!MIReplacements.empty()) {
  393. MachineInstr *MI = MIReplacements.back().first;
  394. MachineInstr *NewMI = MIReplacements.back().second;
  395. MIReplacements.pop_back();
  396. MBB.insert(MI, NewMI);
  397. MBB.erase(MI);
  398. }
  399. }