ParallelSnippetGenerator.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "ParallelSnippetGenerator.h"
  9. #include "BenchmarkRunner.h"
  10. #include "MCInstrDescView.h"
  11. #include "Target.h"
  12. // FIXME: Load constants into registers (e.g. with fld1) to not break
  13. // instructions like x87.
  14. // Ideally we would like the only limitation on executing instructions to be the
  15. // availability of the CPU resources (e.g. execution ports) needed to execute
  16. // them, instead of the availability of their data dependencies.
  17. // To achieve that, one approach is to generate instructions that do not have
  18. // data dependencies between them.
  19. //
  20. // For some instructions, this is trivial:
  21. // mov rax, qword ptr [rsi]
  22. // mov rax, qword ptr [rsi]
  23. // mov rax, qword ptr [rsi]
  24. // mov rax, qword ptr [rsi]
  25. // For the above snippet, haswell just renames rax four times and executes the
  26. // four instructions two at a time on P23 and P0126.
  27. //
  28. // For some instructions, we just need to make sure that the source is
  29. // different from the destination. For example, IDIV8r reads from GPR and
  30. // writes to AX. We just need to ensure that the Var is assigned a
  31. // register which is different from AX:
  32. // idiv bx
  33. // idiv bx
  34. // idiv bx
  35. // idiv bx
  36. // The above snippet will be able to fully saturate the ports, while the same
  37. // with ax would issue one uop every `latency(IDIV8r)` cycles.
  38. //
  39. // Some instructions make this harder because they both read and write from
  40. // the same register:
  41. // inc rax
  42. // inc rax
  43. // inc rax
  44. // inc rax
  45. // This has a data dependency from each instruction to the next, limit the
  46. // number of instructions that can be issued in parallel.
  47. // It turns out that this is not a big issue on recent Intel CPUs because they
  48. // have heuristics to balance port pressure. In the snippet above, subsequent
  49. // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
  50. // might end up executing them all on P0 (just because they can), or try
  51. // avoiding P5 because it's usually under high pressure from vector
  52. // instructions.
  53. // This issue is even more important for high-latency instructions because
  54. // they increase the idle time of the CPU, e.g. :
  55. // imul rax, rbx
  56. // imul rax, rbx
  57. // imul rax, rbx
  58. // imul rax, rbx
  59. //
  60. // To avoid that, we do the renaming statically by generating as many
  61. // independent exclusive assignments as possible (until all possible registers
  62. // are exhausted) e.g.:
  63. // imul rax, rbx
  64. // imul rcx, rbx
  65. // imul rdx, rbx
  66. // imul r8, rbx
  67. //
  68. // Some instruction even make the above static renaming impossible because
  69. // they implicitly read and write from the same operand, e.g. ADC16rr reads
  70. // and writes from EFLAGS.
  71. // In that case we just use a greedy register assignment and hope for the
  72. // best.
  73. namespace llvm {
  74. namespace exegesis {
  75. static SmallVector<const Variable *, 8>
  76. getVariablesWithTiedOperands(const Instruction &Instr) {
  77. SmallVector<const Variable *, 8> Result;
  78. for (const auto &Var : Instr.Variables)
  79. if (Var.hasTiedOperands())
  80. Result.push_back(&Var);
  81. return Result;
  82. }
  83. ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
  84. void ParallelSnippetGenerator::instantiateMemoryOperands(
  85. const unsigned ScratchSpacePointerInReg,
  86. std::vector<InstructionTemplate> &Instructions) const {
  87. if (ScratchSpacePointerInReg == 0)
  88. return; // no memory operands.
  89. const auto &ET = State.getExegesisTarget();
  90. const unsigned MemStep = ET.getMaxMemoryAccessSize();
  91. const size_t OriginalInstructionsSize = Instructions.size();
  92. size_t I = 0;
  93. for (InstructionTemplate &IT : Instructions) {
  94. ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
  95. ++I;
  96. }
  97. while (Instructions.size() < kMinNumDifferentAddresses) {
  98. InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
  99. ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
  100. ++I;
  101. Instructions.push_back(std::move(IT));
  102. }
  103. assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
  104. "not enough scratch space");
  105. }
  106. static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming(
  107. const LLVMState &State, const InstructionTemplate &IT,
  108. const ArrayRef<const Variable *> TiedVariables,
  109. const BitVector &ForbiddenRegisters) {
  110. std::vector<InstructionTemplate> Instructions;
  111. // Assign registers to variables in a round-robin manner. This is simple but
  112. // ensures that the most register-constrained variable does not get starved.
  113. std::vector<BitVector> PossibleRegsForVar;
  114. for (const Variable *Var : TiedVariables) {
  115. assert(Var);
  116. const Operand &Op = IT.getInstr().getPrimaryOperand(*Var);
  117. assert(Op.isReg());
  118. BitVector PossibleRegs = Op.getRegisterAliasing().sourceBits();
  119. remove(PossibleRegs, ForbiddenRegisters);
  120. PossibleRegsForVar.push_back(std::move(PossibleRegs));
  121. }
  122. SmallVector<int, 2> Iterators(TiedVariables.size(), 0);
  123. while (true) {
  124. InstructionTemplate TmpIT = IT;
  125. // Find a possible register for each variable in turn, marking the
  126. // register as taken.
  127. for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) {
  128. const int NextPossibleReg =
  129. PossibleRegsForVar[VarId].find_next(Iterators[VarId]);
  130. if (NextPossibleReg <= 0) {
  131. return Instructions;
  132. }
  133. TmpIT.getValueFor(*TiedVariables[VarId]) =
  134. MCOperand::createReg(NextPossibleReg);
  135. // Bump iterator.
  136. Iterators[VarId] = NextPossibleReg;
  137. // Prevent other variables from using the register.
  138. for (BitVector &OtherPossibleRegs : PossibleRegsForVar) {
  139. OtherPossibleRegs.reset(NextPossibleReg);
  140. }
  141. }
  142. Instructions.push_back(std::move(TmpIT));
  143. }
  144. }
  145. Expected<std::vector<CodeTemplate>>
  146. ParallelSnippetGenerator::generateCodeTemplates(
  147. InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
  148. const Instruction &Instr = Variant.getInstr();
  149. CodeTemplate CT;
  150. CT.ScratchSpacePointerInReg =
  151. Instr.hasMemoryOperands()
  152. ? State.getExegesisTarget().getScratchMemoryRegister(
  153. State.getTargetMachine().getTargetTriple())
  154. : 0;
  155. const AliasingConfigurations SelfAliasing(Instr, Instr);
  156. if (SelfAliasing.empty()) {
  157. CT.Info = "instruction is parallel, repeating a random one.";
  158. CT.Instructions.push_back(std::move(Variant));
  159. instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
  160. return getSingleton(std::move(CT));
  161. }
  162. if (SelfAliasing.hasImplicitAliasing()) {
  163. CT.Info = "instruction is serial, repeating a random one.";
  164. CT.Instructions.push_back(std::move(Variant));
  165. instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
  166. return getSingleton(std::move(CT));
  167. }
  168. const auto TiedVariables = getVariablesWithTiedOperands(Instr);
  169. if (!TiedVariables.empty()) {
  170. CT.Info = "instruction has tied variables, using static renaming.";
  171. CT.Instructions = generateSnippetUsingStaticRenaming(
  172. State, Variant, TiedVariables, ForbiddenRegisters);
  173. instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
  174. return getSingleton(std::move(CT));
  175. }
  176. // No tied variables, we pick random values for defs.
  177. // We don't want to accidentally serialize the instruction,
  178. // so we must be sure that we don't pick a def that is an implicit use,
  179. // or a use that is an implicit def, so record implicit regs now.
  180. BitVector ImplicitUses(State.getRegInfo().getNumRegs());
  181. BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
  182. for (const auto &Op : Instr.Operands) {
  183. if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
  184. assert(Op.isImplicitReg() && "Not an implicit register operand?");
  185. if (Op.isUse())
  186. ImplicitUses.set(Op.getImplicitReg());
  187. else {
  188. assert(Op.isDef() && "Not a use and not a def?");
  189. ImplicitDefs.set(Op.getImplicitReg());
  190. }
  191. }
  192. }
  193. const auto ImplicitUseAliases =
  194. getAliasedBits(State.getRegInfo(), ImplicitUses);
  195. const auto ImplicitDefAliases =
  196. getAliasedBits(State.getRegInfo(), ImplicitDefs);
  197. BitVector Defs(State.getRegInfo().getNumRegs());
  198. for (const auto &Op : Instr.Operands) {
  199. if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
  200. auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
  201. // Do not use forbidden registers and regs that are implicitly used.
  202. // Note that we don't try to avoid using implicit defs explicitly.
  203. remove(PossibleRegisters, ForbiddenRegisters);
  204. remove(PossibleRegisters, ImplicitUseAliases);
  205. if (!PossibleRegisters.any())
  206. return make_error<StringError>(
  207. Twine("no available registers:\ncandidates:\n")
  208. .concat(debugString(State.getRegInfo(),
  209. Op.getRegisterAliasing().sourceBits()))
  210. .concat("\nforbidden:\n")
  211. .concat(debugString(State.getRegInfo(), ForbiddenRegisters))
  212. .concat("\nimplicit use:\n")
  213. .concat(debugString(State.getRegInfo(), ImplicitUseAliases)),
  214. inconvertibleErrorCode());
  215. const auto RandomReg = randomBit(PossibleRegisters);
  216. Defs.set(RandomReg);
  217. Variant.getValueFor(Op) = MCOperand::createReg(RandomReg);
  218. }
  219. }
  220. // And pick random use values that are not reserved and don't alias with defs.
  221. // Note that we don't try to avoid using implicit uses explicitly.
  222. const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
  223. for (const auto &Op : Instr.Operands) {
  224. if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
  225. auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
  226. remove(PossibleRegisters, ForbiddenRegisters);
  227. remove(PossibleRegisters, DefAliases);
  228. remove(PossibleRegisters, ImplicitDefAliases);
  229. assert(PossibleRegisters.any() && "No register left to choose from");
  230. const auto RandomReg = randomBit(PossibleRegisters);
  231. Variant.getValueFor(Op) = MCOperand::createReg(RandomReg);
  232. }
  233. }
  234. CT.Info =
  235. "instruction has no tied variables picking Uses different from defs";
  236. CT.Instructions.push_back(std::move(Variant));
  237. instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
  238. return getSingleton(std::move(CT));
  239. }
  240. constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
  241. } // namespace exegesis
  242. } // namespace llvm