ParallelSnippetGenerator.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "ParallelSnippetGenerator.h"
  9. #include "BenchmarkRunner.h"
  10. #include "MCInstrDescView.h"
  11. #include "Target.h"
  12. // FIXME: Load constants into registers (e.g. with fld1) to not break
  13. // instructions like x87.
  14. // Ideally we would like the only limitation on executing instructions to be the
  15. // availability of the CPU resources (e.g. execution ports) needed to execute
  16. // them, instead of the availability of their data dependencies.
  17. // To achieve that, one approach is to generate instructions that do not have
  18. // data dependencies between them.
  19. //
  20. // For some instructions, this is trivial:
  21. // mov rax, qword ptr [rsi]
  22. // mov rax, qword ptr [rsi]
  23. // mov rax, qword ptr [rsi]
  24. // mov rax, qword ptr [rsi]
  25. // For the above snippet, haswell just renames rax four times and executes the
  26. // four instructions two at a time on P23 and P0126.
  27. //
  28. // For some instructions, we just need to make sure that the source is
  29. // different from the destination. For example, IDIV8r reads from GPR and
  30. // writes to AX. We just need to ensure that the Var is assigned a
  31. // register which is different from AX:
  32. // idiv bx
  33. // idiv bx
  34. // idiv bx
  35. // idiv bx
  36. // The above snippet will be able to fully saturate the ports, while the same
  37. // with ax would issue one uop every `latency(IDIV8r)` cycles.
  38. //
  39. // Some instructions make this harder because they both read and write from
  40. // the same register:
  41. // inc rax
  42. // inc rax
  43. // inc rax
  44. // inc rax
  45. // This has a data dependency from each instruction to the next, limit the
  46. // number of instructions that can be issued in parallel.
  47. // It turns out that this is not a big issue on recent Intel CPUs because they
  48. // have heuristics to balance port pressure. In the snippet above, subsequent
  49. // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
  50. // might end up executing them all on P0 (just because they can), or try
  51. // avoiding P5 because it's usually under high pressure from vector
  52. // instructions.
  53. // This issue is even more important for high-latency instructions because
  54. // they increase the idle time of the CPU, e.g. :
  55. // imul rax, rbx
  56. // imul rax, rbx
  57. // imul rax, rbx
  58. // imul rax, rbx
  59. //
  60. // To avoid that, we do the renaming statically by generating as many
  61. // independent exclusive assignments as possible (until all possible registers
  62. // are exhausted) e.g.:
  63. // imul rax, rbx
  64. // imul rcx, rbx
  65. // imul rdx, rbx
  66. // imul r8, rbx
  67. //
  68. // Some instruction even make the above static renaming impossible because
  69. // they implicitly read and write from the same operand, e.g. ADC16rr reads
  70. // and writes from EFLAGS.
  71. // In that case we just use a greedy register assignment and hope for the
  72. // best.
  73. namespace llvm {
  74. namespace exegesis {
  75. static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
  76. SmallVector<const Variable *, 8> Result;
  77. for (const auto &Var : Instr.Variables)
  78. if (Var.hasTiedOperands())
  79. return true;
  80. return false;
  81. }
  82. ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
  83. void ParallelSnippetGenerator::instantiateMemoryOperands(
  84. const unsigned ScratchSpacePointerInReg,
  85. std::vector<InstructionTemplate> &Instructions) const {
  86. if (ScratchSpacePointerInReg == 0)
  87. return; // no memory operands.
  88. const auto &ET = State.getExegesisTarget();
  89. const unsigned MemStep = ET.getMaxMemoryAccessSize();
  90. const size_t OriginalInstructionsSize = Instructions.size();
  91. size_t I = 0;
  92. for (InstructionTemplate &IT : Instructions) {
  93. ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
  94. ++I;
  95. }
  96. while (Instructions.size() < kMinNumDifferentAddresses) {
  97. InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
  98. ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
  99. ++I;
  100. Instructions.push_back(std::move(IT));
  101. }
  102. assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
  103. "not enough scratch space");
  104. }
  105. enum class RegRandomizationStrategy : uint8_t {
  106. PickRandomRegs,
  107. SingleStaticRegPerOperand,
  108. SingleStaticReg,
  109. FIRST = PickRandomRegs,
  110. LAST = SingleStaticReg,
  111. };
  112. } // namespace exegesis
  113. template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
  114. static constexpr bool is_iterable = true;
  115. };
  116. namespace exegesis {
  117. const char *getDescription(RegRandomizationStrategy S) {
  118. switch (S) {
  119. case RegRandomizationStrategy::PickRandomRegs:
  120. return "randomizing registers";
  121. case RegRandomizationStrategy::SingleStaticRegPerOperand:
  122. return "one unique register for each position";
  123. case RegRandomizationStrategy::SingleStaticReg:
  124. return "reusing the same register for all positions";
  125. }
  126. llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
  127. }
  128. static std::variant<std::nullopt_t, MCOperand, Register>
  129. generateSingleRegisterForInstrAvoidingDefUseOverlap(
  130. const LLVMState &State, const BitVector &ForbiddenRegisters,
  131. const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
  132. const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
  133. const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
  134. RegRandomizationStrategy S) {
  135. const Instruction &Instr = IT.getInstr();
  136. assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
  137. !IT.getValueFor(Op).isValid());
  138. assert((!Op.isUse() || !Op.isTied()) &&
  139. "Not expecting to see a tied use reg");
  140. if (Op.isUse()) {
  141. switch (S) {
  142. case RegRandomizationStrategy::PickRandomRegs:
  143. break;
  144. case RegRandomizationStrategy::SingleStaticReg:
  145. case RegRandomizationStrategy::SingleStaticRegPerOperand: {
  146. if (!Instructions.empty())
  147. return Instructions.front().getValueFor(Op);
  148. if (S != RegRandomizationStrategy::SingleStaticReg)
  149. break;
  150. BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
  151. const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
  152. if (std::optional<int> CommonBit =
  153. getFirstCommonBit(PossibleRegisters, UseAliases))
  154. return *CommonBit;
  155. break;
  156. }
  157. }
  158. }
  159. BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
  160. remove(PossibleRegisters, ForbiddenRegisters);
  161. if (Op.isDef()) {
  162. remove(PossibleRegisters, ImplicitUseAliases);
  163. const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
  164. remove(PossibleRegisters, UseAliases);
  165. }
  166. if (Op.isUse()) {
  167. remove(PossibleRegisters, ImplicitDefAliases);
  168. // NOTE: in general, using same reg for multiple Use's is fine.
  169. if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
  170. const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
  171. remove(PossibleRegisters, UseAliases);
  172. }
  173. }
  174. bool IsDefWithTiedUse =
  175. Instr.Variables[Op.getVariableIndex()].hasTiedOperands();
  176. if (Op.isUse() || IsDefWithTiedUse) {
  177. // Now, important bit: if we have used some register for def,
  178. // then we can not use that same register for *any* use,
  179. // be it either an untied use, or an use tied to a def.
  180. // But def-ing same regs is fine, as long as there are no uses!
  181. const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs);
  182. remove(PossibleRegisters, DefsAliases);
  183. }
  184. if (!PossibleRegisters.any())
  185. return std::nullopt;
  186. return randomBit(PossibleRegisters);
  187. }
  188. static std::optional<InstructionTemplate>
  189. generateSingleSnippetForInstrAvoidingDefUseOverlap(
  190. const LLVMState &State, const BitVector &ForbiddenRegisters,
  191. const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
  192. BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
  193. const ArrayRef<InstructionTemplate> Instructions,
  194. RegRandomizationStrategy S) {
  195. const Instruction &Instr = IT.getInstr();
  196. for (const Operand &Op : Instr.Operands) {
  197. if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() ||
  198. IT.getValueFor(Op).isValid())
  199. continue;
  200. assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses.");
  201. std::variant<std::nullopt_t, MCOperand, Register> R =
  202. generateSingleRegisterForInstrAvoidingDefUseOverlap(
  203. State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
  204. Uses, Defs, IT, Op, Instructions, S);
  205. if (std::holds_alternative<std::nullopt_t>(R))
  206. return {};
  207. MCOperand MCOp;
  208. if (std::holds_alternative<MCOperand>(R))
  209. MCOp = std::get<MCOperand>(R);
  210. else {
  211. Register RandomReg = std::get<Register>(R);
  212. if (Op.isDef())
  213. Defs.set(RandomReg);
  214. if (Op.isUse())
  215. Uses.set(RandomReg);
  216. MCOp = MCOperand::createReg(RandomReg);
  217. }
  218. IT.getValueFor(Op) = MCOp;
  219. }
  220. return IT;
  221. }
  222. static std::vector<InstructionTemplate>
  223. generateSnippetForInstrAvoidingDefUseOverlap(
  224. const LLVMState &State, const InstructionTemplate &IT,
  225. RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
  226. // We don't want to accidentally serialize the instruction,
  227. // so we must be sure that we don't pick a def that is an implicit use,
  228. // or a use that is an implicit def, so record implicit regs now.
  229. BitVector ImplicitUses(State.getRegInfo().getNumRegs());
  230. BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
  231. for (const auto &Op : IT.getInstr().Operands) {
  232. if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
  233. assert(Op.isImplicitReg() && "Not an implicit register operand?");
  234. if (Op.isUse())
  235. ImplicitUses.set(Op.getImplicitReg());
  236. else {
  237. assert(Op.isDef() && "Not a use and not a def?");
  238. ImplicitDefs.set(Op.getImplicitReg());
  239. }
  240. }
  241. }
  242. const BitVector ImplicitUseAliases =
  243. getAliasedBits(State.getRegInfo(), ImplicitUses);
  244. const BitVector ImplicitDefAliases =
  245. getAliasedBits(State.getRegInfo(), ImplicitDefs);
  246. BitVector Defs(State.getRegInfo().getNumRegs());
  247. BitVector Uses(State.getRegInfo().getNumRegs());
  248. std::vector<InstructionTemplate> Instructions;
  249. while (true) {
  250. std::optional<InstructionTemplate> TmpIT =
  251. generateSingleSnippetForInstrAvoidingDefUseOverlap(
  252. State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
  253. Uses, Defs, IT, Instructions, S);
  254. if (!TmpIT)
  255. return Instructions;
  256. Instructions.push_back(std::move(*TmpIT));
  257. if (!hasVariablesWithTiedOperands(IT.getInstr()))
  258. return Instructions;
  259. assert(Instructions.size() <= 128 && "Stuck in endless loop?");
  260. }
  261. }
  262. Expected<std::vector<CodeTemplate>>
  263. ParallelSnippetGenerator::generateCodeTemplates(
  264. InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
  265. const Instruction &Instr = Variant.getInstr();
  266. CodeTemplate CT;
  267. CT.ScratchSpacePointerInReg =
  268. Instr.hasMemoryOperands()
  269. ? State.getExegesisTarget().getScratchMemoryRegister(
  270. State.getTargetMachine().getTargetTriple())
  271. : 0;
  272. const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters);
  273. if (SelfAliasing.empty()) {
  274. CT.Info = "instruction is parallel, repeating a random one.";
  275. CT.Instructions.push_back(std::move(Variant));
  276. instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
  277. return getSingleton(std::move(CT));
  278. }
  279. if (SelfAliasing.hasImplicitAliasing()) {
  280. CT.Info = "instruction is serial, repeating a random one.";
  281. CT.Instructions.push_back(std::move(Variant));
  282. instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
  283. return getSingleton(std::move(CT));
  284. }
  285. std::vector<CodeTemplate> Result;
  286. bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
  287. // If there are no tied operands, then we don't want to "saturate backedge",
  288. // and the template we will produce will have only a single instruction.
  289. unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) {
  290. return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
  291. !Op.isTied();
  292. });
  293. SmallVector<RegRandomizationStrategy, 3> Strategies;
  294. if (HasTiedOperands || NumUntiedUseRegs >= 3)
  295. Strategies.push_back(RegRandomizationStrategy::PickRandomRegs);
  296. if (NumUntiedUseRegs >= 2)
  297. Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand);
  298. Strategies.push_back(RegRandomizationStrategy::SingleStaticReg);
  299. for (RegRandomizationStrategy S : Strategies) {
  300. CodeTemplate CurrCT = CT.clone();
  301. CurrCT.Info =
  302. Twine("instruction has ")
  303. .concat(HasTiedOperands ? "" : "no ")
  304. .concat("tied variables, avoiding "
  305. "Read-After-Write issue, picking random def and use "
  306. "registers not aliasing each other, for uses, ")
  307. .concat(getDescription(S))
  308. .str();
  309. CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
  310. State, Variant, S, ForbiddenRegisters);
  311. if (CurrCT.Instructions.empty())
  312. return make_error<StringError>(
  313. Twine("Failed to produce any snippet via: ").concat(CurrCT.Info),
  314. inconvertibleErrorCode());
  315. instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg,
  316. CurrCT.Instructions);
  317. Result.push_back(std::move(CurrCT));
  318. }
  319. return Result;
  320. }
  321. constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
  322. } // namespace exegesis
  323. } // namespace llvm