//===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "ParallelSnippetGenerator.h" #include "BenchmarkRunner.h" #include "MCInstrDescView.h" #include "Target.h" // FIXME: Load constants into registers (e.g. with fld1) to not break // instructions like x87. // Ideally we would like the only limitation on executing instructions to be the // availability of the CPU resources (e.g. execution ports) needed to execute // them, instead of the availability of their data dependencies. // To achieve that, one approach is to generate instructions that do not have // data dependencies between them. // // For some instructions, this is trivial: // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // For the above snippet, haswell just renames rax four times and executes the // four instructions two at a time on P23 and P0126. // // For some instructions, we just need to make sure that the source is // different from the destination. For example, IDIV8r reads from GPR and // writes to AX. We just need to ensure that the Var is assigned a // register which is different from AX: // idiv bx // idiv bx // idiv bx // idiv bx // The above snippet will be able to fully saturate the ports, while the same // with ax would issue one uop every `latency(IDIV8r)` cycles. // // Some instructions make this harder because they both read and write from // the same register: // inc rax // inc rax // inc rax // inc rax // This has a data dependency from each instruction to the next, limit the // number of instructions that can be issued in parallel. // It turns out that this is not a big issue on recent Intel CPUs because they // have heuristics to balance port pressure. In the snippet above, subsequent // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs // might end up executing them all on P0 (just because they can), or try // avoiding P5 because it's usually under high pressure from vector // instructions. // This issue is even more important for high-latency instructions because // they increase the idle time of the CPU, e.g. : // imul rax, rbx // imul rax, rbx // imul rax, rbx // imul rax, rbx // // To avoid that, we do the renaming statically by generating as many // independent exclusive assignments as possible (until all possible registers // are exhausted) e.g.: // imul rax, rbx // imul rcx, rbx // imul rdx, rbx // imul r8, rbx // // Some instruction even make the above static renaming impossible because // they implicitly read and write from the same operand, e.g. ADC16rr reads // and writes from EFLAGS. // In that case we just use a greedy register assignment and hope for the // best. namespace llvm { namespace exegesis { static bool hasVariablesWithTiedOperands(const Instruction &Instr) { SmallVector Result; for (const auto &Var : Instr.Variables) if (Var.hasTiedOperands()) return true; return false; } ParallelSnippetGenerator::~ParallelSnippetGenerator() = default; void ParallelSnippetGenerator::instantiateMemoryOperands( const unsigned ScratchSpacePointerInReg, std::vector &Instructions) const { if (ScratchSpacePointerInReg == 0) return; // no memory operands. const auto &ET = State.getExegesisTarget(); const unsigned MemStep = ET.getMaxMemoryAccessSize(); const size_t OriginalInstructionsSize = Instructions.size(); size_t I = 0; for (InstructionTemplate &IT : Instructions) { ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); ++I; } while (Instructions.size() < kMinNumDifferentAddresses) { InstructionTemplate IT = Instructions[I % OriginalInstructionsSize]; ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); ++I; Instructions.push_back(std::move(IT)); } assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize && "not enough scratch space"); } enum class RegRandomizationStrategy : uint8_t { PickRandomRegs, SingleStaticRegPerOperand, SingleStaticReg, FIRST = PickRandomRegs, LAST = SingleStaticReg, }; } // namespace exegesis template <> struct enum_iteration_traits { static constexpr bool is_iterable = true; }; namespace exegesis { const char *getDescription(RegRandomizationStrategy S) { switch (S) { case RegRandomizationStrategy::PickRandomRegs: return "randomizing registers"; case RegRandomizationStrategy::SingleStaticRegPerOperand: return "one unique register for each position"; case RegRandomizationStrategy::SingleStaticReg: return "reusing the same register for all positions"; } llvm_unreachable("Unknown UseRegRandomizationStrategy enum"); } static std::variant generateSingleRegisterForInstrAvoidingDefUseOverlap( const LLVMState &State, const BitVector &ForbiddenRegisters, const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT, const Operand &Op, const ArrayRef Instructions, RegRandomizationStrategy S) { const Instruction &Instr = IT.getInstr(); assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() && !IT.getValueFor(Op).isValid()); assert((!Op.isUse() || !Op.isTied()) && "Not expecting to see a tied use reg"); if (Op.isUse()) { switch (S) { case RegRandomizationStrategy::PickRandomRegs: break; case RegRandomizationStrategy::SingleStaticReg: case RegRandomizationStrategy::SingleStaticRegPerOperand: { if (!Instructions.empty()) return Instructions.front().getValueFor(Op); if (S != RegRandomizationStrategy::SingleStaticReg) break; BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits(); const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses); if (std::optional CommonBit = getFirstCommonBit(PossibleRegisters, UseAliases)) return *CommonBit; break; } } } BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits(); remove(PossibleRegisters, ForbiddenRegisters); if (Op.isDef()) { remove(PossibleRegisters, ImplicitUseAliases); const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses); remove(PossibleRegisters, UseAliases); } if (Op.isUse()) { remove(PossibleRegisters, ImplicitDefAliases); // NOTE: in general, using same reg for multiple Use's is fine. if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) { const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses); remove(PossibleRegisters, UseAliases); } } bool IsDefWithTiedUse = Instr.Variables[Op.getVariableIndex()].hasTiedOperands(); if (Op.isUse() || IsDefWithTiedUse) { // Now, important bit: if we have used some register for def, // then we can not use that same register for *any* use, // be it either an untied use, or an use tied to a def. // But def-ing same regs is fine, as long as there are no uses! const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs); remove(PossibleRegisters, DefsAliases); } if (!PossibleRegisters.any()) return std::nullopt; return randomBit(PossibleRegisters); } static std::optional generateSingleSnippetForInstrAvoidingDefUseOverlap( const LLVMState &State, const BitVector &ForbiddenRegisters, const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases, BitVector &Uses, BitVector &Defs, InstructionTemplate IT, const ArrayRef Instructions, RegRandomizationStrategy S) { const Instruction &Instr = IT.getInstr(); for (const Operand &Op : Instr.Operands) { if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() || IT.getValueFor(Op).isValid()) continue; assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses."); std::variant R = generateSingleRegisterForInstrAvoidingDefUseOverlap( State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases, Uses, Defs, IT, Op, Instructions, S); if (std::holds_alternative(R)) return {}; MCOperand MCOp; if (std::holds_alternative(R)) MCOp = std::get(R); else { Register RandomReg = std::get(R); if (Op.isDef()) Defs.set(RandomReg); if (Op.isUse()) Uses.set(RandomReg); MCOp = MCOperand::createReg(RandomReg); } IT.getValueFor(Op) = MCOp; } return IT; } static std::vector generateSnippetForInstrAvoidingDefUseOverlap( const LLVMState &State, const InstructionTemplate &IT, RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) { // We don't want to accidentally serialize the instruction, // so we must be sure that we don't pick a def that is an implicit use, // or a use that is an implicit def, so record implicit regs now. BitVector ImplicitUses(State.getRegInfo().getNumRegs()); BitVector ImplicitDefs(State.getRegInfo().getNumRegs()); for (const auto &Op : IT.getInstr().Operands) { if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) { assert(Op.isImplicitReg() && "Not an implicit register operand?"); if (Op.isUse()) ImplicitUses.set(Op.getImplicitReg()); else { assert(Op.isDef() && "Not a use and not a def?"); ImplicitDefs.set(Op.getImplicitReg()); } } } const BitVector ImplicitUseAliases = getAliasedBits(State.getRegInfo(), ImplicitUses); const BitVector ImplicitDefAliases = getAliasedBits(State.getRegInfo(), ImplicitDefs); BitVector Defs(State.getRegInfo().getNumRegs()); BitVector Uses(State.getRegInfo().getNumRegs()); std::vector Instructions; while (true) { std::optional TmpIT = generateSingleSnippetForInstrAvoidingDefUseOverlap( State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases, Uses, Defs, IT, Instructions, S); if (!TmpIT) return Instructions; Instructions.push_back(std::move(*TmpIT)); if (!hasVariablesWithTiedOperands(IT.getInstr())) return Instructions; assert(Instructions.size() <= 128 && "Stuck in endless loop?"); } } Expected> ParallelSnippetGenerator::generateCodeTemplates( InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { const Instruction &Instr = Variant.getInstr(); CodeTemplate CT; CT.ScratchSpacePointerInReg = Instr.hasMemoryOperands() ? State.getExegesisTarget().getScratchMemoryRegister( State.getTargetMachine().getTargetTriple()) : 0; const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters); if (SelfAliasing.empty()) { CT.Info = "instruction is parallel, repeating a random one."; CT.Instructions.push_back(std::move(Variant)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } if (SelfAliasing.hasImplicitAliasing()) { CT.Info = "instruction is serial, repeating a random one."; CT.Instructions.push_back(std::move(Variant)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } std::vector Result; bool HasTiedOperands = hasVariablesWithTiedOperands(Instr); // If there are no tied operands, then we don't want to "saturate backedge", // and the template we will produce will have only a single instruction. unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) { return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() && !Op.isTied(); }); SmallVector Strategies; if (HasTiedOperands || NumUntiedUseRegs >= 3) Strategies.push_back(RegRandomizationStrategy::PickRandomRegs); if (NumUntiedUseRegs >= 2) Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand); Strategies.push_back(RegRandomizationStrategy::SingleStaticReg); for (RegRandomizationStrategy S : Strategies) { CodeTemplate CurrCT = CT.clone(); CurrCT.Info = Twine("instruction has ") .concat(HasTiedOperands ? "" : "no ") .concat("tied variables, avoiding " "Read-After-Write issue, picking random def and use " "registers not aliasing each other, for uses, ") .concat(getDescription(S)) .str(); CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap( State, Variant, S, ForbiddenRegisters); if (CurrCT.Instructions.empty()) return make_error( Twine("Failed to produce any snippet via: ").concat(CurrCT.Info), inconvertibleErrorCode()); instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg, CurrCT.Instructions); Result.push_back(std::move(CurrCT)); } return Result; } constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses; } // namespace exegesis } // namespace llvm