//===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "ParallelSnippetGenerator.h" #include "BenchmarkRunner.h" #include "MCInstrDescView.h" #include "Target.h" // FIXME: Load constants into registers (e.g. with fld1) to not break // instructions like x87. // Ideally we would like the only limitation on executing instructions to be the // availability of the CPU resources (e.g. execution ports) needed to execute // them, instead of the availability of their data dependencies. // To achieve that, one approach is to generate instructions that do not have // data dependencies between them. // // For some instructions, this is trivial: // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // mov rax, qword ptr [rsi] // For the above snippet, haswell just renames rax four times and executes the // four instructions two at a time on P23 and P0126. // // For some instructions, we just need to make sure that the source is // different from the destination. For example, IDIV8r reads from GPR and // writes to AX. We just need to ensure that the Var is assigned a // register which is different from AX: // idiv bx // idiv bx // idiv bx // idiv bx // The above snippet will be able to fully saturate the ports, while the same // with ax would issue one uop every `latency(IDIV8r)` cycles. // // Some instructions make this harder because they both read and write from // the same register: // inc rax // inc rax // inc rax // inc rax // This has a data dependency from each instruction to the next, limit the // number of instructions that can be issued in parallel. // It turns out that this is not a big issue on recent Intel CPUs because they // have heuristics to balance port pressure. In the snippet above, subsequent // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs // might end up executing them all on P0 (just because they can), or try // avoiding P5 because it's usually under high pressure from vector // instructions. // This issue is even more important for high-latency instructions because // they increase the idle time of the CPU, e.g. : // imul rax, rbx // imul rax, rbx // imul rax, rbx // imul rax, rbx // // To avoid that, we do the renaming statically by generating as many // independent exclusive assignments as possible (until all possible registers // are exhausted) e.g.: // imul rax, rbx // imul rcx, rbx // imul rdx, rbx // imul r8, rbx // // Some instruction even make the above static renaming impossible because // they implicitly read and write from the same operand, e.g. ADC16rr reads // and writes from EFLAGS. // In that case we just use a greedy register assignment and hope for the // best. namespace llvm { namespace exegesis { static SmallVector getVariablesWithTiedOperands(const Instruction &Instr) { SmallVector Result; for (const auto &Var : Instr.Variables) if (Var.hasTiedOperands()) Result.push_back(&Var); return Result; } ParallelSnippetGenerator::~ParallelSnippetGenerator() = default; void ParallelSnippetGenerator::instantiateMemoryOperands( const unsigned ScratchSpacePointerInReg, std::vector &Instructions) const { if (ScratchSpacePointerInReg == 0) return; // no memory operands. const auto &ET = State.getExegesisTarget(); const unsigned MemStep = ET.getMaxMemoryAccessSize(); const size_t OriginalInstructionsSize = Instructions.size(); size_t I = 0; for (InstructionTemplate &IT : Instructions) { ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); ++I; } while (Instructions.size() < kMinNumDifferentAddresses) { InstructionTemplate IT = Instructions[I % OriginalInstructionsSize]; ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); ++I; Instructions.push_back(std::move(IT)); } assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize && "not enough scratch space"); } static std::vector generateSnippetUsingStaticRenaming( const LLVMState &State, const InstructionTemplate &IT, const ArrayRef TiedVariables, const BitVector &ForbiddenRegisters) { std::vector Instructions; // Assign registers to variables in a round-robin manner. This is simple but // ensures that the most register-constrained variable does not get starved. std::vector PossibleRegsForVar; for (const Variable *Var : TiedVariables) { assert(Var); const Operand &Op = IT.getInstr().getPrimaryOperand(*Var); assert(Op.isReg()); BitVector PossibleRegs = Op.getRegisterAliasing().sourceBits(); remove(PossibleRegs, ForbiddenRegisters); PossibleRegsForVar.push_back(std::move(PossibleRegs)); } SmallVector Iterators(TiedVariables.size(), 0); while (true) { InstructionTemplate TmpIT = IT; // Find a possible register for each variable in turn, marking the // register as taken. for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) { const int NextPossibleReg = PossibleRegsForVar[VarId].find_next(Iterators[VarId]); if (NextPossibleReg <= 0) { return Instructions; } TmpIT.getValueFor(*TiedVariables[VarId]) = MCOperand::createReg(NextPossibleReg); // Bump iterator. Iterators[VarId] = NextPossibleReg; // Prevent other variables from using the register. for (BitVector &OtherPossibleRegs : PossibleRegsForVar) { OtherPossibleRegs.reset(NextPossibleReg); } } Instructions.push_back(std::move(TmpIT)); } } Expected> ParallelSnippetGenerator::generateCodeTemplates( InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { const Instruction &Instr = Variant.getInstr(); CodeTemplate CT; CT.ScratchSpacePointerInReg = Instr.hasMemoryOperands() ? State.getExegesisTarget().getScratchMemoryRegister( State.getTargetMachine().getTargetTriple()) : 0; const AliasingConfigurations SelfAliasing(Instr, Instr); if (SelfAliasing.empty()) { CT.Info = "instruction is parallel, repeating a random one."; CT.Instructions.push_back(std::move(Variant)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } if (SelfAliasing.hasImplicitAliasing()) { CT.Info = "instruction is serial, repeating a random one."; CT.Instructions.push_back(std::move(Variant)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } const auto TiedVariables = getVariablesWithTiedOperands(Instr); if (!TiedVariables.empty()) { CT.Info = "instruction has tied variables, using static renaming."; CT.Instructions = generateSnippetUsingStaticRenaming( State, Variant, TiedVariables, ForbiddenRegisters); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } // No tied variables, we pick random values for defs. // We don't want to accidentally serialize the instruction, // so we must be sure that we don't pick a def that is an implicit use, // or a use that is an implicit def, so record implicit regs now. BitVector ImplicitUses(State.getRegInfo().getNumRegs()); BitVector ImplicitDefs(State.getRegInfo().getNumRegs()); for (const auto &Op : Instr.Operands) { if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) { assert(Op.isImplicitReg() && "Not an implicit register operand?"); if (Op.isUse()) ImplicitUses.set(Op.getImplicitReg()); else { assert(Op.isDef() && "Not a use and not a def?"); ImplicitDefs.set(Op.getImplicitReg()); } } } const auto ImplicitUseAliases = getAliasedBits(State.getRegInfo(), ImplicitUses); const auto ImplicitDefAliases = getAliasedBits(State.getRegInfo(), ImplicitDefs); BitVector Defs(State.getRegInfo().getNumRegs()); for (const auto &Op : Instr.Operands) { if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) { auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); // Do not use forbidden registers and regs that are implicitly used. // Note that we don't try to avoid using implicit defs explicitly. remove(PossibleRegisters, ForbiddenRegisters); remove(PossibleRegisters, ImplicitUseAliases); if (!PossibleRegisters.any()) return make_error( Twine("no available registers:\ncandidates:\n") .concat(debugString(State.getRegInfo(), Op.getRegisterAliasing().sourceBits())) .concat("\nforbidden:\n") .concat(debugString(State.getRegInfo(), ForbiddenRegisters)) .concat("\nimplicit use:\n") .concat(debugString(State.getRegInfo(), ImplicitUseAliases)), inconvertibleErrorCode()); const auto RandomReg = randomBit(PossibleRegisters); Defs.set(RandomReg); Variant.getValueFor(Op) = MCOperand::createReg(RandomReg); } } // And pick random use values that are not reserved and don't alias with defs. // Note that we don't try to avoid using implicit uses explicitly. const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs); for (const auto &Op : Instr.Operands) { if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) { auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); remove(PossibleRegisters, ForbiddenRegisters); remove(PossibleRegisters, DefAliases); remove(PossibleRegisters, ImplicitDefAliases); assert(PossibleRegisters.any() && "No register left to choose from"); const auto RandomReg = randomBit(PossibleRegisters); Variant.getValueFor(Op) = MCOperand::createReg(RandomReg); } } CT.Info = "instruction has no tied variables picking Uses different from defs"; CT.Instructions.push_back(std::move(Variant)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return getSingleton(std::move(CT)); } constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses; } // namespace exegesis } // namespace llvm