//===-- Target.cpp ----------------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "../Target.h" #include "../Error.h" #include "../ParallelSnippetGenerator.h" #include "../SerialSnippetGenerator.h" #include "../SnippetGenerator.h" #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "X86.h" #include "X86Counter.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Sequence.h" #include "llvm/MC/MCInstBuilder.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Host.h" #include #include #include #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) #include #include #endif #if defined(__x86_64__) && defined(_MSC_VER) #include // For _clearfp in ~X86SavedState(). #endif namespace llvm { namespace exegesis { // If a positive value is specified, we are going to use the LBR in // latency-mode. // // Note: // - A small value is preferred, but too low a value could result in // throttling. // - A prime number is preferred to avoid always skipping certain blocks. // static cl::opt LbrSamplingPeriod( "x86-lbr-sample-period", cl::desc("The sample period (nbranches/sample), used for LBR sampling"), cl::cat(BenchmarkOptions), cl::init(0)); static cl::opt DisableUpperSSERegisters("x86-disable-upper-sse-registers", cl::desc("Disable XMM8-XMM15 register usage"), cl::cat(BenchmarkOptions), cl::init(false)); // FIXME: Validates that repetition-mode is loop if LBR is requested. // Returns a non-null reason if we cannot handle the memory references in this // instruction. static const char *isInvalidMemoryInstr(const Instruction &Instr) { switch (Instr.Description.TSFlags & X86II::FormMask) { default: return "Unknown FormMask value"; // These have no memory access. case X86II::Pseudo: case X86II::RawFrm: case X86II::AddCCFrm: case X86II::PrefixByte: case X86II::MRMDestReg: case X86II::MRMSrcReg: case X86II::MRMSrcReg4VOp3: case X86II::MRMSrcRegOp4: case X86II::MRMSrcRegCC: case X86II::MRMXrCC: case X86II::MRMr0: case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: case X86II::MRM4r: case X86II::MRM5r: case X86II::MRM6r: case X86II::MRM7r: case X86II::MRM0X: case X86II::MRM1X: case X86II::MRM2X: case X86II::MRM3X: case X86II::MRM4X: case X86II::MRM5X: case X86II::MRM6X: case X86II::MRM7X: case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5: case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE: case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6: case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF: case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF: case X86II::RawFrmImm8: return nullptr; case X86II::AddRegFrm: return (Instr.Description.Opcode == X86::POP16r || Instr.Description.Opcode == X86::POP32r || Instr.Description.Opcode == X86::PUSH16r || Instr.Description.Opcode == X86::PUSH32r) ? "unsupported opcode: unsupported memory access" : nullptr; // These access memory and are handled. case X86II::MRMDestMem: case X86II::MRMSrcMem: case X86II::MRMSrcMem4VOp3: case X86II::MRMSrcMemOp4: case X86II::MRMSrcMemCC: case X86II::MRMXmCC: case X86II::MRMXm: case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: case X86II::MRM6m: case X86II::MRM7m: return nullptr; // These access memory and are not handled yet. case X86II::RawFrmImm16: case X86II::RawFrmMemOffs: case X86II::RawFrmSrc: case X86II::RawFrmDst: case X86II::RawFrmDstSrc: return "unsupported opcode: non uniform memory access"; } } // If the opcode is invalid, returns a pointer to a character literal indicating // the reason. nullptr indicates a valid opcode. static const char *isInvalidOpcode(const Instruction &Instr) { const auto OpcodeName = Instr.Name; if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo) return "unsupported opcode: pseudo instruction"; if ((OpcodeName.startswith("POP") && !OpcodeName.startswith("POPCNT")) || OpcodeName.startswith("PUSH") || OpcodeName.startswith("ADJCALLSTACK") || OpcodeName.startswith("LEAVE")) return "unsupported opcode: Push/Pop/AdjCallStack/Leave"; switch (Instr.Description.Opcode) { case X86::LFS16rm: case X86::LFS32rm: case X86::LFS64rm: case X86::LGS16rm: case X86::LGS32rm: case X86::LGS64rm: case X86::LSS16rm: case X86::LSS32rm: case X86::LSS64rm: case X86::SYSENTER: case X86::WRFSBASE: case X86::WRFSBASE64: return "unsupported opcode"; default: break; } if (const auto reason = isInvalidMemoryInstr(Instr)) return reason; // We do not handle instructions with OPERAND_PCREL. for (const Operand &Op : Instr.Operands) if (Op.isExplicit() && Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL) return "unsupported opcode: PC relative operand"; // We do not handle second-form X87 instructions. We only handle first-form // ones (_Fp), see comment in X86InstrFPStack.td. for (const Operand &Op : Instr.Operands) if (Op.isReg() && Op.isExplicit() && Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID) return "unsupported second-form X87 instruction"; return nullptr; } static unsigned getX86FPFlags(const Instruction &Instr) { return Instr.Description.TSFlags & X86II::FPTypeMask; } // Helper to fill a memory operand with a value. static void setMemOp(InstructionTemplate &IT, int OpIdx, const MCOperand &OpVal) { const auto Op = IT.getInstr().Operands[OpIdx]; assert(Op.isExplicit() && "invalid memory pattern"); IT.getValueFor(Op) = OpVal; } // Common (latency, uops) code for LEA templates. `GetDestReg` takes the // addressing base and index registers and returns the LEA destination register. static Expected> generateLEATemplatesCommon( const Instruction &Instr, const BitVector &ForbiddenRegisters, const LLVMState &State, const SnippetGenerator::Options &Opts, std::function RestrictDestRegs) { assert(Instr.Operands.size() == 6 && "invalid LEA"); assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 && "invalid LEA"); constexpr const int kDestOp = 0; constexpr const int kBaseOp = 1; constexpr const int kIndexOp = 3; auto PossibleDestRegs = Instr.Operands[kDestOp].getRegisterAliasing().sourceBits(); remove(PossibleDestRegs, ForbiddenRegisters); auto PossibleBaseRegs = Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits(); remove(PossibleBaseRegs, ForbiddenRegisters); auto PossibleIndexRegs = Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits(); remove(PossibleIndexRegs, ForbiddenRegisters); const auto &RegInfo = State.getRegInfo(); std::vector Result; for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) { for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) { for (int LogScale = 0; LogScale <= 3; ++LogScale) { // FIXME: Add an option for controlling how we explore immediates. for (const int Disp : {0, 42}) { InstructionTemplate IT(&Instr); const int64_t Scale = 1ull << LogScale; setMemOp(IT, 1, MCOperand::createReg(BaseReg)); setMemOp(IT, 2, MCOperand::createImm(Scale)); setMemOp(IT, 3, MCOperand::createReg(IndexReg)); setMemOp(IT, 4, MCOperand::createImm(Disp)); // SegmentReg must be 0 for LEA. setMemOp(IT, 5, MCOperand::createReg(0)); // Output reg candidates are selected by the caller. auto PossibleDestRegsNow = PossibleDestRegs; RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow); assert(PossibleDestRegsNow.set_bits().begin() != PossibleDestRegsNow.set_bits().end() && "no remaining registers"); setMemOp( IT, 0, MCOperand::createReg(*PossibleDestRegsNow.set_bits().begin())); CodeTemplate CT; CT.Instructions.push_back(std::move(IT)); CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg), RegInfo.getName(IndexReg), Scale, Disp) .str(); Result.push_back(std::move(CT)); if (Result.size() >= Opts.MaxConfigsPerOpcode) return std::move(Result); } } } } return std::move(Result); } namespace { class X86SerialSnippetGenerator : public SerialSnippetGenerator { public: using SerialSnippetGenerator::SerialSnippetGenerator; Expected> generateCodeTemplates(InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const override; }; } // namespace Expected> X86SerialSnippetGenerator::generateCodeTemplates( InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { const Instruction &Instr = Variant.getInstr(); if (const auto reason = isInvalidOpcode(Instr)) return make_error(reason); // LEA gets special attention. const auto Opcode = Instr.Description.getOpcode(); if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) { return generateLEATemplatesCommon( Instr, ForbiddenRegisters, State, Opts, [this](unsigned BaseReg, unsigned IndexReg, BitVector &CandidateDestRegs) { // We just select a destination register that aliases the base // register. CandidateDestRegs &= State.getRATC().getRegister(BaseReg).aliasedBits(); }); } if (Instr.hasMemoryOperands()) return make_error( "unsupported memory operand in latency measurements"); switch (getX86FPFlags(Instr)) { case X86II::NotFP: return SerialSnippetGenerator::generateCodeTemplates(Variant, ForbiddenRegisters); case X86II::ZeroArgFP: case X86II::OneArgFP: case X86II::SpecialFP: case X86II::CompareFP: case X86II::CondMovFP: return make_error("Unsupported x87 Instruction"); case X86II::OneArgFPRW: case X86II::TwoArgFP: // These are instructions like // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) // They are intrinsically serial and do not modify the state of the stack. return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters); default: llvm_unreachable("Unknown FP Type!"); } } namespace { class X86ParallelSnippetGenerator : public ParallelSnippetGenerator { public: using ParallelSnippetGenerator::ParallelSnippetGenerator; Expected> generateCodeTemplates(InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const override; }; } // namespace Expected> X86ParallelSnippetGenerator::generateCodeTemplates( InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const { const Instruction &Instr = Variant.getInstr(); if (const auto reason = isInvalidOpcode(Instr)) return make_error(reason); // LEA gets special attention. const auto Opcode = Instr.Description.getOpcode(); if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) { return generateLEATemplatesCommon( Instr, ForbiddenRegisters, State, Opts, [this](unsigned BaseReg, unsigned IndexReg, BitVector &CandidateDestRegs) { // Any destination register that is not used for addressing is fine. remove(CandidateDestRegs, State.getRATC().getRegister(BaseReg).aliasedBits()); remove(CandidateDestRegs, State.getRATC().getRegister(IndexReg).aliasedBits()); }); } switch (getX86FPFlags(Instr)) { case X86II::NotFP: return ParallelSnippetGenerator::generateCodeTemplates(Variant, ForbiddenRegisters); case X86II::ZeroArgFP: case X86II::OneArgFP: case X86II::SpecialFP: return make_error("Unsupported x87 Instruction"); case X86II::OneArgFPRW: case X86II::TwoArgFP: // These are instructions like // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) // They are intrinsically serial and do not modify the state of the stack. // We generate the same code for latency and uops. return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters); case X86II::CompareFP: case X86II::CondMovFP: // We can compute uops for any FP instruction that does not grow or shrink // the stack (either do not touch the stack or push as much as they pop). return generateUnconstrainedCodeTemplates( Variant, "instruction does not grow/shrink the FP stack"); default: llvm_unreachable("Unknown FP Type!"); } } static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) { switch (RegBitWidth) { case 8: return X86::MOV8ri; case 16: return X86::MOV16ri; case 32: return X86::MOV32ri; case 64: return X86::MOV64ri; } llvm_unreachable("Invalid Value Width"); } // Generates instruction to load an immediate value into a register. static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth, const APInt &Value) { if (Value.getBitWidth() > RegBitWidth) llvm_unreachable("Value must fit in the Register"); return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth)) .addReg(Reg) .addImm(Value.getZExtValue()); } // Allocates scratch memory on the stack. static MCInst allocateStackSpace(unsigned Bytes) { return MCInstBuilder(X86::SUB64ri8) .addReg(X86::RSP) .addReg(X86::RSP) .addImm(Bytes); } // Fills scratch memory at offset `OffsetBytes` with value `Imm`. static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes, uint64_t Imm) { return MCInstBuilder(MovOpcode) // Address = ESP .addReg(X86::RSP) // BaseReg .addImm(1) // ScaleAmt .addReg(0) // IndexReg .addImm(OffsetBytes) // Disp .addReg(0) // Segment // Immediate. .addImm(Imm); } // Loads scratch memory into register `Reg` using opcode `RMOpcode`. static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) { return MCInstBuilder(RMOpcode) .addReg(Reg) // Address = ESP .addReg(X86::RSP) // BaseReg .addImm(1) // ScaleAmt .addReg(0) // IndexReg .addImm(0) // Disp .addReg(0); // Segment } // Releases scratch memory. static MCInst releaseStackSpace(unsigned Bytes) { return MCInstBuilder(X86::ADD64ri8) .addReg(X86::RSP) .addReg(X86::RSP) .addImm(Bytes); } // Reserves some space on the stack, fills it with the content of the provided // constant and provide methods to load the stack value into a register. namespace { struct ConstantInliner { explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {} std::vector loadAndFinalize(unsigned Reg, unsigned RegBitWidth, unsigned Opcode); std::vector loadX87STAndFinalize(unsigned Reg); std::vector loadX87FPAndFinalize(unsigned Reg); std::vector popFlagAndFinalize(); std::vector loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value); private: ConstantInliner &add(const MCInst &Inst) { Instructions.push_back(Inst); return *this; } void initStack(unsigned Bytes); static constexpr const unsigned kF80Bytes = 10; // 80 bits. APInt Constant_; std::vector Instructions; }; } // namespace std::vector ConstantInliner::loadAndFinalize(unsigned Reg, unsigned RegBitWidth, unsigned Opcode) { assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits"); initStack(RegBitWidth / 8); add(loadToReg(Reg, Opcode)); add(releaseStackSpace(RegBitWidth / 8)); return std::move(Instructions); } std::vector ConstantInliner::loadX87STAndFinalize(unsigned Reg) { initStack(kF80Bytes); add(MCInstBuilder(X86::LD_F80m) // Address = ESP .addReg(X86::RSP) // BaseReg .addImm(1) // ScaleAmt .addReg(0) // IndexReg .addImm(0) // Disp .addReg(0)); // Segment if (Reg != X86::ST0) add(MCInstBuilder(X86::ST_Frr).addReg(Reg)); add(releaseStackSpace(kF80Bytes)); return std::move(Instructions); } std::vector ConstantInliner::loadX87FPAndFinalize(unsigned Reg) { initStack(kF80Bytes); add(MCInstBuilder(X86::LD_Fp80m) .addReg(Reg) // Address = ESP .addReg(X86::RSP) // BaseReg .addImm(1) // ScaleAmt .addReg(0) // IndexReg .addImm(0) // Disp .addReg(0)); // Segment add(releaseStackSpace(kF80Bytes)); return std::move(Instructions); } std::vector ConstantInliner::popFlagAndFinalize() { initStack(8); add(MCInstBuilder(X86::POPF64)); return std::move(Instructions); } std::vector ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) { add(allocateStackSpace(4)); add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions add(MCInstBuilder(Opcode) // Address = ESP .addReg(X86::RSP) // BaseReg .addImm(1) // ScaleAmt .addReg(0) // IndexReg .addImm(0) // Disp .addReg(0)); // Segment add(releaseStackSpace(4)); return std::move(Instructions); } void ConstantInliner::initStack(unsigned Bytes) { assert(Constant_.getBitWidth() <= Bytes * 8 && "Value does not have the correct size"); const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8 ? Constant_.sext(Bytes * 8) : Constant_; add(allocateStackSpace(Bytes)); size_t ByteOffset = 0; for (; Bytes - ByteOffset >= 4; ByteOffset += 4) add(fillStackSpace( X86::MOV32mi, ByteOffset, WideConstant.extractBits(32, ByteOffset * 8).getZExtValue())); if (Bytes - ByteOffset >= 2) { add(fillStackSpace( X86::MOV16mi, ByteOffset, WideConstant.extractBits(16, ByteOffset * 8).getZExtValue())); ByteOffset += 2; } if (Bytes - ByteOffset >= 1) add(fillStackSpace( X86::MOV8mi, ByteOffset, WideConstant.extractBits(8, ByteOffset * 8).getZExtValue())); } #include "X86GenExegesis.inc" namespace { class X86SavedState : public ExegesisTarget::SavedState { public: X86SavedState() { #ifdef __x86_64__ # if defined(_MSC_VER) _fxsave64(FPState); Eflags = __readeflags(); # elif defined(__GNUC__) __builtin_ia32_fxsave64(FPState); Eflags = __builtin_ia32_readeflags_u64(); # endif #else llvm_unreachable("X86 exegesis running on non-X86 target"); #endif } ~X86SavedState() { // Restoring the X87 state does not flush pending exceptions, make sure // these exceptions are flushed now. #ifdef __x86_64__ # if defined(_MSC_VER) _clearfp(); _fxrstor64(FPState); __writeeflags(Eflags); # elif defined(__GNUC__) asm volatile("fwait"); __builtin_ia32_fxrstor64(FPState); __builtin_ia32_writeeflags_u64(Eflags); # endif #else llvm_unreachable("X86 exegesis running on non-X86 target"); #endif } private: #ifdef __x86_64__ alignas(16) char FPState[512]; uint64_t Eflags; #endif }; class ExegesisX86Target : public ExegesisTarget { public: ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {} Expected> createCounter(StringRef CounterName, const LLVMState &State) const override { // If LbrSamplingPeriod was provided, then ignore the // CounterName because we only have one for LBR. if (LbrSamplingPeriod > 0) { // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without // __linux__ (for now) #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \ defined(__linux__) return std::make_unique( X86LbrPerfEvent(LbrSamplingPeriod)); #else return llvm::make_error( "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, " "or running on Linux.", llvm::errc::invalid_argument); #endif } return ExegesisTarget::createCounter(CounterName, State); } private: void addTargetSpecificPasses(PassManagerBase &PM) const override; unsigned getScratchMemoryRegister(const Triple &TT) const override; unsigned getLoopCounterRegister(const Triple &) const override; unsigned getMaxMemoryAccessSize() const override { return 64; } Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue, const BitVector &ForbiddenRegs) const override; void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, unsigned Offset) const override; void decrementLoopCounterAndJump(MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, const MCInstrInfo &MII) const override; std::vector setRegTo(const MCSubtargetInfo &STI, unsigned Reg, const APInt &Value) const override; ArrayRef getUnavailableRegisters() const override { if (DisableUpperSSERegisters) return ArrayRef(kUnavailableRegistersSSE, sizeof(kUnavailableRegistersSSE) / sizeof(kUnavailableRegistersSSE[0])); return ArrayRef(kUnavailableRegisters, std::size(kUnavailableRegisters)); } bool allowAsBackToBack(const Instruction &Instr) const override { const unsigned Opcode = Instr.Description.Opcode; return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r && Opcode != X86::LEA64_32r && Opcode != X86::LEA16r; } std::vector generateInstructionVariants(const Instruction &Instr, unsigned MaxConfigsPerOpcode) const override; std::unique_ptr createSerialSnippetGenerator( const LLVMState &State, const SnippetGenerator::Options &Opts) const override { return std::make_unique(State, Opts); } std::unique_ptr createParallelSnippetGenerator( const LLVMState &State, const SnippetGenerator::Options &Opts) const override { return std::make_unique(State, Opts); } bool matchesArch(Triple::ArchType Arch) const override { return Arch == Triple::x86_64 || Arch == Triple::x86; } Error checkFeatureSupport() const override { // LBR is the only feature we conditionally support now. // So if LBR is not requested, then we should be able to run the benchmarks. if (LbrSamplingPeriod == 0) return Error::success(); #if defined(__linux__) && defined(HAVE_LIBPFM) && \ defined(LIBPFM_HAS_FIELD_CYCLES) // FIXME: Fix this. // https://bugs.llvm.org/show_bug.cgi?id=48918 // For now, only do the check if we see an Intel machine because // the counter uses some intel-specific magic and it could // be confuse and think an AMD machine actually has LBR support. #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \ defined(_M_X64) using namespace sys::detail::x86; if (getVendorSignature() == VendorSignatures::GENUINE_INTEL) // If the kernel supports it, the hardware still may not have it. return X86LbrCounter::checkLbrSupport(); #else llvm_unreachable("Running X86 exegesis on non-X86 target"); #endif #endif return llvm::make_error( "LBR not supported on this kernel and/or platform", llvm::errc::not_supported); } std::unique_ptr withSavedState() const override { return std::make_unique(); } static const unsigned kUnavailableRegisters[4]; static const unsigned kUnavailableRegistersSSE[12]; }; // We disable a few registers that cannot be encoded on instructions with a REX // prefix. const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH, X86::CH, X86::DH}; // Optionally, also disable the upper (x86_64) SSE registers to reduce frontend // decoder load. const unsigned ExegesisX86Target::kUnavailableRegistersSSE[12] = { X86::AH, X86::BH, X86::CH, X86::DH, X86::XMM8, X86::XMM9, X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15}; // We're using one of R8-R15 because these registers are never hardcoded in // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less // conflicts. constexpr const unsigned kLoopCounterReg = X86::R8; } // namespace void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const { // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F. PM.add(createX86FloatingPointStackifierPass()); } unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const { if (!TT.isArch64Bit()) { // FIXME: This would require popping from the stack, so we would have to // add some additional setup code. return 0; } return TT.isOSWindows() ? X86::RCX : X86::RDI; } unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const { if (!TT.isArch64Bit()) { return 0; } return kLoopCounterReg; } Error ExegesisX86Target::randomizeTargetMCOperand( const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue, const BitVector &ForbiddenRegs) const { const Operand &Op = Instr.getPrimaryOperand(Var); switch (Op.getExplicitOperandInfo().OperandType) { case X86::OperandType::OPERAND_ROUNDING_CONTROL: AssignedValue = MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO)); return Error::success(); default: break; } return make_error( Twine("unimplemented operand type ") .concat(Twine(Op.getExplicitOperandInfo().OperandType))); } void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, unsigned Offset) const { assert(!isInvalidMemoryInstr(IT.getInstr()) && "fillMemoryOperands requires a valid memory instruction"); int MemOpIdx = X86II::getMemoryOperandNo(IT.getInstr().Description.TSFlags); assert(MemOpIdx >= 0 && "invalid memory operand index"); // getMemoryOperandNo() ignores tied operands, so we have to add them back. MemOpIdx += X86II::getOperandBias(IT.getInstr().Description); setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg)); // BaseReg setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1)); // ScaleAmt setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0)); // IndexReg setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0)); // Segment } void ExegesisX86Target::decrementLoopCounterAndJump( MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, const MCInstrInfo &MII) const { BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8)) .addDef(kLoopCounterReg) .addUse(kLoopCounterReg) .addImm(-1); BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1)) .addMBB(&TargetMBB) .addImm(X86::COND_NE); } std::vector ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI, unsigned Reg, const APInt &Value) const { if (X86::GR8RegClass.contains(Reg)) return {loadImmediate(Reg, 8, Value)}; if (X86::GR16RegClass.contains(Reg)) return {loadImmediate(Reg, 16, Value)}; if (X86::GR32RegClass.contains(Reg)) return {loadImmediate(Reg, 32, Value)}; if (X86::GR64RegClass.contains(Reg)) return {loadImmediate(Reg, 64, Value)}; if (X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) || X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg)) { switch (Value.getBitWidth()) { case 8: if (STI.getFeatureBits()[X86::FeatureDQI]) { ConstantInliner CI(Value); return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVBkm); } [[fallthrough]]; case 16: if (STI.getFeatureBits()[X86::FeatureAVX512]) { ConstantInliner CI(Value.zextOrTrunc(16)); return CI.loadAndFinalize(Reg, 16, X86::KMOVWkm); } break; case 32: if (STI.getFeatureBits()[X86::FeatureBWI]) { ConstantInliner CI(Value); return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVDkm); } break; case 64: if (STI.getFeatureBits()[X86::FeatureBWI]) { ConstantInliner CI(Value); return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVQkm); } break; } } ConstantInliner CI(Value); if (X86::VR64RegClass.contains(Reg)) return CI.loadAndFinalize(Reg, 64, X86::MMX_MOVQ64rm); if (X86::VR128XRegClass.contains(Reg)) { if (STI.getFeatureBits()[X86::FeatureAVX512]) return CI.loadAndFinalize(Reg, 128, X86::VMOVDQU32Z128rm); if (STI.getFeatureBits()[X86::FeatureAVX]) return CI.loadAndFinalize(Reg, 128, X86::VMOVDQUrm); return CI.loadAndFinalize(Reg, 128, X86::MOVDQUrm); } if (X86::VR256XRegClass.contains(Reg)) { if (STI.getFeatureBits()[X86::FeatureAVX512]) return CI.loadAndFinalize(Reg, 256, X86::VMOVDQU32Z256rm); if (STI.getFeatureBits()[X86::FeatureAVX]) return CI.loadAndFinalize(Reg, 256, X86::VMOVDQUYrm); } if (X86::VR512RegClass.contains(Reg)) if (STI.getFeatureBits()[X86::FeatureAVX512]) return CI.loadAndFinalize(Reg, 512, X86::VMOVDQU32Zrm); if (X86::RSTRegClass.contains(Reg)) { return CI.loadX87STAndFinalize(Reg); } if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) || X86::RFP80RegClass.contains(Reg)) { return CI.loadX87FPAndFinalize(Reg); } if (Reg == X86::EFLAGS) return CI.popFlagAndFinalize(); if (Reg == X86::MXCSR) return CI.loadImplicitRegAndFinalize( STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR, 0x1f80); if (Reg == X86::FPCW) return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f); return {}; // Not yet implemented. } // Instruction can have some variable operands, and we may want to see how // different operands affect performance. So for each operand position, // precompute all the possible choices we might care about, // and greedily generate all the possible combinations of choices. std::vector ExegesisX86Target::generateInstructionVariants( const Instruction &Instr, unsigned MaxConfigsPerOpcode) const { bool Exploration = false; SmallVector, 4> VariableChoices; VariableChoices.resize(Instr.Variables.size()); for (auto I : llvm::zip(Instr.Variables, VariableChoices)) { const Variable &Var = std::get<0>(I); SmallVectorImpl &Choices = std::get<1>(I); switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) { default: // We don't wish to explicitly explore this variable. Choices.emplace_back(); // But add invalid MCOperand to simplify logic. continue; case X86::OperandType::OPERAND_COND_CODE: { Exploration = true; auto CondCodes = enum_seq_inclusive(X86::CondCode::COND_O, X86::CondCode::LAST_VALID_COND, force_iteration_on_noniterable_enum); Choices.reserve(CondCodes.size()); for (int CondCode : CondCodes) Choices.emplace_back(MCOperand::createImm(CondCode)); break; } } } // If we don't wish to explore any variables, defer to the baseline method. if (!Exploration) return ExegesisTarget::generateInstructionVariants(Instr, MaxConfigsPerOpcode); std::vector Variants; size_t NumVariants; CombinationGenerator G( VariableChoices); // How many operand combinations can we produce, within the limit? NumVariants = std::min(G.numCombinations(), (size_t)MaxConfigsPerOpcode); // And actually produce all the wanted operand combinations. Variants.reserve(NumVariants); G.generate([&](ArrayRef State) -> bool { Variants.emplace_back(&Instr); Variants.back().setVariableValues(State); // Did we run out of space for variants? return Variants.size() >= NumVariants; }); assert(Variants.size() == NumVariants && Variants.size() <= MaxConfigsPerOpcode && "Should not produce too many variants"); return Variants; } static ExegesisTarget *getTheExegesisX86Target() { static ExegesisX86Target Target; return &Target; } void InitializeX86ExegesisTarget() { ExegesisTarget::registerTarget(getTheExegesisX86Target()); } } // namespace exegesis } // namespace llvm