Target.cpp 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970
  1. //===-- Target.cpp ----------------------------------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "../Target.h"
  9. #include "../Error.h"
  10. #include "../ParallelSnippetGenerator.h"
  11. #include "../SerialSnippetGenerator.h"
  12. #include "../SnippetGenerator.h"
  13. #include "MCTargetDesc/X86BaseInfo.h"
  14. #include "MCTargetDesc/X86MCTargetDesc.h"
  15. #include "X86.h"
  16. #include "X86Counter.h"
  17. #include "X86RegisterInfo.h"
  18. #include "X86Subtarget.h"
  19. #include "llvm/ADT/Sequence.h"
  20. #include "llvm/MC/MCInstBuilder.h"
  21. #include "llvm/Support/Errc.h"
  22. #include "llvm/Support/Error.h"
  23. #include "llvm/Support/FormatVariadic.h"
  24. #include "llvm/Support/Host.h"
  25. #include <memory>
  26. #include <string>
  27. #include <vector>
  28. #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
  29. #include <float.h>
  30. #include <immintrin.h>
  31. #include <intrin.h>
  32. #endif
  33. namespace llvm {
  34. namespace exegesis {
  35. static cl::OptionCategory
  36. BenchmarkOptions("llvm-exegesis benchmark x86-options");
  37. // If a positive value is specified, we are going to use the LBR in
  38. // latency-mode.
  39. //
  40. // Note:
  41. // - A small value is preferred, but too low a value could result in
  42. // throttling.
  43. // - A prime number is preferred to avoid always skipping certain blocks.
  44. //
  45. static cl::opt<unsigned> LbrSamplingPeriod(
  46. "x86-lbr-sample-period",
  47. cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
  48. cl::cat(BenchmarkOptions), cl::init(0));
  49. // FIXME: Validates that repetition-mode is loop if LBR is requested.
  50. // Returns a non-null reason if we cannot handle the memory references in this
  51. // instruction.
  52. static const char *isInvalidMemoryInstr(const Instruction &Instr) {
  53. switch (Instr.Description.TSFlags & X86II::FormMask) {
  54. default:
  55. return "Unknown FormMask value";
  56. // These have no memory access.
  57. case X86II::Pseudo:
  58. case X86II::RawFrm:
  59. case X86II::AddCCFrm:
  60. case X86II::PrefixByte:
  61. case X86II::MRMDestReg:
  62. case X86II::MRMSrcReg:
  63. case X86II::MRMSrcReg4VOp3:
  64. case X86II::MRMSrcRegOp4:
  65. case X86II::MRMSrcRegCC:
  66. case X86II::MRMXrCC:
  67. case X86II::MRMr0:
  68. case X86II::MRMXr:
  69. case X86II::MRM0r:
  70. case X86II::MRM1r:
  71. case X86II::MRM2r:
  72. case X86II::MRM3r:
  73. case X86II::MRM4r:
  74. case X86II::MRM5r:
  75. case X86II::MRM6r:
  76. case X86II::MRM7r:
  77. case X86II::MRM0X:
  78. case X86II::MRM1X:
  79. case X86II::MRM2X:
  80. case X86II::MRM3X:
  81. case X86II::MRM4X:
  82. case X86II::MRM5X:
  83. case X86II::MRM6X:
  84. case X86II::MRM7X:
  85. case X86II::MRM_C0:
  86. case X86II::MRM_C1:
  87. case X86II::MRM_C2:
  88. case X86II::MRM_C3:
  89. case X86II::MRM_C4:
  90. case X86II::MRM_C5:
  91. case X86II::MRM_C6:
  92. case X86II::MRM_C7:
  93. case X86II::MRM_C8:
  94. case X86II::MRM_C9:
  95. case X86II::MRM_CA:
  96. case X86II::MRM_CB:
  97. case X86II::MRM_CC:
  98. case X86II::MRM_CD:
  99. case X86II::MRM_CE:
  100. case X86II::MRM_CF:
  101. case X86II::MRM_D0:
  102. case X86II::MRM_D1:
  103. case X86II::MRM_D2:
  104. case X86II::MRM_D3:
  105. case X86II::MRM_D4:
  106. case X86II::MRM_D5:
  107. case X86II::MRM_D6:
  108. case X86II::MRM_D7:
  109. case X86II::MRM_D8:
  110. case X86II::MRM_D9:
  111. case X86II::MRM_DA:
  112. case X86II::MRM_DB:
  113. case X86II::MRM_DC:
  114. case X86II::MRM_DD:
  115. case X86II::MRM_DE:
  116. case X86II::MRM_DF:
  117. case X86II::MRM_E0:
  118. case X86II::MRM_E1:
  119. case X86II::MRM_E2:
  120. case X86II::MRM_E3:
  121. case X86II::MRM_E4:
  122. case X86II::MRM_E5:
  123. case X86II::MRM_E6:
  124. case X86II::MRM_E7:
  125. case X86II::MRM_E8:
  126. case X86II::MRM_E9:
  127. case X86II::MRM_EA:
  128. case X86II::MRM_EB:
  129. case X86II::MRM_EC:
  130. case X86II::MRM_ED:
  131. case X86II::MRM_EE:
  132. case X86II::MRM_EF:
  133. case X86II::MRM_F0:
  134. case X86II::MRM_F1:
  135. case X86II::MRM_F2:
  136. case X86II::MRM_F3:
  137. case X86II::MRM_F4:
  138. case X86II::MRM_F5:
  139. case X86II::MRM_F6:
  140. case X86II::MRM_F7:
  141. case X86II::MRM_F8:
  142. case X86II::MRM_F9:
  143. case X86II::MRM_FA:
  144. case X86II::MRM_FB:
  145. case X86II::MRM_FC:
  146. case X86II::MRM_FD:
  147. case X86II::MRM_FE:
  148. case X86II::MRM_FF:
  149. case X86II::RawFrmImm8:
  150. return nullptr;
  151. case X86II::AddRegFrm:
  152. return (Instr.Description.Opcode == X86::POP16r ||
  153. Instr.Description.Opcode == X86::POP32r ||
  154. Instr.Description.Opcode == X86::PUSH16r ||
  155. Instr.Description.Opcode == X86::PUSH32r)
  156. ? "unsupported opcode: unsupported memory access"
  157. : nullptr;
  158. // These access memory and are handled.
  159. case X86II::MRMDestMem:
  160. case X86II::MRMSrcMem:
  161. case X86II::MRMSrcMem4VOp3:
  162. case X86II::MRMSrcMemOp4:
  163. case X86II::MRMSrcMemCC:
  164. case X86II::MRMXmCC:
  165. case X86II::MRMXm:
  166. case X86II::MRM0m:
  167. case X86II::MRM1m:
  168. case X86II::MRM2m:
  169. case X86II::MRM3m:
  170. case X86II::MRM4m:
  171. case X86II::MRM5m:
  172. case X86II::MRM6m:
  173. case X86II::MRM7m:
  174. return nullptr;
  175. // These access memory and are not handled yet.
  176. case X86II::RawFrmImm16:
  177. case X86II::RawFrmMemOffs:
  178. case X86II::RawFrmSrc:
  179. case X86II::RawFrmDst:
  180. case X86II::RawFrmDstSrc:
  181. return "unsupported opcode: non uniform memory access";
  182. }
  183. }
  184. // If the opcode is invalid, returns a pointer to a character literal indicating
  185. // the reason. nullptr indicates a valid opcode.
  186. static const char *isInvalidOpcode(const Instruction &Instr) {
  187. const auto OpcodeName = Instr.Name;
  188. if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo)
  189. return "unsupported opcode: pseudo instruction";
  190. if ((OpcodeName.startswith("POP") && !OpcodeName.startswith("POPCNT")) ||
  191. OpcodeName.startswith("PUSH") || OpcodeName.startswith("ADJCALLSTACK") ||
  192. OpcodeName.startswith("LEAVE"))
  193. return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
  194. switch (Instr.Description.Opcode) {
  195. case X86::LFS16rm:
  196. case X86::LFS32rm:
  197. case X86::LFS64rm:
  198. case X86::LGS16rm:
  199. case X86::LGS32rm:
  200. case X86::LGS64rm:
  201. case X86::LSS16rm:
  202. case X86::LSS32rm:
  203. case X86::LSS64rm:
  204. case X86::SYSENTER:
  205. return "unsupported opcode";
  206. default:
  207. break;
  208. }
  209. if (const auto reason = isInvalidMemoryInstr(Instr))
  210. return reason;
  211. // We do not handle instructions with OPERAND_PCREL.
  212. for (const Operand &Op : Instr.Operands)
  213. if (Op.isExplicit() &&
  214. Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL)
  215. return "unsupported opcode: PC relative operand";
  216. // We do not handle second-form X87 instructions. We only handle first-form
  217. // ones (_Fp), see comment in X86InstrFPStack.td.
  218. for (const Operand &Op : Instr.Operands)
  219. if (Op.isReg() && Op.isExplicit() &&
  220. Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID)
  221. return "unsupported second-form X87 instruction";
  222. return nullptr;
  223. }
  224. static unsigned getX86FPFlags(const Instruction &Instr) {
  225. return Instr.Description.TSFlags & X86II::FPTypeMask;
  226. }
  227. // Helper to fill a memory operand with a value.
  228. static void setMemOp(InstructionTemplate &IT, int OpIdx,
  229. const MCOperand &OpVal) {
  230. const auto Op = IT.getInstr().Operands[OpIdx];
  231. assert(Op.isExplicit() && "invalid memory pattern");
  232. IT.getValueFor(Op) = OpVal;
  233. }
  234. // Common (latency, uops) code for LEA templates. `GetDestReg` takes the
  235. // addressing base and index registers and returns the LEA destination register.
  236. static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
  237. const Instruction &Instr, const BitVector &ForbiddenRegisters,
  238. const LLVMState &State, const SnippetGenerator::Options &Opts,
  239. std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)>
  240. RestrictDestRegs) {
  241. assert(Instr.Operands.size() == 6 && "invalid LEA");
  242. assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
  243. "invalid LEA");
  244. constexpr const int kDestOp = 0;
  245. constexpr const int kBaseOp = 1;
  246. constexpr const int kIndexOp = 3;
  247. auto PossibleDestRegs =
  248. Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
  249. remove(PossibleDestRegs, ForbiddenRegisters);
  250. auto PossibleBaseRegs =
  251. Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits();
  252. remove(PossibleBaseRegs, ForbiddenRegisters);
  253. auto PossibleIndexRegs =
  254. Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits();
  255. remove(PossibleIndexRegs, ForbiddenRegisters);
  256. const auto &RegInfo = State.getRegInfo();
  257. std::vector<CodeTemplate> Result;
  258. for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) {
  259. for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) {
  260. for (int LogScale = 0; LogScale <= 3; ++LogScale) {
  261. // FIXME: Add an option for controlling how we explore immediates.
  262. for (const int Disp : {0, 42}) {
  263. InstructionTemplate IT(&Instr);
  264. const int64_t Scale = 1ull << LogScale;
  265. setMemOp(IT, 1, MCOperand::createReg(BaseReg));
  266. setMemOp(IT, 2, MCOperand::createImm(Scale));
  267. setMemOp(IT, 3, MCOperand::createReg(IndexReg));
  268. setMemOp(IT, 4, MCOperand::createImm(Disp));
  269. // SegmentReg must be 0 for LEA.
  270. setMemOp(IT, 5, MCOperand::createReg(0));
  271. // Output reg candidates are selected by the caller.
  272. auto PossibleDestRegsNow = PossibleDestRegs;
  273. RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow);
  274. assert(PossibleDestRegsNow.set_bits().begin() !=
  275. PossibleDestRegsNow.set_bits().end() &&
  276. "no remaining registers");
  277. setMemOp(
  278. IT, 0,
  279. MCOperand::createReg(*PossibleDestRegsNow.set_bits().begin()));
  280. CodeTemplate CT;
  281. CT.Instructions.push_back(std::move(IT));
  282. CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg),
  283. RegInfo.getName(IndexReg), Scale, Disp)
  284. .str();
  285. Result.push_back(std::move(CT));
  286. if (Result.size() >= Opts.MaxConfigsPerOpcode)
  287. return std::move(Result);
  288. }
  289. }
  290. }
  291. }
  292. return std::move(Result);
  293. }
  294. namespace {
  295. class X86SerialSnippetGenerator : public SerialSnippetGenerator {
  296. public:
  297. using SerialSnippetGenerator::SerialSnippetGenerator;
  298. Expected<std::vector<CodeTemplate>>
  299. generateCodeTemplates(InstructionTemplate Variant,
  300. const BitVector &ForbiddenRegisters) const override;
  301. };
  302. } // namespace
  303. Expected<std::vector<CodeTemplate>>
  304. X86SerialSnippetGenerator::generateCodeTemplates(
  305. InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
  306. const Instruction &Instr = Variant.getInstr();
  307. if (const auto reason = isInvalidOpcode(Instr))
  308. return make_error<Failure>(reason);
  309. // LEA gets special attention.
  310. const auto Opcode = Instr.Description.getOpcode();
  311. if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
  312. return generateLEATemplatesCommon(
  313. Instr, ForbiddenRegisters, State, Opts,
  314. [this](unsigned BaseReg, unsigned IndexReg,
  315. BitVector &CandidateDestRegs) {
  316. // We just select a destination register that aliases the base
  317. // register.
  318. CandidateDestRegs &=
  319. State.getRATC().getRegister(BaseReg).aliasedBits();
  320. });
  321. }
  322. if (Instr.hasMemoryOperands())
  323. return make_error<Failure>(
  324. "unsupported memory operand in latency measurements");
  325. switch (getX86FPFlags(Instr)) {
  326. case X86II::NotFP:
  327. return SerialSnippetGenerator::generateCodeTemplates(Variant,
  328. ForbiddenRegisters);
  329. case X86II::ZeroArgFP:
  330. case X86II::OneArgFP:
  331. case X86II::SpecialFP:
  332. case X86II::CompareFP:
  333. case X86II::CondMovFP:
  334. return make_error<Failure>("Unsupported x87 Instruction");
  335. case X86II::OneArgFPRW:
  336. case X86II::TwoArgFP:
  337. // These are instructions like
  338. // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
  339. // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
  340. // They are intrinsically serial and do not modify the state of the stack.
  341. return generateSelfAliasingCodeTemplates(Variant);
  342. default:
  343. llvm_unreachable("Unknown FP Type!");
  344. }
  345. }
  346. namespace {
  347. class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
  348. public:
  349. using ParallelSnippetGenerator::ParallelSnippetGenerator;
  350. Expected<std::vector<CodeTemplate>>
  351. generateCodeTemplates(InstructionTemplate Variant,
  352. const BitVector &ForbiddenRegisters) const override;
  353. };
  354. } // namespace
  355. Expected<std::vector<CodeTemplate>>
  356. X86ParallelSnippetGenerator::generateCodeTemplates(
  357. InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
  358. const Instruction &Instr = Variant.getInstr();
  359. if (const auto reason = isInvalidOpcode(Instr))
  360. return make_error<Failure>(reason);
  361. // LEA gets special attention.
  362. const auto Opcode = Instr.Description.getOpcode();
  363. if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
  364. return generateLEATemplatesCommon(
  365. Instr, ForbiddenRegisters, State, Opts,
  366. [this](unsigned BaseReg, unsigned IndexReg,
  367. BitVector &CandidateDestRegs) {
  368. // Any destination register that is not used for addressing is fine.
  369. remove(CandidateDestRegs,
  370. State.getRATC().getRegister(BaseReg).aliasedBits());
  371. remove(CandidateDestRegs,
  372. State.getRATC().getRegister(IndexReg).aliasedBits());
  373. });
  374. }
  375. switch (getX86FPFlags(Instr)) {
  376. case X86II::NotFP:
  377. return ParallelSnippetGenerator::generateCodeTemplates(Variant,
  378. ForbiddenRegisters);
  379. case X86II::ZeroArgFP:
  380. case X86II::OneArgFP:
  381. case X86II::SpecialFP:
  382. return make_error<Failure>("Unsupported x87 Instruction");
  383. case X86II::OneArgFPRW:
  384. case X86II::TwoArgFP:
  385. // These are instructions like
  386. // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
  387. // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
  388. // They are intrinsically serial and do not modify the state of the stack.
  389. // We generate the same code for latency and uops.
  390. return generateSelfAliasingCodeTemplates(Variant);
  391. case X86II::CompareFP:
  392. case X86II::CondMovFP:
  393. // We can compute uops for any FP instruction that does not grow or shrink
  394. // the stack (either do not touch the stack or push as much as they pop).
  395. return generateUnconstrainedCodeTemplates(
  396. Variant, "instruction does not grow/shrink the FP stack");
  397. default:
  398. llvm_unreachable("Unknown FP Type!");
  399. }
  400. }
  401. static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
  402. switch (RegBitWidth) {
  403. case 8:
  404. return X86::MOV8ri;
  405. case 16:
  406. return X86::MOV16ri;
  407. case 32:
  408. return X86::MOV32ri;
  409. case 64:
  410. return X86::MOV64ri;
  411. }
  412. llvm_unreachable("Invalid Value Width");
  413. }
  414. // Generates instruction to load an immediate value into a register.
  415. static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
  416. const APInt &Value) {
  417. if (Value.getBitWidth() > RegBitWidth)
  418. llvm_unreachable("Value must fit in the Register");
  419. return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
  420. .addReg(Reg)
  421. .addImm(Value.getZExtValue());
  422. }
  423. // Allocates scratch memory on the stack.
  424. static MCInst allocateStackSpace(unsigned Bytes) {
  425. return MCInstBuilder(X86::SUB64ri8)
  426. .addReg(X86::RSP)
  427. .addReg(X86::RSP)
  428. .addImm(Bytes);
  429. }
  430. // Fills scratch memory at offset `OffsetBytes` with value `Imm`.
  431. static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
  432. uint64_t Imm) {
  433. return MCInstBuilder(MovOpcode)
  434. // Address = ESP
  435. .addReg(X86::RSP) // BaseReg
  436. .addImm(1) // ScaleAmt
  437. .addReg(0) // IndexReg
  438. .addImm(OffsetBytes) // Disp
  439. .addReg(0) // Segment
  440. // Immediate.
  441. .addImm(Imm);
  442. }
  443. // Loads scratch memory into register `Reg` using opcode `RMOpcode`.
  444. static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) {
  445. return MCInstBuilder(RMOpcode)
  446. .addReg(Reg)
  447. // Address = ESP
  448. .addReg(X86::RSP) // BaseReg
  449. .addImm(1) // ScaleAmt
  450. .addReg(0) // IndexReg
  451. .addImm(0) // Disp
  452. .addReg(0); // Segment
  453. }
  454. // Releases scratch memory.
  455. static MCInst releaseStackSpace(unsigned Bytes) {
  456. return MCInstBuilder(X86::ADD64ri8)
  457. .addReg(X86::RSP)
  458. .addReg(X86::RSP)
  459. .addImm(Bytes);
  460. }
  461. // Reserves some space on the stack, fills it with the content of the provided
  462. // constant and provide methods to load the stack value into a register.
  463. namespace {
  464. struct ConstantInliner {
  465. explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
  466. std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth,
  467. unsigned Opcode);
  468. std::vector<MCInst> loadX87STAndFinalize(unsigned Reg);
  469. std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg);
  470. std::vector<MCInst> popFlagAndFinalize();
  471. std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
  472. unsigned Value);
  473. private:
  474. ConstantInliner &add(const MCInst &Inst) {
  475. Instructions.push_back(Inst);
  476. return *this;
  477. }
  478. void initStack(unsigned Bytes);
  479. static constexpr const unsigned kF80Bytes = 10; // 80 bits.
  480. APInt Constant_;
  481. std::vector<MCInst> Instructions;
  482. };
  483. } // namespace
  484. std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
  485. unsigned RegBitWidth,
  486. unsigned Opcode) {
  487. assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
  488. initStack(RegBitWidth / 8);
  489. add(loadToReg(Reg, Opcode));
  490. add(releaseStackSpace(RegBitWidth / 8));
  491. return std::move(Instructions);
  492. }
  493. std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
  494. initStack(kF80Bytes);
  495. add(MCInstBuilder(X86::LD_F80m)
  496. // Address = ESP
  497. .addReg(X86::RSP) // BaseReg
  498. .addImm(1) // ScaleAmt
  499. .addReg(0) // IndexReg
  500. .addImm(0) // Disp
  501. .addReg(0)); // Segment
  502. if (Reg != X86::ST0)
  503. add(MCInstBuilder(X86::ST_Frr).addReg(Reg));
  504. add(releaseStackSpace(kF80Bytes));
  505. return std::move(Instructions);
  506. }
  507. std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) {
  508. initStack(kF80Bytes);
  509. add(MCInstBuilder(X86::LD_Fp80m)
  510. .addReg(Reg)
  511. // Address = ESP
  512. .addReg(X86::RSP) // BaseReg
  513. .addImm(1) // ScaleAmt
  514. .addReg(0) // IndexReg
  515. .addImm(0) // Disp
  516. .addReg(0)); // Segment
  517. add(releaseStackSpace(kF80Bytes));
  518. return std::move(Instructions);
  519. }
  520. std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
  521. initStack(8);
  522. add(MCInstBuilder(X86::POPF64));
  523. return std::move(Instructions);
  524. }
  525. std::vector<MCInst>
  526. ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
  527. add(allocateStackSpace(4));
  528. add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions
  529. add(MCInstBuilder(Opcode)
  530. // Address = ESP
  531. .addReg(X86::RSP) // BaseReg
  532. .addImm(1) // ScaleAmt
  533. .addReg(0) // IndexReg
  534. .addImm(0) // Disp
  535. .addReg(0)); // Segment
  536. add(releaseStackSpace(4));
  537. return std::move(Instructions);
  538. }
  539. void ConstantInliner::initStack(unsigned Bytes) {
  540. assert(Constant_.getBitWidth() <= Bytes * 8 &&
  541. "Value does not have the correct size");
  542. const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
  543. ? Constant_.sext(Bytes * 8)
  544. : Constant_;
  545. add(allocateStackSpace(Bytes));
  546. size_t ByteOffset = 0;
  547. for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
  548. add(fillStackSpace(
  549. X86::MOV32mi, ByteOffset,
  550. WideConstant.extractBits(32, ByteOffset * 8).getZExtValue()));
  551. if (Bytes - ByteOffset >= 2) {
  552. add(fillStackSpace(
  553. X86::MOV16mi, ByteOffset,
  554. WideConstant.extractBits(16, ByteOffset * 8).getZExtValue()));
  555. ByteOffset += 2;
  556. }
  557. if (Bytes - ByteOffset >= 1)
  558. add(fillStackSpace(
  559. X86::MOV8mi, ByteOffset,
  560. WideConstant.extractBits(8, ByteOffset * 8).getZExtValue()));
  561. }
  562. #include "X86GenExegesis.inc"
  563. namespace {
  564. class X86SavedState : public ExegesisTarget::SavedState {
  565. public:
  566. X86SavedState() {
  567. #ifdef __x86_64__
  568. # if defined(_MSC_VER)
  569. _fxsave64(FPState);
  570. Eflags = __readeflags();
  571. # elif defined(__GNUC__)
  572. __builtin_ia32_fxsave64(FPState);
  573. Eflags = __builtin_ia32_readeflags_u64();
  574. # endif
  575. #else
  576. llvm_unreachable("X86 exegesis running on non-X86 target");
  577. #endif
  578. }
  579. ~X86SavedState() {
  580. // Restoring the X87 state does not flush pending exceptions, make sure
  581. // these exceptions are flushed now.
  582. #ifdef __x86_64__
  583. # if defined(_MSC_VER)
  584. _clearfp();
  585. _fxrstor64(FPState);
  586. __writeeflags(Eflags);
  587. # elif defined(__GNUC__)
  588. asm volatile("fwait");
  589. __builtin_ia32_fxrstor64(FPState);
  590. __builtin_ia32_writeeflags_u64(Eflags);
  591. # endif
  592. #else
  593. llvm_unreachable("X86 exegesis running on non-X86 target");
  594. #endif
  595. }
  596. private:
  597. #ifdef __x86_64__
  598. alignas(16) char FPState[512];
  599. uint64_t Eflags;
  600. #endif
  601. };
  602. class ExegesisX86Target : public ExegesisTarget {
  603. public:
  604. ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
  605. Expected<std::unique_ptr<pfm::Counter>>
  606. createCounter(StringRef CounterName, const LLVMState &State) const override {
  607. // If LbrSamplingPeriod was provided, then ignore the
  608. // CounterName because we only have one for LBR.
  609. if (LbrSamplingPeriod > 0) {
  610. // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
  611. // __linux__ (for now)
  612. #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \
  613. defined(__linux__)
  614. return std::make_unique<X86LbrCounter>(
  615. X86LbrPerfEvent(LbrSamplingPeriod));
  616. #else
  617. return llvm::make_error<llvm::StringError>(
  618. "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
  619. "or running on Linux.",
  620. llvm::errc::invalid_argument);
  621. #endif
  622. }
  623. return ExegesisTarget::createCounter(CounterName, State);
  624. }
  625. private:
  626. void addTargetSpecificPasses(PassManagerBase &PM) const override;
  627. unsigned getScratchMemoryRegister(const Triple &TT) const override;
  628. unsigned getLoopCounterRegister(const Triple &) const override;
  629. unsigned getMaxMemoryAccessSize() const override { return 64; }
  630. Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var,
  631. MCOperand &AssignedValue,
  632. const BitVector &ForbiddenRegs) const override;
  633. void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
  634. unsigned Offset) const override;
  635. void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
  636. MachineBasicBlock &TargetMBB,
  637. const MCInstrInfo &MII) const override;
  638. std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
  639. const APInt &Value) const override;
  640. ArrayRef<unsigned> getUnavailableRegisters() const override {
  641. return makeArrayRef(kUnavailableRegisters,
  642. sizeof(kUnavailableRegisters) /
  643. sizeof(kUnavailableRegisters[0]));
  644. }
  645. bool allowAsBackToBack(const Instruction &Instr) const override {
  646. const unsigned Opcode = Instr.Description.Opcode;
  647. return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r &&
  648. Opcode != X86::LEA64_32r && Opcode != X86::LEA16r;
  649. }
  650. std::vector<InstructionTemplate>
  651. generateInstructionVariants(const Instruction &Instr,
  652. unsigned MaxConfigsPerOpcode) const override;
  653. std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
  654. const LLVMState &State,
  655. const SnippetGenerator::Options &Opts) const override {
  656. return std::make_unique<X86SerialSnippetGenerator>(State, Opts);
  657. }
  658. std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
  659. const LLVMState &State,
  660. const SnippetGenerator::Options &Opts) const override {
  661. return std::make_unique<X86ParallelSnippetGenerator>(State, Opts);
  662. }
  663. bool matchesArch(Triple::ArchType Arch) const override {
  664. return Arch == Triple::x86_64 || Arch == Triple::x86;
  665. }
  666. Error checkFeatureSupport() const override {
  667. // LBR is the only feature we conditionally support now.
  668. // So if LBR is not requested, then we should be able to run the benchmarks.
  669. if (LbrSamplingPeriod == 0)
  670. return Error::success();
  671. #if defined(__linux__) && defined(HAVE_LIBPFM) && \
  672. defined(LIBPFM_HAS_FIELD_CYCLES)
  673. // FIXME: Fix this.
  674. // https://bugs.llvm.org/show_bug.cgi?id=48918
  675. // For now, only do the check if we see an Intel machine because
  676. // the counter uses some intel-specific magic and it could
  677. // be confuse and think an AMD machine actually has LBR support.
  678. #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
  679. defined(_M_X64)
  680. using namespace sys::detail::x86;
  681. if (getVendorSignature() == VendorSignatures::GENUINE_INTEL)
  682. // If the kernel supports it, the hardware still may not have it.
  683. return X86LbrCounter::checkLbrSupport();
  684. #else
  685. llvm_unreachable("Running X86 exegesis on non-X86 target");
  686. #endif
  687. #endif
  688. return llvm::make_error<llvm::StringError>(
  689. "LBR not supported on this kernel and/or platform",
  690. llvm::errc::not_supported);
  691. }
  692. std::unique_ptr<SavedState> withSavedState() const override {
  693. return std::make_unique<X86SavedState>();
  694. }
  695. static const unsigned kUnavailableRegisters[4];
  696. };
  697. // We disable a few registers that cannot be encoded on instructions with a REX
  698. // prefix.
  699. const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH,
  700. X86::CH, X86::DH};
  701. // We're using one of R8-R15 because these registers are never hardcoded in
  702. // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
  703. // conflicts.
  704. constexpr const unsigned kLoopCounterReg = X86::R8;
  705. } // namespace
  706. void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
  707. // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
  708. PM.add(createX86FloatingPointStackifierPass());
  709. }
  710. unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
  711. if (!TT.isArch64Bit()) {
  712. // FIXME: This would require popping from the stack, so we would have to
  713. // add some additional setup code.
  714. return 0;
  715. }
  716. return TT.isOSWindows() ? X86::RCX : X86::RDI;
  717. }
  718. unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
  719. if (!TT.isArch64Bit()) {
  720. return 0;
  721. }
  722. return kLoopCounterReg;
  723. }
  724. Error ExegesisX86Target::randomizeTargetMCOperand(
  725. const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue,
  726. const BitVector &ForbiddenRegs) const {
  727. const Operand &Op = Instr.getPrimaryOperand(Var);
  728. switch (Op.getExplicitOperandInfo().OperandType) {
  729. case X86::OperandType::OPERAND_ROUNDING_CONTROL:
  730. AssignedValue =
  731. MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO));
  732. return Error::success();
  733. default:
  734. break;
  735. }
  736. return make_error<Failure>(
  737. Twine("unimplemented operand type ")
  738. .concat(Twine(Op.getExplicitOperandInfo().OperandType)));
  739. }
  740. void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
  741. unsigned Reg,
  742. unsigned Offset) const {
  743. assert(!isInvalidMemoryInstr(IT.getInstr()) &&
  744. "fillMemoryOperands requires a valid memory instruction");
  745. int MemOpIdx = X86II::getMemoryOperandNo(IT.getInstr().Description.TSFlags);
  746. assert(MemOpIdx >= 0 && "invalid memory operand index");
  747. // getMemoryOperandNo() ignores tied operands, so we have to add them back.
  748. MemOpIdx += X86II::getOperandBias(IT.getInstr().Description);
  749. setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg)); // BaseReg
  750. setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1)); // ScaleAmt
  751. setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0)); // IndexReg
  752. setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
  753. setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0)); // Segment
  754. }
  755. void ExegesisX86Target::decrementLoopCounterAndJump(
  756. MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
  757. const MCInstrInfo &MII) const {
  758. BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
  759. .addDef(kLoopCounterReg)
  760. .addUse(kLoopCounterReg)
  761. .addImm(-1);
  762. BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
  763. .addMBB(&TargetMBB)
  764. .addImm(X86::COND_NE);
  765. }
  766. std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
  767. unsigned Reg,
  768. const APInt &Value) const {
  769. if (X86::GR8RegClass.contains(Reg))
  770. return {loadImmediate(Reg, 8, Value)};
  771. if (X86::GR16RegClass.contains(Reg))
  772. return {loadImmediate(Reg, 16, Value)};
  773. if (X86::GR32RegClass.contains(Reg))
  774. return {loadImmediate(Reg, 32, Value)};
  775. if (X86::GR64RegClass.contains(Reg))
  776. return {loadImmediate(Reg, 64, Value)};
  777. ConstantInliner CI(Value);
  778. if (X86::VR64RegClass.contains(Reg))
  779. return CI.loadAndFinalize(Reg, 64, X86::MMX_MOVQ64rm);
  780. if (X86::VR128XRegClass.contains(Reg)) {
  781. if (STI.getFeatureBits()[X86::FeatureAVX512])
  782. return CI.loadAndFinalize(Reg, 128, X86::VMOVDQU32Z128rm);
  783. if (STI.getFeatureBits()[X86::FeatureAVX])
  784. return CI.loadAndFinalize(Reg, 128, X86::VMOVDQUrm);
  785. return CI.loadAndFinalize(Reg, 128, X86::MOVDQUrm);
  786. }
  787. if (X86::VR256XRegClass.contains(Reg)) {
  788. if (STI.getFeatureBits()[X86::FeatureAVX512])
  789. return CI.loadAndFinalize(Reg, 256, X86::VMOVDQU32Z256rm);
  790. if (STI.getFeatureBits()[X86::FeatureAVX])
  791. return CI.loadAndFinalize(Reg, 256, X86::VMOVDQUYrm);
  792. }
  793. if (X86::VR512RegClass.contains(Reg))
  794. if (STI.getFeatureBits()[X86::FeatureAVX512])
  795. return CI.loadAndFinalize(Reg, 512, X86::VMOVDQU32Zrm);
  796. if (X86::RSTRegClass.contains(Reg)) {
  797. return CI.loadX87STAndFinalize(Reg);
  798. }
  799. if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) ||
  800. X86::RFP80RegClass.contains(Reg)) {
  801. return CI.loadX87FPAndFinalize(Reg);
  802. }
  803. if (Reg == X86::EFLAGS)
  804. return CI.popFlagAndFinalize();
  805. if (Reg == X86::MXCSR)
  806. return CI.loadImplicitRegAndFinalize(
  807. STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR,
  808. 0x1f80);
  809. if (Reg == X86::FPCW)
  810. return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f);
  811. return {}; // Not yet implemented.
  812. }
  813. // Instruction can have some variable operands, and we may want to see how
  814. // different operands affect performance. So for each operand position,
  815. // precompute all the possible choices we might care about,
  816. // and greedily generate all the possible combinations of choices.
  817. std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants(
  818. const Instruction &Instr, unsigned MaxConfigsPerOpcode) const {
  819. bool Exploration = false;
  820. SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices;
  821. VariableChoices.resize(Instr.Variables.size());
  822. for (auto I : llvm::zip(Instr.Variables, VariableChoices)) {
  823. const Variable &Var = std::get<0>(I);
  824. SmallVectorImpl<MCOperand> &Choices = std::get<1>(I);
  825. switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) {
  826. default:
  827. // We don't wish to explicitly explore this variable.
  828. Choices.emplace_back(); // But add invalid MCOperand to simplify logic.
  829. continue;
  830. case X86::OperandType::OPERAND_COND_CODE: {
  831. Exploration = true;
  832. auto CondCodes = enum_seq_inclusive(X86::CondCode::COND_O,
  833. X86::CondCode::LAST_VALID_COND,
  834. force_iteration_on_noniterable_enum);
  835. Choices.reserve(CondCodes.size());
  836. for (int CondCode : CondCodes)
  837. Choices.emplace_back(MCOperand::createImm(CondCode));
  838. break;
  839. }
  840. }
  841. }
  842. // If we don't wish to explore any variables, defer to the baseline method.
  843. if (!Exploration)
  844. return ExegesisTarget::generateInstructionVariants(Instr,
  845. MaxConfigsPerOpcode);
  846. std::vector<InstructionTemplate> Variants;
  847. size_t NumVariants;
  848. CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G(
  849. VariableChoices);
  850. // How many operand combinations can we produce, within the limit?
  851. NumVariants = std::min(G.numCombinations(), (size_t)MaxConfigsPerOpcode);
  852. // And actually produce all the wanted operand combinations.
  853. Variants.reserve(NumVariants);
  854. G.generate([&](ArrayRef<MCOperand> State) -> bool {
  855. Variants.emplace_back(&Instr);
  856. Variants.back().setVariableValues(State);
  857. // Did we run out of space for variants?
  858. return Variants.size() >= NumVariants;
  859. });
  860. assert(Variants.size() == NumVariants &&
  861. Variants.size() <= MaxConfigsPerOpcode &&
  862. "Should not produce too many variants");
  863. return Variants;
  864. }
  865. static ExegesisTarget *getTheExegesisX86Target() {
  866. static ExegesisX86Target Target;
  867. return &Target;
  868. }
  869. void InitializeX86ExegesisTarget() {
  870. ExegesisTarget::registerTarget(getTheExegesisX86Target());
  871. }
  872. } // namespace exegesis
  873. } // namespace llvm