Target.cpp 35 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015
  1. //===-- Target.cpp ----------------------------------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "../Target.h"
  9. #include "../Error.h"
  10. #include "../ParallelSnippetGenerator.h"
  11. #include "../SerialSnippetGenerator.h"
  12. #include "../SnippetGenerator.h"
  13. #include "MCTargetDesc/X86BaseInfo.h"
  14. #include "MCTargetDesc/X86MCTargetDesc.h"
  15. #include "X86.h"
  16. #include "X86Counter.h"
  17. #include "X86RegisterInfo.h"
  18. #include "X86Subtarget.h"
  19. #include "llvm/ADT/Sequence.h"
  20. #include "llvm/MC/MCInstBuilder.h"
  21. #include "llvm/Support/Errc.h"
  22. #include "llvm/Support/Error.h"
  23. #include "llvm/Support/FormatVariadic.h"
  24. #include "llvm/Support/Host.h"
  25. #include <memory>
  26. #include <string>
  27. #include <vector>
  28. #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
  29. #include <immintrin.h>
  30. #include <intrin.h>
  31. #endif
  32. #if defined(__x86_64__) && defined(_MSC_VER)
  33. #include <float.h> // For _clearfp in ~X86SavedState().
  34. #endif
  35. namespace llvm {
  36. namespace exegesis {
  37. // If a positive value is specified, we are going to use the LBR in
  38. // latency-mode.
  39. //
  40. // Note:
  41. // - A small value is preferred, but too low a value could result in
  42. // throttling.
  43. // - A prime number is preferred to avoid always skipping certain blocks.
  44. //
  45. static cl::opt<unsigned> LbrSamplingPeriod(
  46. "x86-lbr-sample-period",
  47. cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
  48. cl::cat(BenchmarkOptions), cl::init(0));
  49. static cl::opt<bool>
  50. DisableUpperSSERegisters("x86-disable-upper-sse-registers",
  51. cl::desc("Disable XMM8-XMM15 register usage"),
  52. cl::cat(BenchmarkOptions), cl::init(false));
  53. // FIXME: Validates that repetition-mode is loop if LBR is requested.
  54. // Returns a non-null reason if we cannot handle the memory references in this
  55. // instruction.
  56. static const char *isInvalidMemoryInstr(const Instruction &Instr) {
  57. switch (Instr.Description.TSFlags & X86II::FormMask) {
  58. default:
  59. return "Unknown FormMask value";
  60. // These have no memory access.
  61. case X86II::Pseudo:
  62. case X86II::RawFrm:
  63. case X86II::AddCCFrm:
  64. case X86II::PrefixByte:
  65. case X86II::MRMDestReg:
  66. case X86II::MRMSrcReg:
  67. case X86II::MRMSrcReg4VOp3:
  68. case X86II::MRMSrcRegOp4:
  69. case X86II::MRMSrcRegCC:
  70. case X86II::MRMXrCC:
  71. case X86II::MRMr0:
  72. case X86II::MRMXr:
  73. case X86II::MRM0r:
  74. case X86II::MRM1r:
  75. case X86II::MRM2r:
  76. case X86II::MRM3r:
  77. case X86II::MRM4r:
  78. case X86II::MRM5r:
  79. case X86II::MRM6r:
  80. case X86II::MRM7r:
  81. case X86II::MRM0X:
  82. case X86II::MRM1X:
  83. case X86II::MRM2X:
  84. case X86II::MRM3X:
  85. case X86II::MRM4X:
  86. case X86II::MRM5X:
  87. case X86II::MRM6X:
  88. case X86II::MRM7X:
  89. case X86II::MRM_C0:
  90. case X86II::MRM_C1:
  91. case X86II::MRM_C2:
  92. case X86II::MRM_C3:
  93. case X86II::MRM_C4:
  94. case X86II::MRM_C5:
  95. case X86II::MRM_C6:
  96. case X86II::MRM_C7:
  97. case X86II::MRM_C8:
  98. case X86II::MRM_C9:
  99. case X86II::MRM_CA:
  100. case X86II::MRM_CB:
  101. case X86II::MRM_CC:
  102. case X86II::MRM_CD:
  103. case X86II::MRM_CE:
  104. case X86II::MRM_CF:
  105. case X86II::MRM_D0:
  106. case X86II::MRM_D1:
  107. case X86II::MRM_D2:
  108. case X86II::MRM_D3:
  109. case X86II::MRM_D4:
  110. case X86II::MRM_D5:
  111. case X86II::MRM_D6:
  112. case X86II::MRM_D7:
  113. case X86II::MRM_D8:
  114. case X86II::MRM_D9:
  115. case X86II::MRM_DA:
  116. case X86II::MRM_DB:
  117. case X86II::MRM_DC:
  118. case X86II::MRM_DD:
  119. case X86II::MRM_DE:
  120. case X86II::MRM_DF:
  121. case X86II::MRM_E0:
  122. case X86II::MRM_E1:
  123. case X86II::MRM_E2:
  124. case X86II::MRM_E3:
  125. case X86II::MRM_E4:
  126. case X86II::MRM_E5:
  127. case X86II::MRM_E6:
  128. case X86II::MRM_E7:
  129. case X86II::MRM_E8:
  130. case X86II::MRM_E9:
  131. case X86II::MRM_EA:
  132. case X86II::MRM_EB:
  133. case X86II::MRM_EC:
  134. case X86II::MRM_ED:
  135. case X86II::MRM_EE:
  136. case X86II::MRM_EF:
  137. case X86II::MRM_F0:
  138. case X86II::MRM_F1:
  139. case X86II::MRM_F2:
  140. case X86II::MRM_F3:
  141. case X86II::MRM_F4:
  142. case X86II::MRM_F5:
  143. case X86II::MRM_F6:
  144. case X86II::MRM_F7:
  145. case X86II::MRM_F8:
  146. case X86II::MRM_F9:
  147. case X86II::MRM_FA:
  148. case X86II::MRM_FB:
  149. case X86II::MRM_FC:
  150. case X86II::MRM_FD:
  151. case X86II::MRM_FE:
  152. case X86II::MRM_FF:
  153. case X86II::RawFrmImm8:
  154. return nullptr;
  155. case X86II::AddRegFrm:
  156. return (Instr.Description.Opcode == X86::POP16r ||
  157. Instr.Description.Opcode == X86::POP32r ||
  158. Instr.Description.Opcode == X86::PUSH16r ||
  159. Instr.Description.Opcode == X86::PUSH32r)
  160. ? "unsupported opcode: unsupported memory access"
  161. : nullptr;
  162. // These access memory and are handled.
  163. case X86II::MRMDestMem:
  164. case X86II::MRMSrcMem:
  165. case X86II::MRMSrcMem4VOp3:
  166. case X86II::MRMSrcMemOp4:
  167. case X86II::MRMSrcMemCC:
  168. case X86II::MRMXmCC:
  169. case X86II::MRMXm:
  170. case X86II::MRM0m:
  171. case X86II::MRM1m:
  172. case X86II::MRM2m:
  173. case X86II::MRM3m:
  174. case X86II::MRM4m:
  175. case X86II::MRM5m:
  176. case X86II::MRM6m:
  177. case X86II::MRM7m:
  178. return nullptr;
  179. // These access memory and are not handled yet.
  180. case X86II::RawFrmImm16:
  181. case X86II::RawFrmMemOffs:
  182. case X86II::RawFrmSrc:
  183. case X86II::RawFrmDst:
  184. case X86II::RawFrmDstSrc:
  185. return "unsupported opcode: non uniform memory access";
  186. }
  187. }
  188. // If the opcode is invalid, returns a pointer to a character literal indicating
  189. // the reason. nullptr indicates a valid opcode.
  190. static const char *isInvalidOpcode(const Instruction &Instr) {
  191. const auto OpcodeName = Instr.Name;
  192. if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo)
  193. return "unsupported opcode: pseudo instruction";
  194. if ((OpcodeName.startswith("POP") && !OpcodeName.startswith("POPCNT")) ||
  195. OpcodeName.startswith("PUSH") || OpcodeName.startswith("ADJCALLSTACK") ||
  196. OpcodeName.startswith("LEAVE"))
  197. return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
  198. switch (Instr.Description.Opcode) {
  199. case X86::LFS16rm:
  200. case X86::LFS32rm:
  201. case X86::LFS64rm:
  202. case X86::LGS16rm:
  203. case X86::LGS32rm:
  204. case X86::LGS64rm:
  205. case X86::LSS16rm:
  206. case X86::LSS32rm:
  207. case X86::LSS64rm:
  208. case X86::SYSENTER:
  209. case X86::WRFSBASE:
  210. case X86::WRFSBASE64:
  211. return "unsupported opcode";
  212. default:
  213. break;
  214. }
  215. if (const auto reason = isInvalidMemoryInstr(Instr))
  216. return reason;
  217. // We do not handle instructions with OPERAND_PCREL.
  218. for (const Operand &Op : Instr.Operands)
  219. if (Op.isExplicit() &&
  220. Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL)
  221. return "unsupported opcode: PC relative operand";
  222. // We do not handle second-form X87 instructions. We only handle first-form
  223. // ones (_Fp), see comment in X86InstrFPStack.td.
  224. for (const Operand &Op : Instr.Operands)
  225. if (Op.isReg() && Op.isExplicit() &&
  226. Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID)
  227. return "unsupported second-form X87 instruction";
  228. return nullptr;
  229. }
  230. static unsigned getX86FPFlags(const Instruction &Instr) {
  231. return Instr.Description.TSFlags & X86II::FPTypeMask;
  232. }
  233. // Helper to fill a memory operand with a value.
  234. static void setMemOp(InstructionTemplate &IT, int OpIdx,
  235. const MCOperand &OpVal) {
  236. const auto Op = IT.getInstr().Operands[OpIdx];
  237. assert(Op.isExplicit() && "invalid memory pattern");
  238. IT.getValueFor(Op) = OpVal;
  239. }
  240. // Common (latency, uops) code for LEA templates. `GetDestReg` takes the
  241. // addressing base and index registers and returns the LEA destination register.
  242. static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
  243. const Instruction &Instr, const BitVector &ForbiddenRegisters,
  244. const LLVMState &State, const SnippetGenerator::Options &Opts,
  245. std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)>
  246. RestrictDestRegs) {
  247. assert(Instr.Operands.size() == 6 && "invalid LEA");
  248. assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
  249. "invalid LEA");
  250. constexpr const int kDestOp = 0;
  251. constexpr const int kBaseOp = 1;
  252. constexpr const int kIndexOp = 3;
  253. auto PossibleDestRegs =
  254. Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
  255. remove(PossibleDestRegs, ForbiddenRegisters);
  256. auto PossibleBaseRegs =
  257. Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits();
  258. remove(PossibleBaseRegs, ForbiddenRegisters);
  259. auto PossibleIndexRegs =
  260. Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits();
  261. remove(PossibleIndexRegs, ForbiddenRegisters);
  262. const auto &RegInfo = State.getRegInfo();
  263. std::vector<CodeTemplate> Result;
  264. for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) {
  265. for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) {
  266. for (int LogScale = 0; LogScale <= 3; ++LogScale) {
  267. // FIXME: Add an option for controlling how we explore immediates.
  268. for (const int Disp : {0, 42}) {
  269. InstructionTemplate IT(&Instr);
  270. const int64_t Scale = 1ull << LogScale;
  271. setMemOp(IT, 1, MCOperand::createReg(BaseReg));
  272. setMemOp(IT, 2, MCOperand::createImm(Scale));
  273. setMemOp(IT, 3, MCOperand::createReg(IndexReg));
  274. setMemOp(IT, 4, MCOperand::createImm(Disp));
  275. // SegmentReg must be 0 for LEA.
  276. setMemOp(IT, 5, MCOperand::createReg(0));
  277. // Output reg candidates are selected by the caller.
  278. auto PossibleDestRegsNow = PossibleDestRegs;
  279. RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow);
  280. assert(PossibleDestRegsNow.set_bits().begin() !=
  281. PossibleDestRegsNow.set_bits().end() &&
  282. "no remaining registers");
  283. setMemOp(
  284. IT, 0,
  285. MCOperand::createReg(*PossibleDestRegsNow.set_bits().begin()));
  286. CodeTemplate CT;
  287. CT.Instructions.push_back(std::move(IT));
  288. CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg),
  289. RegInfo.getName(IndexReg), Scale, Disp)
  290. .str();
  291. Result.push_back(std::move(CT));
  292. if (Result.size() >= Opts.MaxConfigsPerOpcode)
  293. return std::move(Result);
  294. }
  295. }
  296. }
  297. }
  298. return std::move(Result);
  299. }
  300. namespace {
  301. class X86SerialSnippetGenerator : public SerialSnippetGenerator {
  302. public:
  303. using SerialSnippetGenerator::SerialSnippetGenerator;
  304. Expected<std::vector<CodeTemplate>>
  305. generateCodeTemplates(InstructionTemplate Variant,
  306. const BitVector &ForbiddenRegisters) const override;
  307. };
  308. } // namespace
  309. Expected<std::vector<CodeTemplate>>
  310. X86SerialSnippetGenerator::generateCodeTemplates(
  311. InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
  312. const Instruction &Instr = Variant.getInstr();
  313. if (const auto reason = isInvalidOpcode(Instr))
  314. return make_error<Failure>(reason);
  315. // LEA gets special attention.
  316. const auto Opcode = Instr.Description.getOpcode();
  317. if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
  318. return generateLEATemplatesCommon(
  319. Instr, ForbiddenRegisters, State, Opts,
  320. [this](unsigned BaseReg, unsigned IndexReg,
  321. BitVector &CandidateDestRegs) {
  322. // We just select a destination register that aliases the base
  323. // register.
  324. CandidateDestRegs &=
  325. State.getRATC().getRegister(BaseReg).aliasedBits();
  326. });
  327. }
  328. if (Instr.hasMemoryOperands())
  329. return make_error<Failure>(
  330. "unsupported memory operand in latency measurements");
  331. switch (getX86FPFlags(Instr)) {
  332. case X86II::NotFP:
  333. return SerialSnippetGenerator::generateCodeTemplates(Variant,
  334. ForbiddenRegisters);
  335. case X86II::ZeroArgFP:
  336. case X86II::OneArgFP:
  337. case X86II::SpecialFP:
  338. case X86II::CompareFP:
  339. case X86II::CondMovFP:
  340. return make_error<Failure>("Unsupported x87 Instruction");
  341. case X86II::OneArgFPRW:
  342. case X86II::TwoArgFP:
  343. // These are instructions like
  344. // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
  345. // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
  346. // They are intrinsically serial and do not modify the state of the stack.
  347. return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters);
  348. default:
  349. llvm_unreachable("Unknown FP Type!");
  350. }
  351. }
  352. namespace {
  353. class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
  354. public:
  355. using ParallelSnippetGenerator::ParallelSnippetGenerator;
  356. Expected<std::vector<CodeTemplate>>
  357. generateCodeTemplates(InstructionTemplate Variant,
  358. const BitVector &ForbiddenRegisters) const override;
  359. };
  360. } // namespace
  361. Expected<std::vector<CodeTemplate>>
  362. X86ParallelSnippetGenerator::generateCodeTemplates(
  363. InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
  364. const Instruction &Instr = Variant.getInstr();
  365. if (const auto reason = isInvalidOpcode(Instr))
  366. return make_error<Failure>(reason);
  367. // LEA gets special attention.
  368. const auto Opcode = Instr.Description.getOpcode();
  369. if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
  370. return generateLEATemplatesCommon(
  371. Instr, ForbiddenRegisters, State, Opts,
  372. [this](unsigned BaseReg, unsigned IndexReg,
  373. BitVector &CandidateDestRegs) {
  374. // Any destination register that is not used for addressing is fine.
  375. remove(CandidateDestRegs,
  376. State.getRATC().getRegister(BaseReg).aliasedBits());
  377. remove(CandidateDestRegs,
  378. State.getRATC().getRegister(IndexReg).aliasedBits());
  379. });
  380. }
  381. switch (getX86FPFlags(Instr)) {
  382. case X86II::NotFP:
  383. return ParallelSnippetGenerator::generateCodeTemplates(Variant,
  384. ForbiddenRegisters);
  385. case X86II::ZeroArgFP:
  386. case X86II::OneArgFP:
  387. case X86II::SpecialFP:
  388. return make_error<Failure>("Unsupported x87 Instruction");
  389. case X86II::OneArgFPRW:
  390. case X86II::TwoArgFP:
  391. // These are instructions like
  392. // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
  393. // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
  394. // They are intrinsically serial and do not modify the state of the stack.
  395. // We generate the same code for latency and uops.
  396. return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters);
  397. case X86II::CompareFP:
  398. case X86II::CondMovFP:
  399. // We can compute uops for any FP instruction that does not grow or shrink
  400. // the stack (either do not touch the stack or push as much as they pop).
  401. return generateUnconstrainedCodeTemplates(
  402. Variant, "instruction does not grow/shrink the FP stack");
  403. default:
  404. llvm_unreachable("Unknown FP Type!");
  405. }
  406. }
  407. static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
  408. switch (RegBitWidth) {
  409. case 8:
  410. return X86::MOV8ri;
  411. case 16:
  412. return X86::MOV16ri;
  413. case 32:
  414. return X86::MOV32ri;
  415. case 64:
  416. return X86::MOV64ri;
  417. }
  418. llvm_unreachable("Invalid Value Width");
  419. }
  420. // Generates instruction to load an immediate value into a register.
  421. static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
  422. const APInt &Value) {
  423. if (Value.getBitWidth() > RegBitWidth)
  424. llvm_unreachable("Value must fit in the Register");
  425. return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
  426. .addReg(Reg)
  427. .addImm(Value.getZExtValue());
  428. }
  429. // Allocates scratch memory on the stack.
  430. static MCInst allocateStackSpace(unsigned Bytes) {
  431. return MCInstBuilder(X86::SUB64ri8)
  432. .addReg(X86::RSP)
  433. .addReg(X86::RSP)
  434. .addImm(Bytes);
  435. }
  436. // Fills scratch memory at offset `OffsetBytes` with value `Imm`.
  437. static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
  438. uint64_t Imm) {
  439. return MCInstBuilder(MovOpcode)
  440. // Address = ESP
  441. .addReg(X86::RSP) // BaseReg
  442. .addImm(1) // ScaleAmt
  443. .addReg(0) // IndexReg
  444. .addImm(OffsetBytes) // Disp
  445. .addReg(0) // Segment
  446. // Immediate.
  447. .addImm(Imm);
  448. }
  449. // Loads scratch memory into register `Reg` using opcode `RMOpcode`.
  450. static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) {
  451. return MCInstBuilder(RMOpcode)
  452. .addReg(Reg)
  453. // Address = ESP
  454. .addReg(X86::RSP) // BaseReg
  455. .addImm(1) // ScaleAmt
  456. .addReg(0) // IndexReg
  457. .addImm(0) // Disp
  458. .addReg(0); // Segment
  459. }
  460. // Releases scratch memory.
  461. static MCInst releaseStackSpace(unsigned Bytes) {
  462. return MCInstBuilder(X86::ADD64ri8)
  463. .addReg(X86::RSP)
  464. .addReg(X86::RSP)
  465. .addImm(Bytes);
  466. }
  467. // Reserves some space on the stack, fills it with the content of the provided
  468. // constant and provide methods to load the stack value into a register.
  469. namespace {
  470. struct ConstantInliner {
  471. explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
  472. std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth,
  473. unsigned Opcode);
  474. std::vector<MCInst> loadX87STAndFinalize(unsigned Reg);
  475. std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg);
  476. std::vector<MCInst> popFlagAndFinalize();
  477. std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
  478. unsigned Value);
  479. private:
  480. ConstantInliner &add(const MCInst &Inst) {
  481. Instructions.push_back(Inst);
  482. return *this;
  483. }
  484. void initStack(unsigned Bytes);
  485. static constexpr const unsigned kF80Bytes = 10; // 80 bits.
  486. APInt Constant_;
  487. std::vector<MCInst> Instructions;
  488. };
  489. } // namespace
  490. std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
  491. unsigned RegBitWidth,
  492. unsigned Opcode) {
  493. assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
  494. initStack(RegBitWidth / 8);
  495. add(loadToReg(Reg, Opcode));
  496. add(releaseStackSpace(RegBitWidth / 8));
  497. return std::move(Instructions);
  498. }
  499. std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
  500. initStack(kF80Bytes);
  501. add(MCInstBuilder(X86::LD_F80m)
  502. // Address = ESP
  503. .addReg(X86::RSP) // BaseReg
  504. .addImm(1) // ScaleAmt
  505. .addReg(0) // IndexReg
  506. .addImm(0) // Disp
  507. .addReg(0)); // Segment
  508. if (Reg != X86::ST0)
  509. add(MCInstBuilder(X86::ST_Frr).addReg(Reg));
  510. add(releaseStackSpace(kF80Bytes));
  511. return std::move(Instructions);
  512. }
  513. std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) {
  514. initStack(kF80Bytes);
  515. add(MCInstBuilder(X86::LD_Fp80m)
  516. .addReg(Reg)
  517. // Address = ESP
  518. .addReg(X86::RSP) // BaseReg
  519. .addImm(1) // ScaleAmt
  520. .addReg(0) // IndexReg
  521. .addImm(0) // Disp
  522. .addReg(0)); // Segment
  523. add(releaseStackSpace(kF80Bytes));
  524. return std::move(Instructions);
  525. }
  526. std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
  527. initStack(8);
  528. add(MCInstBuilder(X86::POPF64));
  529. return std::move(Instructions);
  530. }
  531. std::vector<MCInst>
  532. ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
  533. add(allocateStackSpace(4));
  534. add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions
  535. add(MCInstBuilder(Opcode)
  536. // Address = ESP
  537. .addReg(X86::RSP) // BaseReg
  538. .addImm(1) // ScaleAmt
  539. .addReg(0) // IndexReg
  540. .addImm(0) // Disp
  541. .addReg(0)); // Segment
  542. add(releaseStackSpace(4));
  543. return std::move(Instructions);
  544. }
  545. void ConstantInliner::initStack(unsigned Bytes) {
  546. assert(Constant_.getBitWidth() <= Bytes * 8 &&
  547. "Value does not have the correct size");
  548. const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
  549. ? Constant_.sext(Bytes * 8)
  550. : Constant_;
  551. add(allocateStackSpace(Bytes));
  552. size_t ByteOffset = 0;
  553. for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
  554. add(fillStackSpace(
  555. X86::MOV32mi, ByteOffset,
  556. WideConstant.extractBits(32, ByteOffset * 8).getZExtValue()));
  557. if (Bytes - ByteOffset >= 2) {
  558. add(fillStackSpace(
  559. X86::MOV16mi, ByteOffset,
  560. WideConstant.extractBits(16, ByteOffset * 8).getZExtValue()));
  561. ByteOffset += 2;
  562. }
  563. if (Bytes - ByteOffset >= 1)
  564. add(fillStackSpace(
  565. X86::MOV8mi, ByteOffset,
  566. WideConstant.extractBits(8, ByteOffset * 8).getZExtValue()));
  567. }
  568. #include "X86GenExegesis.inc"
  569. namespace {
  570. class X86SavedState : public ExegesisTarget::SavedState {
  571. public:
  572. X86SavedState() {
  573. #ifdef __x86_64__
  574. # if defined(_MSC_VER)
  575. _fxsave64(FPState);
  576. Eflags = __readeflags();
  577. # elif defined(__GNUC__)
  578. __builtin_ia32_fxsave64(FPState);
  579. Eflags = __builtin_ia32_readeflags_u64();
  580. # endif
  581. #else
  582. llvm_unreachable("X86 exegesis running on non-X86 target");
  583. #endif
  584. }
  585. ~X86SavedState() {
  586. // Restoring the X87 state does not flush pending exceptions, make sure
  587. // these exceptions are flushed now.
  588. #ifdef __x86_64__
  589. # if defined(_MSC_VER)
  590. _clearfp();
  591. _fxrstor64(FPState);
  592. __writeeflags(Eflags);
  593. # elif defined(__GNUC__)
  594. asm volatile("fwait");
  595. __builtin_ia32_fxrstor64(FPState);
  596. __builtin_ia32_writeeflags_u64(Eflags);
  597. # endif
  598. #else
  599. llvm_unreachable("X86 exegesis running on non-X86 target");
  600. #endif
  601. }
  602. private:
  603. #ifdef __x86_64__
  604. alignas(16) char FPState[512];
  605. uint64_t Eflags;
  606. #endif
  607. };
  608. class ExegesisX86Target : public ExegesisTarget {
  609. public:
  610. ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
  611. Expected<std::unique_ptr<pfm::Counter>>
  612. createCounter(StringRef CounterName, const LLVMState &State) const override {
  613. // If LbrSamplingPeriod was provided, then ignore the
  614. // CounterName because we only have one for LBR.
  615. if (LbrSamplingPeriod > 0) {
  616. // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
  617. // __linux__ (for now)
  618. #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \
  619. defined(__linux__)
  620. return std::make_unique<X86LbrCounter>(
  621. X86LbrPerfEvent(LbrSamplingPeriod));
  622. #else
  623. return llvm::make_error<llvm::StringError>(
  624. "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
  625. "or running on Linux.",
  626. llvm::errc::invalid_argument);
  627. #endif
  628. }
  629. return ExegesisTarget::createCounter(CounterName, State);
  630. }
  631. private:
  632. void addTargetSpecificPasses(PassManagerBase &PM) const override;
  633. unsigned getScratchMemoryRegister(const Triple &TT) const override;
  634. unsigned getLoopCounterRegister(const Triple &) const override;
  635. unsigned getMaxMemoryAccessSize() const override { return 64; }
  636. Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var,
  637. MCOperand &AssignedValue,
  638. const BitVector &ForbiddenRegs) const override;
  639. void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
  640. unsigned Offset) const override;
  641. void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
  642. MachineBasicBlock &TargetMBB,
  643. const MCInstrInfo &MII) const override;
  644. std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
  645. const APInt &Value) const override;
  646. ArrayRef<unsigned> getUnavailableRegisters() const override {
  647. if (DisableUpperSSERegisters)
  648. return ArrayRef(kUnavailableRegistersSSE,
  649. sizeof(kUnavailableRegistersSSE) /
  650. sizeof(kUnavailableRegistersSSE[0]));
  651. return ArrayRef(kUnavailableRegisters, std::size(kUnavailableRegisters));
  652. }
  653. bool allowAsBackToBack(const Instruction &Instr) const override {
  654. const unsigned Opcode = Instr.Description.Opcode;
  655. return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r &&
  656. Opcode != X86::LEA64_32r && Opcode != X86::LEA16r;
  657. }
  658. std::vector<InstructionTemplate>
  659. generateInstructionVariants(const Instruction &Instr,
  660. unsigned MaxConfigsPerOpcode) const override;
  661. std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
  662. const LLVMState &State,
  663. const SnippetGenerator::Options &Opts) const override {
  664. return std::make_unique<X86SerialSnippetGenerator>(State, Opts);
  665. }
  666. std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
  667. const LLVMState &State,
  668. const SnippetGenerator::Options &Opts) const override {
  669. return std::make_unique<X86ParallelSnippetGenerator>(State, Opts);
  670. }
  671. bool matchesArch(Triple::ArchType Arch) const override {
  672. return Arch == Triple::x86_64 || Arch == Triple::x86;
  673. }
  674. Error checkFeatureSupport() const override {
  675. // LBR is the only feature we conditionally support now.
  676. // So if LBR is not requested, then we should be able to run the benchmarks.
  677. if (LbrSamplingPeriod == 0)
  678. return Error::success();
  679. #if defined(__linux__) && defined(HAVE_LIBPFM) && \
  680. defined(LIBPFM_HAS_FIELD_CYCLES)
  681. // FIXME: Fix this.
  682. // https://bugs.llvm.org/show_bug.cgi?id=48918
  683. // For now, only do the check if we see an Intel machine because
  684. // the counter uses some intel-specific magic and it could
  685. // be confuse and think an AMD machine actually has LBR support.
  686. #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
  687. defined(_M_X64)
  688. using namespace sys::detail::x86;
  689. if (getVendorSignature() == VendorSignatures::GENUINE_INTEL)
  690. // If the kernel supports it, the hardware still may not have it.
  691. return X86LbrCounter::checkLbrSupport();
  692. #else
  693. llvm_unreachable("Running X86 exegesis on non-X86 target");
  694. #endif
  695. #endif
  696. return llvm::make_error<llvm::StringError>(
  697. "LBR not supported on this kernel and/or platform",
  698. llvm::errc::not_supported);
  699. }
  700. std::unique_ptr<SavedState> withSavedState() const override {
  701. return std::make_unique<X86SavedState>();
  702. }
  703. static const unsigned kUnavailableRegisters[4];
  704. static const unsigned kUnavailableRegistersSSE[12];
  705. };
  706. // We disable a few registers that cannot be encoded on instructions with a REX
  707. // prefix.
  708. const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH,
  709. X86::CH, X86::DH};
  710. // Optionally, also disable the upper (x86_64) SSE registers to reduce frontend
  711. // decoder load.
  712. const unsigned ExegesisX86Target::kUnavailableRegistersSSE[12] = {
  713. X86::AH, X86::BH, X86::CH, X86::DH, X86::XMM8, X86::XMM9,
  714. X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15};
  715. // We're using one of R8-R15 because these registers are never hardcoded in
  716. // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
  717. // conflicts.
  718. constexpr const unsigned kLoopCounterReg = X86::R8;
  719. } // namespace
  720. void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
  721. // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
  722. PM.add(createX86FloatingPointStackifierPass());
  723. }
  724. unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
  725. if (!TT.isArch64Bit()) {
  726. // FIXME: This would require popping from the stack, so we would have to
  727. // add some additional setup code.
  728. return 0;
  729. }
  730. return TT.isOSWindows() ? X86::RCX : X86::RDI;
  731. }
  732. unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
  733. if (!TT.isArch64Bit()) {
  734. return 0;
  735. }
  736. return kLoopCounterReg;
  737. }
  738. Error ExegesisX86Target::randomizeTargetMCOperand(
  739. const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue,
  740. const BitVector &ForbiddenRegs) const {
  741. const Operand &Op = Instr.getPrimaryOperand(Var);
  742. switch (Op.getExplicitOperandInfo().OperandType) {
  743. case X86::OperandType::OPERAND_ROUNDING_CONTROL:
  744. AssignedValue =
  745. MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO));
  746. return Error::success();
  747. default:
  748. break;
  749. }
  750. return make_error<Failure>(
  751. Twine("unimplemented operand type ")
  752. .concat(Twine(Op.getExplicitOperandInfo().OperandType)));
  753. }
  754. void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
  755. unsigned Reg,
  756. unsigned Offset) const {
  757. assert(!isInvalidMemoryInstr(IT.getInstr()) &&
  758. "fillMemoryOperands requires a valid memory instruction");
  759. int MemOpIdx = X86II::getMemoryOperandNo(IT.getInstr().Description.TSFlags);
  760. assert(MemOpIdx >= 0 && "invalid memory operand index");
  761. // getMemoryOperandNo() ignores tied operands, so we have to add them back.
  762. MemOpIdx += X86II::getOperandBias(IT.getInstr().Description);
  763. setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg)); // BaseReg
  764. setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1)); // ScaleAmt
  765. setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0)); // IndexReg
  766. setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
  767. setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0)); // Segment
  768. }
  769. void ExegesisX86Target::decrementLoopCounterAndJump(
  770. MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
  771. const MCInstrInfo &MII) const {
  772. BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
  773. .addDef(kLoopCounterReg)
  774. .addUse(kLoopCounterReg)
  775. .addImm(-1);
  776. BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
  777. .addMBB(&TargetMBB)
  778. .addImm(X86::COND_NE);
  779. }
  780. std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
  781. unsigned Reg,
  782. const APInt &Value) const {
  783. if (X86::GR8RegClass.contains(Reg))
  784. return {loadImmediate(Reg, 8, Value)};
  785. if (X86::GR16RegClass.contains(Reg))
  786. return {loadImmediate(Reg, 16, Value)};
  787. if (X86::GR32RegClass.contains(Reg))
  788. return {loadImmediate(Reg, 32, Value)};
  789. if (X86::GR64RegClass.contains(Reg))
  790. return {loadImmediate(Reg, 64, Value)};
  791. if (X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) ||
  792. X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg)) {
  793. switch (Value.getBitWidth()) {
  794. case 8:
  795. if (STI.getFeatureBits()[X86::FeatureDQI]) {
  796. ConstantInliner CI(Value);
  797. return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVBkm);
  798. }
  799. [[fallthrough]];
  800. case 16:
  801. if (STI.getFeatureBits()[X86::FeatureAVX512]) {
  802. ConstantInliner CI(Value.zextOrTrunc(16));
  803. return CI.loadAndFinalize(Reg, 16, X86::KMOVWkm);
  804. }
  805. break;
  806. case 32:
  807. if (STI.getFeatureBits()[X86::FeatureBWI]) {
  808. ConstantInliner CI(Value);
  809. return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVDkm);
  810. }
  811. break;
  812. case 64:
  813. if (STI.getFeatureBits()[X86::FeatureBWI]) {
  814. ConstantInliner CI(Value);
  815. return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVQkm);
  816. }
  817. break;
  818. }
  819. }
  820. ConstantInliner CI(Value);
  821. if (X86::VR64RegClass.contains(Reg))
  822. return CI.loadAndFinalize(Reg, 64, X86::MMX_MOVQ64rm);
  823. if (X86::VR128XRegClass.contains(Reg)) {
  824. if (STI.getFeatureBits()[X86::FeatureAVX512])
  825. return CI.loadAndFinalize(Reg, 128, X86::VMOVDQU32Z128rm);
  826. if (STI.getFeatureBits()[X86::FeatureAVX])
  827. return CI.loadAndFinalize(Reg, 128, X86::VMOVDQUrm);
  828. return CI.loadAndFinalize(Reg, 128, X86::MOVDQUrm);
  829. }
  830. if (X86::VR256XRegClass.contains(Reg)) {
  831. if (STI.getFeatureBits()[X86::FeatureAVX512])
  832. return CI.loadAndFinalize(Reg, 256, X86::VMOVDQU32Z256rm);
  833. if (STI.getFeatureBits()[X86::FeatureAVX])
  834. return CI.loadAndFinalize(Reg, 256, X86::VMOVDQUYrm);
  835. }
  836. if (X86::VR512RegClass.contains(Reg))
  837. if (STI.getFeatureBits()[X86::FeatureAVX512])
  838. return CI.loadAndFinalize(Reg, 512, X86::VMOVDQU32Zrm);
  839. if (X86::RSTRegClass.contains(Reg)) {
  840. return CI.loadX87STAndFinalize(Reg);
  841. }
  842. if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) ||
  843. X86::RFP80RegClass.contains(Reg)) {
  844. return CI.loadX87FPAndFinalize(Reg);
  845. }
  846. if (Reg == X86::EFLAGS)
  847. return CI.popFlagAndFinalize();
  848. if (Reg == X86::MXCSR)
  849. return CI.loadImplicitRegAndFinalize(
  850. STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR,
  851. 0x1f80);
  852. if (Reg == X86::FPCW)
  853. return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f);
  854. return {}; // Not yet implemented.
  855. }
  856. // Instruction can have some variable operands, and we may want to see how
  857. // different operands affect performance. So for each operand position,
  858. // precompute all the possible choices we might care about,
  859. // and greedily generate all the possible combinations of choices.
  860. std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants(
  861. const Instruction &Instr, unsigned MaxConfigsPerOpcode) const {
  862. bool Exploration = false;
  863. SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices;
  864. VariableChoices.resize(Instr.Variables.size());
  865. for (auto I : llvm::zip(Instr.Variables, VariableChoices)) {
  866. const Variable &Var = std::get<0>(I);
  867. SmallVectorImpl<MCOperand> &Choices = std::get<1>(I);
  868. switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) {
  869. default:
  870. // We don't wish to explicitly explore this variable.
  871. Choices.emplace_back(); // But add invalid MCOperand to simplify logic.
  872. continue;
  873. case X86::OperandType::OPERAND_COND_CODE: {
  874. Exploration = true;
  875. auto CondCodes = enum_seq_inclusive(X86::CondCode::COND_O,
  876. X86::CondCode::LAST_VALID_COND,
  877. force_iteration_on_noniterable_enum);
  878. Choices.reserve(CondCodes.size());
  879. for (int CondCode : CondCodes)
  880. Choices.emplace_back(MCOperand::createImm(CondCode));
  881. break;
  882. }
  883. }
  884. }
  885. // If we don't wish to explore any variables, defer to the baseline method.
  886. if (!Exploration)
  887. return ExegesisTarget::generateInstructionVariants(Instr,
  888. MaxConfigsPerOpcode);
  889. std::vector<InstructionTemplate> Variants;
  890. size_t NumVariants;
  891. CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G(
  892. VariableChoices);
  893. // How many operand combinations can we produce, within the limit?
  894. NumVariants = std::min(G.numCombinations(), (size_t)MaxConfigsPerOpcode);
  895. // And actually produce all the wanted operand combinations.
  896. Variants.reserve(NumVariants);
  897. G.generate([&](ArrayRef<MCOperand> State) -> bool {
  898. Variants.emplace_back(&Instr);
  899. Variants.back().setVariableValues(State);
  900. // Did we run out of space for variants?
  901. return Variants.size() >= NumVariants;
  902. });
  903. assert(Variants.size() == NumVariants &&
  904. Variants.size() <= MaxConfigsPerOpcode &&
  905. "Should not produce too many variants");
  906. return Variants;
  907. }
  908. static ExegesisTarget *getTheExegesisX86Target() {
  909. static ExegesisX86Target Target;
  910. return &Target;
  911. }
  912. void InitializeX86ExegesisTarget() {
  913. ExegesisTarget::registerTarget(getTheExegesisX86Target());
  914. }
  915. } // namespace exegesis
  916. } // namespace llvm