X86ScheduleBdVer2.td 57 KB


  1. //=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file defines the machine model for AMD bdver2 (Piledriver) to support
  10. // instruction scheduling and other instruction cost heuristics.
  11. // Based on:
  12. // * AMD Software Optimization Guide for AMD Family 15h Processors.
  13. // https://support.amd.com/TechDocs/47414_15h_sw_opt_guide.pdf
  14. // * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
  15. // http://www.agner.org/optimize/microarchitecture.pdf
  16. // * https://www.realworldtech.com/bulldozer/
  17. // Yes, that is for Bulldozer aka bdver1, not Piledriver aka bdver2.
  18. //
  19. //===----------------------------------------------------------------------===//
  20. def BdVer2Model : SchedMachineModel {
  21. let IssueWidth = 4; // Up to 4 IPC can be decoded, issued, retired.
  22. let MicroOpBufferSize = 128; // RCU reorder buffer size, which is unconfirmed.
  23. let LoopMicroOpBufferSize = -1; // There does not seem to be a loop buffer.
  24. let LoadLatency = 4; // L1 data cache has a 4-cycle load-to-use latency.
  25. let HighLatency = 25; // FIXME: any better choice?
  26. let MispredictPenalty = 20; // Minimum branch misdirection penalty.
  27. let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
  28. // FIXME: Incomplete. This flag is set to allow the scheduler to assign
  29. // a default model to unrecognized opcodes.
  30. let CompleteModel = 0;
  31. } // SchedMachineModel
  32. let SchedModel = BdVer2Model in {
  33. //===----------------------------------------------------------------------===//
  34. // Pipes
  35. //===----------------------------------------------------------------------===//
  36. // There are total of eight pipes.
  37. //===----------------------------------------------------------------------===//
  38. // Integer execution pipes
  39. //
  40. // Two EX (ALU) pipes.
  41. def PdEX0 : ProcResource<1>; // ALU, Integer Pipe0
  42. def PdEX1 : ProcResource<1>; // ALU, Integer Pipe1
  43. def PdEX01 : ProcResGroup<[PdEX0, PdEX1]>;
  44. // Two AGLU pipes, identical.
  45. def PdAGLU01 : ProcResource<2>; // AGU, Integer Pipe[23]
  46. //===----------------------------------------------------------------------===//
  47. // Floating point execution pipes
  48. //
  49. // Four FPU pipes.
  50. def PdFPU0 : ProcResource<1>; // Vector/FPU Pipe0
  51. def PdFPU1 : ProcResource<1>; // Vector/FPU Pipe1
  52. def PdFPU2 : ProcResource<1>; // Vector/FPU Pipe2
  53. def PdFPU3 : ProcResource<1>; // Vector/FPU Pipe3
  54. // FPU grouping
  55. def PdFPU01 : ProcResGroup<[PdFPU0, PdFPU1]>;
  56. def PdFPU23 : ProcResGroup<[PdFPU2, PdFPU3]>;
  57. //===----------------------------------------------------------------------===//
  58. // RCU
  59. //===----------------------------------------------------------------------===//
  60. // The Retire Control Unit on Piledriver can retire up to 4 macro-ops per cycle.
  61. // On the other hand, the RCU reorder buffer size for Piledriver does not
  62. // seem be specified in any trustworthy source.
  63. // But as per https://www.realworldtech.com/bulldozer/6/ the Bulldozer had
  64. // RCU reorder buffer size of 128. So that is a good guess for now.
  65. def PdRCU : RetireControlUnit<128, 4>;
  66. //===----------------------------------------------------------------------===//
  67. // Pipelines
  68. //===----------------------------------------------------------------------===//
  69. // There are total of two pipelines, each one with it's own scheduler.
  70. //===----------------------------------------------------------------------===//
  71. // Integer Pipeline Scheduling
  72. //
  73. // There is one Integer Scheduler per core.
  74. // Integer physical register file has 96 registers of 64-bit.
  75. def PdIntegerPRF : RegisterFile<96, [GR64, CCR]>;
  76. // Unified Integer, Memory Scheduler has 40 entries.
  77. def PdEX : ProcResGroup<[PdEX0, PdEX1, PdAGLU01]> {
  78. // Up to 4 IPC can be decoded, issued, retired.
  79. let BufferSize = 40;
  80. }
  81. //===----------------------------------------------------------------------===//
  82. // FPU Pipeline Scheduling
  83. //
  84. // The FPU unit is shared between the two cores.
  85. // FP physical register file has 160 registers of 128-bit.
  86. // Operations on 256-bit data types are cracked into two COPs.
  87. def PdFpuPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>;
  88. // Unified FP Scheduler has 64 entries,
  89. def PdFPU : ProcResGroup<[PdFPU0, PdFPU1, PdFPU2, PdFPU3]> {
  90. // Up to 4 IPC can be decoded, issued, retired.
  91. let BufferSize = 64;
  92. }
  93. //===----------------------------------------------------------------------===//
  94. // Functional units
  95. //===----------------------------------------------------------------------===//
  96. //===----------------------------------------------------------------------===//
  97. // Load-Store Units
  98. //
  99. let Super = PdAGLU01 in
  100. def PdLoad : ProcResource<2> {
  101. // For Piledriver, the load queue is 40 entries deep.
  102. let BufferSize = 40;
  103. }
  104. def PdLoadQueue : LoadQueue<PdLoad>;
  105. let Super = PdAGLU01 in
  106. def PdStore : ProcResource<1> {
  107. // For Piledriver, the store queue is 24 entries deep.
  108. let BufferSize = 24;
  109. }
  110. def PdStoreQueue : StoreQueue<PdStore>;
  111. //===----------------------------------------------------------------------===//
  112. // Integer Execution Units
  113. //
  114. def PdDiv : ProcResource<1>; // PdEX0; unpipelined integer division
  115. def PdCount : ProcResource<1>; // PdEX0; POPCNT, LZCOUNT
  116. def PdMul : ProcResource<1>; // PdEX1; integer multiplication
  117. def PdBranch : ProcResource<1>; // PdEX1; JMP, fused branches
  118. //===----------------------------------------------------------------------===//
  119. // Floating-Point Units
  120. //
  121. // Two FMAC/FPFMA units.
  122. def PdFPFMA : ProcResource<2>; // PdFPU0, PdFPU1
  123. // One 128-bit integer multiply-accumulate unit.
  124. def PdFPMMA : ProcResource<1>; // PdFPU0
  125. // One fp conversion unit.
  126. def PdFPCVT : ProcResource<1>; // PdFPU0
  127. // One unit for shuffles, packs, permutes, shifts.
  128. def PdFPXBR : ProcResource<1>; // PdFPU1
  129. // Two 128-bit packed integer units.
  130. def PdFPMAL : ProcResource<2>; // PdFPU2, PdFPU3
  131. // One FP store unit.
  132. def PdFPSTO : ProcResource<1>; // PdFPU3
  133. //===----------------------------------------------------------------------===//
  134. // Basic helper classes.
  135. //===----------------------------------------------------------------------===//
  136. // Many SchedWrites are defined in pairs with and without a folded load.
  137. // Instructions with folded loads are usually micro-fused, so they only appear
  138. // as two micro-ops when dispatched by the schedulers.
  139. // This multiclass defines the resource usage for variants with and without
  140. // folded loads.
  141. multiclass PdWriteRes<SchedWrite SchedRW,
  142. list<ProcResourceKind> ExePorts, int Lat = 1,
  143. list<int> Res = [], int UOps = 1> {
  144. def : WriteRes<SchedRW, ExePorts> {
  145. let Latency = Lat;
  146. let ResourceCycles = Res;
  147. let NumMicroOps = UOps;
  148. }
  149. }
  150. multiclass __pdWriteResPair<X86FoldableSchedWrite SchedRW,
  151. list<ProcResourceKind> ExePorts, int Lat,
  152. list<int> Res, int UOps,
  153. int LoadLat, int LoadRes, int LoadUOps> {
  154. defm : PdWriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
  155. defm : PdWriteRes<SchedRW.Folded,
  156. !listconcat([PdLoad], ExePorts),
  157. !add(Lat, LoadLat),
  158. !if(!and(!empty(Res), !eq(LoadRes, 1)),
  159. [],
  160. !listconcat([LoadRes],
  161. !if(!empty(Res),
  162. !listsplat(1, !size(ExePorts)),
  163. Res))),
  164. !add(UOps, LoadUOps)>;
  165. }
  166. multiclass PdWriteResExPair<X86FoldableSchedWrite SchedRW,
  167. list<ProcResourceKind> ExePorts, int Lat = 1,
  168. list<int> Res = [], int UOps = 1,
  169. int LoadUOps = 0> {
  170. defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
  171. /*LoadLat*/4, /*LoadRes*/3, LoadUOps>;
  172. }
  173. multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW,
  174. list<ProcResourceKind> ExePorts, int Lat = 1,
  175. list<int> Res = [], int UOps = 1,
  176. int LoadUOps = 0> {
  177. defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
  178. /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
  179. }
  180. multiclass PdWriteResYMMPair<X86FoldableSchedWrite SchedRW,
  181. list<ProcResourceKind> ExePorts, int Lat,
  182. list<int> Res = [], int UOps = 2,
  183. int LoadUOps = 0> {
  184. defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
  185. /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
  186. }
  187. //===----------------------------------------------------------------------===//
  188. // Here be dragons.
  189. //===----------------------------------------------------------------------===//
  190. // L1 data cache has a 4-cycle load-to-use latency, so ReadAfterLd registers
  191. // needn't be available until 4 cycles after the memory operand.
  192. def : ReadAdvance<ReadAfterLd, 4>;
  193. // Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available
  194. // until 5 cycles after the memory operand.
  195. def : ReadAdvance<ReadAfterVecLd, 5>;
  196. def : ReadAdvance<ReadAfterVecXLd, 5>;
  197. def : ReadAdvance<ReadAfterVecYLd, 5>;
  198. // Transfer from int domain to ivec domain incurs additional latency of 8..10cy
  199. // Reference: Agner, Microarchitecture, "AMD Bulldozer, Piledriver, Steamroller
  200. // and Excavator pipeline", "Data delay between different execution domains"
  201. def : ReadAdvance<ReadInt2Fpu, -10>;
  202. // A folded store needs a cycle on the PdStore for the store data.
  203. def : WriteRes<WriteRMW, [PdStore]>;
  204. ////////////////////////////////////////////////////////////////////////////////
  205. // Loads, stores, and moves, not folded with other operations.
  206. ////////////////////////////////////////////////////////////////////////////////
  207. def : WriteRes<WriteLoad, [PdLoad]> { let Latency = 5; let ResourceCycles = [2]; }
  208. def : WriteRes<WriteStore, [PdStore]>;
  209. def : WriteRes<WriteStoreNT, [PdStore]>;
  210. def : WriteRes<WriteMove, [PdEX01]> { let ResourceCycles = [2]; }
  211. defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
  212. // Load/store MXCSR.
  213. // FIXME: These are copy and pasted from WriteLoad/Store.
  214. def : WriteRes<WriteLDMXCSR, [PdLoad]> { let Latency = 5; }
  215. def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; let ResourceCycles = [18]; }
  216. // Treat misc copies as a move.
  217. def : InstRW<[WriteMove], (instrs COPY)>;
  218. ////////////////////////////////////////////////////////////////////////////////
  219. // Idioms that clear a register, like xorps %xmm0, %xmm0.
  220. // These can often bypass execution ports completely.
  221. ////////////////////////////////////////////////////////////////////////////////
  222. def : WriteRes<WriteZero, [/*No ExePorts*/]>;
  223. ////////////////////////////////////////////////////////////////////////////////
  224. // Branches don't produce values, so they have no latency, but they still
  225. // consume resources. Indirect branches can fold loads.
  226. ////////////////////////////////////////////////////////////////////////////////
  227. defm : PdWriteResExPair<WriteJump, [PdEX1, PdBranch]>;
  228. ////////////////////////////////////////////////////////////////////////////////
  229. // Special case scheduling classes.
  230. ////////////////////////////////////////////////////////////////////////////////
  231. def : WriteRes<WriteSystem, [PdEX01]> { let Latency = 100; }
  232. def : WriteRes<WriteMicrocoded, [PdEX01]> { let Latency = 100; }
  233. def : WriteRes<WriteFence, [PdStore]>;
  234. def PdWriteXLAT : SchedWriteRes<[PdEX01]> {
  235. let Latency = 6;
  236. }
  237. def : InstRW<[PdWriteXLAT], (instrs XLAT)>;
  238. def PdWriteLARrr : SchedWriteRes<[PdEX01]> {
  239. let Latency = 184;
  240. let ResourceCycles = [375];
  241. let NumMicroOps = 45;
  242. }
  243. def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr",
  244. "LSL(16|32|64)rr")>;
  245. // Nops don't have dependencies, so there's no actual latency, but we set this
  246. // to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
  247. def : WriteRes<WriteNop, [PdEX01]> { let ResourceCycles = [2]; }
  248. ////////////////////////////////////////////////////////////////////////////////
  249. // Arithmetic.
  250. ////////////////////////////////////////////////////////////////////////////////
  251. defm : PdWriteResExPair<WriteALU, [PdEX01], 1, [2]>;
  252. def PdWriteALURMW : SchedWriteRes<[PdLoad, PdEX01, PdStore]> {
  253. let Latency = 6;
  254. let ResourceCycles = [3, 2, 1];
  255. let NumMicroOps = 1;
  256. }
  257. def : SchedAlias<WriteALURMW, PdWriteALURMW>;
  258. def PdWriteLXADD : SchedWriteRes<[PdEX01]> {
  259. let Latency = 6;
  260. let ResourceCycles = [88];
  261. let NumMicroOps = 4;
  262. }
  263. def : InstRW<[PdWriteLXADD], (instrs LXADD8, LXADD16, LXADD32, LXADD64)>;
  264. def PdWriteBMI1 : SchedWriteRes<[PdEX01]> {
  265. let Latency = 2;
  266. let ResourceCycles = [2];
  267. let NumMicroOps = 2;
  268. }
  269. def : InstRW<[PdWriteBMI1],
  270. (instrs BLCFILL32rr, BLCFILL64rr, BLCI32rr, BLCI64rr,
  271. BLCIC32rr, BLCIC64rr, BLCMSK32rr, BLCMSK64rr,
  272. BLCS32rr, BLCS64rr, BLSFILL32rr, BLSFILL64rr,
  273. BLSIC32rr, BLSIC64rr, T1MSKC32rr, T1MSKC64rr,
  274. TZMSK32rr, TZMSK64rr)>;
  275. def PdWriteBMI1m : SchedWriteRes<[PdLoad, PdEX01]> {
  276. let Latency = 6;
  277. let ResourceCycles = [3, 3];
  278. let NumMicroOps = 2;
  279. }
  280. def : InstRW<[PdWriteBMI1m],
  281. (instrs BLCFILL32rm, BLCFILL64rm, BLCI32rm, BLCI64rm,
  282. BLCIC32rm, BLCIC64rm, BLCMSK32rm, BLCMSK64rm,
  283. BLCS32rm, BLCS64rm, BLSFILL32rm, BLSFILL64rm,
  284. BLSIC32rm, BLSIC64rm, T1MSKC32rm, T1MSKC64rm,
  285. TZMSK32rm, TZMSK64rm)>;
  286. defm : PdWriteResExPair<WriteADC, [PdEX01], 1, [2]>;
  287. def PdWriteADCSBB64ri32 : SchedWriteRes<[PdEX01]> {
  288. let ResourceCycles = [3];
  289. }
  290. def : InstRW<[PdWriteADCSBB64ri32], (instrs ADC64ri32, SBB64ri32)>;
  291. defm : PdWriteRes<WriteBSWAP32, [PdEX01]>;
  292. defm : PdWriteRes<WriteBSWAP64, [PdEX01]>;
  293. defm : PdWriteRes<WriteCMPXCHG, [PdEX1], 3, [3], 5>;
  294. defm : PdWriteRes<WriteCMPXCHGRMW, [PdEX1, PdStore, PdLoad], 3, [44, 1, 1], 2>;
  295. defm : PdWriteRes<WriteXCHG, [PdEX1], 1, [], 2>;
  296. def PdWriteCMPXCHG8rr : SchedWriteRes<[PdEX1]> {
  297. let Latency = 3;
  298. let ResourceCycles = [3];
  299. let NumMicroOps = 3;
  300. }
  301. def : InstRW<[PdWriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
  302. def PdWriteCMPXCHG8rm : SchedWriteRes<[PdEX1]> {
  303. let Latency = 3;
  304. let ResourceCycles = [23];
  305. let NumMicroOps = 5;
  306. }
  307. def : InstRW<[PdWriteCMPXCHG8rm], (instrs CMPXCHG8rm)>;
  308. def PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm : SchedWriteRes<[PdEX1]> {
  309. let Latency = 3;
  310. let ResourceCycles = [21];
  311. let NumMicroOps = 6;
  312. }
  313. def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm],
  314. (instrs CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm)>;
  315. def PdWriteCMPXCHG8B : SchedWriteRes<[PdEX1]> {
  316. let Latency = 3;
  317. let ResourceCycles = [26];
  318. let NumMicroOps = 18;
  319. }
  320. def : InstRW<[PdWriteCMPXCHG8B], (instrs CMPXCHG8B)>;
  321. def PdWriteCMPXCHG16B : SchedWriteRes<[PdEX1]> {
  322. let Latency = 3;
  323. let ResourceCycles = [69];
  324. let NumMicroOps = 22;
  325. }
  326. def : InstRW<[PdWriteCMPXCHG16B], (instrs CMPXCHG16B)>;
  327. def PdWriteXADD : SchedWriteRes<[PdEX1]> {
  328. let Latency = 1;
  329. let ResourceCycles = [1];
  330. let NumMicroOps = 2;
  331. }
  332. def : InstRW<[PdWriteXADD], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr)>;
  333. def PdWriteXADDm : SchedWriteRes<[PdEX1]> {
  334. let Latency = 6;
  335. let ResourceCycles = [20];
  336. let NumMicroOps = 4;
  337. }
  338. def : InstRW<[PdWriteXADDm], (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm)>;
  339. defm : PdWriteResExPair<WriteIMul8, [PdEX1, PdMul], 4, [1, 4]>;
  340. defm : PdWriteResExPair<WriteIMul16, [PdEX1, PdMul], 4, [1, 5], 2>;
  341. defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul], 5, [1, 5], 2>;
  342. defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul], 4, [1, 2]>;
  343. defm : PdWriteResExPair<WriteIMul32, [PdEX1, PdMul], 4, [1, 4]>;
  344. defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul], 4, [1, 2], 1, 1>;
  345. defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul], 4, [1, 2]>;
  346. defm : PdWriteResExPair<WriteIMul64, [PdEX1, PdMul], 6, [1, 6]>;
  347. defm : PdWriteResExPair<WriteIMul64Imm, [PdEX1, PdMul], 6, [1, 4],1, 1>;
  348. defm : PdWriteResExPair<WriteIMul64Reg, [PdEX1, PdMul], 6, [1, 4]>;
  349. // BMI2 MULX
  350. defm : X86WriteResUnsupported<WriteIMulH>;
  351. defm : X86WriteResUnsupported<WriteIMulHLd>;
  352. defm : X86WriteResPairUnsupported<WriteMULX32>;
  353. defm : X86WriteResPairUnsupported<WriteMULX64>;
  354. defm : PdWriteResExPair<WriteDiv8, [PdEX1, PdDiv], 12, [1, 12]>;
  355. defm : PdWriteResExPair<WriteDiv16, [PdEX1, PdDiv], 15, [1, 15], 2>;
  356. defm : PdWriteResExPair<WriteDiv32, [PdEX1, PdDiv], 14, [1, 14], 2>;
  357. defm : PdWriteResExPair<WriteDiv64, [PdEX1, PdDiv], 14, [1, 14], 2>;
  358. defm : PdWriteResExPair<WriteIDiv8, [PdEX1, PdDiv], 12, [1, 12]>;
  359. defm : PdWriteResExPair<WriteIDiv16, [PdEX1, PdDiv], 15, [1, 17], 2>;
  360. defm : PdWriteResExPair<WriteIDiv32, [PdEX1, PdDiv], 14, [1, 25], 2>;
  361. defm : PdWriteResExPair<WriteIDiv64, [PdEX1, PdDiv], 14, [1, 14], 2>;
  362. defm : PdWriteResExPair<WriteCRC32, [PdEX01], 2, [4], 3>;
  363. def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> {
  364. let Latency = 5;
  365. let ResourceCycles = [10];
  366. let NumMicroOps = 5;
  367. }
  368. def : InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>;
  369. def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> {
  370. let Latency = 6;
  371. let ResourceCycles = [12];
  372. let NumMicroOps = 7;
  373. }
  374. def : InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>;
  375. def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> {
  376. let Latency = 10;
  377. let ResourceCycles = [17];
  378. let NumMicroOps = 11;
  379. }
  380. def : InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>;
  381. defm : PdWriteResExPair<WriteCMOV, [PdEX01]>; // Conditional move.
  382. def PdWriteCMOVm : SchedWriteRes<[PdLoad, PdEX01]> {
  383. let Latency = 5;
  384. let ResourceCycles = [3, 3];
  385. let NumMicroOps = 2;
  386. }
  387. def PdWriteCMOVmVar : SchedWriteVariant<[
  388. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_BE">>, [PdWriteCMOVm]>,
  389. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_A">>, [PdWriteCMOVm]>,
  390. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_L">>, [PdWriteCMOVm]>,
  391. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_GE">>, [PdWriteCMOVm]>,
  392. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_LE">>, [PdWriteCMOVm]>,
  393. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_G">>, [PdWriteCMOVm]>,
  394. SchedVar<NoSchedPred, [WriteCMOV.Folded]>
  395. ]>;
  396. def : InstRW<[PdWriteCMOVmVar], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
  397. defm : PdWriteRes<WriteFCMOV, [PdFPU0, PdFPFMA]>; // x87 conditional move.
  398. def : WriteRes<WriteSETCC, [PdEX01]>; // Setcc.
  399. def : WriteRes<WriteSETCCStore, [PdEX01, PdStore]>;
  400. def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> {
  401. let ResourceCycles = [2];
  402. let NumMicroOps = 2;
  403. }
  404. def PdSETGEmSETGmSETLEmSETLm : SchedWriteVariant<[
  405. SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_GE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
  406. SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_G">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
  407. SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_LE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
  408. SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_L">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
  409. SchedVar<NoSchedPred, [WriteSETCCStore]>
  410. ]>;
  411. def : InstRW<[PdSETGEmSETGmSETLEmSETLm], (instrs SETCCm)>;
  412. defm : PdWriteRes<WriteLAHFSAHF, [PdEX01], 2, [4], 2>;
  413. def PdWriteLAHF : SchedWriteRes<[PdEX01]> {
  414. let Latency = 2;
  415. let ResourceCycles = [4];
  416. let NumMicroOps = 4;
  417. }
  418. def : InstRW<[PdWriteLAHF], (instrs LAHF)>;
  419. def PdWriteSAHF : SchedWriteRes<[PdEX01]> {
  420. let Latency = 2;
  421. let ResourceCycles = [2];
  422. let NumMicroOps = 2;
  423. }
  424. def : InstRW<[PdWriteSAHF], (instrs SAHF)>;
  425. defm : PdWriteRes<WriteBitTest, [PdEX01], 1, [2], 1>;
  426. defm : PdWriteRes<WriteBitTestImmLd, [PdEX01, PdLoad], 5, [2, 3], 1>;
  427. defm : PdWriteRes<WriteBitTestRegLd, [PdEX01, PdLoad], 5, [7, 2], 7>;
  428. defm : PdWriteRes<WriteBitTestSet, [PdEX01], 2, [2], 2>;
  429. defm : PdWriteRes<WriteBitTestSetImmLd, [PdEX01, PdLoad], 6, [1, 1], 4>;
  430. defm : PdWriteRes<WriteBitTestSetRegLd, [PdEX01, PdLoad], 6, [1, 1], 10>;
  431. def PdWriteBTSIm : SchedWriteRes<[PdEX01, PdLoad]> {
  432. let Latency = 7;
  433. let ResourceCycles = [42, 1];
  434. let NumMicroOps = 4;
  435. }
  436. def : SchedAlias<WriteBitTestSetImmRMW, PdWriteBTSIm>;
  437. def PdWriteBTSRm : SchedWriteRes<[PdEX01, PdLoad]> {
  438. let Latency = 7;
  439. let ResourceCycles = [44, 1];
  440. let NumMicroOps = 10;
  441. }
  442. def : SchedAlias<WriteBitTestSetRegRMW, PdWriteBTSRm>;
  443. // This is for simple LEAs with one or two input operands.
  444. def : WriteRes<WriteLEA, [PdEX01]> { let ResourceCycles = [2]; }
  445. // This write is used for slow LEA instructions.
  446. def PdWrite3OpsLEA : SchedWriteRes<[PdEX01]> {
  447. let Latency = 2;
  448. let ResourceCycles = [2];
  449. }
  450. // On Piledriver, a slow LEA is either a 3Ops LEA (base, index, offset),
  451. // or an LEA with a `Scale` value different than 1.
  452. def PdSlowLEAPredicate : MCSchedPredicate<
  453. CheckAny<[
  454. // A 3-operand LEA (base, index, offset).
  455. IsThreeOperandsLEAFn,
  456. // An LEA with a "Scale" different than 1.
  457. CheckAll<[
  458. CheckIsImmOperand<2>,
  459. CheckNot<CheckImmOperand<2, 1>>
  460. ]>
  461. ]>
  462. >;
  463. def PdWriteLEA : SchedWriteVariant<[
  464. SchedVar<PdSlowLEAPredicate, [PdWrite3OpsLEA]>,
  465. SchedVar<NoSchedPred, [WriteLEA]>
  466. ]>;
  467. def : InstRW<[PdWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
  468. def PdWriteLEA16r : SchedWriteRes<[PdEX01]> {
  469. let ResourceCycles = [3];
  470. let NumMicroOps = 2;
  471. }
  472. def : InstRW<[PdWriteLEA16r], (instrs LEA16r)>;
  473. // Bit counts.
  474. defm : PdWriteResExPair<WriteBSF, [PdEX01], 3, [6], 6, 2>;
  475. defm : PdWriteResExPair<WriteBSR, [PdEX01], 4, [8], 7, 2>;
  476. defm : PdWriteResExPair<WritePOPCNT, [PdEX01], 4, [4]>;
  477. defm : PdWriteResExPair<WriteLZCNT, [PdEX0], 2, [2], 2>;
  478. defm : PdWriteResExPair<WriteTZCNT, [PdEX0], 2, [2], 2>;
  479. // BMI1 BEXTR, BMI2 BZHI
  480. defm : PdWriteResExPair<WriteBEXTR, [PdEX01], 2, [2], 2>;
  481. defm : PdWriteResExPair<WriteBLS, [PdEX01], 2, [2], 2>;
  482. defm : PdWriteResExPair<WriteBZHI, [PdEX01]>;
  483. def PdWriteBEXTRI : SchedWriteRes<[PdEX01]> {
  484. let Latency = 2;
  485. let ResourceCycles = [4];
  486. let NumMicroOps = 2;
  487. }
  488. def : InstRW<[PdWriteBEXTRI], (instrs BEXTRI32ri, BEXTRI64ri)>;
  489. def PdWriteBEXTRIm : SchedWriteRes<[PdEX01]> {
  490. let Latency = 2;
  491. let ResourceCycles = [5];
  492. let NumMicroOps = 2;
  493. }
  494. def : InstRW<[PdWriteBEXTRIm], (instrs BEXTRI32mi, BEXTRI64mi)>;
  495. ////////////////////////////////////////////////////////////////////////////////
  496. // Integer shifts and rotates.
  497. ////////////////////////////////////////////////////////////////////////////////
  498. defm : PdWriteResExPair<WriteShift, [PdEX01], 1, [2]>;
  499. defm : PdWriteResExPair<WriteShiftCL, [PdEX01]>;
  500. defm : PdWriteResExPair<WriteRotate, [PdEX01], 1, [2]>;
  501. defm : PdWriteResExPair<WriteRotateCL, [PdEX01]>;
  502. def PdWriteRCL8rCL : SchedWriteRes<[PdEX01]> {
  503. let Latency = 12;
  504. let ResourceCycles = [24];
  505. let NumMicroOps = 26;
  506. }
  507. def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>;
  508. def PdWriteRCR8ri : SchedWriteRes<[PdEX01]> {
  509. let Latency = 12;
  510. let ResourceCycles = [23];
  511. let NumMicroOps = 23;
  512. }
  513. def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>;
  514. def PdWriteRCR8rCL : SchedWriteRes<[PdEX01]> {
  515. let Latency = 11;
  516. let ResourceCycles = [22];
  517. let NumMicroOps = 24;
  518. }
  519. def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>;
  520. def PdWriteRCL16rCL : SchedWriteRes<[PdEX01]> {
  521. let Latency = 10;
  522. let ResourceCycles = [20];
  523. let NumMicroOps = 22;
  524. }
  525. def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>;
  526. def PdWriteRCR16ri : SchedWriteRes<[PdEX01]> {
  527. let Latency = 10;
  528. let ResourceCycles = [19];
  529. let NumMicroOps = 19;
  530. }
  531. def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>;
  532. def PdWriteRCL3264rCL : SchedWriteRes<[PdEX01]> {
  533. let Latency = 7;
  534. let ResourceCycles = [14];
  535. let NumMicroOps = 17;
  536. }
  537. def : InstRW<[PdWriteRCL3264rCL], (instrs RCL32rCL, RCL64rCL)>;
  538. def PdWriteRCR3264rCL : SchedWriteRes<[PdEX01]> {
  539. let Latency = 7;
  540. let ResourceCycles = [13];
  541. let NumMicroOps = 16;
  542. }
  543. def : InstRW<[PdWriteRCR3264rCL], (instrs RCR32rCL, RCR64rCL)>;
  544. def PdWriteRCR32riRCR64ri : SchedWriteRes<[PdEX01]> {
  545. let Latency = 7;
  546. let ResourceCycles = [14];
  547. let NumMicroOps = 15;
  548. }
  549. def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>;
  550. def PdWriteRCR16rCL : SchedWriteRes<[PdEX01]> {
  551. let Latency = 9;
  552. let ResourceCycles = [18];
  553. let NumMicroOps = 20;
  554. }
  555. def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>;
  556. def PdWriteRCL16ri : SchedWriteRes<[PdEX01]> {
  557. let Latency = 11;
  558. let ResourceCycles = [21];
  559. let NumMicroOps = 21;
  560. }
  561. def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>;
  562. def PdWriteRCL3264ri : SchedWriteRes<[PdEX01]> {
  563. let Latency = 8;
  564. let ResourceCycles = [15];
  565. let NumMicroOps = 16;
  566. }
  567. def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>;
  568. def PdWriteRCL8ri : SchedWriteRes<[PdEX01]> {
  569. let Latency = 13;
  570. let ResourceCycles = [25];
  571. let NumMicroOps = 25;
  572. }
  573. def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>;
  574. // SHLD/SHRD.
  575. defm : PdWriteRes<WriteSHDrri, [PdEX01], 3, [6], 6>;
  576. defm : PdWriteRes<WriteSHDrrcl, [PdEX01], 3, [8], 7>;
  577. def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> {
  578. let Latency = 3;
  579. let ResourceCycles = [6];
  580. let NumMicroOps = 6;
  581. }
  582. def : InstRW<[PdWriteSHLD32rri8SHRD16rri8 ], (instrs SHLD32rri8, SHRD16rri8)>;
  583. def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> {
  584. let Latency = 3;
  585. let ResourceCycles = [6];
  586. let NumMicroOps = 7;
  587. }
  588. def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL,
  589. SHLD32rrCL,
  590. SHRD32rrCL)>;
  591. defm : PdWriteRes<WriteSHDmri, [PdLoad, PdEX01], 4, [1, 22], 8>;
  592. defm : PdWriteRes<WriteSHDmrcl, [PdLoad, PdEX01], 4, [1, 22], 8>;
  593. ////////////////////////////////////////////////////////////////////////////////
  594. // Floating point. This covers both scalar and vector operations.
  595. ////////////////////////////////////////////////////////////////////////////////
  596. defm : PdWriteRes<WriteFLD0, [PdFPU1, PdFPSTO], 3>;
  597. defm : PdWriteRes<WriteFLD1, [PdFPU1, PdFPSTO], 3>;
  598. defm : PdWriteRes<WriteFLDC, [PdFPU1, PdFPSTO], 3>;
  599. defm : PdWriteRes<WriteFLoad, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
  600. defm : PdWriteRes<WriteFLoadX, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
  601. defm : PdWriteRes<WriteFLoadY, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3], 2>;
  602. defm : PdWriteRes<WriteFMaskedLoad, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 1, 4]>;
  603. defm : PdWriteRes<WriteFMaskedLoadY, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 2, 4], 2>;
  604. defm : PdWriteRes<WriteFStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>;
  605. defm : PdWriteRes<WriteFStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>;
  606. defm : PdWriteRes<WriteFStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [1, 36, 2], 4>;
  607. def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU23, PdFPSTO]> {
  608. let Latency = 2;
  609. let ResourceCycles = [1, 3, 1];
  610. let NumMicroOps = 2;
  611. }
  612. def : InstRW<[PdWriteMOVHPm], (instrs MOVHPDmr, MOVHPSmr, VMOVHPDmr, VMOVHPSmr)>;
  613. def PdWriteVMOVUPDYmrVMOVUPSYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> {
  614. let NumMicroOps = 8;
  615. }
  616. def : InstRW<[PdWriteVMOVUPDYmrVMOVUPSYmr], (instrs VMOVUPDYmr, VMOVUPSYmr)>;
  617. defm : PdWriteRes<WriteFStoreNT, [PdStore, PdFPU1, PdFPSTO], 3>;
  618. defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>;
  619. defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>;
  620. defm : PdWriteRes<WriteFMaskedStore32, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
  621. defm : PdWriteRes<WriteFMaskedStore64, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
  622. defm : PdWriteRes<WriteFMaskedStore32Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
  623. defm : PdWriteRes<WriteFMaskedStore64Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
  624. defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>;
  625. defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>;
  626. defm : PdWriteRes<WriteFMoveY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
  627. defm : X86WriteResUnsupported<WriteFMoveZ>;
  628. defm : PdWriteRes<WriteEMMS, [PdFPU01, PdFPFMA], 2>;
  629. defm : PdWriteResXMMPair<WriteFAdd, [PdFPU0, PdFPFMA], 5>;
  630. defm : PdWriteResXMMPair<WriteFAddX, [PdFPU0, PdFPFMA], 5>;
  631. defm : PdWriteResYMMPair<WriteFAddY, [PdFPU0, PdFPFMA], 5, [1, 2]>;
  632. defm : X86WriteResPairUnsupported<WriteFAddZ>;
  633. def PdWriteX87Add: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
  634. let Latency = 5;
  635. let ResourceCycles = [3, 1, 10];
  636. }
  637. def : InstRW<[PdWriteX87Add], (instrs ADD_FI16m, ADD_FI32m, ADD_F32m, ADD_F64m,
  638. SUB_FI16m, SUB_FI32m, SUB_F32m, SUB_F64m,
  639. SUBR_FI16m, SUBR_FI32m, SUBR_F32m, SUBR_F64m)>;
  640. defm : PdWriteResXMMPair<WriteFAdd64, [PdFPU0, PdFPFMA], 5>;
  641. defm : PdWriteResXMMPair<WriteFAdd64X, [PdFPU0, PdFPFMA], 5>;
  642. defm : PdWriteResYMMPair<WriteFAdd64Y, [PdFPU0, PdFPFMA], 5, [1, 2]>;
  643. defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
  644. defm : PdWriteResXMMPair<WriteFCmp, [PdFPU0, PdFPFMA], 2>;
  645. defm : PdWriteResXMMPair<WriteFCmpX, [PdFPU0, PdFPFMA], 2>;
  646. defm : PdWriteResYMMPair<WriteFCmpY, [PdFPU0, PdFPFMA], 2, [1, 2]>;
  647. defm : X86WriteResPairUnsupported<WriteFCmpZ>;
  648. defm : PdWriteResXMMPair<WriteFCmp64, [PdFPU0, PdFPFMA], 2>;
  649. defm : PdWriteResXMMPair<WriteFCmp64X, [PdFPU0, PdFPFMA], 2>;
  650. defm : PdWriteResYMMPair<WriteFCmp64Y, [PdFPU0, PdFPFMA], 2, [1, 2]>;
  651. defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
  652. defm : PdWriteResXMMPair<WriteFCom, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
  653. defm : PdWriteResXMMPair<WriteFComX, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
  654. def PdWriteFCOMPm : SchedWriteRes<[PdFPU1, PdFPFMA]> {
  655. let Latency = 6;
  656. }
  657. def : InstRW<[PdWriteFCOMPm], (instrs FCOM32m, FCOM64m, FCOMP32m, FCOMP64m)>;
  658. def PdWriteTST_F_UCOM_FPPr : SchedWriteRes<[PdFPU1, PdFPFMA]>;
  659. def : InstRW<[PdWriteTST_F_UCOM_FPPr], (instrs TST_F, UCOM_FPPr)>;
  660. defm : PdWriteResXMMPair<WriteFMul, [PdFPU1, PdFPFMA], 5>;
  661. defm : PdWriteResXMMPair<WriteFMulX, [PdFPU1, PdFPFMA], 5>;
  662. defm : PdWriteResYMMPair<WriteFMulY, [PdFPU1, PdFPFMA], 5, [1, 2]>;
  663. defm : X86WriteResPairUnsupported<WriteFMulZ>;
  664. def PdWriteX87Mul: SchedWriteRes<[PdLoad, PdFPU1, PdFPFMA]> {
  665. let Latency = 5;
  666. let ResourceCycles = [3, 1, 10];
  667. }
  668. def : InstRW<[PdWriteX87Mul], (instrs MUL_FI16m, MUL_FI32m, MUL_F32m, MUL_F64m)>;
  669. defm : PdWriteResXMMPair<WriteFMul64, [PdFPU1, PdFPFMA], 5>;
  670. defm : PdWriteResXMMPair<WriteFMul64X, [PdFPU1, PdFPFMA], 5>;
  671. defm : PdWriteResYMMPair<WriteFMul64Y, [PdFPU1, PdFPFMA], 5, [1, 2]>;
  672. defm : X86WriteResPairUnsupported<WriteFMul64Z>;
  673. defm : PdWriteResXMMPair<WriteFMA, [PdFPU, PdFPFMA], 5, [1, 3]>;
  674. defm : PdWriteResXMMPair<WriteFMAX, [PdFPU, PdFPFMA], 5, [1, 3]>;
  675. defm : PdWriteResYMMPair<WriteFMAY, [PdFPU, PdFPFMA], 5, [1, 3]>;
  676. defm : X86WriteResPairUnsupported<WriteFMAZ>;
  677. defm : PdWriteResXMMPair<WriteDPPD, [PdFPU1, PdFPFMA], 15, [1, 10], 15, 2>;
  678. defm : PdWriteResXMMPair<WriteDPPS, [PdFPU1, PdFPFMA], 25, [1, 14], 16, 2>;
  679. defm : PdWriteResYMMPair<WriteDPPSY, [PdFPU1, PdFPFMA], 27, [2, 25], /*or 29*/ 25, 4>;
  680. defm : X86WriteResPairUnsupported<WriteDPPSZ>;
  681. def PdWriteVDPPSrri : SchedWriteRes<[PdFPU1, PdFPFMA]> {
  682. let Latency = 27;
  683. let ResourceCycles = [1, 14];
  684. let NumMicroOps = 17;
  685. }
  686. def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>;
  687. defm : PdWriteResXMMPair<WriteFRcp, [PdFPU1, PdFPFMA], 5>;
  688. defm : PdWriteResXMMPair<WriteFRcpX, [PdFPU1, PdFPFMA], 5>;
  689. defm : PdWriteResYMMPair<WriteFRcpY, [PdFPU1, PdFPFMA], 5, [2, 1]>;
  690. defm : X86WriteResPairUnsupported<WriteFRcpZ>;
  691. defm : PdWriteResXMMPair<WriteFRsqrt, [PdFPU1, PdFPFMA], 5, [1, 2]>;
  692. defm : PdWriteResXMMPair<WriteFRsqrtX, [PdFPU1, PdFPFMA], 5>;
  693. defm : PdWriteResYMMPair<WriteFRsqrtY, [PdFPU1, PdFPFMA], 5, [2, 2]>;
  694. defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
  695. defm : PdWriteResXMMPair<WriteFDiv, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  696. defm : PdWriteResXMMPair<WriteFDivX, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  697. defm : PdWriteResYMMPair<WriteFDivY, [PdFPU1, PdFPFMA], 9, [2, 18]>;
  698. defm : X86WriteResPairUnsupported<WriteFDivZ>;
  699. def PdWriteX87Div: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
  700. let Latency = 9;
  701. let ResourceCycles = [3, 1, 18];
  702. }
  703. def : InstRW<[PdWriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
  704. DIVR_FI16m, DIVR_FI32m,
  705. DIV_F32m, DIV_F64m,
  706. DIVR_F32m, DIVR_F64m)>;
  707. defm : PdWriteResXMMPair<WriteFDiv64, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  708. defm : PdWriteResXMMPair<WriteFDiv64X, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  709. defm : PdWriteResYMMPair<WriteFDiv64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>;
  710. defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
  711. defm : PdWriteResXMMPair<WriteFSqrt, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  712. defm : PdWriteResXMMPair<WriteFSqrtX, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  713. defm : PdWriteResYMMPair<WriteFSqrtY, [PdFPU1, PdFPFMA], 9, [2, 18]>;
  714. defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
  715. defm : PdWriteResXMMPair<WriteFSqrt64, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  716. defm : PdWriteResXMMPair<WriteFSqrt64X, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  717. defm : PdWriteResYMMPair<WriteFSqrt64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>;
  718. defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
  719. defm : PdWriteResXMMPair<WriteFSqrt80, [PdFPU1, PdFPFMA], 1, [1, 18]>;
  720. defm : PdWriteResXMMPair<WriteFSign, [PdFPU1, PdFPFMA], 1, [1, 4]>;
  721. defm : PdWriteResXMMPair<WriteFRnd, [PdFPU1, PdFPSTO], 4, []>;
  722. defm : PdWriteResYMMPair<WriteFRndY, [PdFPU1, PdFPSTO], 4, [2, 1], 2>;
  723. defm : X86WriteResPairUnsupported<WriteFRndZ>;
  724. def PdWriteVFRCZP : SchedWriteRes<[PdFPU1, PdFPSTO]> {
  725. let Latency = 10;
  726. let ResourceCycles = [2, 1];
  727. let NumMicroOps = 2;
  728. }
  729. def : InstRW<[PdWriteVFRCZP], (instrs VFRCZPDrr, VFRCZPSrr)>;
  730. def PdWriteVFRCZS : SchedWriteRes<[PdFPU1, PdFPSTO]> {
  731. let Latency = 10;
  732. let ResourceCycles = [10, 1];
  733. let NumMicroOps = 2;
  734. }
  735. def : InstRW<[PdWriteVFRCZS], (instrs VFRCZSDrr, VFRCZSSrr)>;
  736. def PdWriteVFRCZm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
  737. let Latency = 15;
  738. let ResourceCycles = [2, 1];
  739. let NumMicroOps = 3;
  740. }
  741. def : InstRW<[PdWriteVFRCZm], (instrs VFRCZPDrm, VFRCZPSrm,
  742. VFRCZSDrm, VFRCZSSrm)>;
  743. def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> {
  744. let Latency = 10;
  745. let ResourceCycles = [3, 1];
  746. let NumMicroOps = 4;
  747. }
  748. def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>;
  749. def PdWriteVFRCZYm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
  750. let Latency = 15;
  751. let ResourceCycles = [4, 1];
  752. let NumMicroOps = 8;
  753. }
  754. def : InstRW<[PdWriteVFRCZYm], (instrs VFRCZPSYrm, VFRCZPDYrm)>;
  755. defm : PdWriteResXMMPair<WriteFLogic, [PdFPU01, PdFPFMA], 2, [1, 2]>;
  756. defm : PdWriteResYMMPair<WriteFLogicY, [PdFPU01, PdFPFMA], 2, [2, 2]>;
  757. defm : X86WriteResPairUnsupported<WriteFLogicZ>;
  758. defm : PdWriteResXMMPair<WriteFTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
  759. defm : PdWriteResYMMPair<WriteFTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [4, 4, 1], 4, 2>;
  760. defm : X86WriteResPairUnsupported<WriteFTestZ>;
  761. defm : PdWriteResXMMPair<WriteFShuffle, [PdFPU01, PdFPFMA], 2, [1, 2]>;
  762. defm : PdWriteResYMMPair<WriteFShuffleY, [PdFPU01, PdFPFMA], 2, [2, 4], 2>;
  763. defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
  764. def PdWriteVBROADCASTF128 : SchedWriteRes<[PdFPU01, PdFPFMA]> {
  765. let Latency = 7;
  766. let ResourceCycles = [1, 3];
  767. let NumMicroOps = 2;
  768. }
  769. def : InstRW<[PdWriteVBROADCASTF128], (instrs VBROADCASTF128)>;
  770. defm : PdWriteResXMMPair<WriteFVarShuffle, [PdFPU01, PdFPFMA], 3, [1, 2]>;
  771. defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU01, PdFPFMA], 3, [2, 4], 2>;
  772. defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
  773. defm : PdWriteResXMMPair<WriteFBlend, [PdFPU01, PdFPFMA], 2, [1, 3]>;
  774. defm : PdWriteResYMMPair<WriteFBlendY, [PdFPU01, PdFPFMA], 2, [2, 3], 2>;
  775. defm : X86WriteResPairUnsupported<WriteFBlendZ>;
  776. defm : PdWriteResXMMPair<WriteFVarBlend, [PdFPU01, PdFPFMA], 2, [1, 3]>;
  777. defm : PdWriteResYMMPair<WriteFVarBlendY, [PdFPU01, PdFPFMA], 2, [2, 4], 2>;
  778. defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
  779. defm : PdWriteResXMMPair<WriteFShuffle256, [PdFPU01, PdFPFMA], 2, [1, 3], 2>;
  780. defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
  781. def PdWriteVEXTRACTF128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
  782. let Latency = 2;
  783. let ResourceCycles = [1, 2];
  784. }
  785. def : InstRW<[PdWriteVEXTRACTF128rr], (instrs VEXTRACTF128rr)>;
  786. def PdWriteVEXTRACTF128mr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
  787. let Latency = 7;
  788. let ResourceCycles = [1, 4];
  789. let NumMicroOps = 2;
  790. }
  791. def : InstRW<[PdWriteVEXTRACTF128mr], (instrs VEXTRACTF128mr)>;
  792. def PdWriteVPERM2F128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
  793. let Latency = 4;
  794. let ResourceCycles = [1, 6];
  795. let NumMicroOps = 8;
  796. }
  797. def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>;
  798. def PdWriteVPERM2F128rm : SchedWriteRes<[PdFPU01, PdFPFMA]> {
  799. let Latency = 8; // 4 + 4
  800. let ResourceCycles = [1, 8];
  801. let NumMicroOps = 10;
  802. }
  803. def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>;
  804. ////////////////////////////////////////////////////////////////////////////////
  805. // Conversions.
  806. ////////////////////////////////////////////////////////////////////////////////
  807. defm : PdWriteResXMMPair<WriteCvtSS2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
  808. defm : PdWriteResXMMPair<WriteCvtPS2I, [PdFPU0, PdFPCVT, PdFPSTO], 4>;
  809. defm : PdWriteResYMMPair<WriteCvtPS2IY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
  810. defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
  811. defm : PdWriteResXMMPair<WriteCvtSD2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
  812. defm : PdWriteResXMMPair<WriteCvtPD2I, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
  813. defm : PdWriteResYMMPair<WriteCvtPD2IY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
  814. defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
  815. def PdWriteMMX_CVTTPD2PIrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
  816. let Latency = 6;
  817. let NumMicroOps = 2;
  818. }
  819. def : InstRW<[PdWriteMMX_CVTTPD2PIrr], (instrs MMX_CVTTPD2PIrr)>;
  820. // FIXME: f+3 ST, LD+STC latency
  821. defm : PdWriteResXMMPair<WriteCvtI2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
  822. // FIXME: .Folded version is one NumMicroOp *less*..
  823. defm : PdWriteResXMMPair<WriteCvtI2PS, [PdFPU0, PdFPCVT, PdFPSTO], 4>;
  824. defm : PdWriteResYMMPair<WriteCvtI2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
  825. defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
  826. defm : PdWriteResXMMPair<WriteCvtI2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
  827. // FIXME: .Folded version is one NumMicroOp *less*..
  828. def PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
  829. let Latency = 13;
  830. let ResourceCycles = [1, 3, 1];
  831. let NumMicroOps = 2;
  832. }
  833. def : InstRW<[PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr], (instrs CVTSI642SDrr, CVTSI642SSrr, CVTSI2SDrr, CVTSI2SSrr)>;
  834. defm : PdWriteResXMMPair<WriteCvtI2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
  835. defm : PdWriteResYMMPair<WriteCvtI2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
  836. defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
  837. defm : PdWriteResXMMPair<WriteCvtSS2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
  838. defm : PdWriteResXMMPair<WriteCvtPS2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
  839. defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
  840. defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
  841. defm : PdWriteResXMMPair<WriteCvtSD2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
  842. defm : PdWriteResXMMPair<WriteCvtPD2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
  843. defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
  844. defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
  845. def PdWriteMMX_CVTPD2PIrrMMX_CVTPI2PDrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
  846. let Latency = 6;
  847. let NumMicroOps = 2;
  848. }
  849. def : InstRW<[PdWriteMMX_CVTPD2PIrrMMX_CVTPI2PDrr], (instrs MMX_CVTPD2PIrr,
  850. MMX_CVTPI2PDrr)>;
  851. def PdWriteMMX_CVTPI2PSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
  852. let Latency = 4;
  853. let NumMicroOps = 2;
  854. }
  855. def : InstRW<[PdWriteMMX_CVTPI2PSrr], (instrs MMX_CVTPI2PSrr)>;
  856. defm : PdWriteResXMMPair<WriteCvtPH2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2, 1>;
  857. defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 3>;
  858. defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
  859. defm : PdWriteRes<WriteCvtPS2PH, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2>;
  860. defm : PdWriteRes<WriteCvtPS2PHY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
  861. defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
  862. defm : PdWriteRes<WriteCvtPS2PHSt, [PdFPU0, PdFPCVT, PdFPSTO, PdStore], 4, [1, 2, 1, 1], 3>;
  863. defm : PdWriteRes<WriteCvtPS2PHYSt, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdStore], 4, [1, 2, 1, 1, 1], 4>;
  864. defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
  865. ////////////////////////////////////////////////////////////////////////////////
  866. // Vector integer operations.
  867. ////////////////////////////////////////////////////////////////////////////////
  868. defm : PdWriteRes<WriteVecLoad, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
  869. defm : PdWriteRes<WriteVecLoadX, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
  870. defm : PdWriteRes<WriteVecLoadY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 3], 2>;
  871. defm : PdWriteRes<WriteVecLoadNT, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 4]>;
  872. defm : PdWriteRes<WriteVecLoadNTY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 4]>;
  873. defm : PdWriteRes<WriteVecMaskedLoad, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 1, 2]>;
  874. defm : PdWriteRes<WriteVecMaskedLoadY, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 2, 4], 2>;
  875. defm : PdWriteRes<WriteVecStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>;
  876. defm : PdWriteRes<WriteVecStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>;
  877. defm : PdWriteRes<WriteVecStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [2, 36, 2], 4>;
  878. def PdWriteVMOVDQUYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> {
  879. let NumMicroOps = 8;
  880. }
  881. def : InstRW<[PdWriteVMOVDQUYmr], (instrs VMOVDQUYmr)>;
  882. defm : PdWriteRes<WriteVecStoreNT, [PdStore, PdFPU1, PdFPSTO], 2>;
  883. defm : PdWriteRes<WriteVecStoreNTY, [PdStore, PdFPU1, PdFPSTO], 2, [2, 2, 2], 4>;
  884. defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
  885. defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
  886. defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
  887. defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
  888. defm : PdWriteRes<WriteVecMove, [PdFPU01, PdFPMAL], 2>;
  889. defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 1, [1, 2]>;
  890. defm : PdWriteRes<WriteVecMoveY, [PdFPU01, PdFPMAL], 2, [2, 2], 2>;
  891. defm : X86WriteResUnsupported<WriteVecMoveZ>;
  892. def PdWriteMOVDQArr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
  893. }
  894. def : InstRW<[PdWriteMOVDQArr], (instrs MOVDQArr)>;
  895. def PdWriteMOVQ2DQrr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
  896. let Latency = 4;
  897. }
  898. def : InstRW<[PdWriteMOVQ2DQrr], (instrs MMX_MOVQ2DQrr)>;
  899. defm : PdWriteRes<WriteVecMoveToGpr, [PdFPU0, PdFPFMA, PdEX0], 11>;
  900. defm : PdWriteRes<WriteVecMoveFromGpr, [PdFPU01, PdFPFMA], 11, [1, 2], 2>;
  901. defm : PdWriteResXMMPair<WriteVecALU, [PdFPU01, PdFPMAL], 2>;
  902. defm : PdWriteResXMMPair<WriteVecALUX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
  903. defm : X86WriteResPairUnsupported<WriteVecALUY>;
  904. defm : X86WriteResPairUnsupported<WriteVecALUZ>;
  905. defm : PdWriteResXMMPair<WriteVecShift, [PdFPU01, PdFPMAL], 3, [1, 2]>;
  906. defm : PdWriteResXMMPair<WriteVecShiftX, [PdFPU01, PdFPMAL], 3, [1, 2]>;
  907. defm : X86WriteResPairUnsupported<WriteVecShiftY>;
  908. defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
  909. defm : PdWriteResXMMPair<WriteVecShiftImm, [PdFPU01, PdFPMAL], 2, [1, 2]>;
  910. defm : PdWriteResXMMPair<WriteVecShiftImmX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
  911. defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
  912. defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
  913. defm : PdWriteResXMMPair<WriteVecIMul, [PdFPU0, PdFPMMA], 4>;
  914. defm : PdWriteResXMMPair<WriteVecIMulX, [PdFPU0, PdFPMMA], 4>;
  915. defm : X86WriteResPairUnsupported<WriteVecIMulY>;
  916. defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
  917. defm : PdWriteResXMMPair<WritePMULLD, [PdFPU0, PdFPU01, PdFPMMA, PdFPMAL], 5, [2, 1, 2, 1]>;
  918. defm : X86WriteResPairUnsupported<WritePMULLDY>;
  919. defm : X86WriteResPairUnsupported<WritePMULLDZ>;
  920. def PdWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPMMA, PdFPMAL]> {
  921. let Latency = 4;
  922. }
  923. def : InstRW<[PdWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr,
  924. VPMACSSDQLrr)>;
  925. defm : PdWriteResXMMPair<WriteMPSAD, [PdFPU0, PdFPMMA], 9, [1, 4], 8>;
  926. defm : X86WriteResPairUnsupported<WriteMPSADY>;
  927. defm : X86WriteResPairUnsupported<WriteMPSADZ>;
  928. def PdWriteVMPSADBW : SchedWriteRes<[PdFPU0, PdFPMMA]> {
  929. let Latency = 8;
  930. let ResourceCycles = [1, 4];
  931. let NumMicroOps = 10;
  932. }
  933. def : InstRW<[PdWriteVMPSADBW], (instrs VMPSADBWrri)>;
  934. defm : PdWriteResXMMPair<WritePSADBW, [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
  935. defm : PdWriteResXMMPair<WritePSADBWX, [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
  936. defm : X86WriteResPairUnsupported<WritePSADBWY>;
  937. defm : X86WriteResPairUnsupported<WritePSADBWZ>;
  938. defm : PdWriteResXMMPair<WritePHMINPOS, [PdFPU0, PdFPMAL], 4, [], 2>;
  939. defm : PdWriteResXMMPair<WriteShuffle, [PdFPU01, PdFPMAL], 2, [1, 2]>;
  940. defm : PdWriteResXMMPair<WriteShuffleX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
  941. defm : PdWriteResYMMPair<WriteShuffleY, [PdFPU01, PdFPMAL], 2, [1, 4]>;
  942. defm : X86WriteResPairUnsupported<WriteShuffleZ>;
  943. defm : PdWriteResXMMPair<WriteVarShuffle, [PdFPU01, PdFPMAL], 3, [1, 2]>;
  944. defm : PdWriteResXMMPair<WriteVarShuffleX, [PdFPU01, PdFPMAL], 3, [1, 3]>;
  945. defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
  946. defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
  947. def PdWriteVPPERM : SchedWriteRes<[PdFPU01, PdFPMAL]> {
  948. let Latency = 2;
  949. let ResourceCycles = [1, 3];
  950. }
  951. def : InstRW<[PdWriteVPPERM], (instrs VPPERMrrr, VPPERMrrr_REV)>;
  952. defm : PdWriteResXMMPair<WriteBlend, [PdFPU01, PdFPMAL], 2>;
  953. defm : X86WriteResPairUnsupported<WriteBlendY>;
  954. defm : X86WriteResPairUnsupported<WriteBlendZ>;
  955. defm : PdWriteResXMMPair<WriteVarBlend, [PdFPU01, PdFPMAL], 2, [1, 2]>;
  956. defm : X86WriteResPairUnsupported<WriteVarBlendY>;
  957. defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
  958. defm : PdWriteResXMMPair<WriteVecLogic, [PdFPU01, PdFPMAL], 2>;
  959. defm : PdWriteResXMMPair<WriteVecLogicX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
  960. defm : X86WriteResPairUnsupported<WriteVecLogicY>;
  961. defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
  962. defm : PdWriteResXMMPair<WriteVecTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
  963. defm : PdWriteResYMMPair<WriteVecTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 4, 1], 4, 2>;
  964. defm : X86WriteResPairUnsupported<WriteVecTestZ>;
  965. defm : PdWriteResXMMPair<WriteShuffle256, [PdFPU01, PdFPMAL]>;
  966. defm : PdWriteResXMMPair<WriteVPMOV256, [PdFPU01, PdFPMAL]>;
  967. defm : PdWriteResXMMPair<WriteVarShuffle256, [PdFPU01, PdFPMAL]>;
  968. defm : PdWriteResXMMPair<WriteVarVecShift, [PdFPU01, PdFPMAL], 3, [1, 2]>;
  969. defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
  970. defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
  971. ////////////////////////////////////////////////////////////////////////////////
  972. // Vector insert/extract operations.
  973. ////////////////////////////////////////////////////////////////////////////////
  974. defm : PdWriteRes<WriteVecInsert, [PdFPU01, PdFPMAL], 2, [1, 3], 2>;
  975. defm : PdWriteRes<WriteVecInsertLd, [PdFPU01, PdFPMAL, PdLoad], 6, [1, 4, 3], 2>;
  976. defm : PdWriteRes<WriteVecExtract, [PdFPU0, PdFPFMA, PdEX0], 12, [1, 3, 1], 2>;
  977. defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [2, 1, 1], 2>;
  978. def PdWriteEXTRQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
  979. let Latency = 3;
  980. let ResourceCycles = [1, 3];
  981. }
  982. def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>;
  983. ////////////////////////////////////////////////////////////////////////////////
  984. // SSE42 String instructions.
  985. ////////////////////////////////////////////////////////////////////////////////
  986. defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 11, [1, 6, 1], 7, 1>;
  987. defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0], 7, [1, 8, 1], 7, 2>;
  988. defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 14, [1, 10, 10, 10, 1, 1], 27, 1>;
  989. defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 10, 10, 10, 1, 1], 27, 1>;
  990. ////////////////////////////////////////////////////////////////////////////////
  991. // MOVMSK Instructions.
  992. ////////////////////////////////////////////////////////////////////////////////
  993. defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>;
  994. defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>;
  995. defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
  996. // defm : X86WriteResUnsupported<WriteVecMOVMSKZ>;
  997. defm : PdWriteRes<WriteMMXMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 10, [], 2>;
  998. ////////////////////////////////////////////////////////////////////////////////
  999. // AES Instructions.
  1000. ////////////////////////////////////////////////////////////////////////////////
  1001. defm : PdWriteResXMMPair<WriteAESIMC, [PdFPU0, PdFPMMA], 5>;
  1002. defm : PdWriteResXMMPair<WriteAESKeyGen, [PdFPU0, PdFPMMA], 5>;
  1003. defm : PdWriteResXMMPair<WriteAESDecEnc, [PdFPU0, PdFPMMA], 9, [], 2>;
  1004. ////////////////////////////////////////////////////////////////////////////////
  1005. // Horizontal add/sub instructions.
  1006. ////////////////////////////////////////////////////////////////////////////////
  1007. defm : PdWriteResXMMPair<WriteFHAdd, [PdFPU0, PdFPFMA], 11, [1, 5], 3, 1>;
  1008. defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [1, 8], 8, 2>;
  1009. defm : X86WriteResPairUnsupported<WriteFHAddZ>;
  1010. defm : PdWriteResXMMPair<WritePHAdd, [PdFPU01, PdFPMAL], 5, [1, 4], 3, 1>;
  1011. defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
  1012. defm : X86WriteResPairUnsupported<WritePHAddY>;
  1013. defm : X86WriteResPairUnsupported<WritePHAddZ>;
  1014. def : InstRW<[WritePHAdd], (instrs PHADDDrr, PHSUBDrr,
  1015. PHADDWrr, PHSUBWrr,
  1016. PHADDSWrr, PHSUBSWrr,
  1017. VPHADDDrr, VPHSUBDrr,
  1018. VPHADDWrr, VPHSUBWrr,
  1019. VPHADDSWrr, VPHSUBSWrr)>;
  1020. def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm,
  1021. PHADDWrm, PHSUBWrm,
  1022. PHADDSWrm, PHSUBSWrm,
  1023. VPHADDDrm, VPHSUBDrm,
  1024. VPHADDWrm, VPHSUBWrm,
  1025. VPHADDSWrm, VPHSUBSWrm)>;
  1026. ////////////////////////////////////////////////////////////////////////////////
  1027. // Carry-less multiplication instructions.
  1028. ////////////////////////////////////////////////////////////////////////////////
  1029. defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [1, 7], 5, 1>;
  1030. def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> {
  1031. let Latency = 12;
  1032. let ResourceCycles = [1, 7];
  1033. let NumMicroOps = 6;
  1034. }
  1035. def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>;
  1036. ////////////////////////////////////////////////////////////////////////////////
  1037. // SSE4A instructions.
  1038. ////////////////////////////////////////////////////////////////////////////////
  1039. def PdWriteINSERTQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
  1040. let Latency = 3;
  1041. let ResourceCycles = [1, 2];
  1042. }
  1043. def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ)>;
  1044. def PdWriteINSERTQI : SchedWriteRes<[PdFPU01, PdFPMAL]> {
  1045. let Latency = 3;
  1046. let ResourceCycles = [1, 3];
  1047. }
  1048. def : InstRW<[PdWriteINSERTQI], (instrs INSERTQI)>;
  1049. ////////////////////////////////////////////////////////////////////////////////
  1050. // AVX instructions.
  1051. ////////////////////////////////////////////////////////////////////////////////
  1052. def PdWriteVBROADCASTYLd : SchedWriteRes<[PdLoad, PdFPU01, PdFPFMA]> {
  1053. let Latency = 6;
  1054. let ResourceCycles = [1, 2, 4];
  1055. let NumMicroOps = 2;
  1056. }
  1057. def : InstRW<[PdWriteVBROADCASTYLd, ReadAfterLd], (instrs VBROADCASTSDYrm,
  1058. VBROADCASTSSYrm)>;
  1059. def PdWriteVZEROALL : SchedWriteRes<[]> {
  1060. let Latency = 90;
  1061. let NumMicroOps = 32;
  1062. }
  1063. def : InstRW<[PdWriteVZEROALL], (instrs VZEROALL)>;
  1064. def PdWriteVZEROUPPER : SchedWriteRes<[]> {
  1065. let Latency = 46;
  1066. let NumMicroOps = 16;
  1067. }
  1068. def : InstRW<[PdWriteVZEROUPPER], (instrs VZEROUPPER)>;
  1069. ///////////////////////////////////////////////////////////////////////////////
  1070. // SchedWriteVariant definitions.
  1071. ///////////////////////////////////////////////////////////////////////////////
  1072. def PdWriteZeroLatency : SchedWriteRes<[]> {
  1073. let Latency = 0;
  1074. }
  1075. def PdWriteZeroIdiom : SchedWriteVariant<[
  1076. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1077. SchedVar<MCSchedPredicate<TruePred>, [WriteALU]>
  1078. ]>;
  1079. def : InstRW<[PdWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
  1080. XOR32rr, XOR64rr)>;
  1081. def PdWriteFZeroIdiom : SchedWriteVariant<[
  1082. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1083. SchedVar<MCSchedPredicate<TruePred>, [WriteFLogic]>
  1084. ]>;
  1085. def : InstRW<[PdWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr,
  1086. XORPDrr, VXORPDrr,
  1087. ANDNPSrr, VANDNPSrr,
  1088. ANDNPDrr, VANDNPDrr)>;
  1089. // VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr "zero-idioms" have latency of 1.
  1090. def PdWriteVZeroIdiomLogic : SchedWriteVariant<[
  1091. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1092. SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogic]>
  1093. ]>;
  1094. def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>;
  1095. def PdWriteVZeroIdiomLogicX : SchedWriteVariant<[
  1096. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1097. SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogicX]>
  1098. ]>;
  1099. def : InstRW<[PdWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
  1100. PANDNrr, VPANDNrr)>;
  1101. def PdWriteVZeroIdiomALU : SchedWriteVariant<[
  1102. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1103. SchedVar<MCSchedPredicate<TruePred>, [WriteVecALU]>
  1104. ]>;
  1105. def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBrr, MMX_PSUBDrr,
  1106. MMX_PSUBQrr, MMX_PSUBWrr,
  1107. MMX_PCMPGTBrr,
  1108. MMX_PCMPGTDrr,
  1109. MMX_PCMPGTWrr)>;
  1110. def PdWriteVZeroIdiomALUX : SchedWriteVariant<[
  1111. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1112. SchedVar<MCSchedPredicate<TruePred>, [WriteVecALUX]>
  1113. ]>;
  1114. def : InstRW<[PdWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
  1115. PSUBDrr, VPSUBDrr,
  1116. PSUBQrr, VPSUBQrr,
  1117. PSUBWrr, VPSUBWrr,
  1118. PCMPGTBrr, VPCMPGTBrr,
  1119. PCMPGTDrr, VPCMPGTDrr,
  1120. PCMPGTWrr, VPCMPGTWrr)>;
  1121. ///////////////////////////////////////////////////////////////////////////////
  1122. // Dependency breaking instructions.
  1123. ///////////////////////////////////////////////////////////////////////////////
  1124. // VPCMPGTQ, but not PCMPGTQ!
  1125. def : IsZeroIdiomFunction<[
  1126. // GPR Zero-idioms.
  1127. DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
  1128. // MMX Zero-idioms.
  1129. DepBreakingClass<[
  1130. MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr,
  1131. MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr,
  1132. MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr,
  1133. MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr
  1134. ], ZeroIdiomPredicate>,
  1135. // SSE Zero-idioms.
  1136. DepBreakingClass<[
  1137. // fp variants.
  1138. XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
  1139. // int variants.
  1140. PXORrr, PANDNrr,
  1141. PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
  1142. PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
  1143. PCMPGTBrr, PCMPGTDrr, PCMPGTWrr
  1144. ], ZeroIdiomPredicate>,
  1145. // AVX Zero-idioms.
  1146. DepBreakingClass<[
  1147. // xmm fp variants.
  1148. VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
  1149. // xmm int variants.
  1150. VPXORrr, VPANDNrr,
  1151. VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
  1152. VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
  1153. VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
  1154. // ymm variants.
  1155. VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
  1156. ], ZeroIdiomPredicate>
  1157. ]>;
  1158. def : IsDepBreakingFunction<[
  1159. // GPR
  1160. DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
  1161. DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
  1162. // MMX
  1163. DepBreakingClass<[
  1164. MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr
  1165. ], ZeroIdiomPredicate>,
  1166. // SSE
  1167. DepBreakingClass<[
  1168. PCMPEQBrr, PCMPEQWrr, PCMPEQDrr
  1169. // But not PCMPEQQrr.
  1170. ], ZeroIdiomPredicate>,
  1171. // AVX
  1172. DepBreakingClass<[
  1173. VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr
  1174. // But not VPCMPEQQrr.
  1175. ], ZeroIdiomPredicate>
  1176. ]>;
  1177. } // SchedModel