X86ScheduleBdVer2.td 57 KB


  1. //=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file defines the machine model for AMD bdver2 (Piledriver) to support
  10. // instruction scheduling and other instruction cost heuristics.
  11. // Based on:
  12. // * AMD Software Optimization Guide for AMD Family 15h Processors.
  13. // https://support.amd.com/TechDocs/47414_15h_sw_opt_guide.pdf
  14. // * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
  15. // http://www.agner.org/optimize/microarchitecture.pdf
  16. // * https://www.realworldtech.com/bulldozer/
  17. // Yes, that is for Bulldozer aka bdver1, not Piledriver aka bdver2.
  18. //
  19. //===----------------------------------------------------------------------===//
  20. def BdVer2Model : SchedMachineModel {
  21. let IssueWidth = 4; // Up to 4 IPC can be decoded, issued, retired.
  22. let MicroOpBufferSize = 128; // RCU reorder buffer size, which is unconfirmed.
  23. let LoopMicroOpBufferSize = -1; // There does not seem to be a loop buffer.
  24. let LoadLatency = 4; // L1 data cache has a 4-cycle load-to-use latency.
  25. let HighLatency = 25; // FIXME: any better choice?
  26. let MispredictPenalty = 20; // Minimum branch misdirection penalty.
  27. let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
  28. // FIXME: Incomplete. This flag is set to allow the scheduler to assign
  29. // a default model to unrecognized opcodes.
  30. let CompleteModel = 0;
  31. } // SchedMachineModel
  32. let SchedModel = BdVer2Model in {
  33. //===----------------------------------------------------------------------===//
  34. // Pipes
  35. //===----------------------------------------------------------------------===//
  36. // There are total of eight pipes.
  37. //===----------------------------------------------------------------------===//
  38. // Integer execution pipes
  39. //
  40. // Two EX (ALU) pipes.
  41. def PdEX0 : ProcResource<1>; // ALU, Integer Pipe0
  42. def PdEX1 : ProcResource<1>; // ALU, Integer Pipe1
  43. def PdEX01 : ProcResGroup<[PdEX0, PdEX1]>;
  44. // Two AGLU pipes, identical.
  45. def PdAGLU01 : ProcResource<2>; // AGU, Integer Pipe[23]
  46. //===----------------------------------------------------------------------===//
  47. // Floating point execution pipes
  48. //
  49. // Four FPU pipes.
  50. def PdFPU0 : ProcResource<1>; // Vector/FPU Pipe0
  51. def PdFPU1 : ProcResource<1>; // Vector/FPU Pipe1
  52. def PdFPU2 : ProcResource<1>; // Vector/FPU Pipe2
  53. def PdFPU3 : ProcResource<1>; // Vector/FPU Pipe3
  54. // FPU grouping
  55. def PdFPU01 : ProcResGroup<[PdFPU0, PdFPU1]>;
  56. def PdFPU23 : ProcResGroup<[PdFPU2, PdFPU3]>;
  57. //===----------------------------------------------------------------------===//
  58. // RCU
  59. //===----------------------------------------------------------------------===//
  60. // The Retire Control Unit on Piledriver can retire up to 4 macro-ops per cycle.
  61. // On the other hand, the RCU reorder buffer size for Piledriver does not
  62. // seem be specified in any trustworthy source.
  63. // But as per https://www.realworldtech.com/bulldozer/6/ the Bulldozer had
  64. // RCU reorder buffer size of 128. So that is a good guess for now.
  65. def PdRCU : RetireControlUnit<128, 4>;
  66. //===----------------------------------------------------------------------===//
  67. // Pipelines
  68. //===----------------------------------------------------------------------===//
  69. // There are total of two pipelines, each one with it's own scheduler.
  70. //===----------------------------------------------------------------------===//
  71. // Integer Pipeline Scheduling
  72. //
  73. // There is one Integer Scheduler per core.
  74. // Integer physical register file has 96 registers of 64-bit.
  75. def PdIntegerPRF : RegisterFile<96, [GR64, CCR]>;
  76. // Unified Integer, Memory Scheduler has 40 entries.
  77. def PdEX : ProcResGroup<[PdEX0, PdEX1, PdAGLU01]> {
  78. // Up to 4 IPC can be decoded, issued, retired.
  79. let BufferSize = 40;
  80. }
  81. //===----------------------------------------------------------------------===//
  82. // FPU Pipeline Scheduling
  83. //
  84. // The FPU unit is shared between the two cores.
  85. // FP physical register file has 160 registers of 128-bit.
  86. // Operations on 256-bit data types are cracked into two COPs.
  87. def PdFpuPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>;
  88. // Unified FP Scheduler has 64 entries,
  89. def PdFPU : ProcResGroup<[PdFPU0, PdFPU1, PdFPU2, PdFPU3]> {
  90. // Up to 4 IPC can be decoded, issued, retired.
  91. let BufferSize = 64;
  92. }
  93. //===----------------------------------------------------------------------===//
  94. // Functional units
  95. //===----------------------------------------------------------------------===//
  96. //===----------------------------------------------------------------------===//
  97. // Load-Store Units
  98. //
  99. let Super = PdAGLU01 in
  100. def PdLoad : ProcResource<2> {
  101. // For Piledriver, the load queue is 40 entries deep.
  102. let BufferSize = 40;
  103. }
  104. def PdLoadQueue : LoadQueue<PdLoad>;
  105. let Super = PdAGLU01 in
  106. def PdStore : ProcResource<1> {
  107. // For Piledriver, the store queue is 24 entries deep.
  108. let BufferSize = 24;
  109. }
  110. def PdStoreQueue : StoreQueue<PdStore>;
  111. //===----------------------------------------------------------------------===//
  112. // Integer Execution Units
  113. //
  114. def PdDiv : ProcResource<1>; // PdEX0; unpipelined integer division
  115. def PdCount : ProcResource<1>; // PdEX0; POPCNT, LZCOUNT
  116. def PdMul : ProcResource<1>; // PdEX1; integer multiplication
  117. def PdBranch : ProcResource<1>; // PdEX1; JMP, fused branches
  118. //===----------------------------------------------------------------------===//
  119. // Floating-Point Units
  120. //
  121. // Two FMAC/FPFMA units.
  122. def PdFPFMA : ProcResource<2>; // PdFPU0, PdFPU1
  123. // One 128-bit integer multiply-accumulate unit.
  124. def PdFPMMA : ProcResource<1>; // PdFPU0
  125. // One fp conversion unit.
  126. def PdFPCVT : ProcResource<1>; // PdFPU0
  127. // One unit for shuffles, packs, permutes, shifts.
  128. def PdFPXBR : ProcResource<1>; // PdFPU1
  129. // Two 128-bit packed integer units.
  130. def PdFPMAL : ProcResource<2>; // PdFPU2, PdFPU3
  131. // One FP store unit.
  132. def PdFPSTO : ProcResource<1>; // PdFPU3
  133. //===----------------------------------------------------------------------===//
  134. // Basic helper classes.
  135. //===----------------------------------------------------------------------===//
  136. // Many SchedWrites are defined in pairs with and without a folded load.
  137. // Instructions with folded loads are usually micro-fused, so they only appear
  138. // as two micro-ops when dispatched by the schedulers.
  139. // This multiclass defines the resource usage for variants with and without
  140. // folded loads.
  141. multiclass PdWriteRes<SchedWrite SchedRW,
  142. list<ProcResourceKind> ExePorts, int Lat = 1,
  143. list<int> Res = [], int UOps = 1> {
  144. def : WriteRes<SchedRW, ExePorts> {
  145. let Latency = Lat;
  146. let ResourceCycles = Res;
  147. let NumMicroOps = UOps;
  148. }
  149. }
  150. multiclass __pdWriteResPair<X86FoldableSchedWrite SchedRW,
  151. list<ProcResourceKind> ExePorts, int Lat,
  152. list<int> Res, int UOps,
  153. int LoadLat, int LoadRes, int LoadUOps> {
  154. defm : PdWriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
  155. defm : PdWriteRes<SchedRW.Folded,
  156. !listconcat([PdLoad], ExePorts),
  157. !add(Lat, LoadLat),
  158. !if(!and(!empty(Res), !eq(LoadRes, 1)),
  159. [],
  160. !listconcat([LoadRes],
  161. !if(!empty(Res),
  162. !listsplat(1, !size(ExePorts)),
  163. Res))),
  164. !add(UOps, LoadUOps)>;
  165. }
  166. multiclass PdWriteResExPair<X86FoldableSchedWrite SchedRW,
  167. list<ProcResourceKind> ExePorts, int Lat = 1,
  168. list<int> Res = [], int UOps = 1,
  169. int LoadUOps = 0> {
  170. defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
  171. /*LoadLat*/4, /*LoadRes*/3, LoadUOps>;
  172. }
  173. multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW,
  174. list<ProcResourceKind> ExePorts, int Lat = 1,
  175. list<int> Res = [], int UOps = 1,
  176. int LoadUOps = 0> {
  177. defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
  178. /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
  179. }
  180. multiclass PdWriteResYMMPair<X86FoldableSchedWrite SchedRW,
  181. list<ProcResourceKind> ExePorts, int Lat,
  182. list<int> Res = [], int UOps = 2,
  183. int LoadUOps = 0> {
  184. defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
  185. /*LoadLat*/5, /*LoadRes*/3, LoadUOps>;
  186. }
  187. //===----------------------------------------------------------------------===//
  188. // Here be dragons.
  189. //===----------------------------------------------------------------------===//
  190. // L1 data cache has a 4-cycle load-to-use latency, so ReadAfterLd registers
  191. // needn't be available until 4 cycles after the memory operand.
  192. def : ReadAdvance<ReadAfterLd, 4>;
  193. // Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available
  194. // until 5 cycles after the memory operand.
  195. def : ReadAdvance<ReadAfterVecLd, 5>;
  196. def : ReadAdvance<ReadAfterVecXLd, 5>;
  197. def : ReadAdvance<ReadAfterVecYLd, 5>;
  198. // Transfer from int domain to ivec domain incurs additional latency of 8..10cy
  199. // Reference: Agner, Microarchitecture, "AMD Bulldozer, Piledriver, Steamroller
  200. // and Excavator pipeline", "Data delay between different execution domains"
  201. def : ReadAdvance<ReadInt2Fpu, -10>;
  202. // A folded store needs a cycle on the PdStore for the store data.
  203. def : WriteRes<WriteRMW, [PdStore]>;
  204. ////////////////////////////////////////////////////////////////////////////////
  205. // Loads, stores, and moves, not folded with other operations.
  206. ////////////////////////////////////////////////////////////////////////////////
  207. def : WriteRes<WriteLoad, [PdLoad]> { let Latency = 5; let ResourceCycles = [2]; }
  208. def : WriteRes<WriteStore, [PdStore]>;
  209. def : WriteRes<WriteStoreNT, [PdStore]>;
  210. def : WriteRes<WriteMove, [PdEX01]> { let ResourceCycles = [2]; }
  211. defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
  212. // Load/store MXCSR.
  213. // FIXME: These are copy and pasted from WriteLoad/Store.
  214. def : WriteRes<WriteLDMXCSR, [PdLoad]> { let Latency = 5; }
  215. def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; let ResourceCycles = [18]; }
  216. // Treat misc copies as a move.
  217. def : InstRW<[WriteMove], (instrs COPY)>;
  218. ////////////////////////////////////////////////////////////////////////////////
  219. // Idioms that clear a register, like xorps %xmm0, %xmm0.
  220. // These can often bypass execution ports completely.
  221. ////////////////////////////////////////////////////////////////////////////////
  222. def : WriteRes<WriteZero, [/*No ExePorts*/]>;
  223. ////////////////////////////////////////////////////////////////////////////////
  224. // Branches don't produce values, so they have no latency, but they still
  225. // consume resources. Indirect branches can fold loads.
  226. ////////////////////////////////////////////////////////////////////////////////
  227. defm : PdWriteResExPair<WriteJump, [PdEX1, PdBranch]>;
  228. ////////////////////////////////////////////////////////////////////////////////
  229. // Special case scheduling classes.
  230. ////////////////////////////////////////////////////////////////////////////////
  231. def : WriteRes<WriteSystem, [PdEX01]> { let Latency = 100; }
  232. def : WriteRes<WriteMicrocoded, [PdEX01]> { let Latency = 100; }
  233. def : WriteRes<WriteFence, [PdStore]>;
  234. def PdWriteXLAT : SchedWriteRes<[PdEX01]> {
  235. let Latency = 6;
  236. }
  237. def : InstRW<[PdWriteXLAT], (instrs XLAT)>;
  238. def PdWriteLARrr : SchedWriteRes<[PdEX01]> {
  239. let Latency = 184;
  240. let ResourceCycles = [375];
  241. let NumMicroOps = 45;
  242. }
  243. def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr",
  244. "LSL(16|32|64)rr")>;
  245. // Nops don't have dependencies, so there's no actual latency, but we set this
  246. // to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
  247. def : WriteRes<WriteNop, [PdEX01]> { let ResourceCycles = [2]; }
  248. ////////////////////////////////////////////////////////////////////////////////
  249. // Arithmetic.
  250. ////////////////////////////////////////////////////////////////////////////////
  251. defm : PdWriteResExPair<WriteALU, [PdEX01], 1, [2]>;
  252. def PdWriteALURMW : SchedWriteRes<[PdLoad, PdEX01, PdStore]> {
  253. let Latency = 6;
  254. let ResourceCycles = [3, 2, 1];
  255. let NumMicroOps = 1;
  256. }
  257. def : SchedAlias<WriteALURMW, PdWriteALURMW>;
  258. def PdWriteLXADD : SchedWriteRes<[PdEX01]> {
  259. let Latency = 6;
  260. let ResourceCycles = [88];
  261. let NumMicroOps = 4;
  262. }
  263. def : InstRW<[PdWriteLXADD], (instrs LXADD8, LXADD16, LXADD32, LXADD64)>;
  264. def PdWriteBMI1 : SchedWriteRes<[PdEX01]> {
  265. let Latency = 2;
  266. let ResourceCycles = [2];
  267. let NumMicroOps = 2;
  268. }
  269. def : InstRW<[PdWriteBMI1],
  270. (instrs BLCFILL32rr, BLCFILL64rr, BLCI32rr, BLCI64rr,
  271. BLCIC32rr, BLCIC64rr, BLCMSK32rr, BLCMSK64rr,
  272. BLCS32rr, BLCS64rr, BLSFILL32rr, BLSFILL64rr,
  273. BLSIC32rr, BLSIC64rr, T1MSKC32rr, T1MSKC64rr,
  274. TZMSK32rr, TZMSK64rr)>;
  275. def PdWriteBMI1m : SchedWriteRes<[PdLoad, PdEX01]> {
  276. let Latency = 6;
  277. let ResourceCycles = [3, 3];
  278. let NumMicroOps = 2;
  279. }
  280. def : InstRW<[PdWriteBMI1m],
  281. (instrs BLCFILL32rm, BLCFILL64rm, BLCI32rm, BLCI64rm,
  282. BLCIC32rm, BLCIC64rm, BLCMSK32rm, BLCMSK64rm,
  283. BLCS32rm, BLCS64rm, BLSFILL32rm, BLSFILL64rm,
  284. BLSIC32rm, BLSIC64rm, T1MSKC32rm, T1MSKC64rm,
  285. TZMSK32rm, TZMSK64rm)>;
  286. defm : PdWriteResExPair<WriteADC, [PdEX01], 1, [2]>;
  287. def PdWriteADCSBB64ri32 : SchedWriteRes<[PdEX01]> {
  288. let ResourceCycles = [3];
  289. }
  290. def : InstRW<[PdWriteADCSBB64ri32], (instrs ADC64ri32, SBB64ri32)>;
  291. defm : PdWriteRes<WriteBSWAP32, [PdEX01]>;
  292. defm : PdWriteRes<WriteBSWAP64, [PdEX01]>;
  293. defm : PdWriteRes<WriteCMPXCHG, [PdEX1], 3, [3], 5>;
  294. defm : PdWriteRes<WriteCMPXCHGRMW, [PdEX1, PdStore, PdLoad], 3, [44, 1, 1], 2>;
  295. defm : PdWriteRes<WriteXCHG, [PdEX1], 1, [], 2>;
  296. def PdWriteCMPXCHG8rr : SchedWriteRes<[PdEX1]> {
  297. let Latency = 3;
  298. let ResourceCycles = [3];
  299. let NumMicroOps = 3;
  300. }
  301. def : InstRW<[PdWriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
  302. def PdWriteCMPXCHG8rm : SchedWriteRes<[PdEX1]> {
  303. let Latency = 3;
  304. let ResourceCycles = [23];
  305. let NumMicroOps = 5;
  306. }
  307. def : InstRW<[PdWriteCMPXCHG8rm], (instrs CMPXCHG8rm)>;
  308. def PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm : SchedWriteRes<[PdEX1]> {
  309. let Latency = 3;
  310. let ResourceCycles = [21];
  311. let NumMicroOps = 6;
  312. }
  313. def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm],
  314. (instrs CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm)>;
  315. def PdWriteCMPXCHG8B : SchedWriteRes<[PdEX1]> {
  316. let Latency = 3;
  317. let ResourceCycles = [26];
  318. let NumMicroOps = 18;
  319. }
  320. def : InstRW<[PdWriteCMPXCHG8B], (instrs CMPXCHG8B)>;
  321. def PdWriteCMPXCHG16B : SchedWriteRes<[PdEX1]> {
  322. let Latency = 3;
  323. let ResourceCycles = [69];
  324. let NumMicroOps = 22;
  325. }
  326. def : InstRW<[PdWriteCMPXCHG16B], (instrs CMPXCHG16B)>;
  327. def PdWriteXADDm : SchedWriteRes<[PdEX1]> {
  328. let Latency = 6;
  329. let ResourceCycles = [20];
  330. let NumMicroOps = 4;
  331. }
  332. def : InstRW<[PdWriteXADDm], (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm)>;
  333. defm : PdWriteResExPair<WriteIMul8, [PdEX1, PdMul], 4, [1, 4]>;
  334. defm : PdWriteResExPair<WriteIMul16, [PdEX1, PdMul], 4, [1, 5], 2>;
  335. defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul], 5, [1, 5], 2>;
  336. defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul], 4, [1, 2]>;
  337. defm : PdWriteResExPair<WriteIMul32, [PdEX1, PdMul], 4, [1, 4]>;
  338. defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul], 4, [1, 2], 1, 1>;
  339. defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul], 4, [1, 2]>;
  340. defm : PdWriteResExPair<WriteIMul64, [PdEX1, PdMul], 6, [1, 6]>;
  341. defm : PdWriteResExPair<WriteIMul64Imm, [PdEX1, PdMul], 6, [1, 4],1, 1>;
  342. defm : PdWriteResExPair<WriteIMul64Reg, [PdEX1, PdMul], 6, [1, 4]>;
  343. // BMI2 MULX
  344. defm : X86WriteResUnsupported<WriteIMulH>;
  345. defm : X86WriteResUnsupported<WriteIMulHLd>;
  346. defm : X86WriteResPairUnsupported<WriteMULX32>;
  347. defm : X86WriteResPairUnsupported<WriteMULX64>;
  348. defm : PdWriteResExPair<WriteDiv8, [PdEX1, PdDiv], 12, [1, 12]>;
  349. defm : PdWriteResExPair<WriteDiv16, [PdEX1, PdDiv], 15, [1, 15], 2>;
  350. defm : PdWriteResExPair<WriteDiv32, [PdEX1, PdDiv], 14, [1, 14], 2>;
  351. defm : PdWriteResExPair<WriteDiv64, [PdEX1, PdDiv], 14, [1, 14], 2>;
  352. defm : PdWriteResExPair<WriteIDiv8, [PdEX1, PdDiv], 12, [1, 12]>;
  353. defm : PdWriteResExPair<WriteIDiv16, [PdEX1, PdDiv], 15, [1, 17], 2>;
  354. defm : PdWriteResExPair<WriteIDiv32, [PdEX1, PdDiv], 14, [1, 25], 2>;
  355. defm : PdWriteResExPair<WriteIDiv64, [PdEX1, PdDiv], 14, [1, 14], 2>;
  356. defm : PdWriteResExPair<WriteCRC32, [PdEX01], 2, [4], 3>;
  357. def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> {
  358. let Latency = 5;
  359. let ResourceCycles = [10];
  360. let NumMicroOps = 5;
  361. }
  362. def : InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>;
  363. def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> {
  364. let Latency = 6;
  365. let ResourceCycles = [12];
  366. let NumMicroOps = 7;
  367. }
  368. def : InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>;
  369. def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> {
  370. let Latency = 10;
  371. let ResourceCycles = [17];
  372. let NumMicroOps = 11;
  373. }
  374. def : InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>;
  375. defm : PdWriteResExPair<WriteCMOV, [PdEX01]>; // Conditional move.
  376. def PdWriteCMOVm : SchedWriteRes<[PdLoad, PdEX01]> {
  377. let Latency = 5;
  378. let ResourceCycles = [3, 3];
  379. let NumMicroOps = 2;
  380. }
  381. def PdWriteCMOVmVar : SchedWriteVariant<[
  382. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_BE">>, [PdWriteCMOVm]>,
  383. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_A">>, [PdWriteCMOVm]>,
  384. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_L">>, [PdWriteCMOVm]>,
  385. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_GE">>, [PdWriteCMOVm]>,
  386. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_LE">>, [PdWriteCMOVm]>,
  387. SchedVar<MCSchedPredicate<CheckImmOperand_s<7, "X86::COND_G">>, [PdWriteCMOVm]>,
  388. SchedVar<NoSchedPred, [WriteCMOV.Folded]>
  389. ]>;
  390. def : InstRW<[PdWriteCMOVmVar], (instrs CMOV16rm, CMOV32rm, CMOV64rm)>;
  391. defm : PdWriteRes<WriteFCMOV, [PdFPU0, PdFPFMA]>; // x87 conditional move.
  392. def : WriteRes<WriteSETCC, [PdEX01]>; // Setcc.
  393. def : WriteRes<WriteSETCCStore, [PdEX01, PdStore]>;
  394. def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> {
  395. let ResourceCycles = [2];
  396. let NumMicroOps = 2;
  397. }
  398. def PdSETGEmSETGmSETLEmSETLm : SchedWriteVariant<[
  399. SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_GE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
  400. SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_G">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
  401. SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_LE">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
  402. SchedVar<MCSchedPredicate<CheckImmOperand_s<5, "X86::COND_L">>, [PdWriteSETGEmSETGmSETLEmSETLm]>,
  403. SchedVar<NoSchedPred, [WriteSETCCStore]>
  404. ]>;
  405. def : InstRW<[PdSETGEmSETGmSETLEmSETLm], (instrs SETCCm)>;
  406. defm : PdWriteRes<WriteLAHFSAHF, [PdEX01], 2, [4], 2>;
  407. def PdWriteLAHF : SchedWriteRes<[PdEX01]> {
  408. let Latency = 2;
  409. let ResourceCycles = [4];
  410. let NumMicroOps = 4;
  411. }
  412. def : InstRW<[PdWriteLAHF], (instrs LAHF)>;
  413. def PdWriteSAHF : SchedWriteRes<[PdEX01]> {
  414. let Latency = 2;
  415. let ResourceCycles = [2];
  416. let NumMicroOps = 2;
  417. }
  418. def : InstRW<[PdWriteSAHF], (instrs SAHF)>;
  419. defm : PdWriteRes<WriteBitTest, [PdEX01], 1, [2], 1>;
  420. defm : PdWriteRes<WriteBitTestImmLd, [PdEX01, PdLoad], 5, [2, 3], 1>;
  421. defm : PdWriteRes<WriteBitTestRegLd, [PdEX01, PdLoad], 5, [7, 2], 7>;
  422. defm : PdWriteRes<WriteBitTestSet, [PdEX01], 2, [2], 2>;
  423. defm : PdWriteRes<WriteBitTestSetImmLd, [PdEX01, PdLoad], 6, [1, 1], 4>;
  424. defm : PdWriteRes<WriteBitTestSetRegLd, [PdEX01, PdLoad], 6, [1, 1], 10>;
  425. def PdWriteBTSIm : SchedWriteRes<[PdEX01, PdLoad]> {
  426. let Latency = 7;
  427. let ResourceCycles = [42, 1];
  428. let NumMicroOps = 4;
  429. }
  430. def : SchedAlias<WriteBitTestSetImmRMW, PdWriteBTSIm>;
  431. def PdWriteBTSRm : SchedWriteRes<[PdEX01, PdLoad]> {
  432. let Latency = 7;
  433. let ResourceCycles = [44, 1];
  434. let NumMicroOps = 10;
  435. }
  436. def : SchedAlias<WriteBitTestSetRegRMW, PdWriteBTSRm>;
  437. // This is for simple LEAs with one or two input operands.
  438. def : WriteRes<WriteLEA, [PdEX01]> { let ResourceCycles = [2]; }
  439. // This write is used for slow LEA instructions.
  440. def PdWrite3OpsLEA : SchedWriteRes<[PdEX01]> {
  441. let Latency = 2;
  442. let ResourceCycles = [2];
  443. }
  444. // On Piledriver, a slow LEA is either a 3Ops LEA (base, index, offset),
  445. // or an LEA with a `Scale` value different than 1.
  446. def PdSlowLEAPredicate : MCSchedPredicate<
  447. CheckAny<[
  448. // A 3-operand LEA (base, index, offset).
  449. IsThreeOperandsLEAFn,
  450. // An LEA with a "Scale" different than 1.
  451. CheckAll<[
  452. CheckIsImmOperand<2>,
  453. CheckNot<CheckImmOperand<2, 1>>
  454. ]>
  455. ]>
  456. >;
  457. def PdWriteLEA : SchedWriteVariant<[
  458. SchedVar<PdSlowLEAPredicate, [PdWrite3OpsLEA]>,
  459. SchedVar<NoSchedPred, [WriteLEA]>
  460. ]>;
  461. def : InstRW<[PdWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
  462. def PdWriteLEA16r : SchedWriteRes<[PdEX01]> {
  463. let ResourceCycles = [3];
  464. let NumMicroOps = 2;
  465. }
  466. def : InstRW<[PdWriteLEA16r], (instrs LEA16r)>;
  467. // Bit counts.
  468. defm : PdWriteResExPair<WriteBSF, [PdEX01], 3, [6], 6, 2>;
  469. defm : PdWriteResExPair<WriteBSR, [PdEX01], 4, [8], 7, 2>;
  470. defm : PdWriteResExPair<WritePOPCNT, [PdEX01], 4, [4]>;
  471. defm : PdWriteResExPair<WriteLZCNT, [PdEX0], 2, [2], 2>;
  472. defm : PdWriteResExPair<WriteTZCNT, [PdEX0], 2, [2], 2>;
  473. // BMI1 BEXTR, BMI2 BZHI
  474. defm : PdWriteResExPair<WriteBEXTR, [PdEX01], 2, [2], 2>;
  475. defm : PdWriteResExPair<WriteBLS, [PdEX01], 2, [2], 2>;
  476. defm : PdWriteResExPair<WriteBZHI, [PdEX01]>;
  477. def PdWriteBEXTRI : SchedWriteRes<[PdEX01]> {
  478. let Latency = 2;
  479. let ResourceCycles = [4];
  480. let NumMicroOps = 2;
  481. }
  482. def : InstRW<[PdWriteBEXTRI], (instrs BEXTRI32ri, BEXTRI64ri)>;
  483. def PdWriteBEXTRIm : SchedWriteRes<[PdEX01]> {
  484. let Latency = 2;
  485. let ResourceCycles = [5];
  486. let NumMicroOps = 2;
  487. }
  488. def : InstRW<[PdWriteBEXTRIm], (instrs BEXTRI32mi, BEXTRI64mi)>;
  489. ////////////////////////////////////////////////////////////////////////////////
  490. // Integer shifts and rotates.
  491. ////////////////////////////////////////////////////////////////////////////////
  492. defm : PdWriteResExPair<WriteShift, [PdEX01], 1, [2]>;
  493. defm : PdWriteResExPair<WriteShiftCL, [PdEX01]>;
  494. defm : PdWriteResExPair<WriteRotate, [PdEX01], 1, [2]>;
  495. defm : PdWriteResExPair<WriteRotateCL, [PdEX01]>;
  496. def PdWriteRCL8rCL : SchedWriteRes<[PdEX01]> {
  497. let Latency = 12;
  498. let ResourceCycles = [24];
  499. let NumMicroOps = 26;
  500. }
  501. def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>;
  502. def PdWriteRCR8ri : SchedWriteRes<[PdEX01]> {
  503. let Latency = 12;
  504. let ResourceCycles = [23];
  505. let NumMicroOps = 23;
  506. }
  507. def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>;
  508. def PdWriteRCR8rCL : SchedWriteRes<[PdEX01]> {
  509. let Latency = 11;
  510. let ResourceCycles = [22];
  511. let NumMicroOps = 24;
  512. }
  513. def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>;
  514. def PdWriteRCL16rCL : SchedWriteRes<[PdEX01]> {
  515. let Latency = 10;
  516. let ResourceCycles = [20];
  517. let NumMicroOps = 22;
  518. }
  519. def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>;
  520. def PdWriteRCR16ri : SchedWriteRes<[PdEX01]> {
  521. let Latency = 10;
  522. let ResourceCycles = [19];
  523. let NumMicroOps = 19;
  524. }
  525. def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>;
  526. def PdWriteRCL3264rCL : SchedWriteRes<[PdEX01]> {
  527. let Latency = 7;
  528. let ResourceCycles = [14];
  529. let NumMicroOps = 17;
  530. }
  531. def : InstRW<[PdWriteRCL3264rCL], (instrs RCL32rCL, RCL64rCL)>;
  532. def PdWriteRCR3264rCL : SchedWriteRes<[PdEX01]> {
  533. let Latency = 7;
  534. let ResourceCycles = [13];
  535. let NumMicroOps = 16;
  536. }
  537. def : InstRW<[PdWriteRCR3264rCL], (instrs RCR32rCL, RCR64rCL)>;
  538. def PdWriteRCR32riRCR64ri : SchedWriteRes<[PdEX01]> {
  539. let Latency = 7;
  540. let ResourceCycles = [14];
  541. let NumMicroOps = 15;
  542. }
  543. def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>;
  544. def PdWriteRCR16rCL : SchedWriteRes<[PdEX01]> {
  545. let Latency = 9;
  546. let ResourceCycles = [18];
  547. let NumMicroOps = 20;
  548. }
  549. def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>;
  550. def PdWriteRCL16ri : SchedWriteRes<[PdEX01]> {
  551. let Latency = 11;
  552. let ResourceCycles = [21];
  553. let NumMicroOps = 21;
  554. }
  555. def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>;
  556. def PdWriteRCL3264ri : SchedWriteRes<[PdEX01]> {
  557. let Latency = 8;
  558. let ResourceCycles = [15];
  559. let NumMicroOps = 16;
  560. }
  561. def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>;
  562. def PdWriteRCL8ri : SchedWriteRes<[PdEX01]> {
  563. let Latency = 13;
  564. let ResourceCycles = [25];
  565. let NumMicroOps = 25;
  566. }
  567. def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>;
  568. // SHLD/SHRD.
  569. defm : PdWriteRes<WriteSHDrri, [PdEX01], 3, [6], 6>;
  570. defm : PdWriteRes<WriteSHDrrcl, [PdEX01], 3, [8], 7>;
  571. def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> {
  572. let Latency = 3;
  573. let ResourceCycles = [6];
  574. let NumMicroOps = 7;
  575. }
  576. def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL,
  577. SHLD32rrCL,
  578. SHRD32rrCL)>;
  579. defm : PdWriteRes<WriteSHDmri, [PdLoad, PdEX01], 4, [1, 22], 8>;
  580. defm : PdWriteRes<WriteSHDmrcl, [PdLoad, PdEX01], 4, [1, 22], 8>;
  581. ////////////////////////////////////////////////////////////////////////////////
  582. // Floating point. This covers both scalar and vector operations.
  583. ////////////////////////////////////////////////////////////////////////////////
  584. defm : PdWriteRes<WriteFLD0, [PdFPU1, PdFPSTO], 3>;
  585. defm : PdWriteRes<WriteFLD1, [PdFPU1, PdFPSTO], 3>;
  586. defm : PdWriteRes<WriteFLDC, [PdFPU1, PdFPSTO], 3>;
  587. defm : PdWriteRes<WriteFLoad, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
  588. defm : PdWriteRes<WriteFLoadX, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3]>;
  589. defm : PdWriteRes<WriteFLoadY, [PdLoad, PdFPU01, PdFPFMA], 5, [3, 1, 3], 2>;
  590. defm : PdWriteRes<WriteFMaskedLoad, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 1, 4]>;
  591. defm : PdWriteRes<WriteFMaskedLoadY, [PdLoad, PdFPU01, PdFPFMA], 6, [3, 2, 4], 2>;
  592. defm : PdWriteRes<WriteFStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>;
  593. defm : PdWriteRes<WriteFStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>;
  594. defm : PdWriteRes<WriteFStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [1, 36, 2], 4>;
  595. def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU23, PdFPSTO]> {
  596. let Latency = 2;
  597. let ResourceCycles = [1, 3, 1];
  598. let NumMicroOps = 2;
  599. }
  600. def : InstRW<[PdWriteMOVHPm], (instrs MOVHPDmr, MOVHPSmr, VMOVHPDmr, VMOVHPSmr)>;
  601. def PdWriteVMOVUPDYmrVMOVUPSYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> {
  602. let NumMicroOps = 8;
  603. }
  604. def : InstRW<[PdWriteVMOVUPDYmrVMOVUPSYmr], (instrs VMOVUPDYmr, VMOVUPSYmr)>;
  605. defm : PdWriteRes<WriteFStoreNT, [PdStore, PdFPU1, PdFPSTO], 3>;
  606. defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>;
  607. defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>;
  608. defm : PdWriteRes<WriteFMaskedStore32, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
  609. defm : PdWriteRes<WriteFMaskedStore64, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
  610. defm : PdWriteRes<WriteFMaskedStore32Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
  611. defm : PdWriteRes<WriteFMaskedStore64Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
  612. defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>;
  613. defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>;
  614. defm : PdWriteRes<WriteFMoveY, [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
  615. defm : X86WriteResUnsupported<WriteFMoveZ>;
  616. defm : PdWriteRes<WriteEMMS, [PdFPU01, PdFPFMA], 2>;
  617. defm : PdWriteResXMMPair<WriteFAdd, [PdFPU0, PdFPFMA], 5>;
  618. defm : PdWriteResXMMPair<WriteFAddX, [PdFPU0, PdFPFMA], 5>;
  619. defm : PdWriteResYMMPair<WriteFAddY, [PdFPU0, PdFPFMA], 5, [1, 2]>;
  620. defm : X86WriteResPairUnsupported<WriteFAddZ>;
  621. def PdWriteX87Add: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
  622. let Latency = 5;
  623. let ResourceCycles = [3, 1, 10];
  624. }
  625. def : InstRW<[PdWriteX87Add], (instrs ADD_FI16m, ADD_FI32m, ADD_F32m, ADD_F64m,
  626. SUB_FI16m, SUB_FI32m, SUB_F32m, SUB_F64m,
  627. SUBR_FI16m, SUBR_FI32m, SUBR_F32m, SUBR_F64m)>;
  628. defm : PdWriteResXMMPair<WriteFAdd64, [PdFPU0, PdFPFMA], 5>;
  629. defm : PdWriteResXMMPair<WriteFAdd64X, [PdFPU0, PdFPFMA], 5>;
  630. defm : PdWriteResYMMPair<WriteFAdd64Y, [PdFPU0, PdFPFMA], 5, [1, 2]>;
  631. defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
  632. defm : PdWriteResXMMPair<WriteFCmp, [PdFPU0, PdFPFMA], 2>;
  633. defm : PdWriteResXMMPair<WriteFCmpX, [PdFPU0, PdFPFMA], 2>;
  634. defm : PdWriteResYMMPair<WriteFCmpY, [PdFPU0, PdFPFMA], 2, [1, 2]>;
  635. defm : X86WriteResPairUnsupported<WriteFCmpZ>;
  636. defm : PdWriteResXMMPair<WriteFCmp64, [PdFPU0, PdFPFMA], 2>;
  637. defm : PdWriteResXMMPair<WriteFCmp64X, [PdFPU0, PdFPFMA], 2>;
  638. defm : PdWriteResYMMPair<WriteFCmp64Y, [PdFPU0, PdFPFMA], 2, [1, 2]>;
  639. defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
  640. defm : PdWriteResXMMPair<WriteFCom, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
  641. defm : PdWriteResXMMPair<WriteFComX, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
  642. def PdWriteFCOMPm : SchedWriteRes<[PdFPU1, PdFPFMA]> {
  643. let Latency = 6;
  644. }
  645. def : InstRW<[PdWriteFCOMPm], (instrs FCOM32m, FCOM64m, FCOMP32m, FCOMP64m)>;
  646. def PdWriteTST_F_UCOM_FPPr : SchedWriteRes<[PdFPU1, PdFPFMA]>;
  647. def : InstRW<[PdWriteTST_F_UCOM_FPPr], (instrs TST_F, UCOM_FPPr)>;
  648. defm : PdWriteResXMMPair<WriteFMul, [PdFPU1, PdFPFMA], 5>;
  649. defm : PdWriteResXMMPair<WriteFMulX, [PdFPU1, PdFPFMA], 5>;
  650. defm : PdWriteResYMMPair<WriteFMulY, [PdFPU1, PdFPFMA], 5, [1, 2]>;
  651. defm : X86WriteResPairUnsupported<WriteFMulZ>;
  652. def PdWriteX87Mul: SchedWriteRes<[PdLoad, PdFPU1, PdFPFMA]> {
  653. let Latency = 5;
  654. let ResourceCycles = [3, 1, 10];
  655. }
  656. def : InstRW<[PdWriteX87Mul], (instrs MUL_FI16m, MUL_FI32m, MUL_F32m, MUL_F64m)>;
  657. defm : PdWriteResXMMPair<WriteFMul64, [PdFPU1, PdFPFMA], 5>;
  658. defm : PdWriteResXMMPair<WriteFMul64X, [PdFPU1, PdFPFMA], 5>;
  659. defm : PdWriteResYMMPair<WriteFMul64Y, [PdFPU1, PdFPFMA], 5, [1, 2]>;
  660. defm : X86WriteResPairUnsupported<WriteFMul64Z>;
  661. defm : PdWriteResXMMPair<WriteFMA, [PdFPU, PdFPFMA], 5, [1, 3]>;
  662. defm : PdWriteResXMMPair<WriteFMAX, [PdFPU, PdFPFMA], 5, [1, 3]>;
  663. defm : PdWriteResYMMPair<WriteFMAY, [PdFPU, PdFPFMA], 5, [1, 3]>;
  664. defm : X86WriteResPairUnsupported<WriteFMAZ>;
  665. defm : PdWriteResXMMPair<WriteDPPD, [PdFPU1, PdFPFMA], 15, [1, 10], 15, 2>;
  666. defm : PdWriteResXMMPair<WriteDPPS, [PdFPU1, PdFPFMA], 25, [1, 14], 16, 2>;
  667. defm : PdWriteResYMMPair<WriteDPPSY, [PdFPU1, PdFPFMA], 27, [2, 25], /*or 29*/ 25, 4>;
  668. def PdWriteVDPPSrri : SchedWriteRes<[PdFPU1, PdFPFMA]> {
  669. let Latency = 27;
  670. let ResourceCycles = [1, 14];
  671. let NumMicroOps = 17;
  672. }
  673. def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>;
  674. defm : PdWriteResXMMPair<WriteFRcp, [PdFPU1, PdFPFMA], 5>;
  675. defm : PdWriteResXMMPair<WriteFRcpX, [PdFPU1, PdFPFMA], 5>;
  676. defm : PdWriteResYMMPair<WriteFRcpY, [PdFPU1, PdFPFMA], 5, [2, 1]>;
  677. defm : X86WriteResPairUnsupported<WriteFRcpZ>;
  678. defm : PdWriteResXMMPair<WriteFRsqrt, [PdFPU1, PdFPFMA], 5, [1, 2]>;
  679. defm : PdWriteResXMMPair<WriteFRsqrtX, [PdFPU1, PdFPFMA], 5>;
  680. defm : PdWriteResYMMPair<WriteFRsqrtY, [PdFPU1, PdFPFMA], 5, [2, 2]>;
  681. defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
  682. defm : PdWriteResXMMPair<WriteFDiv, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  683. defm : PdWriteResXMMPair<WriteFDivX, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  684. defm : PdWriteResYMMPair<WriteFDivY, [PdFPU1, PdFPFMA], 9, [2, 18]>;
  685. defm : X86WriteResPairUnsupported<WriteFDivZ>;
  686. def PdWriteX87Div: SchedWriteRes<[PdLoad, PdFPU0, PdFPFMA]> {
  687. let Latency = 9;
  688. let ResourceCycles = [3, 1, 18];
  689. }
  690. def : InstRW<[PdWriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
  691. DIVR_FI16m, DIVR_FI32m,
  692. DIV_F32m, DIV_F64m,
  693. DIVR_F32m, DIVR_F64m)>;
  694. defm : PdWriteResXMMPair<WriteFDiv64, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  695. defm : PdWriteResXMMPair<WriteFDiv64X, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  696. defm : PdWriteResYMMPair<WriteFDiv64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>;
  697. defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
  698. defm : PdWriteResXMMPair<WriteFSqrt, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  699. defm : PdWriteResXMMPair<WriteFSqrtX, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  700. defm : PdWriteResYMMPair<WriteFSqrtY, [PdFPU1, PdFPFMA], 9, [2, 18]>;
  701. defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
  702. defm : PdWriteResXMMPair<WriteFSqrt64, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  703. defm : PdWriteResXMMPair<WriteFSqrt64X, [PdFPU1, PdFPFMA], 9, [1, 9]>;
  704. defm : PdWriteResYMMPair<WriteFSqrt64Y, [PdFPU1, PdFPFMA], 9, [2, 18]>;
  705. defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
  706. defm : PdWriteResXMMPair<WriteFSqrt80, [PdFPU1, PdFPFMA], 1, [1, 18]>;
  707. defm : PdWriteResXMMPair<WriteFSign, [PdFPU1, PdFPFMA], 1, [1, 4]>;
  708. defm : PdWriteResXMMPair<WriteFRnd, [PdFPU1, PdFPSTO], 4, []>;
  709. defm : PdWriteResYMMPair<WriteFRndY, [PdFPU1, PdFPSTO], 4, [2, 1], 2>;
  710. defm : X86WriteResPairUnsupported<WriteFRndZ>;
  711. def PdWriteVFRCZP : SchedWriteRes<[PdFPU1, PdFPSTO]> {
  712. let Latency = 10;
  713. let ResourceCycles = [2, 1];
  714. let NumMicroOps = 2;
  715. }
  716. def : InstRW<[PdWriteVFRCZP], (instrs VFRCZPDrr, VFRCZPSrr)>;
  717. def PdWriteVFRCZS : SchedWriteRes<[PdFPU1, PdFPSTO]> {
  718. let Latency = 10;
  719. let ResourceCycles = [10, 1];
  720. let NumMicroOps = 2;
  721. }
  722. def : InstRW<[PdWriteVFRCZS], (instrs VFRCZSDrr, VFRCZSSrr)>;
  723. def PdWriteVFRCZm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
  724. let Latency = 15;
  725. let ResourceCycles = [2, 1];
  726. let NumMicroOps = 3;
  727. }
  728. def : InstRW<[PdWriteVFRCZm], (instrs VFRCZPDrm, VFRCZPSrm,
  729. VFRCZSDrm, VFRCZSSrm)>;
  730. def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> {
  731. let Latency = 10;
  732. let ResourceCycles = [3, 1];
  733. let NumMicroOps = 4;
  734. }
  735. def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>;
  736. def PdWriteVFRCZYm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
  737. let Latency = 15;
  738. let ResourceCycles = [4, 1];
  739. let NumMicroOps = 8;
  740. }
  741. def : InstRW<[PdWriteVFRCZYm], (instrs VFRCZPSYrm, VFRCZPDYrm)>;
  742. defm : PdWriteResXMMPair<WriteFLogic, [PdFPU23, PdFPMAL], 2>;
  743. defm : PdWriteResYMMPair<WriteFLogicY, [PdFPU23, PdFPMAL], 2, [2, 2]>;
  744. defm : X86WriteResPairUnsupported<WriteFLogicZ>;
  745. defm : PdWriteResXMMPair<WriteFTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
  746. defm : PdWriteResYMMPair<WriteFTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [4, 4, 1], 4, 2>;
  747. defm : X86WriteResPairUnsupported<WriteFTestZ>;
  748. defm : PdWriteResXMMPair<WriteFShuffle, [PdFPU01, PdFPFMA], 2, [1, 2]>;
  749. defm : PdWriteResYMMPair<WriteFShuffleY, [PdFPU01, PdFPFMA], 2, [2, 4], 2>;
  750. defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
  751. def PdWriteVBROADCASTF128 : SchedWriteRes<[PdFPU01, PdFPFMA]> {
  752. let Latency = 7;
  753. let ResourceCycles = [1, 3];
  754. let NumMicroOps = 2;
  755. }
  756. def : InstRW<[PdWriteVBROADCASTF128], (instrs VBROADCASTF128)>;
  757. defm : PdWriteResXMMPair<WriteFVarShuffle, [PdFPU1, PdFPXBR], 3>;
  758. defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU1, PdFPXBR], 3, [2, 2], 2>;
  759. defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
  760. defm : PdWriteResXMMPair<WriteFBlend, [PdFPU23, PdFPMAL], 2>;
  761. defm : PdWriteResYMMPair<WriteFBlendY, [PdFPU23, PdFPMAL], 2, [2, 2], 2>;
  762. defm : X86WriteResPairUnsupported<WriteFBlendZ>;
  763. defm : PdWriteResXMMPair<WriteFVarBlend, [PdFPU1, PdFPXBR], 2>;
  764. defm : PdWriteResYMMPair<WriteFVarBlendY, [PdFPU1, PdFPXBR], 2, [2, 2], 2>;
  765. defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
  766. defm : PdWriteResXMMPair<WriteFShuffle256, [PdFPU01, PdFPFMA], 2, [1, 3], 2>;
  767. defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
  768. def PdWriteVEXTRACTF128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
  769. let Latency = 2;
  770. let ResourceCycles = [1, 2];
  771. }
  772. def : InstRW<[PdWriteVEXTRACTF128rr], (instrs VEXTRACTF128rr)>;
  773. def PdWriteVEXTRACTF128mr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
  774. let Latency = 7;
  775. let ResourceCycles = [1, 4];
  776. let NumMicroOps = 2;
  777. }
  778. def : InstRW<[PdWriteVEXTRACTF128mr], (instrs VEXTRACTF128mr)>;
  779. def PdWriteVPERM2F128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
  780. let Latency = 4;
  781. let ResourceCycles = [1, 6];
  782. let NumMicroOps = 8;
  783. }
  784. def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>;
  785. def PdWriteVPERM2F128rm : SchedWriteRes<[PdFPU01, PdFPFMA]> {
  786. let Latency = 8; // 4 + 4
  787. let ResourceCycles = [1, 8];
  788. let NumMicroOps = 10;
  789. }
  790. def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>;
  791. ////////////////////////////////////////////////////////////////////////////////
  792. // Conversions.
  793. ////////////////////////////////////////////////////////////////////////////////
  794. defm : PdWriteResXMMPair<WriteCvtSS2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
  795. defm : PdWriteResXMMPair<WriteCvtPS2I, [PdFPU0, PdFPCVT, PdFPSTO], 4>;
  796. defm : PdWriteResYMMPair<WriteCvtPS2IY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
  797. defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
  798. defm : PdWriteResXMMPair<WriteCvtSD2I, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
  799. defm : PdWriteResXMMPair<WriteCvtPD2I, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
  800. defm : PdWriteResYMMPair<WriteCvtPD2IY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
  801. defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
  802. def PdWriteMMX_CVTTPD2PIrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
  803. let Latency = 6;
  804. let NumMicroOps = 2;
  805. }
  806. def : InstRW<[PdWriteMMX_CVTTPD2PIrr], (instrs MMX_CVTTPD2PIrr)>;
  807. // FIXME: f+3 ST, LD+STC latency
  808. defm : PdWriteResXMMPair<WriteCvtI2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
  809. // FIXME: .Folded version is one NumMicroOp *less*..
  810. defm : PdWriteResXMMPair<WriteCvtI2PS, [PdFPU0, PdFPCVT, PdFPSTO], 4>;
  811. defm : PdWriteResYMMPair<WriteCvtI2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
  812. defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
  813. defm : PdWriteResXMMPair<WriteCvtI2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>;
  814. // FIXME: .Folded version is one NumMicroOp *less*..
  815. def PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
  816. let Latency = 13;
  817. let ResourceCycles = [1, 3, 1];
  818. let NumMicroOps = 2;
  819. }
  820. def : InstRW<[PdWriteCVTSI642SDrr_CVTSI642SSrr_CVTSI2SDr_CVTSI2SSrr], (instrs CVTSI642SDrr, CVTSI642SSrr, CVTSI2SDrr, CVTSI2SSrr)>;
  821. defm : PdWriteResXMMPair<WriteCvtI2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
  822. defm : PdWriteResYMMPair<WriteCvtI2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
  823. defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
  824. defm : PdWriteResXMMPair<WriteCvtSS2SD, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
  825. defm : PdWriteResXMMPair<WriteCvtPS2PD, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
  826. defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 1>;
  827. defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
  828. defm : PdWriteResXMMPair<WriteCvtSD2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [1, 2, 1]>;
  829. defm : PdWriteResXMMPair<WriteCvtPD2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [], 2>;
  830. defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
  831. defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
  832. def PdWriteMMX_CVTPD2PIrrMMX_CVTPI2PDrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
  833. let Latency = 6;
  834. let NumMicroOps = 2;
  835. }
  836. def : InstRW<[PdWriteMMX_CVTPD2PIrrMMX_CVTPI2PDrr], (instrs MMX_CVTPD2PIrr,
  837. MMX_CVTPI2PDrr)>;
  838. def PdWriteMMX_CVTPI2PSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> {
  839. let Latency = 4;
  840. let NumMicroOps = 2;
  841. }
  842. def : InstRW<[PdWriteMMX_CVTPI2PSrr], (instrs MMX_CVTPI2PSrr)>;
  843. defm : PdWriteResXMMPair<WriteCvtPH2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2, 1>;
  844. defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 3>;
  845. defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
  846. defm : PdWriteRes<WriteCvtPS2PH, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2>;
  847. defm : PdWriteRes<WriteCvtPS2PHY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>;
  848. defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
  849. defm : PdWriteRes<WriteCvtPS2PHSt, [PdFPU0, PdFPCVT, PdFPSTO, PdStore], 4, [1, 2, 1, 1], 3>;
  850. defm : PdWriteRes<WriteCvtPS2PHYSt, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA, PdStore], 4, [1, 2, 1, 1, 1], 4>;
  851. defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
  852. ////////////////////////////////////////////////////////////////////////////////
  853. // Vector integer operations.
  854. ////////////////////////////////////////////////////////////////////////////////
  855. defm : PdWriteRes<WriteVecLoad, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
  856. defm : PdWriteRes<WriteVecLoadX, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 3]>;
  857. defm : PdWriteRes<WriteVecLoadY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 3], 2>;
  858. defm : PdWriteRes<WriteVecLoadNT, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 1, 4]>;
  859. defm : PdWriteRes<WriteVecLoadNTY, [PdLoad, PdFPU01, PdFPMAL], 5, [3, 2, 4]>;
  860. defm : PdWriteRes<WriteVecMaskedLoad, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 1, 2]>;
  861. defm : PdWriteRes<WriteVecMaskedLoadY, [PdLoad, PdFPU01, PdFPMAL], 6, [3, 2, 4], 2>;
  862. defm : PdWriteRes<WriteVecStore, [PdStore, PdFPU23, PdFPSTO], 2, [1, 3, 1]>;
  863. defm : PdWriteRes<WriteVecStoreX, [PdStore, PdFPU23, PdFPSTO], 1, [1, 3, 1]>;
  864. defm : PdWriteRes<WriteVecStoreY, [PdStore, PdFPU23, PdFPSTO], 1, [2, 36, 2], 4>;
  865. def PdWriteVMOVDQUYmr : SchedWriteRes<[PdStore, PdFPU1, PdFPSTO]> {
  866. let NumMicroOps = 8;
  867. }
  868. def : InstRW<[PdWriteVMOVDQUYmr], (instrs VMOVDQUYmr)>;
  869. defm : PdWriteRes<WriteVecStoreNT, [PdStore, PdFPU1, PdFPSTO], 2>;
  870. defm : PdWriteRes<WriteVecStoreNTY, [PdStore, PdFPU1, PdFPSTO], 2, [2, 2, 2], 4>;
  871. defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
  872. defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
  873. defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
  874. defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
  875. defm : PdWriteRes<WriteVecMove, [PdFPU01, PdFPMAL], 2>;
  876. defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 1, [1, 2]>;
  877. defm : PdWriteRes<WriteVecMoveY, [PdFPU01, PdFPMAL], 2, [2, 2], 2>;
  878. defm : X86WriteResUnsupported<WriteVecMoveZ>;
  879. def PdWriteMOVDQArr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
  880. }
  881. def : InstRW<[PdWriteMOVDQArr], (instrs MOVDQArr)>;
  882. def PdWriteMOVQ2DQrr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
  883. let Latency = 4;
  884. }
  885. def : InstRW<[PdWriteMOVQ2DQrr], (instrs MMX_MOVQ2DQrr)>;
  886. defm : PdWriteRes<WriteVecMoveToGpr, [PdFPU0, PdFPFMA, PdEX0], 11>;
  887. defm : PdWriteRes<WriteVecMoveFromGpr, [PdFPU01, PdFPFMA], 11, [1, 2], 2>;
  888. defm : PdWriteResXMMPair<WriteVecALU, [PdFPU23, PdFPMAL], 2>;
  889. defm : PdWriteResXMMPair<WriteVecALUX, [PdFPU23, PdFPMAL], 2>;
  890. defm : X86WriteResPairUnsupported<WriteVecALUY>;
  891. defm : X86WriteResPairUnsupported<WriteVecALUZ>;
  892. defm : PdWriteResXMMPair<WriteVecShift, [PdFPU1, PdFPXBR], 3>;
  893. defm : PdWriteResXMMPair<WriteVecShiftX, [PdFPU1, PdFPXBR], 3>;
  894. defm : X86WriteResPairUnsupported<WriteVecShiftY>;
  895. defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
  896. defm : PdWriteResXMMPair<WriteVecShiftImm, [PdFPU1, PdFPXBR], 2>;
  897. defm : PdWriteResXMMPair<WriteVecShiftImmX, [PdFPU1, PdFPXBR], 2>;
  898. defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
  899. defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
  900. defm : PdWriteResXMMPair<WriteVecIMul, [PdFPU0, PdFPMMA], 4>;
  901. defm : PdWriteResXMMPair<WriteVecIMulX, [PdFPU0, PdFPMMA], 4>;
  902. defm : X86WriteResPairUnsupported<WriteVecIMulY>;
  903. defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
  904. defm : PdWriteResXMMPair<WritePMULLD, [PdFPU0, PdFPU01, PdFPMMA, PdFPMAL], 5, [2, 1, 2, 1]>;
  905. defm : X86WriteResPairUnsupported<WritePMULLDY>;
  906. defm : X86WriteResPairUnsupported<WritePMULLDZ>;
  907. def PdWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPMMA, PdFPMAL]> {
  908. let Latency = 4;
  909. }
  910. def : InstRW<[PdWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr,
  911. VPMACSSDQLrr)>;
  912. // FIXME: Investigate RR vs RM differences.
  913. defm : PdWriteRes<WriteMPSAD, [PdFPU0, PdFPMMA], 8, [1, 4], 8>;
  914. defm : PdWriteRes<WriteMPSADLd, [PdFPU0, PdFPMMA, PdLoad], 14, [1, 4, 3], 8>;
  915. defm : X86WriteResPairUnsupported<WriteMPSADY>;
  916. defm : X86WriteResPairUnsupported<WriteMPSADZ>;
  917. defm : PdWriteResXMMPair<WritePSADBW, [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
  918. defm : PdWriteResXMMPair<WritePSADBWX, [PdFPU01, PdFPMAL], 4, [1, 2], 2>;
  919. defm : X86WriteResPairUnsupported<WritePSADBWY>;
  920. defm : X86WriteResPairUnsupported<WritePSADBWZ>;
  921. defm : PdWriteResXMMPair<WritePHMINPOS, [PdFPU0, PdFPMAL], 4, [], 2>;
  922. defm : PdWriteResXMMPair<WriteShuffle, [PdFPU1, PdFPXBR], 2>;
  923. defm : PdWriteResXMMPair<WriteShuffleX, [PdFPU1, PdFPXBR], 2>;
  924. defm : PdWriteResYMMPair<WriteShuffleY, [PdFPU1, PdFPXBR], 2, [2, 2]>;
  925. defm : X86WriteResPairUnsupported<WriteShuffleZ>;
  926. defm : PdWriteResXMMPair<WriteVarShuffle, [PdFPU1, PdFPXBR], 3>;
  927. defm : PdWriteResXMMPair<WriteVarShuffleX, [PdFPU1, PdFPXBR], 3>;
  928. defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
  929. defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
  930. def PdWriteVPPERM : SchedWriteRes<[PdFPU1, PdFPXBR]> {
  931. let Latency = 2;
  932. let ResourceCycles = [1, 1];
  933. }
  934. def : InstRW<[PdWriteVPPERM], (instrs VPPERMrrr, VPPERMrrr_REV)>;
  935. def PdWriteVPPERMLd : SchedWriteRes<[PdFPU1, PdFPXBR, PdLoad]> {
  936. let Latency = 7;
  937. let ResourceCycles = [1, 1, 3];
  938. }
  939. def : InstRW<[PdWriteVPPERMLd], (instrs VPPERMrrm, VPPERMrmr)>;
  940. defm : PdWriteResXMMPair<WriteBlend, [PdFPU23, PdFPMAL], 2>;
  941. defm : X86WriteResPairUnsupported<WriteBlendY>;
  942. defm : X86WriteResPairUnsupported<WriteBlendZ>;
  943. defm : PdWriteResXMMPair<WriteVarBlend, [PdFPU1, PdFPXBR], 2>;
  944. defm : X86WriteResPairUnsupported<WriteVarBlendY>;
  945. defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
  946. defm : PdWriteResXMMPair<WriteVecLogic, [PdFPU23, PdFPMAL], 2>;
  947. defm : PdWriteResXMMPair<WriteVecLogicX, [PdFPU23, PdFPMAL], 2>;
  948. defm : X86WriteResPairUnsupported<WriteVecLogicY>;
  949. defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
  950. defm : PdWriteResXMMPair<WriteVecTest, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
  951. defm : PdWriteResYMMPair<WriteVecTestY, [PdFPU01, PdFPFMA, PdEX0], 1, [2, 4, 1], 4, 2>;
  952. defm : X86WriteResPairUnsupported<WriteVecTestZ>;
  953. defm : PdWriteResXMMPair<WriteShuffle256, [PdFPU01, PdFPMAL]>;
  954. defm : PdWriteResXMMPair<WriteVPMOV256, [PdFPU01, PdFPMAL]>;
  955. defm : PdWriteResXMMPair<WriteVarShuffle256, [PdFPU01, PdFPMAL]>;
  956. defm : PdWriteResXMMPair<WriteVarVecShift, [PdFPU1, PdFPXBR], 3>;
  957. defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
  958. defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
  959. ////////////////////////////////////////////////////////////////////////////////
  960. // Vector insert/extract operations.
  961. ////////////////////////////////////////////////////////////////////////////////
  962. defm : PdWriteRes<WriteVecInsert, [PdFPU01, PdFPMAL], 2, [1, 3], 2>;
  963. defm : PdWriteRes<WriteVecInsertLd, [PdFPU01, PdFPMAL, PdLoad], 6, [1, 4, 3], 2>;
  964. defm : PdWriteRes<WriteVecExtract, [PdFPU0, PdFPFMA, PdEX0], 12, [1, 3, 1], 2>;
  965. defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [2, 1, 1], 2>;
  966. def PdWriteEXTRQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
  967. let Latency = 3;
  968. let ResourceCycles = [1, 3];
  969. }
  970. def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>;
  971. ////////////////////////////////////////////////////////////////////////////////
  972. // SSE42 String instructions.
  973. ////////////////////////////////////////////////////////////////////////////////
  974. defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 11, [1, 6, 1], 7, 1>;
  975. defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0], 7, [1, 8, 1], 7, 2>;
  976. defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 14, [1, 10, 10, 10, 1, 1], 27, 1>;
  977. defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 10, 10, 10, 1, 1], 27, 1>;
  978. ////////////////////////////////////////////////////////////////////////////////
  979. // MOVMSK Instructions.
  980. ////////////////////////////////////////////////////////////////////////////////
  981. defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>;
  982. defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 12, [], 2>;
  983. defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
  984. // defm : X86WriteResUnsupported<WriteVecMOVMSKZ>;
  985. defm : PdWriteRes<WriteMMXMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 10, [], 2>;
  986. ////////////////////////////////////////////////////////////////////////////////
  987. // AES Instructions.
  988. ////////////////////////////////////////////////////////////////////////////////
  989. defm : PdWriteResXMMPair<WriteAESIMC, [PdFPU0, PdFPMMA], 5>;
  990. defm : PdWriteResXMMPair<WriteAESKeyGen, [PdFPU0, PdFPMMA], 5>;
  991. defm : PdWriteResXMMPair<WriteAESDecEnc, [PdFPU0, PdFPMMA], 9, [], 2>;
  992. ////////////////////////////////////////////////////////////////////////////////
  993. // Horizontal add/sub instructions.
  994. ////////////////////////////////////////////////////////////////////////////////
  995. defm : PdWriteResXMMPair<WriteFHAdd, [PdFPU0, PdFPFMA], 11, [1, 5], 3, 1>;
  996. defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [1, 8], 8, 2>;
  997. defm : X86WriteResPairUnsupported<WriteFHAddZ>;
  998. defm : PdWriteResXMMPair<WritePHAdd, [PdFPU01, PdFPMAL], 5, [1, 4], 3, 1>;
  999. defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2, [1, 2]>;
  1000. defm : X86WriteResPairUnsupported<WritePHAddY>;
  1001. defm : X86WriteResPairUnsupported<WritePHAddZ>;
  1002. def : InstRW<[WritePHAdd], (instrs PHADDDrr, PHSUBDrr,
  1003. PHADDWrr, PHSUBWrr,
  1004. PHADDSWrr, PHSUBSWrr,
  1005. VPHADDDrr, VPHSUBDrr,
  1006. VPHADDWrr, VPHSUBWrr,
  1007. VPHADDSWrr, VPHSUBSWrr)>;
  1008. def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm,
  1009. PHADDWrm, PHSUBWrm,
  1010. PHADDSWrm, PHSUBSWrm,
  1011. VPHADDDrm, VPHSUBDrm,
  1012. VPHADDWrm, VPHSUBWrm,
  1013. VPHADDSWrm, VPHSUBSWrm)>;
  1014. ////////////////////////////////////////////////////////////////////////////////
  1015. // Carry-less multiplication instructions.
  1016. ////////////////////////////////////////////////////////////////////////////////
  1017. defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [1, 7], 5, 1>;
  1018. def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> {
  1019. let Latency = 12;
  1020. let ResourceCycles = [1, 7];
  1021. let NumMicroOps = 6;
  1022. }
  1023. def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>;
  1024. ////////////////////////////////////////////////////////////////////////////////
  1025. // SSE4A instructions.
  1026. ////////////////////////////////////////////////////////////////////////////////
  1027. def PdWriteINSERTQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
  1028. let Latency = 3;
  1029. let ResourceCycles = [1, 2];
  1030. }
  1031. def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ)>;
  1032. def PdWriteINSERTQI : SchedWriteRes<[PdFPU01, PdFPMAL]> {
  1033. let Latency = 3;
  1034. let ResourceCycles = [1, 3];
  1035. }
  1036. def : InstRW<[PdWriteINSERTQI], (instrs INSERTQI)>;
  1037. ////////////////////////////////////////////////////////////////////////////////
  1038. // AVX instructions.
  1039. ////////////////////////////////////////////////////////////////////////////////
  1040. def PdWriteVBROADCASTYLd : SchedWriteRes<[PdLoad, PdFPU01, PdFPFMA]> {
  1041. let Latency = 6;
  1042. let ResourceCycles = [1, 2, 4];
  1043. let NumMicroOps = 2;
  1044. }
  1045. def : InstRW<[PdWriteVBROADCASTYLd, ReadAfterLd], (instrs VBROADCASTSDYrm,
  1046. VBROADCASTSSYrm)>;
  1047. def PdWriteVZEROALL : SchedWriteRes<[]> {
  1048. let Latency = 90;
  1049. let NumMicroOps = 32;
  1050. }
  1051. def : InstRW<[PdWriteVZEROALL], (instrs VZEROALL)>;
  1052. def PdWriteVZEROUPPER : SchedWriteRes<[]> {
  1053. let Latency = 46;
  1054. let NumMicroOps = 16;
  1055. }
  1056. def : InstRW<[PdWriteVZEROUPPER], (instrs VZEROUPPER)>;
  1057. ///////////////////////////////////////////////////////////////////////////////
  1058. // SchedWriteVariant definitions.
  1059. ///////////////////////////////////////////////////////////////////////////////
  1060. def PdWriteZeroLatency : SchedWriteRes<[]> {
  1061. let Latency = 0;
  1062. }
  1063. def PdWriteZeroIdiom : SchedWriteVariant<[
  1064. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1065. SchedVar<MCSchedPredicate<TruePred>, [WriteALU]>
  1066. ]>;
  1067. def : InstRW<[PdWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
  1068. XOR32rr, XOR64rr)>;
  1069. def PdWriteFZeroIdiom : SchedWriteVariant<[
  1070. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1071. SchedVar<MCSchedPredicate<TruePred>, [WriteFLogic]>
  1072. ]>;
  1073. def : InstRW<[PdWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr,
  1074. XORPDrr, VXORPDrr,
  1075. ANDNPSrr, VANDNPSrr,
  1076. ANDNPDrr, VANDNPDrr)>;
  1077. // VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr "zero-idioms" have latency of 1.
  1078. def PdWriteVZeroIdiomLogic : SchedWriteVariant<[
  1079. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1080. SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogic]>
  1081. ]>;
  1082. def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>;
  1083. def PdWriteVZeroIdiomLogicX : SchedWriteVariant<[
  1084. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1085. SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogicX]>
  1086. ]>;
  1087. def : InstRW<[PdWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr,
  1088. PANDNrr, VPANDNrr)>;
  1089. def PdWriteVZeroIdiomALU : SchedWriteVariant<[
  1090. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1091. SchedVar<MCSchedPredicate<TruePred>, [WriteVecALU]>
  1092. ]>;
  1093. def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBrr, MMX_PSUBDrr,
  1094. MMX_PSUBQrr, MMX_PSUBWrr,
  1095. MMX_PCMPGTBrr,
  1096. MMX_PCMPGTDrr,
  1097. MMX_PCMPGTWrr)>;
  1098. def PdWriteVZeroIdiomALUX : SchedWriteVariant<[
  1099. SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
  1100. SchedVar<MCSchedPredicate<TruePred>, [WriteVecALUX]>
  1101. ]>;
  1102. def : InstRW<[PdWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
  1103. PSUBDrr, VPSUBDrr,
  1104. PSUBQrr, VPSUBQrr,
  1105. PSUBWrr, VPSUBWrr,
  1106. PCMPGTBrr, VPCMPGTBrr,
  1107. PCMPGTDrr, VPCMPGTDrr,
  1108. PCMPGTWrr, VPCMPGTWrr)>;
  1109. ///////////////////////////////////////////////////////////////////////////////
  1110. // Dependency breaking instructions.
  1111. ///////////////////////////////////////////////////////////////////////////////
  1112. // VPCMPGTQ, but not PCMPGTQ!
  1113. def : IsZeroIdiomFunction<[
  1114. // GPR Zero-idioms.
  1115. DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
  1116. // MMX Zero-idioms.
  1117. DepBreakingClass<[
  1118. MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr,
  1119. MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr,
  1120. MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr,
  1121. MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr
  1122. ], ZeroIdiomPredicate>,
  1123. // SSE Zero-idioms.
  1124. DepBreakingClass<[
  1125. // fp variants.
  1126. XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
  1127. // int variants.
  1128. PXORrr, PANDNrr,
  1129. PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
  1130. PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
  1131. PCMPGTBrr, PCMPGTDrr, PCMPGTWrr
  1132. ], ZeroIdiomPredicate>,
  1133. // AVX Zero-idioms.
  1134. DepBreakingClass<[
  1135. // xmm fp variants.
  1136. VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
  1137. // xmm int variants.
  1138. VPXORrr, VPANDNrr,
  1139. VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
  1140. VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
  1141. VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
  1142. // ymm variants.
  1143. VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
  1144. ], ZeroIdiomPredicate>
  1145. ]>;
  1146. def : IsDepBreakingFunction<[
  1147. // GPR
  1148. DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
  1149. DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
  1150. // MMX
  1151. DepBreakingClass<[
  1152. MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr
  1153. ], ZeroIdiomPredicate>,
  1154. // SSE
  1155. DepBreakingClass<[
  1156. PCMPEQBrr, PCMPEQWrr, PCMPEQDrr
  1157. // But not PCMPEQQrr.
  1158. ], ZeroIdiomPredicate>,
  1159. // AVX
  1160. DepBreakingClass<[
  1161. VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr
  1162. // But not VPCMPEQQrr.
  1163. ], ZeroIdiomPredicate>
  1164. ]>;
  1165. } // SchedModel