AArch64SchedAmpere1.td 43 KB


  1. //=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file defines the machine model for the Ampere Computing Ampere-1 to
  10. // support instruction scheduling and other instruction cost heuristics.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. // The Ampere-1 core is an out-of-order micro-architecture. The front
  14. // end has branch prediction, with a 10-cycle recovery time from a
  15. // mispredicted branch. Instructions coming out of the front end are
  16. // decoded into internal micro-ops (uops).
  17. def Ampere1Model : SchedMachineModel {
  18. let IssueWidth = 4; // 4-way decode and dispatch
  19. let MicroOpBufferSize = 174; // micro-op re-order buffer size
  20. let LoadLatency = 4; // Optimistic load latency
  21. let MispredictPenalty = 10; // Branch mispredict penalty
  22. let LoopMicroOpBufferSize = 32; // Instruction queue size
  23. let CompleteModel = 1;
  24. list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
  25. SMEUnsupported.F);
  26. }
  27. let SchedModel = Ampere1Model in {
  28. //===----------------------------------------------------------------------===//
  29. // Define each kind of processor resource and number available on Ampere-1.
  30. // Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP,
  31. // and 2 memory) issue into. The integer and FP schedulers can each issue
  32. // one uop per cycle, while the memory schedulers can each issue one load
  33. // and one store address calculation per cycle.
  34. def Ampere1UnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w
  35. def Ampere1UnitB : ProcResource<2>; // integer single-cycle, and complex shifts
  36. def Ampere1UnitBS : ProcResource<1>; // integer multi-cycle
  37. def Ampere1UnitL : ProcResource<2>; // load
  38. def Ampere1UnitS : ProcResource<2>; // store address calculation
  39. def Ampere1UnitX : ProcResource<1>; // FP and vector operations, and flag write
  40. def Ampere1UnitY : ProcResource<1>; // FP and vector operations, and crypto
  41. def Ampere1UnitZ : ProcResource<1>; // FP store data and FP-to-integer moves
  42. def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>;
  43. def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>;
  44. //===----------------------------------------------------------------------===//
  45. // Define customized scheduler read/write types specific to the Ampere-1.
  46. def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> {
  47. let Latency = 1;
  48. let NumMicroOps = 1;
  49. }
  50. def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> {
  51. let Latency = 1;
  52. let NumMicroOps = 2;
  53. }
  54. def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> {
  55. let Latency = 1;
  56. let NumMicroOps = 1;
  57. }
  58. def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> {
  59. let Latency = 1;
  60. let NumMicroOps = 1;
  61. }
  62. def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
  63. let Latency = 1;
  64. let NumMicroOps = 1;
  65. }
  66. def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> {
  67. let Latency = 1;
  68. let NumMicroOps = 1;
  69. }
  70. def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> {
  71. let Latency = 1;
  72. let NumMicroOps = 2;
  73. }
  74. def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
  75. let Latency = 2;
  76. let NumMicroOps = 1;
  77. }
  78. def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> {
  79. let Latency = 2;
  80. let NumMicroOps = 2;
  81. }
  82. def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> {
  83. let Latency = 2;
  84. let NumMicroOps = 2;
  85. }
  86. def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> {
  87. let Latency = 2;
  88. let NumMicroOps = 2;
  89. }
  90. def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> {
  91. let Latency = 2;
  92. let NumMicroOps = 2;
  93. }
  94. def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
  95. let Latency = 2;
  96. let NumMicroOps = 2;
  97. }
  98. def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
  99. Ampere1UnitS]> {
  100. let Latency = 2;
  101. let NumMicroOps = 3;
  102. }
  103. def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS,
  104. Ampere1UnitZ]> {
  105. let Latency = 2;
  106. let NumMicroOps = 3;
  107. }
  108. def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> {
  109. let Latency = 2;
  110. let NumMicroOps = 2;
  111. }
  112. def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
  113. let Latency = 2;
  114. let NumMicroOps = 1;
  115. }
  116. def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> {
  117. let Latency = 2;
  118. let NumMicroOps = 2;
  119. }
  120. def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
  121. let Latency = 3;
  122. let NumMicroOps = 1;
  123. }
  124. def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
  125. let Latency = 3;
  126. let NumMicroOps = 1;
  127. }
  128. def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS,
  129. Ampere1UnitAB]> {
  130. let Latency = 2;
  131. let NumMicroOps = 3;
  132. }
  133. def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> {
  134. let Latency = 2;
  135. let NumMicroOps = 3;
  136. }
  137. def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
  138. Ampere1UnitZ, Ampere1UnitZ]> {
  139. let Latency = 2;
  140. let NumMicroOps = 4;
  141. }
  142. def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
  143. let Latency = 4;
  144. let NumMicroOps = 1;
  145. }
  146. def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
  147. let Latency = 4;
  148. let NumMicroOps = 1;
  149. }
  150. def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
  151. let Latency = 4;
  152. let NumMicroOps = 1;
  153. }
  154. def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> {
  155. let Latency = 4;
  156. let NumMicroOps = 1;
  157. }
  158. def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> {
  159. let Latency = 4;
  160. let NumMicroOps = 1;
  161. }
  162. def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
  163. let Latency = 4;
  164. let NumMicroOps = 2;
  165. }
  166. def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
  167. let Latency = 4;
  168. let NumMicroOps = 1;
  169. }
  170. def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
  171. let Latency = 4;
  172. let NumMicroOps = 2;
  173. }
  174. def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> {
  175. let Latency = 4;
  176. let NumMicroOps = 3;
  177. }
  178. def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
  179. Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
  180. let Latency = 4;
  181. let NumMicroOps = 6;
  182. }
  183. def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
  184. let Latency = 5;
  185. let NumMicroOps = 2;
  186. }
  187. def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
  188. let Latency = 5;
  189. let NumMicroOps = 1;
  190. }
  191. def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> {
  192. let Latency = 5;
  193. let NumMicroOps = 1;
  194. }
  195. def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> {
  196. let Latency = 5;
  197. let NumMicroOps = 1;
  198. }
  199. def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> {
  200. let Latency = 5;
  201. let NumMicroOps = 2;
  202. }
  203. def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> {
  204. let Latency = 5;
  205. let NumMicroOps = 2;
  206. }
  207. def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
  208. let Latency = 5;
  209. let NumMicroOps = 1;
  210. }
  211. def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
  212. let Latency = 5;
  213. let NumMicroOps = 2;
  214. }
  215. def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS,
  216. Ampere1UnitS, Ampere1UnitS,
  217. Ampere1UnitZ, Ampere1UnitZ,
  218. Ampere1UnitZ, Ampere1UnitZ]> {
  219. let Latency = 5;
  220. let NumMicroOps = 8;
  221. }
  222. def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
  223. Ampere1UnitS, Ampere1UnitS,
  224. Ampere1UnitZ, Ampere1UnitZ]> {
  225. let Latency = 5;
  226. let NumMicroOps = 6;
  227. }
  228. def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
  229. Ampere1UnitS, Ampere1UnitS,
  230. Ampere1UnitZ, Ampere1UnitZ]> {
  231. let Latency = 6;
  232. let NumMicroOps = 6;
  233. }
  234. def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY,
  235. Ampere1UnitS, Ampere1UnitS, Ampere1UnitS,
  236. Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> {
  237. let Latency = 6;
  238. let NumMicroOps = 9;
  239. }
  240. def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> {
  241. let Latency = 6;
  242. let NumMicroOps = 2;
  243. }
  244. def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
  245. let Latency = 6;
  246. let NumMicroOps = 1;
  247. }
  248. def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
  249. let Latency = 6;
  250. let NumMicroOps = 2;
  251. }
  252. def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
  253. let Latency = 6;
  254. let NumMicroOps = 3;
  255. }
  256. def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> {
  257. let Latency = 6;
  258. let NumMicroOps = 3;
  259. }
  260. def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
  261. Ampere1UnitL, Ampere1UnitL]> {
  262. let Latency = 6;
  263. let NumMicroOps = 4;
  264. }
  265. def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
  266. let Latency = 6;
  267. let NumMicroOps = 2;
  268. }
  269. def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
  270. let Latency = 7;
  271. let NumMicroOps = 1;
  272. }
  273. def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> {
  274. let Latency = 7;
  275. let NumMicroOps = 2;
  276. }
  277. def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> {
  278. let Latency = 7;
  279. let NumMicroOps = 2;
  280. }
  281. def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
  282. Ampere1UnitXY, Ampere1UnitXY]> {
  283. let Latency = 7;
  284. let NumMicroOps = 4;
  285. }
  286. def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
  287. let Latency = 7;
  288. let NumMicroOps = 2;
  289. }
  290. def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
  291. Ampere1UnitXY, Ampere1UnitXY,
  292. Ampere1UnitS, Ampere1UnitS,
  293. Ampere1UnitS, Ampere1UnitS,
  294. Ampere1UnitZ, Ampere1UnitZ,
  295. Ampere1UnitZ, Ampere1UnitZ]> {
  296. let Latency = 7;
  297. let NumMicroOps = 12;
  298. }
  299. def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> {
  300. let Latency = 8;
  301. let NumMicroOps = 2;
  302. }
  303. def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA,
  304. Ampere1UnitA]> {
  305. let Latency = 8;
  306. let NumMicroOps = 3;
  307. }
  308. def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
  309. let Latency = 8;
  310. let NumMicroOps = 2;
  311. }
  312. def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
  313. Ampere1UnitXY, Ampere1UnitXY]> {
  314. let Latency = 8;
  315. let NumMicroOps = 4;
  316. }
  317. def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
  318. Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
  319. let Latency = 8;
  320. let NumMicroOps = 6;
  321. }
  322. def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
  323. Ampere1UnitL, Ampere1UnitL,
  324. Ampere1UnitXY, Ampere1UnitXY,
  325. Ampere1UnitXY, Ampere1UnitXY]> {
  326. let Latency = 8;
  327. let NumMicroOps = 8;
  328. }
  329. def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
  330. Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
  331. let Latency = 9;
  332. let NumMicroOps = 6;
  333. }
  334. def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
  335. Ampere1UnitL, Ampere1UnitL,
  336. Ampere1UnitXY, Ampere1UnitXY,
  337. Ampere1UnitXY, Ampere1UnitXY]> {
  338. let Latency = 9;
  339. let NumMicroOps = 8;
  340. }
  341. def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
  342. let Latency = 9;
  343. let NumMicroOps = 3;
  344. }
  345. def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
  346. Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
  347. let Latency = 9;
  348. let NumMicroOps = 5;
  349. }
  350. def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
  351. Ampere1UnitXY, Ampere1UnitXY,
  352. Ampere1UnitXY, Ampere1UnitXY,
  353. Ampere1UnitS, Ampere1UnitS,
  354. Ampere1UnitS, Ampere1UnitS,
  355. Ampere1UnitZ, Ampere1UnitZ,
  356. Ampere1UnitZ, Ampere1UnitZ]> {
  357. let Latency = 9;
  358. let NumMicroOps = 14;
  359. }
  360. def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
  361. Ampere1UnitXY, Ampere1UnitXY,
  362. Ampere1UnitXY, Ampere1UnitXY,
  363. Ampere1UnitXY, Ampere1UnitXY,
  364. Ampere1UnitS, Ampere1UnitS,
  365. Ampere1UnitS, Ampere1UnitS,
  366. Ampere1UnitZ, Ampere1UnitZ,
  367. Ampere1UnitZ, Ampere1UnitZ]> {
  368. let Latency = 9;
  369. let NumMicroOps = 16;
  370. }
  371. def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> {
  372. let Latency = 10;
  373. let NumMicroOps = 2;
  374. }
  375. def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> {
  376. let Latency = 10;
  377. let NumMicroOps = 2;
  378. }
  379. def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> {
  380. let Latency = 10;
  381. let NumMicroOps = 2;
  382. }
  383. def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL,
  384. Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
  385. let Latency = 10;
  386. let NumMicroOps = 6;
  387. }
  388. def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
  389. let Latency = 10;
  390. let NumMicroOps = 3;
  391. }
  392. def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
  393. let Latency = 10;
  394. let NumMicroOps = 3;
  395. }
  396. def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> {
  397. let Latency = 11;
  398. let NumMicroOps = 2;
  399. }
  400. def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> {
  401. let Latency = 11;
  402. let NumMicroOps = 3;
  403. }
  404. def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> {
  405. let Latency = 11;
  406. let NumMicroOps = 3;
  407. }
  408. def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
  409. Ampere1UnitL, Ampere1UnitL,
  410. Ampere1UnitXY, Ampere1UnitXY,
  411. Ampere1UnitXY, Ampere1UnitXY,
  412. Ampere1UnitXY, Ampere1UnitXY,
  413. Ampere1UnitXY, Ampere1UnitXY]> {
  414. let Latency = 11;
  415. let NumMicroOps = 12;
  416. }
  417. def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL,
  418. Ampere1UnitL, Ampere1UnitL,
  419. Ampere1UnitXY, Ampere1UnitXY,
  420. Ampere1UnitXY, Ampere1UnitXY,
  421. Ampere1UnitXY, Ampere1UnitXY,
  422. Ampere1UnitXY, Ampere1UnitXY]> {
  423. let Latency = 12;
  424. let NumMicroOps = 12;
  425. }
  426. def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> {
  427. let Latency = 12;
  428. let NumMicroOps = 3;
  429. }
  430. def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY,
  431. Ampere1UnitXY, Ampere1UnitXY]> {
  432. let Latency = 12;
  433. let NumMicroOps = 4;
  434. }
  435. def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
  436. let Latency = 18;
  437. let NumMicroOps = 1;
  438. }
  439. def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
  440. let Latency = 19;
  441. let NumMicroOps = 1;
  442. }
  443. def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
  444. let Latency = 25;
  445. let NumMicroOps = 1;
  446. }
  447. def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
  448. let Latency = 32;
  449. let NumMicroOps = 1;
  450. }
  451. def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> {
  452. let Latency = 34;
  453. let NumMicroOps = 1;
  454. }
  455. def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
  456. let Latency = 34;
  457. let NumMicroOps = 1;
  458. }
  459. def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
  460. let Latency = 39;
  461. let NumMicroOps = 1;
  462. }
  463. def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> {
  464. let Latency = 62;
  465. let NumMicroOps = 1;
  466. }
  467. // For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4),
  468. // which are a single uop, and for extended registers, which have full flexibility
  469. // across Unit A or B for both uops.
  470. def Ampere1Write_Arith : SchedWriteVariant<[
  471. SchedVar<RegExtendedPred, [Ampere1Write_2cyc_2AB]>,
  472. SchedVar<AmpereCheapLSL, [Ampere1Write_1cyc_1AB]>,
  473. SchedVar<NoSchedPred, [Ampere1Write_2cyc_1B_1AB]>]>;
  474. def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[
  475. SchedVar<RegExtendedPred, [Ampere1Write_2cyc_1AB_1A]>,
  476. SchedVar<AmpereCheapLSL, [Ampere1Write_1cyc_1A]>,
  477. SchedVar<NoSchedPred, [Ampere1Write_2cyc_1B_1A]>]>;
  478. //===----------------------------------------------------------------------===//
  479. // Map the target-defined scheduler read/write resources and latencies for Ampere-1.
  480. // This provides a coarse model, which is then specialised below.
  481. def : WriteRes<WriteImm, [Ampere1UnitAB]>; // MOVN, MOVZ
  482. def : WriteRes<WriteI, [Ampere1UnitAB]>; // ALU
  483. def : WriteRes<WriteISReg, [Ampere1UnitB, Ampere1UnitA]> {
  484. let Latency = 2;
  485. let NumMicroOps = 2;
  486. } // ALU of Shifted-Reg
  487. def : WriteRes<WriteIEReg, [Ampere1UnitAB, Ampere1UnitA]> {
  488. let Latency = 2;
  489. let NumMicroOps = 2;
  490. } // ALU of Extended-Reg
  491. def : WriteRes<WriteExtr, [Ampere1UnitB]>; // EXTR shifts a reg pair
  492. def : WriteRes<WriteIS, [Ampere1UnitB]>; // Shift/Scale
  493. def : WriteRes<WriteID32, [Ampere1UnitBS]> {
  494. let Latency = 18;
  495. } // 32-bit Divide
  496. def : WriteRes<WriteID64, [Ampere1UnitBS]> {
  497. let Latency = 34;
  498. } // 64-bit Divide
  499. def : WriteRes<WriteIM32, [Ampere1UnitBS]> {
  500. let Latency = 3;
  501. } // 32-bit Multiply
  502. def : WriteRes<WriteIM64, [Ampere1UnitBS]> {
  503. let Latency = 3;
  504. } // 32-bit Multiply
  505. def : WriteRes<WriteBr, [Ampere1UnitA]>;
  506. def : WriteRes<WriteBrReg, [Ampere1UnitA, Ampere1UnitA]>;
  507. def : WriteRes<WriteLD, [Ampere1UnitL]> {
  508. let Latency = 4;
  509. } // Load from base addr plus immediate offset
  510. def : WriteRes<WriteST, [Ampere1UnitS]> {
  511. let Latency = 1;
  512. } // Store to base addr plus immediate offset
  513. def : WriteRes<WriteSTP, [Ampere1UnitS, Ampere1UnitS]> {
  514. let Latency = 1;
  515. let NumMicroOps = 2;
  516. } // Store a register pair.
  517. def : WriteRes<WriteAdr, [Ampere1UnitAB]>;
  518. def : WriteRes<WriteLDIdx, [Ampere1UnitAB, Ampere1UnitS]> {
  519. let Latency = 5;
  520. let NumMicroOps = 2;
  521. } // Load from a register index (maybe scaled).
  522. def : WriteRes<WriteSTIdx, [Ampere1UnitS, Ampere1UnitS]> {
  523. let Latency = 1;
  524. let NumMicroOps = 2;
  525. } // Store to a register index (maybe scaled).
  526. def : WriteRes<WriteF, [Ampere1UnitXY]> {
  527. let Latency = 2;
  528. } // General floating-point ops.
  529. def : WriteRes<WriteFCmp, [Ampere1UnitX]> {
  530. let Latency = 5;
  531. } // Floating-point compare.
  532. def : WriteRes<WriteFCvt, [Ampere1UnitXY]> {
  533. let Latency = 6;
  534. } // Float conversion.
  535. def : WriteRes<WriteFCopy, [Ampere1UnitXY]> {
  536. } // Float-int register copy.
  537. def : WriteRes<WriteFImm, [Ampere1UnitXY]> {
  538. let Latency = 2;
  539. } // Float-int register copy.
  540. def : WriteRes<WriteFMul, [Ampere1UnitXY]> {
  541. let Latency = 5;
  542. } // Floating-point multiply.
  543. def : WriteRes<WriteFDiv, [Ampere1UnitXY]> {
  544. let Latency = 34;
  545. } // Floating-point division.
  546. def : WriteRes<WriteVd, [Ampere1UnitXY]> {
  547. let Latency = 3;
  548. } // 64bit Vector D ops.
  549. def : WriteRes<WriteVq, [Ampere1UnitXY]> {
  550. let Latency = 3;
  551. } // 128bit Vector Q ops.
  552. def : WriteRes<WriteVLD, [Ampere1UnitL, Ampere1UnitL]> {
  553. let Latency = 5;
  554. } // Vector loads.
  555. def : WriteRes<WriteVST, [Ampere1UnitS, Ampere1UnitZ]> {
  556. let Latency = 2;
  557. } // Vector stores.
  558. def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
  559. def : WriteRes<WriteSys, []> { let Latency = 1; }
  560. def : WriteRes<WriteBarrier, []> { let Latency = 1; }
  561. def : WriteRes<WriteHint, []> { let Latency = 1; }
  562. def : WriteRes<WriteLDHi, []> {
  563. let Latency = 4;
  564. } // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP
  565. // Forwarding logic.
  566. def : ReadAdvance<ReadI, 0>;
  567. def : ReadAdvance<ReadISReg, 0>;
  568. def : ReadAdvance<ReadIEReg, 0>;
  569. def : ReadAdvance<ReadIM, 0>;
  570. def : ReadAdvance<ReadIMA, 1, [WriteIM32, WriteIM64]>;
  571. def : ReadAdvance<ReadID, 0>;
  572. def : ReadAdvance<ReadExtrHi, 0>;
  573. def : ReadAdvance<ReadST, 0>;
  574. def : ReadAdvance<ReadAdrBase, 0>;
  575. def : ReadAdvance<ReadVLD, 0>;
  576. //===----------------------------------------------------------------------===//
  577. // Specialising the scheduling model further for Ampere-1.
  578. def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>;
  579. // Branch instructions
  580. def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>;
  581. def : InstRW<[Ampere1Write_1cyc_1A],
  582. (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
  583. def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>;
  584. // Cryptography instructions
  585. // -- AES encryption/decryption
  586. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>;
  587. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>;
  588. // -- Polynomial multiplication
  589. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>;
  590. // -- SHA-256 hash
  591. def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>;
  592. // -- SHA-256 schedule update
  593. def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>;
  594. // -- SHA-3 instructions
  595. def : InstRW<[Ampere1Write_2cyc_1XY],
  596. (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>;
  597. // -- SHA-512 hash
  598. def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>;
  599. // -- SHA-512 schedule update
  600. def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>;
  601. // -- SHA1 choose/majority/parity
  602. def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>;
  603. // -- SHA1 hash/schedule update
  604. def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>;
  605. def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>;
  606. // FP and vector load instructions
  607. // -- Load 1-element structure to one/all lanes
  608. // ---- all lanes
  609. def : InstRW<[Ampere1Write_7cyc_1L_1XY],
  610. (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>;
  611. // ---- one lane
  612. def : InstRW<[Ampere1Write_7cyc_1L_1XY],
  613. (instregex "^LD1i(8|16|32|64)")>;
  614. // -- Load 1-element structure to one/all lanes, 1D size
  615. def : InstRW<[Ampere1Write_5cyc_1L],
  616. (instregex "^LD1Rv1d")>;
  617. // -- Load 1-element structures to 1 register
  618. def : InstRW<[Ampere1Write_5cyc_1L],
  619. (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
  620. // -- Load 1-element structures to 2 registers
  621. def : InstRW<[Ampere1Write_5cyc_2L],
  622. (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
  623. // -- Load 1-element structures to 3 registers
  624. def : InstRW<[Ampere1Write_6cyc_3L],
  625. (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
  626. // -- Load 1-element structures to 4 registers
  627. def : InstRW<[Ampere1Write_6cyc_4L],
  628. (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
  629. // -- Load 2-element structure to all lanes of 2 registers, 1D size
  630. def : InstRW<[Ampere1Write_5cyc_2L],
  631. (instregex "^LD2Rv1d")>;
  632. // -- Load 2-element structure to all lanes of 2 registers, other sizes
  633. def : InstRW<[Ampere1Write_7cyc_2L_2XY],
  634. (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>;
  635. // -- Load 2-element structure to one lane of 2 registers
  636. def : InstRW<[Ampere1Write_7cyc_2L_2XY],
  637. (instregex "^LD2i(8|16|32|64)")>;
  638. // -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size
  639. def : InstRW<[Ampere1Write_7cyc_2L_2XY],
  640. (instregex "^LD2Twov(16b|8h|4s|2d)")>;
  641. // -- Load 2-element structures to 2 registers, 8B/4H/2S size
  642. def : InstRW<[Ampere1Write_9cyc_2L_3XY],
  643. (instregex "^LD2Twov(8b|4h|2s)")>;
  644. // -- Load 3-element structure to all lanes of 3 registers, 1D size
  645. def : InstRW<[Ampere1Write_6cyc_3L],
  646. (instregex "^LD3Rv1d")>;
  647. // -- Load 3-element structure to all lanes of 3 registers, other sizes
  648. def : InstRW<[Ampere1Write_8cyc_3L_3XY],
  649. (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>;
  650. // -- Load 3-element structure to one lane of 3 registers
  651. def : InstRW<[Ampere1Write_8cyc_3L_3XY],
  652. (instregex "^LD3i(8|16|32|64)")>;
  653. // -- Load 3-element structures to 3 registers, 16B/8H/4S sizes
  654. def : InstRW<[Ampere1Write_9cyc_3L_3XY],
  655. (instregex "^LD3Threev(16b|8h|4s)")>;
  656. // -- Load 3-element structures to 3 registers, 2D size
  657. def : InstRW<[Ampere1Write_8cyc_3L_3XY],
  658. (instregex "^LD3Threev2d")>;
  659. // -- Load 3-element structures to 3 registers, 8B/4H/2S sizes
  660. def : InstRW<[Ampere1Write_10cyc_3L_3XY],
  661. (instregex "^LD3Threev(8b|4h|2s)")>;
  662. // -- Load 4-element structure to all lanes of 4 registers, 1D size
  663. def : InstRW<[Ampere1Write_6cyc_4L],
  664. (instregex "^LD4Rv1d")>;
  665. // -- Load 4-element structure to all lanes of 4 registers, other sizes
  666. def : InstRW<[Ampere1Write_8cyc_4L_4XY],
  667. (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>;
  668. // -- Load 4-element structure to one lane of 4 registers
  669. def : InstRW<[Ampere1Write_6cyc_4L],
  670. (instregex "^LD4i(8|16|32|64)")>;
  671. // -- Load 4-element structures to 4 registers, 2D size
  672. def : InstRW<[Ampere1Write_9cyc_4L_4XY],
  673. (instregex "^LD4Fourv2d")>;
  674. // -- Load 4-element structures to 4 registers, 2S size
  675. def : InstRW<[Ampere1Write_12cyc_4L_8XY],
  676. (instregex "^LD4Fourv2s")>;
  677. // -- Load 4-element structures to 4 registers, other sizes
  678. def : InstRW<[Ampere1Write_11cyc_4L_8XY],
  679. (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>;
  680. // -- Load pair, Q-form
  681. def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>;
  682. // -- Load pair, S/D-form
  683. def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>;
  684. // -- Load register
  685. def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>;
  686. // -- Load register, sign-extended register
  687. def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>;
  688. // FP and vector store instructions
  689. // -- Store 1-element structure from one lane of 1 register
  690. def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z],
  691. (instregex "^ST1i(8|16|32|64)")>;
  692. // -- Store 1-element structures from 1 register
  693. def : InstRW<[Ampere1Write_2cyc_1S_1Z],
  694. (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
  695. // -- Store 1-element structures from 2 registers
  696. def : InstRW<[Ampere1Write_3cyc_2S_2Z],
  697. (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>;
  698. // -- Store 1-element structures from 3 registers
  699. def : InstRW<[Ampere1Write_4cyc_3S_3Z],
  700. (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
  701. // -- Store 1-element structures from 4 registers
  702. def : InstRW<[Ampere1Write_5cyc_4S_4Z],
  703. (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>;
  704. // -- Store 2-element structure from one lane of 2 registers
  705. def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
  706. (instregex "^ST2i(8|16|32|64)")>;
  707. // -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes
  708. def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z],
  709. (instregex "^ST2Twov(16b|8h|4s|2d)")>;
  710. // -- Store 2-element structures from 2 registers, 8B/4H/2S sizes
  711. def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z],
  712. (instregex "^ST2Twov(8b|4h|2s)")>;
  713. // -- Store 3-element structure from one lane of 3 registers
  714. def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
  715. (instregex "^ST3i(8|16|32|64)")>;
  716. // -- Store 3-element structures from 3 registers
  717. def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z],
  718. (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>;
  719. // -- Store 4-element structure from one lane of 4 registers
  720. def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
  721. (instregex "^ST4i(8|16|32|64)")>;
  722. // -- Store 4-element structures from 4 registers, 16B/8H/4S sizes
  723. def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z],
  724. (instregex "^ST4Fourv(16b|8h|4s)")>;
  725. // -- Store 4-element structures from 4 registers, 2D sizes
  726. def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z],
  727. (instregex "^ST4Fourv2d")>;
  728. // -- Store 4-element structures from 4 registers, 8B/4H/2S sizes
  729. def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z],
  730. (instregex "^ST4Fourv(8b|4h|2s)")>;
  731. // -- Store pair, Q-form
  732. def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>;
  733. // -- Store pair, S/D-form
  734. def : InstRW<[Ampere1Write_3cyc_1S_2Z], (instregex "^STN?P[SD]")>;
  735. // -- Store register
  736. def : InstRW<[Ampere1Write_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>;
  737. // -- Store register, sign-extended register offset
  738. def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>;
  739. // FP data processing, bfloat16 format
  740. def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>;
  741. def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>;
  742. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>;
  743. def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>;
  744. def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>;
  745. // FP data processing, scalar/vector, half precision
  746. def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>;
  747. def : InstRW<[Ampere1Write_4cyc_1XY],
  748. (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>;
  749. def : InstRW<[Ampere1Write_4cyc_1XY],
  750. (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>;
  751. def : InstRW<[Ampere1Write_4cyc_1XY],
  752. (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>;
  753. def : InstRW<[Ampere1Write_4cyc_1X],
  754. (instregex "^FCMPE?H")>;
  755. def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X],
  756. (instregex "^FCCMPE?H")>;
  757. def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY],
  758. (instregex "^FCSELH")>;
  759. def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>;
  760. def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>;
  761. def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>;
  762. def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>;
  763. def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>;
  764. def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>;
  765. def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>;
  766. def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>;
  767. def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>;
  768. def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>;
  769. def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>;
  770. def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>;
  771. def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>;
  772. def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>;
  773. // FP data processing, scalar/vector, single/double precision
  774. def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>;
  775. def : InstRW<[Ampere1Write_5cyc_1XY],
  776. (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>;
  777. def : InstRW<[Ampere1Write_5cyc_1XY],
  778. (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>;
  779. def : InstRW<[Ampere1Write_5cyc_1XY],
  780. (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>;
  781. def : InstRW<[Ampere1Write_5cyc_1X],
  782. (instregex "^FCMPE?(S|D)")>;
  783. def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X],
  784. (instregex "^FCCMPE?(S|D)")>;
  785. def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY],
  786. (instregex "^FCSEL(S|D)")>;
  787. def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>;
  788. def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>;
  789. def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>;
  790. def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>;
  791. def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>;
  792. def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>;
  793. def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>;
  794. def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>;
  795. def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>;
  796. def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>;
  797. def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>;
  798. def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>;
  799. def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>;
  800. def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>;
  801. def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>;
  802. def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>;
  803. // FP miscellaneous instructions
  804. def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>;
  805. def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>;
  806. def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>;
  807. def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>;
  808. def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>;
  809. def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>;
  810. def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>;
  811. def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>;
  812. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>;
  813. def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>;
  814. def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>;
  815. // Integer arithmetic and logical instructions
  816. def : InstRW<[Ampere1Write_1cyc_1A],
  817. (instregex "ADC(W|X)r", "SBC(W|X)r")>;
  818. def : InstRW<[Ampere1Write_Arith],
  819. (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r")>;
  820. def : InstRW<[Ampere1Write_ArithFlagsetting],
  821. (instregex "(ADD|AND|BIC|SUB)S(W|X)r")>;
  822. def : InstRW<[Ampere1Write_1cyc_1A],
  823. (instregex "(ADC|SBC)S(W|X)r")>;
  824. def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>;
  825. def : InstRW<[Ampere1Write_1cyc_1A],
  826. (instregex "(CCMN|CCMP)(X|W)")>;
  827. def : InstRW<[Ampere1Write_1cyc_1A],
  828. (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>;
  829. def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>;
  830. def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>;
  831. def : InstRW<[Ampere1Write_3cyc_1BS],
  832. (instregex "(S|U)MULHr")>;
  833. def : InstRW<[Ampere1Write_4cyc_1BS],
  834. (instregex "(S|U)?M(ADD|SUB)L?r")>;
  835. // Integer load instructions
  836. def : InstRW<[Ampere1Write_4cyc_2L],
  837. (instregex "(LDNP|LDP|LDPSW)(X|W)")>;
  838. def : InstRW<[Ampere1Write_4cyc_1L],
  839. (instregex "LDR(B|D|H|Q|S)ui")>;
  840. def : InstRW<[Ampere1Write_4cyc_1L],
  841. (instregex "LDR(D|Q|W|X)l")>;
  842. def : InstRW<[Ampere1Write_4cyc_1L],
  843. (instregex "LDTR(B|H|W|X)i")>;
  844. def : InstRW<[Ampere1Write_4cyc_1L],
  845. (instregex "LDTRS(BW|BX|HW|HX|W)i")>;
  846. def : InstRW<[Ampere1Write_4cyc_1L],
  847. (instregex "LDUR(BB|HH|X|W)i")>;
  848. def : InstRW<[Ampere1Write_4cyc_1L],
  849. (instregex "LDURS(BW|BX|HW|HX|W)i")>;
  850. def : InstRW<[Ampere1Write_5cyc_1AB_1L],
  851. (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>;
  852. def : InstRW<[Ampere1Write_1cyc_1L],
  853. (instrs PRFMl, PRFUMi, PRFUMi)>;
  854. def : InstRW<[Ampere1Write_2cyc_1AB_1L],
  855. (instrs PRFMroW, PRFMroX)>;
  856. // Integer miscellaneous instructions
  857. def : InstRW<[Ampere1Write_1cyc_1A], (instrs ADR, ADRP)>;
  858. def : InstRW<[Ampere1Write_1cyc_1B], (instregex "EXTR(W|X)")>;
  859. def : InstRW<[Ampere1Write_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>;
  860. def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>;
  861. def : InstRW<[Ampere1Write_1cyc_1B], (instregex "CLS(W|X)")>;
  862. def : InstRW<[Ampere1Write_1cyc_1A], (instrs SETF8, SETF16)>;
  863. def : InstRW<[Ampere1Write_1cyc_1AB],
  864. (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
  865. def : InstRW<[Ampere1Write_1cyc_1B],
  866. (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>;
  867. def : InstRW<[Ampere1Write_1cyc_1B],
  868. (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>;
  869. // Integer store instructions
  870. def : InstRW<[Ampere1Write_1cyc_2S], (instregex "STNP(X|W)i")>;
  871. def : InstRW<[Ampere1Write_2cyc_1B_1S],
  872. (instrs STPWi, STPXi)>;
  873. def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB],
  874. (instregex "STP(W|X)(pre|post)")>;
  875. def : InstRW<[Ampere1Write_1cyc_1S],
  876. (instrs STTRBi, STTRHi, STTRWi, STTRXi)>;
  877. def : InstRW<[Ampere1Write_1cyc_1S],
  878. (instregex "STUR(BB|HH|X|W)i",
  879. "STR(X|W)ui",
  880. "STUR(BB|HH|X|W)i")>;
  881. def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>;
  882. def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>;
  883. // Pointer authentication
  884. //def : InstRW<[Ampere1Write_7cyc_1BS],
  885. // (instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>;
  886. def : InstRW<[Ampere1Write_8cyc_1BS_1A],
  887. (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>;
  888. def : InstRW<[Ampere1Write_8cyc_1BS_2A],
  889. (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>;
  890. //def : InstRW<[Ampere1Write_7cyc_1BS],
  891. // (instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>;
  892. def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>;
  893. def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>;
  894. // Vector integer instructions
  895. // -- absolute difference
  896. def : InstRW<[Ampere1Write_3cyc_1XY],
  897. (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv",
  898. "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>;
  899. // -- arithmetic
  900. def : InstRW<[Ampere1Write_3cyc_1XY],
  901. (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD",
  902. "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW",
  903. "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>;
  904. // -- arithmetic, horizontal, 16B
  905. def : InstRW<[Ampere1Write_12cyc_4XY],
  906. (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>;
  907. def : InstRW<[Ampere1Write_12cyc_4XY],
  908. (instregex "^[SU](MIN|MAX)Vv16i8v")>;
  909. // -- arithmetic, horizontal, 4H/4S
  910. def : InstRW<[Ampere1Write_6cyc_2XY],
  911. (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>;
  912. def : InstRW<[Ampere1Write_6cyc_2XY],
  913. (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>;
  914. // -- arithmetic, horizontal, 8B/8H
  915. def : InstRW<[Ampere1Write_9cyc_3XY],
  916. (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>;
  917. def : InstRW<[Ampere1Write_9cyc_3XY],
  918. (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>;
  919. // -- arithmetic, narrowing
  920. def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>;
  921. def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>;
  922. // -- arithmetic, pairwise
  923. def : InstRW<[Ampere1Write_3cyc_1XY],
  924. (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>;
  925. // -- arithmetic, saturating
  926. def : InstRW<[Ampere1Write_3cyc_1XY],
  927. (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>;
  928. // -- bit count
  929. def : InstRW<[Ampere1Write_2cyc_1XY],
  930. (instregex "^(CLS|CLZ|CNT)v")>;
  931. // -- compare
  932. def : InstRW<[Ampere1Write_3cyc_1XY],
  933. (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv",
  934. "^CMHIv", "^CMHSv")>;
  935. // -- compare non-zero
  936. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>;
  937. // -- dot product
  938. def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>;
  939. // -- fp reciprocal estimate
  940. def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>;
  941. // -- integer reciprocal estimate
  942. def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>;
  943. // -- logical
  944. def : InstRW<[Ampere1Write_2cyc_1XY],
  945. (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
  946. // -- logical, narrowing
  947. def : InstRW<[Ampere1Write_5cyc_2XY],
  948. (instregex "RSHRNv",
  949. "SHRNv", "SQSHRNv", "SQSHRUNv",
  950. "UQXTNv")>;
  951. // -- matrix multiply
  952. def : InstRW<[Ampere1Write_6cyc_2XY],
  953. (instrs SMMLA, UMMLA, USMMLA)>;
  954. // -- max/min
  955. def : InstRW<[Ampere1Write_3cyc_1XY],
  956. (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>;
  957. def : InstRW<[Ampere1Write_3cyc_1XY],
  958. (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>;
  959. // -- move immediate
  960. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>;
  961. // -- multiply
  962. def : InstRW<[Ampere1Write_3cyc_1XY],
  963. (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>;
  964. // -- multiply accumulate
  965. def : InstRW<[Ampere1Write_3cyc_1XY],
  966. (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>;
  967. // -- negation, saturating
  968. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>;
  969. // -- reverse bits/bytes
  970. def : InstRW<[Ampere1Write_2cyc_1XY],
  971. (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>;
  972. // -- shift
  973. def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
  974. // -- shift and accumulate
  975. def : InstRW<[Ampere1Write_3cyc_1XY],
  976. (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>;
  977. // -- shift, saturating
  978. def : InstRW<[Ampere1Write_3cyc_1XY],
  979. (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU",
  980. "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL",
  981. "^UQSHL")>;
  982. // Vector miscellaneous instructions
  983. // -- duplicate element
  984. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>;
  985. // -- duplicate from GPR
  986. def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>;
  987. // -- extract narrow
  988. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>;
  989. // -- insert/extract element
  990. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>;
  991. // -- move FP immediate
  992. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>;
  993. // -- move element to GPR
  994. def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>;
  995. // -- move from GPR to any element
  996. def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>;
  997. // -- table lookup
  998. def : InstRW<[Ampere1Write_2cyc_1XY],
  999. (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
  1000. def : InstRW<[Ampere1Write_4cyc_2XY],
  1001. (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
  1002. def : InstRW<[Ampere1Write_6cyc_3XY],
  1003. (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
  1004. def : InstRW<[Ampere1Write_8cyc_4XY],
  1005. (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
  1006. // -- transpose
  1007. def : InstRW<[Ampere1Write_2cyc_1XY],
  1008. (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
  1009. // -- zip/unzip
  1010. def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>;
  1011. } // SchedModel = Ampere1Model