ARMScheduleM7.td 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. //=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file defines the SchedRead/Write data for the ARM Cortex-M7 processor.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. def CortexM7Model : SchedMachineModel {
  13. let IssueWidth = 2; // Dual issue for most instructions.
  14. let MicroOpBufferSize = 0; // The Cortex-M7 is in-order.
  15. let LoadLatency = 2; // Best case for load-use case.
  16. let MispredictPenalty = 4; // Mispredict cost for forward branches is 6,
  17. // but 4 works better
  18. let CompleteModel = 0;
  19. }
  20. let SchedModel = CortexM7Model in {
  21. //===--------------------------------------------------------------------===//
  22. // The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP
  23. // pipe. The stages relevant to scheduling are as follows:
  24. //
  25. // EX1: address generation shifts
  26. // EX2: fast load data ALUs FP operation
  27. // EX3: slow load data integer writeback FP operation
  28. // EX4: store data FP writeback
  29. //
  30. // There are shifters in both EX1 and EX2, and some instructions can be
  31. // flexibly allocated between them. EX2 is used as the "zero" point
  32. // for scheduling, so simple ALU operations executing in EX2 will have
  33. // ReadAdvance<0> (the default) for their source operands and Latency = 1.
  34. def M7UnitLoadL : ProcResource<1> { let BufferSize = 0; }
  35. def M7UnitLoadH : ProcResource<1> { let BufferSize = 0; }
  36. def M7UnitLoad : ProcResGroup<[M7UnitLoadL,M7UnitLoadH]> { let BufferSize = 0; }
  37. def M7UnitStore : ProcResource<1> { let BufferSize = 0; }
  38. def M7UnitALU : ProcResource<2>;
  39. def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; }
  40. def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; }
  41. def M7UnitMAC : ProcResource<1> { let BufferSize = 0; }
  42. def M7UnitBranch : ProcResource<1> { let BufferSize = 0; }
  43. def M7UnitVFP : ProcResource<1> { let BufferSize = 0; }
  44. def M7UnitVPortL : ProcResource<1> { let BufferSize = 0; }
  45. def M7UnitVPortH : ProcResource<1> { let BufferSize = 0; }
  46. def M7UnitVPort : ProcResGroup<[M7UnitVPortL,M7UnitVPortH]> { let BufferSize = 0; }
  47. def M7UnitSIMD : ProcResource<1> { let BufferSize = 0; }
  48. //===---------------------------------------------------------------------===//
  49. // Subtarget-specific SchedWrite types with map ProcResources and set latency.
  50. def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; }
  51. // Basic ALU with shifts.
  52. let Latency = 1 in {
  53. def : WriteRes<WriteALUsi, [M7UnitALU, M7UnitShift1]>;
  54. def : WriteRes<WriteALUsr, [M7UnitALU, M7UnitShift1]>;
  55. def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>;
  56. }
  57. // Compares.
  58. def : WriteRes<WriteCMP, [M7UnitALU]> { let Latency = 1; }
  59. def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
  60. def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; }
  61. // Multiplies.
  62. let Latency = 2 in {
  63. def : WriteRes<WriteMUL16, [M7UnitMAC]>;
  64. def : WriteRes<WriteMUL32, [M7UnitMAC]>;
  65. def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>;
  66. def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; }
  67. }
  68. // Multiply-accumulates.
  69. let Latency = 2 in {
  70. def : WriteRes<WriteMAC16, [M7UnitMAC]>;
  71. def : WriteRes<WriteMAC32, [M7UnitMAC]>;
  72. def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; }
  73. def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; }
  74. }
  75. // Divisions.
  76. // These cannot be dual-issued with any instructions.
  77. def : WriteRes<WriteDIV, [M7UnitALU]> {
  78. let Latency = 7;
  79. let SingleIssue = 1;
  80. }
  81. // Loads/Stores.
  82. def : WriteRes<WriteLd, [M7UnitLoad]> { let Latency = 1; }
  83. def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; }
  84. def : WriteRes<WriteST, [M7UnitStore]> { let Latency = 2; }
  85. // Branches.
  86. def : WriteRes<WriteBr, [M7UnitBranch]> { let Latency = 2; }
  87. def : WriteRes<WriteBrL, [M7UnitBranch]> { let Latency = 2; }
  88. def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; }
  89. // Noop.
  90. def : WriteRes<WriteNoop, []> { let Latency = 0; }
  91. //===---------------------------------------------------------------------===//
  92. // Sched definitions for floating-point instructions
  93. //
  94. // Floating point conversions.
  95. def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
  96. def : WriteRes<WriteFPMOV, [M7UnitVPort]> { let Latency = 3; }
  97. def M7WriteFPMOV64 : SchedWriteRes<[M7UnitVPortL, M7UnitVPortH]> {
  98. let Latency = 3;
  99. }
  100. // The FP pipeline has a latency of 3 cycles.
  101. // ALU operations (32/64-bit). These go down the FP pipeline.
  102. def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
  103. def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
  104. let Latency = 4;
  105. let BeginGroup = 1;
  106. }
  107. // Multiplication
  108. def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; }
  109. def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
  110. let Latency = 7;
  111. let BeginGroup = 1;
  112. }
  113. // Multiply-accumulate. FPMAC goes down the FP Pipeline.
  114. def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; }
  115. def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
  116. let Latency = 11;
  117. let BeginGroup = 1;
  118. }
  119. // Division. Effective scheduling latency is 3, though real latency is larger
  120. def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
  121. def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
  122. let Latency = 30;
  123. let BeginGroup = 1;
  124. }
  125. // Square-root. Effective scheduling latency is 3; real latency is larger
  126. def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; }
  127. def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> {
  128. let Latency = 30;
  129. let BeginGroup = 1;
  130. }
  131. def M7WriteShift2 : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {}
  132. // Not used for M7, but needing definitions anyway
  133. def : WriteRes<WriteVLD1, []>;
  134. def : WriteRes<WriteVLD2, []>;
  135. def : WriteRes<WriteVLD3, []>;
  136. def : WriteRes<WriteVLD4, []>;
  137. def : WriteRes<WriteVST1, []>;
  138. def : WriteRes<WriteVST2, []>;
  139. def : WriteRes<WriteVST3, []>;
  140. def : WriteRes<WriteVST4, []>;
  141. def M7SingleIssue : SchedWriteRes<[]> {
  142. let SingleIssue = 1;
  143. let NumMicroOps = 0;
  144. }
  145. def M7Slot0Only : SchedWriteRes<[]> {
  146. let BeginGroup = 1;
  147. let NumMicroOps = 0;
  148. }
  149. // What pipeline stage operands need to be ready for depending on
  150. // where they come from.
  151. def : ReadAdvance<ReadALUsr, 0>;
  152. def : ReadAdvance<ReadMUL, 0>;
  153. def : ReadAdvance<ReadMAC, 1>;
  154. def : ReadAdvance<ReadALU, 0>;
  155. def : ReadAdvance<ReadFPMUL, 0>;
  156. def : ReadAdvance<ReadFPMAC, 3>;
  157. def M7Read_ISS : SchedReadAdvance<-1>; // operands needed at EX1
  158. def M7Read_EX2 : SchedReadAdvance<1>; // operands needed at EX3
  159. def M7Read_EX3 : SchedReadAdvance<2>; // operands needed at EX4
  160. // Non general purpose instructions may not be dual issued. These
  161. // use both issue units.
  162. def M7NonGeneralPurpose : SchedWriteRes<[]> {
  163. // Assume that these will go down the main ALU pipeline.
  164. // In reality, many look likely to stall the whole pipeline.
  165. let Latency = 3;
  166. let SingleIssue = 1;
  167. }
  168. // List the non general purpose instructions.
  169. def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT",
  170. "t2MSR", "t2DMB", "t2DSB", "t2ISB",
  171. "t2HVC", "t2SMC", "t2UDF", "ERET",
  172. "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>;
  173. //===---------------------------------------------------------------------===//
  174. // Sched definitions for load/store
  175. //
  176. // Mark whether the loads/stores must be single-issue
  177. // Address operands are needed earlier
  178. // Data operands are needed later
  179. def M7BaseUpdate : SchedWriteRes<[]> {
  180. let Latency = 0; // Update is bypassable out of EX1
  181. let NumMicroOps = 0;
  182. }
  183. def M7LoadLatency1 : SchedWriteRes<[]> {
  184. let Latency = 1;
  185. let NumMicroOps = 0;
  186. }
  187. def M7SlowLoad : SchedWriteRes<[M7UnitLoad]> { let Latency = 2; }
  188. // Byte and half-word loads should have greater latency than other loads.
  189. // So should load exclusive.
  190. def : InstRW<[M7SlowLoad],
  191. (instregex "t2LDR(B|H|SB|SH)pc")>;
  192. def : InstRW<[M7SlowLoad, M7Read_ISS],
  193. (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i",
  194. "tLDR(B|H)i")>;
  195. def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS],
  196. (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>;
  197. def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS],
  198. (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>;
  199. // Exclusive loads/stores cannot be dual-issued
  200. def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS],
  201. (instregex "t2LDREX$")>;
  202. def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS],
  203. (instregex "t2LDREX(B|H)")>;
  204. def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS],
  205. (instregex "t2STREX(B|H)?$")>;
  206. // Load/store multiples cannot be dual-issued. Note that default scheduling
  207. // occurs around read/write times of individual registers in the list; read
  208. // time for STM cannot be overridden because it is a variadic source operand.
  209. def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
  210. (instregex "(t|t2)LDM(DB|IA)$")>;
  211. def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
  212. (instregex "(t|t2)STM(DB|IA)$")>;
  213. def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
  214. (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>;
  215. def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
  216. (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>;
  217. // Load/store doubles cannot be dual-issued.
  218. def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue,
  219. M7Read_EX2, M7Read_EX2, M7Read_ISS],
  220. (instregex "t2STRD_(PRE|POST)")>;
  221. def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS],
  222. (instregex "t2STRDi")>;
  223. def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS],
  224. (instregex "t2LDRD_(PRE|POST)")>;
  225. def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS],
  226. (instregex "t2LDRDi")>;
  227. // Word load / preload
  228. def : InstRW<[WriteLd],
  229. (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>;
  230. def : InstRW<[WriteLd, M7Read_ISS],
  231. (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>;
  232. def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS],
  233. (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>;
  234. def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS],
  235. (instregex "t2LDR_(POST|PRE)")>;
  236. // Stores
  237. def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS],
  238. (instregex "t2STR(B|H)?_(POST|PRE)")>;
  239. def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS],
  240. (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>;
  241. def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS],
  242. (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>;
  243. // TBB/TBH - single-issue only; takes two cycles to issue
  244. def M7TableLoad : SchedWriteRes<[M7UnitLoad]> {
  245. let NumMicroOps = 2;
  246. let SingleIssue = 1;
  247. }
  248. def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>;
  249. // VFP loads and stores
  250. def M7LoadSP : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; }
  251. def M7LoadDP : SchedWriteRes<[M7UnitLoadL, M7UnitLoadH, M7UnitVPortL, M7UnitVPortH]> {
  252. let Latency = 2;
  253. let SingleIssue = 1;
  254. }
  255. def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>;
  256. def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPortL, M7UnitVPortH]> {
  257. let SingleIssue = 1;
  258. }
  259. def : InstRW<[M7LoadSP, M7Read_ISS], (instregex "VLDR(S|H)$")>;
  260. def : InstRW<[M7LoadDP, M7Read_ISS], (instregex "VLDRD$")>;
  261. def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS], (instregex "VSTR(S|H)$")>;
  262. def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS], (instregex "VSTRD$")>;
  263. // Load/store multiples cannot be dual-issued.
  264. def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS],
  265. (instregex "VLDM(S|D|Q)(DB|IA)$")>;
  266. def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS],
  267. (instregex "VSTM(S|D|Q)(DB|IA)$")>;
  268. def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS],
  269. (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>;
  270. def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS],
  271. (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>;
  272. //===---------------------------------------------------------------------===//
  273. // Sched definitions for ALU
  274. //
  275. // Shifted ALU operands are read a cycle early.
  276. def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>;
  277. def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS],
  278. (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$",
  279. "t2(SUB|CMP|CMNz|TEQ|TST)rs$",
  280. "t2MOVsr(a|l)")>;
  281. def : InstRW<[WriteALUsi, M7Read_ISS],
  282. (instregex "t2MVNs")>;
  283. // Treat pure shift operations (except for RRX) as if they used the EX1
  284. // shifter but have timing as if they used the EX2 shifter as they usually
  285. // can choose the EX2 shifter when needed. Will miss a few dual-issue cases,
  286. // but the results prove to be better than trying to get them exact.
  287. def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>;
  288. def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>;
  289. // Instructions that use the shifter, but have normal timing.
  290. def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>;
  291. // Instructions which are slot zero only but otherwise normal.
  292. def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>;
  293. // MAC operations that don't have SchedRW set.
  294. def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>;
  295. // Divides are special because they stall for their latency, and so look like a
  296. // single-cycle as far as scheduling opportunities go. By putting WriteALU
  297. // first, we make the operand latency 1, but keep the instruction latency 7.
  298. def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>;
  299. // DSP extension operations
  300. def M7WriteSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
  301. let Latency = 1;
  302. let BeginGroup = 1;
  303. }
  304. def M7WriteSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> {
  305. let Latency = 2;
  306. let BeginGroup = 1;
  307. }
  308. def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
  309. let Latency = 1;
  310. let BeginGroup = 1;
  311. }
  312. def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
  313. let Latency = 0; // Bypassable out of EX1
  314. let BeginGroup = 1;
  315. }
  316. def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> {
  317. let Latency = 2;
  318. let BeginGroup = 1;
  319. }
  320. def : InstRW<[M7WriteShSIMD2, M7Read_ISS],
  321. (instregex "t2(S|U)SAT")>;
  322. def : InstRW<[M7WriteSIMD1, ReadALU],
  323. (instregex "(t|t2)(S|U)XT(B|H)")>;
  324. def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU],
  325. (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)",
  326. "t2SEL")>;
  327. def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU],
  328. (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>;
  329. def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS],
  330. (instregex "t2QD(ADD|SUB)")>;
  331. def : InstRW<[M7WriteShSIMD0, M7Read_ISS],
  332. (instregex "t2(RBIT|REV)", "tREV")>;
  333. def : InstRW<[M7WriteShSIMD1, M7Read_ISS],
  334. (instregex "t2(SBFX|UBFX)")>;
  335. def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS],
  336. (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>;
  337. def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2],
  338. (instregex "t2USADA8")>;
  339. // MSR/MRS
  340. def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>;
  341. //===---------------------------------------------------------------------===//
  342. // Sched definitions for FP operations
  343. //
  344. // Effective scheduling latency is really 3 for nearly all FP operations,
  345. // even if their true latency is higher.
  346. def M7WriteVFPLatOverride : SchedWriteRes<[]> {
  347. let Latency = 3;
  348. let NumMicroOps = 0;
  349. }
  350. def M7WriteVFPExtraVPort : SchedWriteRes<[M7UnitVPort]> {
  351. let Latency = 3;
  352. let NumMicroOps = 0;
  353. }
  354. // Instructions which are missing default schedules.
  355. def : InstRW<[WriteFPALU32],
  356. (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>;
  357. def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
  358. (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>;
  359. // VCMP
  360. def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; }
  361. def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> {
  362. let Latency = 0;
  363. let BeginGroup = 1;
  364. }
  365. def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>;
  366. def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>;
  367. // VMRS/VMSR
  368. def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
  369. def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; }
  370. def : InstRW<[M7VMRS], (instregex "FMSTAT")>;
  371. def : InstRW<[M7VMSR], (instregex "VMSR")>;
  372. // VSEL cannot bypass in its implied $cpsr operand; model as earlier read
  373. def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS],
  374. (instregex "VSEL.*S$")>;
  375. def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only,
  376. ReadALU, ReadALU, M7Read_ISS],
  377. (instregex "VSEL.*D$")>;
  378. // VMOV
  379. def : InstRW<[WriteFPMOV],
  380. (instregex "VMOV(H|S)$", "FCONST(H|S)")>;
  381. def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
  382. (instregex "VMOVD$")>;
  383. def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only],
  384. (instregex "FCONSTD")>;
  385. def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue],
  386. (instregex "VMOV(DRR|RRD|RRS|SRR)")>;
  387. // Larger-latency overrides.
  388. def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32], (instregex "VDIVS")>;
  389. def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64], (instregex "VDIVD")>;
  390. def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>;
  391. def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>;
  392. def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64],
  393. (instregex "V(MUL|NMUL)D")>;
  394. def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64],
  395. (instregex "V(ADD|SUB)D")>;
  396. // Multiply-accumulate. Chained SP timing is correct; rest need overrides
  397. // Double-precision chained MAC stalls the pipeline behind it for 3 cycles,
  398. // making it appear to have 3 cycle latency for scheduling.
  399. def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
  400. ReadFPMAC, ReadFPMUL, ReadFPMUL],
  401. (instregex "V(N)?ML(A|S)D$")>;
  402. // Single-precision fused MACs look like latency 5 with advance of 2.
  403. def M7WriteVFPLatOverride5 : SchedWriteRes<[]> {
  404. let Latency = 5;
  405. let NumMicroOps = 0;
  406. }
  407. def M7ReadFPMAC2 : SchedReadAdvance<2>;
  408. def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32,
  409. M7ReadFPMAC2, ReadFPMUL, ReadFPMUL],
  410. (instregex "VF(N)?M(A|S)S$")>;
  411. // Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making
  412. // it appear to have 3 cycle latency for scheduling.
  413. def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64,
  414. ReadFPMAC, ReadFPMUL, ReadFPMUL],
  415. (instregex "VF(N)?M(A|S)D$")>;
  416. } // SchedModel = CortexM7Model