NVPTXInstrInfo.td 136 KB


  1. //===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file describes the PTX instructions in TableGen format.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. include "NVPTXInstrFormats.td"
  13. // A NOP instruction
  14. let hasSideEffects = false in {
  15. def NOP : NVPTXInst<(outs), (ins), "", []>;
  16. }
  17. let OperandType = "OPERAND_IMMEDIATE" in {
  18. def f16imm : Operand<f16>;
  19. }
  20. // List of vector specific properties
  21. def isVecLD : VecInstTypeEnum<1>;
  22. def isVecST : VecInstTypeEnum<2>;
  23. def isVecBuild : VecInstTypeEnum<3>;
  24. def isVecShuffle : VecInstTypeEnum<4>;
  25. def isVecExtract : VecInstTypeEnum<5>;
  26. def isVecInsert : VecInstTypeEnum<6>;
  27. def isVecDest : VecInstTypeEnum<7>;
  28. def isVecOther : VecInstTypeEnum<15>;
  29. //===----------------------------------------------------------------------===//
  30. // NVPTX Operand Definitions.
  31. //===----------------------------------------------------------------------===//
  32. def brtarget : Operand<OtherVT>;
  33. // CVT conversion modes
  34. // These must match the enum in NVPTX.h
  35. def CvtNONE : PatLeaf<(i32 0x0)>;
  36. def CvtRNI : PatLeaf<(i32 0x1)>;
  37. def CvtRZI : PatLeaf<(i32 0x2)>;
  38. def CvtRMI : PatLeaf<(i32 0x3)>;
  39. def CvtRPI : PatLeaf<(i32 0x4)>;
  40. def CvtRN : PatLeaf<(i32 0x5)>;
  41. def CvtRZ : PatLeaf<(i32 0x6)>;
  42. def CvtRM : PatLeaf<(i32 0x7)>;
  43. def CvtRP : PatLeaf<(i32 0x8)>;
  44. def CvtRNA : PatLeaf<(i32 0x9)>;
  45. def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
  46. def CvtRNI_FTZ : PatLeaf<(i32 0x11)>;
  47. def CvtRZI_FTZ : PatLeaf<(i32 0x12)>;
  48. def CvtRMI_FTZ : PatLeaf<(i32 0x13)>;
  49. def CvtRPI_FTZ : PatLeaf<(i32 0x14)>;
  50. def CvtRN_FTZ : PatLeaf<(i32 0x15)>;
  51. def CvtRZ_FTZ : PatLeaf<(i32 0x16)>;
  52. def CvtRM_FTZ : PatLeaf<(i32 0x17)>;
  53. def CvtRP_FTZ : PatLeaf<(i32 0x18)>;
  54. def CvtSAT : PatLeaf<(i32 0x20)>;
  55. def CvtSAT_FTZ : PatLeaf<(i32 0x30)>;
  56. def CvtNONE_RELU : PatLeaf<(i32 0x40)>;
  57. def CvtRN_RELU : PatLeaf<(i32 0x45)>;
  58. def CvtRZ_RELU : PatLeaf<(i32 0x46)>;
  59. def CvtMode : Operand<i32> {
  60. let PrintMethod = "printCvtMode";
  61. }
  62. // Compare modes
  63. // These must match the enum in NVPTX.h
  64. def CmpEQ : PatLeaf<(i32 0)>;
  65. def CmpNE : PatLeaf<(i32 1)>;
  66. def CmpLT : PatLeaf<(i32 2)>;
  67. def CmpLE : PatLeaf<(i32 3)>;
  68. def CmpGT : PatLeaf<(i32 4)>;
  69. def CmpGE : PatLeaf<(i32 5)>;
  70. def CmpEQU : PatLeaf<(i32 10)>;
  71. def CmpNEU : PatLeaf<(i32 11)>;
  72. def CmpLTU : PatLeaf<(i32 12)>;
  73. def CmpLEU : PatLeaf<(i32 13)>;
  74. def CmpGTU : PatLeaf<(i32 14)>;
  75. def CmpGEU : PatLeaf<(i32 15)>;
  76. def CmpNUM : PatLeaf<(i32 16)>;
  77. def CmpNAN : PatLeaf<(i32 17)>;
  78. def CmpEQ_FTZ : PatLeaf<(i32 0x100)>;
  79. def CmpNE_FTZ : PatLeaf<(i32 0x101)>;
  80. def CmpLT_FTZ : PatLeaf<(i32 0x102)>;
  81. def CmpLE_FTZ : PatLeaf<(i32 0x103)>;
  82. def CmpGT_FTZ : PatLeaf<(i32 0x104)>;
  83. def CmpGE_FTZ : PatLeaf<(i32 0x105)>;
  84. def CmpEQU_FTZ : PatLeaf<(i32 0x10A)>;
  85. def CmpNEU_FTZ : PatLeaf<(i32 0x10B)>;
  86. def CmpLTU_FTZ : PatLeaf<(i32 0x10C)>;
  87. def CmpLEU_FTZ : PatLeaf<(i32 0x10D)>;
  88. def CmpGTU_FTZ : PatLeaf<(i32 0x10E)>;
  89. def CmpGEU_FTZ : PatLeaf<(i32 0x10F)>;
  90. def CmpNUM_FTZ : PatLeaf<(i32 0x110)>;
  91. def CmpNAN_FTZ : PatLeaf<(i32 0x111)>;
  92. def CmpMode : Operand<i32> {
  93. let PrintMethod = "printCmpMode";
  94. }
  95. def VecElement : Operand<i32> {
  96. let PrintMethod = "printVecElement";
  97. }
  98. //===----------------------------------------------------------------------===//
  99. // NVPTX Instruction Predicate Definitions
  100. //===----------------------------------------------------------------------===//
  101. def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
  102. def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
  103. def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
  104. def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
  105. def hasVote : Predicate<"Subtarget->hasVote()">;
  106. def hasDouble : Predicate<"Subtarget->hasDouble()">;
  107. def hasLDG : Predicate<"Subtarget->hasLDG()">;
  108. def hasLDU : Predicate<"Subtarget->hasLDU()">;
  109. def doF32FTZ : Predicate<"useF32FTZ()">;
  110. def doNoF32FTZ : Predicate<"!useF32FTZ()">;
  111. def doMulWide : Predicate<"doMulWide">;
  112. def allowFMA : Predicate<"allowFMA()">;
  113. def noFMA : Predicate<"!allowFMA()">;
  114. def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
  115. def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
  116. def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
  117. def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
  118. def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
  119. def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
  120. def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
  121. def True : Predicate<"true">;
  122. def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
  123. def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
  124. def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
  125. def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
  126. def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
  127. def hasPTX65 : Predicate<"Subtarget->getPTXVersion() >= 65">;
  128. def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">;
  129. def hasPTX71 : Predicate<"Subtarget->getPTXVersion() >= 71">;
  130. def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
  131. def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
  132. def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
  133. def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
  134. def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">;
  135. // non-sync shfl instructions are not available on sm_70+ in PTX6.4+
  136. def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
  137. "&& Subtarget->getPTXVersion() >= 64)">;
  138. def useShortPtr : Predicate<"useShortPointers()">;
  139. def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
  140. //===----------------------------------------------------------------------===//
  141. // Some Common Instruction Class Templates
  142. //===----------------------------------------------------------------------===//
  143. // Template for instructions which take three int64, int32, or int16 args.
  144. // The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
  145. multiclass I3<string OpcStr, SDNode OpNode> {
  146. def i64rr :
  147. NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
  148. !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
  149. [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
  150. def i64ri :
  151. NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
  152. !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
  153. [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
  154. def i32rr :
  155. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
  156. !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
  157. [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
  158. def i32ri :
  159. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
  160. !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
  161. [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
  162. def i16rr :
  163. NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
  164. !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
  165. [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
  166. def i16ri :
  167. NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
  168. !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
  169. [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
  170. }
  171. // Template for instructions which take 3 int32 args. The instructions are
  172. // named "<OpcStr>.s32" (e.g. "addc.cc.s32").
  173. multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
  174. def i32rr :
  175. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
  176. !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
  177. [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
  178. def i32ri :
  179. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
  180. !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
  181. [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
  182. }
  183. // Template for instructions which take three fp64 or fp32 args. The
  184. // instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
  185. //
  186. // Also defines ftz (flush subnormal inputs and results to sign-preserving
  187. // zero) variants for fp32 functions.
  188. //
  189. // This multiclass should be used for nodes that cannot be folded into FMAs.
  190. // For nodes that can be folded into FMAs (i.e. adds and muls), use
  191. // F3_fma_component.
  192. multiclass F3<string OpcStr, SDNode OpNode> {
  193. def f64rr :
  194. NVPTXInst<(outs Float64Regs:$dst),
  195. (ins Float64Regs:$a, Float64Regs:$b),
  196. !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
  197. [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
  198. def f64ri :
  199. NVPTXInst<(outs Float64Regs:$dst),
  200. (ins Float64Regs:$a, f64imm:$b),
  201. !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
  202. [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
  203. def f32rr_ftz :
  204. NVPTXInst<(outs Float32Regs:$dst),
  205. (ins Float32Regs:$a, Float32Regs:$b),
  206. !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
  207. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
  208. Requires<[doF32FTZ]>;
  209. def f32ri_ftz :
  210. NVPTXInst<(outs Float32Regs:$dst),
  211. (ins Float32Regs:$a, f32imm:$b),
  212. !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
  213. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
  214. Requires<[doF32FTZ]>;
  215. def f32rr :
  216. NVPTXInst<(outs Float32Regs:$dst),
  217. (ins Float32Regs:$a, Float32Regs:$b),
  218. !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
  219. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
  220. def f32ri :
  221. NVPTXInst<(outs Float32Regs:$dst),
  222. (ins Float32Regs:$a, f32imm:$b),
  223. !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
  224. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
  225. def f16rr_ftz :
  226. NVPTXInst<(outs Float16Regs:$dst),
  227. (ins Float16Regs:$a, Float16Regs:$b),
  228. !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
  229. [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
  230. Requires<[useFP16Math, doF32FTZ]>;
  231. def f16rr :
  232. NVPTXInst<(outs Float16Regs:$dst),
  233. (ins Float16Regs:$a, Float16Regs:$b),
  234. !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
  235. [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
  236. Requires<[useFP16Math]>;
  237. def f16x2rr_ftz :
  238. NVPTXInst<(outs Float16x2Regs:$dst),
  239. (ins Float16x2Regs:$a, Float16x2Regs:$b),
  240. !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
  241. [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
  242. Requires<[useFP16Math, doF32FTZ]>;
  243. def f16x2rr :
  244. NVPTXInst<(outs Float16x2Regs:$dst),
  245. (ins Float16x2Regs:$a, Float16x2Regs:$b),
  246. !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
  247. [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
  248. Requires<[useFP16Math]>;
  249. }
  250. // Template for instructions which take three FP args. The
  251. // instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
  252. //
  253. // Also defines ftz (flush subnormal inputs and results to sign-preserving
  254. // zero) variants for fp32/fp16 functions.
  255. //
  256. // This multiclass should be used for nodes that can be folded to make fma ops.
  257. // In this case, we use the ".rn" variant when FMA is disabled, as this behaves
  258. // just like the non ".rn" op, but prevents ptxas from creating FMAs.
  259. multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
  260. def f64rr :
  261. NVPTXInst<(outs Float64Regs:$dst),
  262. (ins Float64Regs:$a, Float64Regs:$b),
  263. !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
  264. [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
  265. Requires<[allowFMA]>;
  266. def f64ri :
  267. NVPTXInst<(outs Float64Regs:$dst),
  268. (ins Float64Regs:$a, f64imm:$b),
  269. !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
  270. [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
  271. Requires<[allowFMA]>;
  272. def f32rr_ftz :
  273. NVPTXInst<(outs Float32Regs:$dst),
  274. (ins Float32Regs:$a, Float32Regs:$b),
  275. !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
  276. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
  277. Requires<[allowFMA, doF32FTZ]>;
  278. def f32ri_ftz :
  279. NVPTXInst<(outs Float32Regs:$dst),
  280. (ins Float32Regs:$a, f32imm:$b),
  281. !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
  282. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
  283. Requires<[allowFMA, doF32FTZ]>;
  284. def f32rr :
  285. NVPTXInst<(outs Float32Regs:$dst),
  286. (ins Float32Regs:$a, Float32Regs:$b),
  287. !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
  288. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
  289. Requires<[allowFMA]>;
  290. def f32ri :
  291. NVPTXInst<(outs Float32Regs:$dst),
  292. (ins Float32Regs:$a, f32imm:$b),
  293. !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
  294. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
  295. Requires<[allowFMA]>;
  296. def f16rr_ftz :
  297. NVPTXInst<(outs Float16Regs:$dst),
  298. (ins Float16Regs:$a, Float16Regs:$b),
  299. !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
  300. [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
  301. Requires<[useFP16Math, allowFMA, doF32FTZ]>;
  302. def f16rr :
  303. NVPTXInst<(outs Float16Regs:$dst),
  304. (ins Float16Regs:$a, Float16Regs:$b),
  305. !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
  306. [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
  307. Requires<[useFP16Math, allowFMA]>;
  308. def f16x2rr_ftz :
  309. NVPTXInst<(outs Float16x2Regs:$dst),
  310. (ins Float16x2Regs:$a, Float16x2Regs:$b),
  311. !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
  312. [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
  313. Requires<[useFP16Math, allowFMA, doF32FTZ]>;
  314. def f16x2rr :
  315. NVPTXInst<(outs Float16x2Regs:$dst),
  316. (ins Float16x2Regs:$a, Float16x2Regs:$b),
  317. !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
  318. [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
  319. Requires<[useFP16Math, allowFMA]>;
  320. // These have strange names so we don't perturb existing mir tests.
  321. def _rnf64rr :
  322. NVPTXInst<(outs Float64Regs:$dst),
  323. (ins Float64Regs:$a, Float64Regs:$b),
  324. !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
  325. [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
  326. Requires<[noFMA]>;
  327. def _rnf64ri :
  328. NVPTXInst<(outs Float64Regs:$dst),
  329. (ins Float64Regs:$a, f64imm:$b),
  330. !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
  331. [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
  332. Requires<[noFMA]>;
  333. def _rnf32rr_ftz :
  334. NVPTXInst<(outs Float32Regs:$dst),
  335. (ins Float32Regs:$a, Float32Regs:$b),
  336. !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
  337. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
  338. Requires<[noFMA, doF32FTZ]>;
  339. def _rnf32ri_ftz :
  340. NVPTXInst<(outs Float32Regs:$dst),
  341. (ins Float32Regs:$a, f32imm:$b),
  342. !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
  343. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
  344. Requires<[noFMA, doF32FTZ]>;
  345. def _rnf32rr :
  346. NVPTXInst<(outs Float32Regs:$dst),
  347. (ins Float32Regs:$a, Float32Regs:$b),
  348. !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
  349. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
  350. Requires<[noFMA]>;
  351. def _rnf32ri :
  352. NVPTXInst<(outs Float32Regs:$dst),
  353. (ins Float32Regs:$a, f32imm:$b),
  354. !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
  355. [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
  356. Requires<[noFMA]>;
  357. def _rnf16rr_ftz :
  358. NVPTXInst<(outs Float16Regs:$dst),
  359. (ins Float16Regs:$a, Float16Regs:$b),
  360. !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
  361. [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
  362. Requires<[useFP16Math, noFMA, doF32FTZ]>;
  363. def _rnf16rr :
  364. NVPTXInst<(outs Float16Regs:$dst),
  365. (ins Float16Regs:$a, Float16Regs:$b),
  366. !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
  367. [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
  368. Requires<[useFP16Math, noFMA]>;
  369. def _rnf16x2rr_ftz :
  370. NVPTXInst<(outs Float16x2Regs:$dst),
  371. (ins Float16x2Regs:$a, Float16x2Regs:$b),
  372. !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
  373. [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
  374. Requires<[useFP16Math, noFMA, doF32FTZ]>;
  375. def _rnf16x2rr :
  376. NVPTXInst<(outs Float16x2Regs:$dst),
  377. (ins Float16x2Regs:$a, Float16x2Regs:$b),
  378. !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
  379. [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
  380. Requires<[useFP16Math, noFMA]>;
  381. }
  382. // Template for operations which take two f32 or f64 operands. Provides three
  383. // instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
  384. // subnormal inputs and results to zero).
  385. multiclass F2<string OpcStr, SDNode OpNode> {
  386. def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
  387. !strconcat(OpcStr, ".f64 \t$dst, $a;"),
  388. [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
  389. def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
  390. !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
  391. [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
  392. Requires<[doF32FTZ]>;
  393. def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
  394. !strconcat(OpcStr, ".f32 \t$dst, $a;"),
  395. [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
  396. }
  397. //===----------------------------------------------------------------------===//
  398. // NVPTX Instructions.
  399. //===----------------------------------------------------------------------===//
  400. //-----------------------------------
  401. // Type Conversion
  402. //-----------------------------------
  403. let hasSideEffects = false in {
  404. // Generate a cvt to the given type from all possible types. Each instance
  405. // takes a CvtMode immediate that defines the conversion mode to use. It can
  406. // be CvtNONE to omit a conversion mode.
  407. multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
  408. def _s8 :
  409. NVPTXInst<(outs RC:$dst),
  410. (ins Int16Regs:$src, CvtMode:$mode),
  411. !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
  412. FromName, ".s8 \t$dst, $src;"), []>;
  413. def _u8 :
  414. NVPTXInst<(outs RC:$dst),
  415. (ins Int16Regs:$src, CvtMode:$mode),
  416. !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
  417. FromName, ".u8 \t$dst, $src;"), []>;
  418. def _s16 :
  419. NVPTXInst<(outs RC:$dst),
  420. (ins Int16Regs:$src, CvtMode:$mode),
  421. !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
  422. FromName, ".s16 \t$dst, $src;"), []>;
  423. def _u16 :
  424. NVPTXInst<(outs RC:$dst),
  425. (ins Int16Regs:$src, CvtMode:$mode),
  426. !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
  427. FromName, ".u16 \t$dst, $src;"), []>;
  428. def _s32 :
  429. NVPTXInst<(outs RC:$dst),
  430. (ins Int32Regs:$src, CvtMode:$mode),
  431. !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
  432. FromName, ".s32 \t$dst, $src;"), []>;
  433. def _u32 :
  434. NVPTXInst<(outs RC:$dst),
  435. (ins Int32Regs:$src, CvtMode:$mode),
  436. !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
  437. FromName, ".u32 \t$dst, $src;"), []>;
  438. def _s64 :
  439. NVPTXInst<(outs RC:$dst),
  440. (ins Int64Regs:$src, CvtMode:$mode),
  441. !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
  442. FromName, ".s64 \t$dst, $src;"), []>;
  443. def _u64 :
  444. NVPTXInst<(outs RC:$dst),
  445. (ins Int64Regs:$src, CvtMode:$mode),
  446. !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
  447. FromName, ".u64 \t$dst, $src;"), []>;
  448. def _f16 :
  449. NVPTXInst<(outs RC:$dst),
  450. (ins Float16Regs:$src, CvtMode:$mode),
  451. !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
  452. FromName, ".f16 \t$dst, $src;"), []>;
  453. def _f32 :
  454. NVPTXInst<(outs RC:$dst),
  455. (ins Float32Regs:$src, CvtMode:$mode),
  456. !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
  457. FromName, ".f32 \t$dst, $src;"), []>;
  458. def _f64 :
  459. NVPTXInst<(outs RC:$dst),
  460. (ins Float64Regs:$src, CvtMode:$mode),
  461. !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
  462. FromName, ".f64 \t$dst, $src;"), []>;
  463. }
  464. // Generate cvts from all types to all types.
  465. defm CVT_s8 : CVT_FROM_ALL<"s8", Int16Regs>;
  466. defm CVT_u8 : CVT_FROM_ALL<"u8", Int16Regs>;
  467. defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
  468. defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
  469. defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
  470. defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
  471. defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
  472. defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
  473. defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>;
  474. defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
  475. defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
  476. // These cvts are different from those above: The source and dest registers
  477. // are of the same type.
  478. def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
  479. "cvt.s16.s8 \t$dst, $src;", []>;
  480. def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
  481. "cvt.s32.s8 \t$dst, $src;", []>;
  482. def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
  483. "cvt.s32.s16 \t$dst, $src;", []>;
  484. def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
  485. "cvt.s64.s8 \t$dst, $src;", []>;
  486. def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
  487. "cvt.s64.s16 \t$dst, $src;", []>;
  488. def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
  489. "cvt.s64.s32 \t$dst, $src;", []>;
  490. multiclass CVT_FROM_FLOAT_SM80<string FromName, RegisterClass RC> {
  491. def _f32 :
  492. NVPTXInst<(outs RC:$dst),
  493. (ins Float32Regs:$src, CvtMode:$mode),
  494. !strconcat("cvt${mode:base}${mode:relu}.",
  495. FromName, ".f32 \t$dst, $src;"), []>,
  496. Requires<[hasPTX70, hasSM80]>;
  497. }
  498. defm CVT_bf16 : CVT_FROM_FLOAT_SM80<"bf16", Int16Regs>;
  499. multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> {
  500. def _f32 :
  501. NVPTXInst<(outs RC:$dst),
  502. (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
  503. !strconcat("cvt${mode:base}${mode:relu}.",
  504. FromName, ".f32 \t$dst, $src1, $src2;"), []>,
  505. Requires<[hasPTX70, hasSM80]>;
  506. }
  507. defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Float16x2Regs>;
  508. defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
  509. }
  510. //-----------------------------------
  511. // Integer Arithmetic
  512. //-----------------------------------
  513. // Template for xor masquerading as int1 arithmetic.
  514. multiclass ADD_SUB_i1<SDNode OpNode> {
  515. def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
  516. "xor.pred \t$dst, $a, $b;",
  517. [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
  518. def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
  519. "xor.pred \t$dst, $a, $b;",
  520. [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
  521. }
  522. // int1 addition and subtraction are both just xor.
  523. defm ADD_i1 : ADD_SUB_i1<add>;
  524. defm SUB_i1 : ADD_SUB_i1<sub>;
  525. // int16, int32, and int64 signed addition. Since nvptx is 2's complement, we
  526. // also use these for unsigned arithmetic.
  527. defm ADD : I3<"add.s", add>;
  528. defm SUB : I3<"sub.s", sub>;
  529. // int32 addition and subtraction with carry-out.
  530. // FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
  531. defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
  532. defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
  533. // int32 addition and subtraction with carry-in and carry-out.
  534. defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
  535. defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
  536. defm MULT : I3<"mul.lo.s", mul>;
  537. defm MULTHS : I3<"mul.hi.s", mulhs>;
  538. defm MULTHU : I3<"mul.hi.u", mulhu>;
  539. defm SDIV : I3<"div.s", sdiv>;
  540. defm UDIV : I3<"div.u", udiv>;
  541. // The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
  542. // will lower it.
  543. defm SREM : I3<"rem.s", srem>;
  544. defm UREM : I3<"rem.u", urem>;
  545. // Integer absolute value. NumBits should be one minus the bit width of RC.
  546. // This idiom implements the algorithm at
  547. // http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
  548. multiclass ABS<RegisterClass RC, string SizeName> {
  549. def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
  550. !strconcat("abs", SizeName, " \t$dst, $a;"),
  551. [(set RC:$dst, (abs RC:$a))]>;
  552. }
  553. defm ABS_16 : ABS<Int16Regs, ".s16">;
  554. defm ABS_32 : ABS<Int32Regs, ".s32">;
  555. defm ABS_64 : ABS<Int64Regs, ".s64">;
  556. // Integer min/max.
  557. defm SMAX : I3<"max.s", smax>;
  558. defm UMAX : I3<"max.u", umax>;
  559. defm SMIN : I3<"min.s", smin>;
  560. defm UMIN : I3<"min.u", umin>;
  561. //
  562. // Wide multiplication
  563. //
  564. def MULWIDES64 :
  565. NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
  566. "mul.wide.s32 \t$dst, $a, $b;", []>;
  567. def MULWIDES64Imm :
  568. NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
  569. "mul.wide.s32 \t$dst, $a, $b;", []>;
  570. def MULWIDES64Imm64 :
  571. NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
  572. "mul.wide.s32 \t$dst, $a, $b;", []>;
  573. def MULWIDEU64 :
  574. NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
  575. "mul.wide.u32 \t$dst, $a, $b;", []>;
  576. def MULWIDEU64Imm :
  577. NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
  578. "mul.wide.u32 \t$dst, $a, $b;", []>;
  579. def MULWIDEU64Imm64 :
  580. NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
  581. "mul.wide.u32 \t$dst, $a, $b;", []>;
  582. def MULWIDES32 :
  583. NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
  584. "mul.wide.s16 \t$dst, $a, $b;", []>;
  585. def MULWIDES32Imm :
  586. NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
  587. "mul.wide.s16 \t$dst, $a, $b;", []>;
  588. def MULWIDES32Imm32 :
  589. NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
  590. "mul.wide.s16 \t$dst, $a, $b;", []>;
  591. def MULWIDEU32 :
  592. NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
  593. "mul.wide.u16 \t$dst, $a, $b;", []>;
  594. def MULWIDEU32Imm :
  595. NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
  596. "mul.wide.u16 \t$dst, $a, $b;", []>;
  597. def MULWIDEU32Imm32 :
  598. NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
  599. "mul.wide.u16 \t$dst, $a, $b;", []>;
  600. def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
  601. def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
  602. def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
  603. // Matchers for signed, unsigned mul.wide ISD nodes.
  604. def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
  605. (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
  606. Requires<[doMulWide]>;
  607. def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
  608. (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
  609. Requires<[doMulWide]>;
  610. def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
  611. (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
  612. Requires<[doMulWide]>;
  613. def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
  614. (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
  615. Requires<[doMulWide]>;
  616. def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
  617. (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
  618. Requires<[doMulWide]>;
  619. def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
  620. (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
  621. Requires<[doMulWide]>;
  622. def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
  623. (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
  624. Requires<[doMulWide]>;
  625. def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
  626. (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
  627. Requires<[doMulWide]>;
  628. // Predicates used for converting some patterns to mul.wide.
  629. def SInt32Const : PatLeaf<(imm), [{
  630. const APInt &v = N->getAPIntValue();
  631. return v.isSignedIntN(32);
  632. }]>;
  633. def UInt32Const : PatLeaf<(imm), [{
  634. const APInt &v = N->getAPIntValue();
  635. return v.isIntN(32);
  636. }]>;
  637. def SInt16Const : PatLeaf<(imm), [{
  638. const APInt &v = N->getAPIntValue();
  639. return v.isSignedIntN(16);
  640. }]>;
  641. def UInt16Const : PatLeaf<(imm), [{
  642. const APInt &v = N->getAPIntValue();
  643. return v.isIntN(16);
  644. }]>;
  645. def Int5Const : PatLeaf<(imm), [{
  646. // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
  647. const APInt &v = N->getAPIntValue();
  648. return v.sge(0) && v.slt(32);
  649. }]>;
  650. def Int4Const : PatLeaf<(imm), [{
  651. // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
  652. const APInt &v = N->getAPIntValue();
  653. return v.sge(0) && v.slt(16);
  654. }]>;
  655. def SHL2MUL32 : SDNodeXForm<imm, [{
  656. const APInt &v = N->getAPIntValue();
  657. APInt temp(32, 1);
  658. return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
  659. }]>;
  660. def SHL2MUL16 : SDNodeXForm<imm, [{
  661. const APInt &v = N->getAPIntValue();
  662. APInt temp(16, 1);
  663. return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
  664. }]>;
  665. // Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
  666. def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
  667. (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
  668. Requires<[doMulWide]>;
  669. def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
  670. (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
  671. Requires<[doMulWide]>;
  672. def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
  673. (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
  674. Requires<[doMulWide]>;
  675. def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
  676. (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
  677. Requires<[doMulWide]>;
  678. // Convert "sign/zero-extend then multiply" to mul.wide.
  679. def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
  680. (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
  681. Requires<[doMulWide]>;
  682. def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
  683. (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
  684. Requires<[doMulWide]>;
  685. def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
  686. (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
  687. Requires<[doMulWide]>;
  688. def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
  689. (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
  690. Requires<[doMulWide]>;
  691. def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
  692. (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
  693. Requires<[doMulWide]>;
  694. def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
  695. (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
  696. Requires<[doMulWide]>;
  697. def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
  698. (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
  699. Requires<[doMulWide]>;
  700. def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
  701. (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
  702. Requires<[doMulWide]>;
  703. //
  704. // Integer multiply-add
  705. //
  706. def SDTIMAD :
  707. SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
  708. SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
  709. def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
  710. def MAD16rrr :
  711. NVPTXInst<(outs Int16Regs:$dst),
  712. (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
  713. "mad.lo.s16 \t$dst, $a, $b, $c;",
  714. [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
  715. def MAD16rri :
  716. NVPTXInst<(outs Int16Regs:$dst),
  717. (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
  718. "mad.lo.s16 \t$dst, $a, $b, $c;",
  719. [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
  720. def MAD16rir :
  721. NVPTXInst<(outs Int16Regs:$dst),
  722. (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
  723. "mad.lo.s16 \t$dst, $a, $b, $c;",
  724. [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
  725. def MAD16rii :
  726. NVPTXInst<(outs Int16Regs:$dst),
  727. (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
  728. "mad.lo.s16 \t$dst, $a, $b, $c;",
  729. [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
  730. def MAD32rrr :
  731. NVPTXInst<(outs Int32Regs:$dst),
  732. (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
  733. "mad.lo.s32 \t$dst, $a, $b, $c;",
  734. [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
  735. def MAD32rri :
  736. NVPTXInst<(outs Int32Regs:$dst),
  737. (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
  738. "mad.lo.s32 \t$dst, $a, $b, $c;",
  739. [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
  740. def MAD32rir :
  741. NVPTXInst<(outs Int32Regs:$dst),
  742. (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
  743. "mad.lo.s32 \t$dst, $a, $b, $c;",
  744. [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
  745. def MAD32rii :
  746. NVPTXInst<(outs Int32Regs:$dst),
  747. (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
  748. "mad.lo.s32 \t$dst, $a, $b, $c;",
  749. [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
  750. def MAD64rrr :
  751. NVPTXInst<(outs Int64Regs:$dst),
  752. (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
  753. "mad.lo.s64 \t$dst, $a, $b, $c;",
  754. [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
  755. def MAD64rri :
  756. NVPTXInst<(outs Int64Regs:$dst),
  757. (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
  758. "mad.lo.s64 \t$dst, $a, $b, $c;",
  759. [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
  760. def MAD64rir :
  761. NVPTXInst<(outs Int64Regs:$dst),
  762. (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
  763. "mad.lo.s64 \t$dst, $a, $b, $c;",
  764. [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
  765. def MAD64rii :
  766. NVPTXInst<(outs Int64Regs:$dst),
  767. (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
  768. "mad.lo.s64 \t$dst, $a, $b, $c;",
  769. [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
  770. def INEG16 :
  771. NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
  772. "neg.s16 \t$dst, $src;",
  773. [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
  774. def INEG32 :
  775. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
  776. "neg.s32 \t$dst, $src;",
  777. [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
  778. def INEG64 :
  779. NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
  780. "neg.s64 \t$dst, $src;",
  781. [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
  782. //-----------------------------------
  783. // Floating Point Arithmetic
  784. //-----------------------------------
  785. // Constant 1.0f
  786. def FloatConst1 : PatLeaf<(fpimm), [{
  787. return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
  788. N->getValueAPF().convertToFloat() == 1.0f;
  789. }]>;
  790. // Constant 1.0 (double)
  791. def DoubleConst1 : PatLeaf<(fpimm), [{
  792. return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
  793. N->getValueAPF().convertToDouble() == 1.0;
  794. }]>;
  795. // Loads FP16 constant into a register.
  796. //
  797. // ptxas does not have hex representation for fp16, so we can't use
  798. // fp16 immediate values in .f16 instructions. Instead we have to load
  799. // the constant into a register using mov.b16.
  800. def LOAD_CONST_F16 :
  801. NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a),
  802. "mov.b16 \t$dst, $a;", []>;
  803. defm FADD : F3_fma_component<"add", fadd>;
  804. defm FSUB : F3_fma_component<"sub", fsub>;
  805. defm FMUL : F3_fma_component<"mul", fmul>;
  806. defm FMIN : F3<"min", fminnum>;
  807. defm FMAX : F3<"max", fmaxnum>;
  808. // Note: min.NaN.f64 and max.NaN.f64 do not actually exist.
  809. defm FMINNAN : F3<"min.NaN", fminimum>;
  810. defm FMAXNAN : F3<"max.NaN", fmaximum>;
  811. defm FABS : F2<"abs", fabs>;
  812. defm FNEG : F2<"neg", fneg>;
  813. defm FSQRT : F2<"sqrt.rn", fsqrt>;
  814. //
  815. // F64 division
  816. //
  817. def FDIV641r :
  818. NVPTXInst<(outs Float64Regs:$dst),
  819. (ins f64imm:$a, Float64Regs:$b),
  820. "rcp.rn.f64 \t$dst, $b;",
  821. [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
  822. def FDIV64rr :
  823. NVPTXInst<(outs Float64Regs:$dst),
  824. (ins Float64Regs:$a, Float64Regs:$b),
  825. "div.rn.f64 \t$dst, $a, $b;",
  826. [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
  827. def FDIV64ri :
  828. NVPTXInst<(outs Float64Regs:$dst),
  829. (ins Float64Regs:$a, f64imm:$b),
  830. "div.rn.f64 \t$dst, $a, $b;",
  831. [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
  832. //
  833. // F32 Approximate reciprocal
  834. //
  835. def FDIV321r_ftz :
  836. NVPTXInst<(outs Float32Regs:$dst),
  837. (ins f32imm:$a, Float32Regs:$b),
  838. "rcp.approx.ftz.f32 \t$dst, $b;",
  839. [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
  840. Requires<[do_DIVF32_APPROX, doF32FTZ]>;
  841. def FDIV321r :
  842. NVPTXInst<(outs Float32Regs:$dst),
  843. (ins f32imm:$a, Float32Regs:$b),
  844. "rcp.approx.f32 \t$dst, $b;",
  845. [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
  846. Requires<[do_DIVF32_APPROX]>;
  847. //
  848. // F32 Approximate division
  849. //
  850. def FDIV32approxrr_ftz :
  851. NVPTXInst<(outs Float32Regs:$dst),
  852. (ins Float32Regs:$a, Float32Regs:$b),
  853. "div.approx.ftz.f32 \t$dst, $a, $b;",
  854. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
  855. Requires<[do_DIVF32_APPROX, doF32FTZ]>;
  856. def FDIV32approxri_ftz :
  857. NVPTXInst<(outs Float32Regs:$dst),
  858. (ins Float32Regs:$a, f32imm:$b),
  859. "div.approx.ftz.f32 \t$dst, $a, $b;",
  860. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
  861. Requires<[do_DIVF32_APPROX, doF32FTZ]>;
  862. def FDIV32approxrr :
  863. NVPTXInst<(outs Float32Regs:$dst),
  864. (ins Float32Regs:$a, Float32Regs:$b),
  865. "div.approx.f32 \t$dst, $a, $b;",
  866. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
  867. Requires<[do_DIVF32_APPROX]>;
  868. def FDIV32approxri :
  869. NVPTXInst<(outs Float32Regs:$dst),
  870. (ins Float32Regs:$a, f32imm:$b),
  871. "div.approx.f32 \t$dst, $a, $b;",
  872. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
  873. Requires<[do_DIVF32_APPROX]>;
  874. //
  875. // F32 Semi-accurate reciprocal
  876. //
  877. // rcp.approx gives the same result as div.full(1.0f, a) and is faster.
  878. //
  879. def FDIV321r_approx_ftz :
  880. NVPTXInst<(outs Float32Regs:$dst),
  881. (ins f32imm:$a, Float32Regs:$b),
  882. "rcp.approx.ftz.f32 \t$dst, $b;",
  883. [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
  884. Requires<[do_DIVF32_FULL, doF32FTZ]>;
  885. def FDIV321r_approx :
  886. NVPTXInst<(outs Float32Regs:$dst),
  887. (ins f32imm:$a, Float32Regs:$b),
  888. "rcp.approx.f32 \t$dst, $b;",
  889. [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
  890. Requires<[do_DIVF32_FULL]>;
  891. //
  892. // F32 Semi-accurate division
  893. //
  894. def FDIV32rr_ftz :
  895. NVPTXInst<(outs Float32Regs:$dst),
  896. (ins Float32Regs:$a, Float32Regs:$b),
  897. "div.full.ftz.f32 \t$dst, $a, $b;",
  898. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
  899. Requires<[do_DIVF32_FULL, doF32FTZ]>;
  900. def FDIV32ri_ftz :
  901. NVPTXInst<(outs Float32Regs:$dst),
  902. (ins Float32Regs:$a, f32imm:$b),
  903. "div.full.ftz.f32 \t$dst, $a, $b;",
  904. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
  905. Requires<[do_DIVF32_FULL, doF32FTZ]>;
  906. def FDIV32rr :
  907. NVPTXInst<(outs Float32Regs:$dst),
  908. (ins Float32Regs:$a, Float32Regs:$b),
  909. "div.full.f32 \t$dst, $a, $b;",
  910. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
  911. Requires<[do_DIVF32_FULL]>;
  912. def FDIV32ri :
  913. NVPTXInst<(outs Float32Regs:$dst),
  914. (ins Float32Regs:$a, f32imm:$b),
  915. "div.full.f32 \t$dst, $a, $b;",
  916. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
  917. Requires<[do_DIVF32_FULL]>;
  918. //
  919. // F32 Accurate reciprocal
  920. //
  921. def FDIV321r_prec_ftz :
  922. NVPTXInst<(outs Float32Regs:$dst),
  923. (ins f32imm:$a, Float32Regs:$b),
  924. "rcp.rn.ftz.f32 \t$dst, $b;",
  925. [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
  926. Requires<[doF32FTZ]>;
  927. def FDIV321r_prec :
  928. NVPTXInst<(outs Float32Regs:$dst),
  929. (ins f32imm:$a, Float32Regs:$b),
  930. "rcp.rn.f32 \t$dst, $b;",
  931. [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>;
  932. //
  933. // F32 Accurate division
  934. //
  935. def FDIV32rr_prec_ftz :
  936. NVPTXInst<(outs Float32Regs:$dst),
  937. (ins Float32Regs:$a, Float32Regs:$b),
  938. "div.rn.ftz.f32 \t$dst, $a, $b;",
  939. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
  940. Requires<[doF32FTZ]>;
  941. def FDIV32ri_prec_ftz :
  942. NVPTXInst<(outs Float32Regs:$dst),
  943. (ins Float32Regs:$a, f32imm:$b),
  944. "div.rn.ftz.f32 \t$dst, $a, $b;",
  945. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
  946. Requires<[doF32FTZ]>;
  947. def FDIV32rr_prec :
  948. NVPTXInst<(outs Float32Regs:$dst),
  949. (ins Float32Regs:$a, Float32Regs:$b),
  950. "div.rn.f32 \t$dst, $a, $b;",
  951. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>;
  952. def FDIV32ri_prec :
  953. NVPTXInst<(outs Float32Regs:$dst),
  954. (ins Float32Regs:$a, f32imm:$b),
  955. "div.rn.f32 \t$dst, $a, $b;",
  956. [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>;
  957. //
  958. // FMA
  959. //
  960. multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
  961. def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
  962. !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
  963. [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
  964. Requires<[Pred]>;
  965. def rri : NVPTXInst<(outs RC:$dst),
  966. (ins RC:$a, RC:$b, ImmCls:$c),
  967. !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
  968. [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
  969. Requires<[Pred]>;
  970. def rir : NVPTXInst<(outs RC:$dst),
  971. (ins RC:$a, ImmCls:$b, RC:$c),
  972. !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
  973. [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
  974. Requires<[Pred]>;
  975. def rii : NVPTXInst<(outs RC:$dst),
  976. (ins RC:$a, ImmCls:$b, ImmCls:$c),
  977. !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
  978. [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
  979. Requires<[Pred]>;
  980. }
  981. multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
  982. def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
  983. !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
  984. [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
  985. Requires<[useFP16Math, Pred]>;
  986. }
  987. defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
  988. defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, True>;
  989. defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
  990. defm FMA16x2 : FMA_F16<"fma.rn.f16x2", Float16x2Regs, True>;
  991. defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
  992. defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
  993. defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
  994. // sin/cos
  995. def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
  996. "sin.approx.f32 \t$dst, $src;",
  997. [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
  998. Requires<[allowUnsafeFPMath]>;
  999. def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
  1000. "cos.approx.f32 \t$dst, $src;",
  1001. [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
  1002. Requires<[allowUnsafeFPMath]>;
  1003. // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
  1004. // i.e. "poor man's fmod()"
  1005. // frem - f32 FTZ
  1006. def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
  1007. (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
  1008. (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRZI_FTZ),
  1009. Float32Regs:$y))>,
  1010. Requires<[doF32FTZ]>;
  1011. def : Pat<(frem Float32Regs:$x, fpimm:$y),
  1012. (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
  1013. (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRZI_FTZ),
  1014. fpimm:$y))>,
  1015. Requires<[doF32FTZ]>;
  1016. // frem - f32
  1017. def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
  1018. (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
  1019. (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRZI),
  1020. Float32Regs:$y))>;
  1021. def : Pat<(frem Float32Regs:$x, fpimm:$y),
  1022. (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
  1023. (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRZI),
  1024. fpimm:$y))>;
  1025. // frem - f64
  1026. def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
  1027. (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
  1028. (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRZI),
  1029. Float64Regs:$y))>;
  1030. def : Pat<(frem Float64Regs:$x, fpimm:$y),
  1031. (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
  1032. (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRZI),
  1033. fpimm:$y))>;
  1034. //-----------------------------------
  1035. // Bitwise operations
  1036. //-----------------------------------
  1037. // Template for three-arg bitwise operations. Takes three args, Creates .b16,
  1038. // .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
  1039. multiclass BITWISE<string OpcStr, SDNode OpNode> {
  1040. def b1rr :
  1041. NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
  1042. !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
  1043. [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
  1044. def b1ri :
  1045. NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
  1046. !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
  1047. [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
  1048. def b16rr :
  1049. NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
  1050. !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
  1051. [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
  1052. def b16ri :
  1053. NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
  1054. !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
  1055. [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
  1056. def b32rr :
  1057. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
  1058. !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
  1059. [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
  1060. def b32ri :
  1061. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
  1062. !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
  1063. [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
  1064. def b64rr :
  1065. NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
  1066. !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
  1067. [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
  1068. def b64ri :
  1069. NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
  1070. !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
  1071. [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
  1072. }
  1073. defm OR : BITWISE<"or", or>;
  1074. defm AND : BITWISE<"and", and>;
  1075. defm XOR : BITWISE<"xor", xor>;
  1076. def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
  1077. "not.pred \t$dst, $src;",
  1078. [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
  1079. def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
  1080. "not.b16 \t$dst, $src;",
  1081. [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
  1082. def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
  1083. "not.b32 \t$dst, $src;",
  1084. [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
  1085. def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
  1086. "not.b64 \t$dst, $src;",
  1087. [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
  1088. // Template for left/right shifts. Takes three operands,
  1089. // [dest (reg), src (reg), shift (reg or imm)].
  1090. // dest and src may be int64, int32, or int16, but shift is always int32.
  1091. //
  1092. // This template also defines a 32-bit shift (imm, imm) instruction.
  1093. multiclass SHIFT<string OpcStr, SDNode OpNode> {
  1094. def i64rr :
  1095. NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
  1096. !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
  1097. [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
  1098. def i64ri :
  1099. NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
  1100. !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
  1101. [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
  1102. def i32rr :
  1103. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
  1104. !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
  1105. [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
  1106. def i32ri :
  1107. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
  1108. !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
  1109. [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
  1110. def i32ii :
  1111. NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
  1112. !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
  1113. [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
  1114. def i16rr :
  1115. NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
  1116. !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
  1117. [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
  1118. def i16ri :
  1119. NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
  1120. !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
  1121. [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
  1122. }
  1123. defm SHL : SHIFT<"shl.b", shl>;
  1124. defm SRA : SHIFT<"shr.s", sra>;
  1125. defm SRL : SHIFT<"shr.u", srl>;
  1126. // Bit-reverse
  1127. def BREV32 :
  1128. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
  1129. "brev.b32 \t$dst, $a;",
  1130. [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>;
  1131. def BREV64 :
  1132. NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
  1133. "brev.b64 \t$dst, $a;",
  1134. [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
  1135. //
  1136. // Rotate: Use ptx shf instruction if available.
  1137. //
  1138. // 32 bit r2 = rotl r1, n
  1139. // =>
  1140. // r2 = shf.l r1, r1, n
  1141. def ROTL32imm_hw :
  1142. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
  1143. "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
  1144. [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
  1145. Requires<[hasHWROT32]>;
  1146. def ROTL32reg_hw :
  1147. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
  1148. "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
  1149. [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
  1150. Requires<[hasHWROT32]>;
  1151. // 32 bit r2 = rotr r1, n
  1152. // =>
  1153. // r2 = shf.r r1, r1, n
  1154. def ROTR32imm_hw :
  1155. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
  1156. "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
  1157. [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
  1158. Requires<[hasHWROT32]>;
  1159. def ROTR32reg_hw :
  1160. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
  1161. "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
  1162. [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
  1163. Requires<[hasHWROT32]>;
  1164. // 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1.
  1165. def ROT32imm_sw :
  1166. NVPTXInst<(outs Int32Regs:$dst),
  1167. (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
  1168. "{{\n\t"
  1169. ".reg .b32 %lhs;\n\t"
  1170. ".reg .b32 %rhs;\n\t"
  1171. "shl.b32 \t%lhs, $src, $amt1;\n\t"
  1172. "shr.b32 \t%rhs, $src, $amt2;\n\t"
  1173. "add.u32 \t$dst, %lhs, %rhs;\n\t"
  1174. "}}",
  1175. []>;
  1176. def SUB_FRM_32 : SDNodeXForm<imm, [{
  1177. return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
  1178. }]>;
  1179. def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
  1180. (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
  1181. Requires<[noHWROT32]>;
  1182. def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
  1183. (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
  1184. Requires<[noHWROT32]>;
  1185. // 32-bit software rotate left by register.
  1186. def ROTL32reg_sw :
  1187. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
  1188. "{{\n\t"
  1189. ".reg .b32 %lhs;\n\t"
  1190. ".reg .b32 %rhs;\n\t"
  1191. ".reg .b32 %amt2;\n\t"
  1192. "shl.b32 \t%lhs, $src, $amt;\n\t"
  1193. "sub.s32 \t%amt2, 32, $amt;\n\t"
  1194. "shr.b32 \t%rhs, $src, %amt2;\n\t"
  1195. "add.u32 \t$dst, %lhs, %rhs;\n\t"
  1196. "}}",
  1197. [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
  1198. Requires<[noHWROT32]>;
  1199. // 32-bit software rotate right by register.
  1200. def ROTR32reg_sw :
  1201. NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
  1202. "{{\n\t"
  1203. ".reg .b32 %lhs;\n\t"
  1204. ".reg .b32 %rhs;\n\t"
  1205. ".reg .b32 %amt2;\n\t"
  1206. "shr.b32 \t%lhs, $src, $amt;\n\t"
  1207. "sub.s32 \t%amt2, 32, $amt;\n\t"
  1208. "shl.b32 \t%rhs, $src, %amt2;\n\t"
  1209. "add.u32 \t$dst, %lhs, %rhs;\n\t"
  1210. "}}",
  1211. [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
  1212. Requires<[noHWROT32]>;
  1213. // 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1.
  1214. def ROT64imm_sw :
  1215. NVPTXInst<(outs Int64Regs:$dst),
  1216. (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
  1217. "{{\n\t"
  1218. ".reg .b64 %lhs;\n\t"
  1219. ".reg .b64 %rhs;\n\t"
  1220. "shl.b64 \t%lhs, $src, $amt1;\n\t"
  1221. "shr.b64 \t%rhs, $src, $amt2;\n\t"
  1222. "add.u64 \t$dst, %lhs, %rhs;\n\t"
  1223. "}}",
  1224. []>;
  1225. def SUB_FRM_64 : SDNodeXForm<imm, [{
  1226. return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
  1227. }]>;
  1228. def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
  1229. (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
  1230. def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
  1231. (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
  1232. // 64-bit software rotate left by register.
  1233. def ROTL64reg_sw :
  1234. NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
  1235. "{{\n\t"
  1236. ".reg .b64 %lhs;\n\t"
  1237. ".reg .b64 %rhs;\n\t"
  1238. ".reg .u32 %amt2;\n\t"
  1239. "shl.b64 \t%lhs, $src, $amt;\n\t"
  1240. "sub.u32 \t%amt2, 64, $amt;\n\t"
  1241. "shr.b64 \t%rhs, $src, %amt2;\n\t"
  1242. "add.u64 \t$dst, %lhs, %rhs;\n\t"
  1243. "}}",
  1244. [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
  1245. def ROTR64reg_sw :
  1246. NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
  1247. "{{\n\t"
  1248. ".reg .b64 %lhs;\n\t"
  1249. ".reg .b64 %rhs;\n\t"
  1250. ".reg .u32 %amt2;\n\t"
  1251. "shr.b64 \t%lhs, $src, $amt;\n\t"
  1252. "sub.u32 \t%amt2, 64, $amt;\n\t"
  1253. "shl.b64 \t%rhs, $src, %amt2;\n\t"
  1254. "add.u64 \t$dst, %lhs, %rhs;\n\t"
  1255. "}}",
  1256. [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
  1257. //
  1258. // Funnnel shift in clamp mode
  1259. //
  1260. // Create SDNodes so they can be used in the DAG code, e.g.
  1261. // NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
  1262. def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
  1263. def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
  1264. def FUNSHFLCLAMP :
  1265. NVPTXInst<(outs Int32Regs:$dst),
  1266. (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
  1267. "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
  1268. [(set Int32Regs:$dst,
  1269. (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
  1270. def FUNSHFRCLAMP :
  1271. NVPTXInst<(outs Int32Regs:$dst),
  1272. (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
  1273. "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
  1274. [(set Int32Regs:$dst,
  1275. (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
  1276. //
  1277. // BFE - bit-field extract
  1278. //
  1279. // Template for BFE instructions. Takes four args,
  1280. // [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
  1281. // Start may be an imm only if end is also an imm. FIXME: Is this a
  1282. // restriction in PTX?
  1283. //
  1284. // dest and src may be int32 or int64, but start and end are always int32.
  1285. multiclass BFE<string TyStr, RegisterClass RC> {
  1286. def rrr
  1287. : NVPTXInst<(outs RC:$d),
  1288. (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
  1289. !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
  1290. def rri
  1291. : NVPTXInst<(outs RC:$d),
  1292. (ins RC:$a, Int32Regs:$b, i32imm:$c),
  1293. !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
  1294. def rii
  1295. : NVPTXInst<(outs RC:$d),
  1296. (ins RC:$a, i32imm:$b, i32imm:$c),
  1297. !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
  1298. }
  1299. let hasSideEffects = false in {
  1300. defm BFE_S32 : BFE<"s32", Int32Regs>;
  1301. defm BFE_U32 : BFE<"u32", Int32Regs>;
  1302. defm BFE_S64 : BFE<"s64", Int64Regs>;
  1303. defm BFE_U64 : BFE<"u64", Int64Regs>;
  1304. }
  1305. //-----------------------------------
  1306. // Comparison instructions (setp, set)
  1307. //-----------------------------------
  1308. // FIXME: This doesn't cover versions of set and setp that combine with a
  1309. // boolean predicate, e.g. setp.eq.and.b16.
  1310. let hasSideEffects = false in {
  1311. multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
  1312. def rr :
  1313. NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
  1314. !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
  1315. " \t$dst, $a, $b;"), []>;
  1316. def ri :
  1317. NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
  1318. !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
  1319. " \t$dst, $a, $b;"), []>;
  1320. def ir :
  1321. NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
  1322. !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
  1323. " \t$dst, $a, $b;"), []>;
  1324. }
  1325. }
  1326. defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
  1327. defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
  1328. defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
  1329. defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
  1330. defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
  1331. defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
  1332. defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
  1333. defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
  1334. defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
  1335. defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
  1336. defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
  1337. def SETP_f16rr :
  1338. NVPTXInst<(outs Int1Regs:$dst),
  1339. (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
  1340. "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
  1341. []>, Requires<[useFP16Math]>;
  1342. def SETP_f16x2rr :
  1343. NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
  1344. (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
  1345. "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
  1346. []>,
  1347. Requires<[useFP16Math]>;
  1348. // FIXME: This doesn't appear to be correct. The "set" mnemonic has the form
  1349. // "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
  1350. // reg, either u32, s32, or f32. Anyway these aren't used at the moment.
  1351. let hasSideEffects = false in {
  1352. multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
  1353. def rr : NVPTXInst<(outs Int32Regs:$dst),
  1354. (ins RC:$a, RC:$b, CmpMode:$cmp),
  1355. !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
  1356. def ri : NVPTXInst<(outs Int32Regs:$dst),
  1357. (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
  1358. !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
  1359. def ir : NVPTXInst<(outs Int32Regs:$dst),
  1360. (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
  1361. !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
  1362. }
  1363. }
  1364. defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
  1365. defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
  1366. defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
  1367. defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
  1368. defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
  1369. defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
  1370. defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
  1371. defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
  1372. defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
  1373. defm SET_f16 : SET<"f16", Float16Regs, f16imm>;
  1374. defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
  1375. defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
  1376. //-----------------------------------
  1377. // Selection instructions (selp)
  1378. //-----------------------------------
  1379. // FIXME: Missing slct
  1380. // selp instructions that don't have any pattern matches; we explicitly use
  1381. // them within this file.
  1382. let hasSideEffects = false in {
  1383. multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
  1384. def rr : NVPTXInst<(outs RC:$dst),
  1385. (ins RC:$a, RC:$b, Int1Regs:$p),
  1386. !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
  1387. def ri : NVPTXInst<(outs RC:$dst),
  1388. (ins RC:$a, ImmCls:$b, Int1Regs:$p),
  1389. !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
  1390. def ir : NVPTXInst<(outs RC:$dst),
  1391. (ins ImmCls:$a, RC:$b, Int1Regs:$p),
  1392. !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
  1393. def ii : NVPTXInst<(outs RC:$dst),
  1394. (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
  1395. !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
  1396. }
  1397. multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
  1398. SDNode ImmNode> {
  1399. def rr :
  1400. NVPTXInst<(outs RC:$dst),
  1401. (ins RC:$a, RC:$b, Int1Regs:$p),
  1402. !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
  1403. [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
  1404. def ri :
  1405. NVPTXInst<(outs RC:$dst),
  1406. (ins RC:$a, ImmCls:$b, Int1Regs:$p),
  1407. !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
  1408. [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
  1409. def ir :
  1410. NVPTXInst<(outs RC:$dst),
  1411. (ins ImmCls:$a, RC:$b, Int1Regs:$p),
  1412. !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
  1413. [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
  1414. def ii :
  1415. NVPTXInst<(outs RC:$dst),
  1416. (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
  1417. !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
  1418. [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
  1419. }
  1420. }
  1421. // Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
  1422. // good.
  1423. defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
  1424. defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
  1425. defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
  1426. defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
  1427. defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
  1428. defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
  1429. defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
  1430. defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
  1431. defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
  1432. defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;
  1433. defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
  1434. defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
  1435. def SELP_f16x2rr :
  1436. NVPTXInst<(outs Float16x2Regs:$dst),
  1437. (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
  1438. "selp.b32 \t$dst, $a, $b, $p;",
  1439. [(set Float16x2Regs:$dst,
  1440. (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;
  1441. //-----------------------------------
  1442. // Data Movement (Load / Store, Move)
  1443. //-----------------------------------
  1444. def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
  1445. [SDNPWantRoot]>;
  1446. def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
  1447. [SDNPWantRoot]>;
  1448. def ADDRvar : ComplexPattern<iPTR, 1, "SelectDirectAddr", [], []>;
  1449. def MEMri : Operand<i32> {
  1450. let PrintMethod = "printMemOperand";
  1451. let MIOperandInfo = (ops Int32Regs, i32imm);
  1452. }
  1453. def MEMri64 : Operand<i64> {
  1454. let PrintMethod = "printMemOperand";
  1455. let MIOperandInfo = (ops Int64Regs, i64imm);
  1456. }
  1457. def imem : Operand<iPTR> {
  1458. let PrintMethod = "printOperand";
  1459. }
  1460. def imemAny : Operand<iPTRAny> {
  1461. let PrintMethod = "printOperand";
  1462. }
  1463. def LdStCode : Operand<i32> {
  1464. let PrintMethod = "printLdStCode";
  1465. }
  1466. def MmaCode : Operand<i32> {
  1467. let PrintMethod = "printMmaCode";
  1468. }
  1469. def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
  1470. def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
  1471. // Load a memory address into a u32 or u64 register.
  1472. def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
  1473. "mov.u32 \t$dst, $a;",
  1474. [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
  1475. def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
  1476. "mov.u64 \t$dst, $a;",
  1477. [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
  1478. // Get pointer to local stack.
  1479. let hasSideEffects = false in {
  1480. def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
  1481. "mov.u32 \t$d, __local_depot$num;", []>;
  1482. def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
  1483. "mov.u64 \t$d, __local_depot$num;", []>;
  1484. }
  1485. // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
  1486. let IsSimpleMove=1, hasSideEffects=0 in {
  1487. def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
  1488. "mov.pred \t$dst, $sss;", []>;
  1489. def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
  1490. "mov.u16 \t$dst, $sss;", []>;
  1491. def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
  1492. "mov.u32 \t$dst, $sss;", []>;
  1493. def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
  1494. "mov.u64 \t$dst, $sss;", []>;
  1495. def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src),
  1496. // We have to use .b16 here as there's no mov.f16.
  1497. "mov.b16 \t$dst, $src;", []>;
  1498. def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
  1499. "mov.f32 \t$dst, $src;", []>;
  1500. def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
  1501. "mov.f64 \t$dst, $src;", []>;
  1502. }
  1503. def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
  1504. "mov.pred \t$dst, $src;",
  1505. [(set Int1Regs:$dst, imm:$src)]>;
  1506. def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
  1507. "mov.u16 \t$dst, $src;",
  1508. [(set Int16Regs:$dst, imm:$src)]>;
  1509. def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
  1510. "mov.u32 \t$dst, $src;",
  1511. [(set Int32Regs:$dst, imm:$src)]>;
  1512. def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
  1513. "mov.u64 \t$dst, $src;",
  1514. [(set Int64Regs:$dst, imm:$src)]>;
  1515. def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
  1516. "mov.f32 \t$dst, $src;",
  1517. [(set Float32Regs:$dst, fpimm:$src)]>;
  1518. def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
  1519. "mov.f64 \t$dst, $src;",
  1520. [(set Float64Regs:$dst, fpimm:$src)]>;
  1521. def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
  1522. //---- Copy Frame Index ----
  1523. def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
  1524. "add.u32 \t$dst, ${addr:add};",
  1525. [(set Int32Regs:$dst, ADDRri:$addr)]>;
  1526. def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
  1527. "add.u64 \t$dst, ${addr:add};",
  1528. [(set Int64Regs:$dst, ADDRri64:$addr)]>;
  1529. //-----------------------------------
  1530. // Comparison and Selection
  1531. //-----------------------------------
  1532. multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
  1533. Instruction setp_16rr,
  1534. Instruction setp_16ri,
  1535. Instruction setp_16ir,
  1536. Instruction setp_32rr,
  1537. Instruction setp_32ri,
  1538. Instruction setp_32ir,
  1539. Instruction setp_64rr,
  1540. Instruction setp_64ri,
  1541. Instruction setp_64ir,
  1542. Instruction set_16rr,
  1543. Instruction set_16ri,
  1544. Instruction set_16ir,
  1545. Instruction set_32rr,
  1546. Instruction set_32ri,
  1547. Instruction set_32ir,
  1548. Instruction set_64rr,
  1549. Instruction set_64ri,
  1550. Instruction set_64ir> {
  1551. // i16 -> pred
  1552. def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
  1553. (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
  1554. def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
  1555. (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
  1556. def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
  1557. (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
  1558. // i32 -> pred
  1559. def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
  1560. (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
  1561. def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
  1562. (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
  1563. def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
  1564. (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
  1565. // i64 -> pred
  1566. def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
  1567. (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
  1568. def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
  1569. (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
  1570. def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
  1571. (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
  1572. // i16 -> i32
  1573. def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
  1574. (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
  1575. def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
  1576. (set_16ri Int16Regs:$a, imm:$b, Mode)>;
  1577. def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
  1578. (set_16ir imm:$a, Int16Regs:$b, Mode)>;
  1579. // i32 -> i32
  1580. def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
  1581. (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
  1582. def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
  1583. (set_32ri Int32Regs:$a, imm:$b, Mode)>;
  1584. def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
  1585. (set_32ir imm:$a, Int32Regs:$b, Mode)>;
  1586. // i64 -> i32
  1587. def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
  1588. (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
  1589. def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
  1590. (set_64ri Int64Regs:$a, imm:$b, Mode)>;
  1591. def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
  1592. (set_64ir imm:$a, Int64Regs:$b, Mode)>;
  1593. }
  1594. multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
  1595. : ISET_FORMAT<OpNode, Mode,
  1596. SETP_s16rr, SETP_s16ri, SETP_s16ir,
  1597. SETP_s32rr, SETP_s32ri, SETP_s32ir,
  1598. SETP_s64rr, SETP_s64ri, SETP_s64ir,
  1599. SET_s16rr, SET_s16ri, SET_s16ir,
  1600. SET_s32rr, SET_s32ri, SET_s32ir,
  1601. SET_s64rr, SET_s64ri, SET_s64ir> {
  1602. // TableGen doesn't like empty multiclasses.
  1603. def : PatLeaf<(i32 0)>;
  1604. }
  1605. multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
  1606. : ISET_FORMAT<OpNode, Mode,
  1607. SETP_u16rr, SETP_u16ri, SETP_u16ir,
  1608. SETP_u32rr, SETP_u32ri, SETP_u32ir,
  1609. SETP_u64rr, SETP_u64ri, SETP_u64ir,
  1610. SET_u16rr, SET_u16ri, SET_u16ir,
  1611. SET_u32rr, SET_u32ri, SET_u32ir,
  1612. SET_u64rr, SET_u64ri, SET_u64ir> {
  1613. // TableGen doesn't like empty multiclasses.
  1614. def : PatLeaf<(i32 0)>;
  1615. }
  1616. defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
  1617. defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
  1618. defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
  1619. defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
  1620. defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
  1621. defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
  1622. defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
  1623. defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
  1624. defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
  1625. defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
  1626. defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
  1627. defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
  1628. // i1 compares
  1629. def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
  1630. (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
  1631. def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
  1632. (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
  1633. def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
  1634. (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
  1635. def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
  1636. (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
  1637. // i1 compare -> i32
  1638. def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
  1639. (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
  1640. def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
  1641. (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
  1642. multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
  1643. // f16 -> pred
  1644. def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
  1645. (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
  1646. Requires<[useFP16Math,doF32FTZ]>;
  1647. def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
  1648. (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
  1649. Requires<[useFP16Math]>;
  1650. def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
  1651. (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
  1652. Requires<[useFP16Math,doF32FTZ]>;
  1653. def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
  1654. (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
  1655. Requires<[useFP16Math]>;
  1656. def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
  1657. (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
  1658. Requires<[useFP16Math,doF32FTZ]>;
  1659. def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
  1660. (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
  1661. Requires<[useFP16Math]>;
  1662. // f32 -> pred
  1663. def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
  1664. (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
  1665. Requires<[doF32FTZ]>;
  1666. def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
  1667. (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
  1668. def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
  1669. (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
  1670. Requires<[doF32FTZ]>;
  1671. def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
  1672. (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
  1673. def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
  1674. (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
  1675. Requires<[doF32FTZ]>;
  1676. def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
  1677. (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
  1678. // f64 -> pred
  1679. def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
  1680. (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
  1681. def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
  1682. (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
  1683. def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
  1684. (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
  1685. // f16 -> i32
  1686. def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
  1687. (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
  1688. Requires<[useFP16Math, doF32FTZ]>;
  1689. def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
  1690. (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
  1691. Requires<[useFP16Math]>;
  1692. def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
  1693. (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
  1694. Requires<[useFP16Math, doF32FTZ]>;
  1695. def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
  1696. (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
  1697. Requires<[useFP16Math]>;
  1698. def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
  1699. (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
  1700. Requires<[useFP16Math, doF32FTZ]>;
  1701. def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
  1702. (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
  1703. Requires<[useFP16Math]>;
  1704. // f32 -> i32
  1705. def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
  1706. (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
  1707. Requires<[doF32FTZ]>;
  1708. def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
  1709. (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
  1710. def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
  1711. (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
  1712. Requires<[doF32FTZ]>;
  1713. def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
  1714. (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
  1715. def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
  1716. (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
  1717. Requires<[doF32FTZ]>;
  1718. def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
  1719. (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
  1720. // f64 -> i32
  1721. def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
  1722. (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
  1723. def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
  1724. (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
  1725. def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
  1726. (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
  1727. }
  1728. defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
  1729. defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
  1730. defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
  1731. defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
  1732. defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
  1733. defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
  1734. defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
  1735. defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
  1736. defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
  1737. defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
  1738. defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
  1739. defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
  1740. defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
  1741. defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
  1742. defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
  1743. defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
  1744. defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
  1745. defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
  1746. defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
  1747. defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
  1748. // FIXME: What is this doing here? Can it be deleted?
  1749. // def ld_param : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
  1750. // [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
  1751. def SDTDeclareParamProfile :
  1752. SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
  1753. def SDTDeclareScalarParamProfile :
  1754. SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
  1755. def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
  1756. def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
  1757. def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
  1758. def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
  1759. def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
  1760. def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
  1761. def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
  1762. def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
  1763. def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
  1764. def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
  1765. def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
  1766. def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
  1767. def SDTCallValProfile : SDTypeProfile<1, 0, []>;
  1768. def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
  1769. def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
  1770. def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
  1771. def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
  1772. def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
  1773. def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
  1774. def DeclareParam :
  1775. SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
  1776. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1777. def DeclareScalarParam :
  1778. SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
  1779. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1780. def DeclareRetParam :
  1781. SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
  1782. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1783. def DeclareRet :
  1784. SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
  1785. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1786. def LoadParam :
  1787. SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
  1788. [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
  1789. def LoadParamV2 :
  1790. SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
  1791. [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
  1792. def LoadParamV4 :
  1793. SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
  1794. [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
  1795. def PrintCall :
  1796. SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
  1797. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1798. def PrintConvergentCall :
  1799. SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
  1800. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1801. def PrintCallUni :
  1802. SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
  1803. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1804. def PrintConvergentCallUni :
  1805. SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
  1806. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1807. def StoreParam :
  1808. SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
  1809. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1810. def StoreParamV2 :
  1811. SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
  1812. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1813. def StoreParamV4 :
  1814. SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
  1815. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1816. def StoreParamU32 :
  1817. SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
  1818. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1819. def StoreParamS32 :
  1820. SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
  1821. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1822. def CallArgBegin :
  1823. SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
  1824. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1825. def CallArg :
  1826. SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
  1827. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1828. def LastCallArg :
  1829. SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
  1830. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1831. def CallArgEnd :
  1832. SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
  1833. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1834. def CallVoid :
  1835. SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
  1836. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1837. def Prototype :
  1838. SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
  1839. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1840. def CallVal :
  1841. SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
  1842. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1843. def MoveParam :
  1844. SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
  1845. def StoreRetval :
  1846. SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
  1847. [SDNPHasChain, SDNPSideEffect]>;
  1848. def StoreRetvalV2 :
  1849. SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
  1850. [SDNPHasChain, SDNPSideEffect]>;
  1851. def StoreRetvalV4 :
  1852. SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
  1853. [SDNPHasChain, SDNPSideEffect]>;
  1854. def PseudoUseParam :
  1855. SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
  1856. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1857. def RETURNNode :
  1858. SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
  1859. [SDNPHasChain, SDNPSideEffect]>;
  1860. def ProxyReg :
  1861. SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
  1862. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  1863. let mayLoad = true in {
  1864. class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
  1865. NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
  1866. !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
  1867. []>;
  1868. class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
  1869. NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
  1870. !strconcat("ld.param.v2", opstr,
  1871. " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
  1872. class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
  1873. NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
  1874. regclass:$dst4),
  1875. (ins i32imm:$b),
  1876. !strconcat("ld.param.v4", opstr,
  1877. " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
  1878. []>;
  1879. }
  1880. class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
  1881. NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
  1882. !strconcat("mov", opstr, " \t$dst, retval$b;"),
  1883. [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
  1884. let mayStore = true in {
  1885. class StoreParamInst<NVPTXRegClass regclass, string opstr> :
  1886. NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
  1887. !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
  1888. []>;
  1889. class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
  1890. NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
  1891. i32imm:$a, i32imm:$b),
  1892. !strconcat("st.param.v2", opstr,
  1893. " \t[param$a+$b], {{$val, $val2}};"),
  1894. []>;
  1895. class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
  1896. NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
  1897. regclass:$val4, i32imm:$a,
  1898. i32imm:$b),
  1899. !strconcat("st.param.v4", opstr,
  1900. " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
  1901. []>;
  1902. class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
  1903. NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
  1904. !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
  1905. []>;
  1906. class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
  1907. NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
  1908. !strconcat("st.param.v2", opstr,
  1909. " \t[func_retval0+$a], {{$val, $val2}};"),
  1910. []>;
  1911. class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
  1912. NVPTXInst<(outs),
  1913. (ins regclass:$val, regclass:$val2, regclass:$val3,
  1914. regclass:$val4, i32imm:$a),
  1915. !strconcat("st.param.v4", opstr,
  1916. " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
  1917. []>;
  1918. }
  1919. let isCall=1 in {
  1920. multiclass CALL<string OpcStr, SDNode OpNode> {
  1921. def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
  1922. !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
  1923. def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
  1924. !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
  1925. def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
  1926. !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
  1927. def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
  1928. !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
  1929. def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
  1930. !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
  1931. [(OpNode (i32 4))]>;
  1932. def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
  1933. !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
  1934. [(OpNode (i32 5))]>;
  1935. def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
  1936. !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
  1937. "retval5), "),
  1938. [(OpNode (i32 6))]>;
  1939. def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
  1940. !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
  1941. "retval5, retval6), "),
  1942. [(OpNode (i32 7))]>;
  1943. def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
  1944. !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
  1945. "retval5, retval6, retval7), "),
  1946. [(OpNode (i32 8))]>;
  1947. }
  1948. }
  1949. defm Call : CALL<"call", PrintCall>;
  1950. defm CallUni : CALL<"call.uni", PrintCallUni>;
  1951. // Convergent call instructions. These are identical to regular calls, except
  1952. // they have the isConvergent bit set.
  1953. let isConvergent=1 in {
  1954. defm ConvergentCall : CALL<"call", PrintConvergentCall>;
  1955. defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
  1956. }
  1957. def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">;
  1958. def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">;
  1959. def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">;
  1960. def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">;
  1961. def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">;
  1962. def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">;
  1963. def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">;
  1964. def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">;
  1965. def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">;
  1966. def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">;
  1967. def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">;
  1968. def LoadParamMemF16 : LoadParamMemInst<Float16Regs, ".b16">;
  1969. def LoadParamMemF16x2 : LoadParamMemInst<Float16x2Regs, ".b32">;
  1970. def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">;
  1971. def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">;
  1972. def LoadParamMemV2F16 : LoadParamV2MemInst<Float16Regs, ".b16">;
  1973. def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
  1974. def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">;
  1975. def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">;
  1976. def LoadParamMemV4F16 : LoadParamV4MemInst<Float16Regs, ".b16">;
  1977. def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
  1978. def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">;
  1979. def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">;
  1980. def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">;
  1981. def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">;
  1982. def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">;
  1983. def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">;
  1984. def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">;
  1985. def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">;
  1986. def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">;
  1987. def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">;
  1988. def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">;
  1989. def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">;
  1990. def StoreParamF16 : StoreParamInst<Float16Regs, ".b16">;
  1991. def StoreParamF16x2 : StoreParamInst<Float16x2Regs, ".b32">;
  1992. def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">;
  1993. def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">;
  1994. def StoreParamV2F16 : StoreParamV2Inst<Float16Regs, ".b16">;
  1995. def StoreParamV2F16x2 : StoreParamV2Inst<Float16x2Regs, ".b32">;
  1996. def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">;
  1997. def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">;
  1998. def StoreParamV4F16 : StoreParamV4Inst<Float16Regs, ".b16">;
  1999. def StoreParamV4F16x2 : StoreParamV4Inst<Float16x2Regs, ".b32">;
  2000. def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">;
  2001. def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
  2002. def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
  2003. def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">;
  2004. def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">;
  2005. def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">;
  2006. def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">;
  2007. def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">;
  2008. def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">;
  2009. def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">;
  2010. def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">;
  2011. def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">;
  2012. def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">;
  2013. def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">;
  2014. def StoreRetvalF16 : StoreRetvalInst<Float16Regs, ".b16">;
  2015. def StoreRetvalF16x2 : StoreRetvalInst<Float16x2Regs, ".b32">;
  2016. def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">;
  2017. def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">;
  2018. def StoreRetvalV2F16 : StoreRetvalV2Inst<Float16Regs, ".b16">;
  2019. def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
  2020. def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">;
  2021. def StoreRetvalV4F16 : StoreRetvalV4Inst<Float16Regs, ".b16">;
  2022. def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
  2023. def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
  2024. def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
  2025. def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
  2026. def RETURNInst : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
  2027. class CallArgInst<NVPTXRegClass regclass> :
  2028. NVPTXInst<(outs), (ins regclass:$a), "$a, ",
  2029. [(CallArg (i32 0), regclass:$a)]>;
  2030. class LastCallArgInst<NVPTXRegClass regclass> :
  2031. NVPTXInst<(outs), (ins regclass:$a), "$a",
  2032. [(LastCallArg (i32 0), regclass:$a)]>;
  2033. def CallArgI64 : CallArgInst<Int64Regs>;
  2034. def CallArgI32 : CallArgInst<Int32Regs>;
  2035. def CallArgI16 : CallArgInst<Int16Regs>;
  2036. def CallArgF64 : CallArgInst<Float64Regs>;
  2037. def CallArgF32 : CallArgInst<Float32Regs>;
  2038. def LastCallArgI64 : LastCallArgInst<Int64Regs>;
  2039. def LastCallArgI32 : LastCallArgInst<Int32Regs>;
  2040. def LastCallArgI16 : LastCallArgInst<Int16Regs>;
  2041. def LastCallArgF64 : LastCallArgInst<Float64Regs>;
  2042. def LastCallArgF32 : LastCallArgInst<Float32Regs>;
  2043. def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
  2044. [(CallArg (i32 0), (i32 imm:$a))]>;
  2045. def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
  2046. [(LastCallArg (i32 0), (i32 imm:$a))]>;
  2047. def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
  2048. [(CallArg (i32 1), (i32 imm:$a))]>;
  2049. def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
  2050. [(LastCallArg (i32 1), (i32 imm:$a))]>;
  2051. def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
  2052. [(CallVoid (Wrapper tglobaladdr:$addr))]>;
  2053. def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
  2054. [(CallVoid Int32Regs:$addr)]>;
  2055. def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
  2056. [(CallVoid Int64Regs:$addr)]>;
  2057. def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
  2058. [(Prototype (i32 imm:$val))]>;
  2059. def DeclareRetMemInst :
  2060. NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
  2061. ".param .align $align .b8 retval$num[$size];",
  2062. [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
  2063. def DeclareRetScalarInst :
  2064. NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
  2065. ".param .b$size retval$num;",
  2066. [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
  2067. def DeclareRetRegInst :
  2068. NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
  2069. ".reg .b$size retval$num;",
  2070. [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
  2071. def DeclareParamInst :
  2072. NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
  2073. ".param .align $align .b8 param$a[$size];",
  2074. [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
  2075. def DeclareScalarParamInst :
  2076. NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
  2077. ".param .b$size param$a;",
  2078. [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
  2079. def DeclareScalarRegInst :
  2080. NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
  2081. ".reg .b$size param$a;",
  2082. [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
  2083. class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
  2084. NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
  2085. !strconcat("mov", asmstr, " \t$dst, $src;"),
  2086. [(set regclass:$dst, (MoveParam regclass:$src))]>;
  2087. class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty,
  2088. string asmstr> :
  2089. NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
  2090. !strconcat("mov", asmstr, " \t$dst, $src;"),
  2091. [(set regclass:$dst, (MoveParam texternalsym:$src))]>;
  2092. def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
  2093. def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
  2094. def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, ".b64">;
  2095. def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, ".b32">;
  2096. def MoveParamI16 :
  2097. NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
  2098. "cvt.u16.u32 \t$dst, $src;",
  2099. [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
  2100. def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
  2101. def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
  2102. def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">;
  2103. class PseudoUseParamInst<NVPTXRegClass regclass> :
  2104. NVPTXInst<(outs), (ins regclass:$src),
  2105. "// Pseudo use of $src",
  2106. [(PseudoUseParam regclass:$src)]>;
  2107. def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
  2108. def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
  2109. def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
  2110. def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
  2111. def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
  2112. class ProxyRegInst<string SzStr, NVPTXRegClass regclass> :
  2113. NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
  2114. !strconcat("mov.", SzStr, " \t$dst, $src;"),
  2115. [(set regclass:$dst, (ProxyReg regclass:$src))]>;
  2116. let isCodeGenOnly=1, isPseudo=1 in {
  2117. def ProxyRegI1 : ProxyRegInst<"pred", Int1Regs>;
  2118. def ProxyRegI16 : ProxyRegInst<"b16", Int16Regs>;
  2119. def ProxyRegI32 : ProxyRegInst<"b32", Int32Regs>;
  2120. def ProxyRegI64 : ProxyRegInst<"b64", Int64Regs>;
  2121. def ProxyRegF16 : ProxyRegInst<"b16", Float16Regs>;
  2122. def ProxyRegF32 : ProxyRegInst<"f32", Float32Regs>;
  2123. def ProxyRegF64 : ProxyRegInst<"f64", Float64Regs>;
  2124. def ProxyRegF16x2 : ProxyRegInst<"b32", Float16x2Regs>;
  2125. }
  2126. //
  2127. // Load / Store Handling
  2128. //
  2129. multiclass LD<NVPTXRegClass regclass> {
  2130. def _avar : NVPTXInst<
  2131. (outs regclass:$dst),
  2132. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2133. i32imm:$fromWidth, imem:$addr),
  2134. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2135. "\t$dst, [$addr];", []>;
  2136. def _areg : NVPTXInst<
  2137. (outs regclass:$dst),
  2138. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2139. i32imm:$fromWidth, Int32Regs:$addr),
  2140. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2141. "\t$dst, [$addr];", []>;
  2142. def _areg_64 : NVPTXInst<
  2143. (outs regclass:$dst),
  2144. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2145. i32imm:$fromWidth, Int64Regs:$addr),
  2146. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2147. "\t$dst, [$addr];", []>;
  2148. def _ari : NVPTXInst<
  2149. (outs regclass:$dst),
  2150. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2151. i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
  2152. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2153. "\t$dst, [$addr+$offset];", []>;
  2154. def _ari_64 : NVPTXInst<
  2155. (outs regclass:$dst),
  2156. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
  2157. LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
  2158. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2159. "\t$dst, [$addr+$offset];", []>;
  2160. def _asi : NVPTXInst<
  2161. (outs regclass:$dst),
  2162. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
  2163. LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
  2164. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2165. "\t$dst, [$addr+$offset];", []>;
  2166. }
  2167. let mayLoad=1, hasSideEffects=0 in {
  2168. defm LD_i8 : LD<Int16Regs>;
  2169. defm LD_i16 : LD<Int16Regs>;
  2170. defm LD_i32 : LD<Int32Regs>;
  2171. defm LD_i64 : LD<Int64Regs>;
  2172. defm LD_f16 : LD<Float16Regs>;
  2173. defm LD_f16x2 : LD<Float16x2Regs>;
  2174. defm LD_f32 : LD<Float32Regs>;
  2175. defm LD_f64 : LD<Float64Regs>;
  2176. }
  2177. multiclass ST<NVPTXRegClass regclass> {
  2178. def _avar : NVPTXInst<
  2179. (outs),
  2180. (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
  2181. LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
  2182. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
  2183. " \t[$addr], $src;", []>;
  2184. def _areg : NVPTXInst<
  2185. (outs),
  2186. (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
  2187. LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
  2188. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
  2189. " \t[$addr], $src;", []>;
  2190. def _areg_64 : NVPTXInst<
  2191. (outs),
  2192. (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
  2193. LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
  2194. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
  2195. " \t[$addr], $src;", []>;
  2196. def _ari : NVPTXInst<
  2197. (outs),
  2198. (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
  2199. LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
  2200. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
  2201. " \t[$addr+$offset], $src;", []>;
  2202. def _ari_64 : NVPTXInst<
  2203. (outs),
  2204. (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
  2205. LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
  2206. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
  2207. " \t[$addr+$offset], $src;", []>;
  2208. def _asi : NVPTXInst<
  2209. (outs),
  2210. (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
  2211. LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
  2212. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
  2213. " \t[$addr+$offset], $src;", []>;
  2214. }
  2215. let mayStore=1, hasSideEffects=0 in {
  2216. defm ST_i8 : ST<Int16Regs>;
  2217. defm ST_i16 : ST<Int16Regs>;
  2218. defm ST_i32 : ST<Int32Regs>;
  2219. defm ST_i64 : ST<Int64Regs>;
  2220. defm ST_f16 : ST<Float16Regs>;
  2221. defm ST_f16x2 : ST<Float16x2Regs>;
  2222. defm ST_f32 : ST<Float32Regs>;
  2223. defm ST_f64 : ST<Float64Regs>;
  2224. }
  2225. // The following is used only in and after vector elementizations. Vector
  2226. // elementization happens at the machine instruction level, so the following
  2227. // instructions never appear in the DAG.
  2228. multiclass LD_VEC<NVPTXRegClass regclass> {
  2229. def _v2_avar : NVPTXInst<
  2230. (outs regclass:$dst1, regclass:$dst2),
  2231. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2232. i32imm:$fromWidth, imem:$addr),
  2233. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2234. "\t{{$dst1, $dst2}}, [$addr];", []>;
  2235. def _v2_areg : NVPTXInst<
  2236. (outs regclass:$dst1, regclass:$dst2),
  2237. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2238. i32imm:$fromWidth, Int32Regs:$addr),
  2239. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2240. "\t{{$dst1, $dst2}}, [$addr];", []>;
  2241. def _v2_areg_64 : NVPTXInst<
  2242. (outs regclass:$dst1, regclass:$dst2),
  2243. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2244. i32imm:$fromWidth, Int64Regs:$addr),
  2245. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2246. "\t{{$dst1, $dst2}}, [$addr];", []>;
  2247. def _v2_ari : NVPTXInst<
  2248. (outs regclass:$dst1, regclass:$dst2),
  2249. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2250. i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
  2251. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2252. "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
  2253. def _v2_ari_64 : NVPTXInst<
  2254. (outs regclass:$dst1, regclass:$dst2),
  2255. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2256. i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
  2257. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2258. "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
  2259. def _v2_asi : NVPTXInst<
  2260. (outs regclass:$dst1, regclass:$dst2),
  2261. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2262. i32imm:$fromWidth, imem:$addr, i32imm:$offset),
  2263. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2264. "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
  2265. def _v4_avar : NVPTXInst<
  2266. (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
  2267. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2268. i32imm:$fromWidth, imem:$addr),
  2269. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2270. "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
  2271. def _v4_areg : NVPTXInst<
  2272. (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
  2273. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2274. i32imm:$fromWidth, Int32Regs:$addr),
  2275. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2276. "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
  2277. def _v4_areg_64 : NVPTXInst<
  2278. (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
  2279. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2280. i32imm:$fromWidth, Int64Regs:$addr),
  2281. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2282. "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
  2283. def _v4_ari : NVPTXInst<
  2284. (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
  2285. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2286. i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
  2287. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2288. "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
  2289. def _v4_ari_64 : NVPTXInst<
  2290. (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
  2291. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2292. i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
  2293. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2294. "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
  2295. def _v4_asi : NVPTXInst<
  2296. (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
  2297. (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2298. i32imm:$fromWidth, imem:$addr, i32imm:$offset),
  2299. "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2300. "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
  2301. }
  2302. let mayLoad=1, hasSideEffects=0 in {
  2303. defm LDV_i8 : LD_VEC<Int16Regs>;
  2304. defm LDV_i16 : LD_VEC<Int16Regs>;
  2305. defm LDV_i32 : LD_VEC<Int32Regs>;
  2306. defm LDV_i64 : LD_VEC<Int64Regs>;
  2307. defm LDV_f16 : LD_VEC<Float16Regs>;
  2308. defm LDV_f16x2 : LD_VEC<Float16x2Regs>;
  2309. defm LDV_f32 : LD_VEC<Float32Regs>;
  2310. defm LDV_f64 : LD_VEC<Float64Regs>;
  2311. }
  2312. multiclass ST_VEC<NVPTXRegClass regclass> {
  2313. def _v2_avar : NVPTXInst<
  2314. (outs),
  2315. (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
  2316. LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
  2317. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2318. "\t[$addr], {{$src1, $src2}};", []>;
  2319. def _v2_areg : NVPTXInst<
  2320. (outs),
  2321. (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
  2322. LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
  2323. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2324. "\t[$addr], {{$src1, $src2}};", []>;
  2325. def _v2_areg_64 : NVPTXInst<
  2326. (outs),
  2327. (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
  2328. LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
  2329. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2330. "\t[$addr], {{$src1, $src2}};", []>;
  2331. def _v2_ari : NVPTXInst<
  2332. (outs),
  2333. (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
  2334. LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
  2335. i32imm:$offset),
  2336. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2337. "\t[$addr+$offset], {{$src1, $src2}};", []>;
  2338. def _v2_ari_64 : NVPTXInst<
  2339. (outs),
  2340. (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
  2341. LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
  2342. i32imm:$offset),
  2343. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2344. "\t[$addr+$offset], {{$src1, $src2}};", []>;
  2345. def _v2_asi : NVPTXInst<
  2346. (outs),
  2347. (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
  2348. LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
  2349. i32imm:$offset),
  2350. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2351. "\t[$addr+$offset], {{$src1, $src2}};", []>;
  2352. def _v4_avar : NVPTXInst<
  2353. (outs),
  2354. (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
  2355. LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2356. i32imm:$fromWidth, imem:$addr),
  2357. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2358. "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
  2359. def _v4_areg : NVPTXInst<
  2360. (outs),
  2361. (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
  2362. LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2363. i32imm:$fromWidth, Int32Regs:$addr),
  2364. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2365. "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
  2366. def _v4_areg_64 : NVPTXInst<
  2367. (outs),
  2368. (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
  2369. LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2370. i32imm:$fromWidth, Int64Regs:$addr),
  2371. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2372. "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
  2373. def _v4_ari : NVPTXInst<
  2374. (outs),
  2375. (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
  2376. LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2377. i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
  2378. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2379. "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
  2380. def _v4_ari_64 : NVPTXInst<
  2381. (outs),
  2382. (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
  2383. LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2384. i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
  2385. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
  2386. "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
  2387. def _v4_asi : NVPTXInst<
  2388. (outs),
  2389. (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
  2390. LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
  2391. i32imm:$fromWidth, imem:$addr, i32imm:$offset),
  2392. "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
  2393. "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
  2394. }
  2395. let mayStore=1, hasSideEffects=0 in {
  2396. defm STV_i8 : ST_VEC<Int16Regs>;
  2397. defm STV_i16 : ST_VEC<Int16Regs>;
  2398. defm STV_i32 : ST_VEC<Int32Regs>;
  2399. defm STV_i64 : ST_VEC<Int64Regs>;
  2400. defm STV_f16 : ST_VEC<Float16Regs>;
  2401. defm STV_f16x2 : ST_VEC<Float16x2Regs>;
  2402. defm STV_f32 : ST_VEC<Float32Regs>;
  2403. defm STV_f64 : ST_VEC<Float64Regs>;
  2404. }
  2405. //---- Conversion ----
  2406. class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
  2407. NVPTXRegClass regclassOut> :
  2408. NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
  2409. !strconcat("mov.b", SzStr, " \t$d, $a;"),
  2410. [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
  2411. def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;
  2412. def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>;
  2413. def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
  2414. def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
  2415. def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
  2416. def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
  2417. def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
  2418. def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
  2419. // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
  2420. // we cannot specify floating-point literals in isel patterns. Therefore, we
  2421. // use an integer selp to select either 1 or 0 and then cvt to floating-point.
  2422. // sint -> f16
  2423. def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
  2424. (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
  2425. def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
  2426. (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
  2427. def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
  2428. (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
  2429. def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
  2430. (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
  2431. // uint -> f16
  2432. def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
  2433. (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
  2434. def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
  2435. (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
  2436. def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
  2437. (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
  2438. def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
  2439. (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
  2440. // sint -> f32
  2441. def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
  2442. (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
  2443. def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
  2444. (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
  2445. def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
  2446. (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
  2447. def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
  2448. (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
  2449. // uint -> f32
  2450. def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
  2451. (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
  2452. def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
  2453. (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
  2454. def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
  2455. (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
  2456. def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
  2457. (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
  2458. // sint -> f64
  2459. def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
  2460. (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
  2461. def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
  2462. (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
  2463. def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
  2464. (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
  2465. def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
  2466. (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
  2467. // uint -> f64
  2468. def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
  2469. (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
  2470. def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
  2471. (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
  2472. def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
  2473. (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
  2474. def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
  2475. (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
  2476. // f16 -> sint
  2477. def : Pat<(i1 (fp_to_sint Float16Regs:$a)),
  2478. (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
  2479. def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
  2480. (CVT_s16_f16 Float16Regs:$a, CvtRZI)>;
  2481. def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
  2482. (CVT_s32_f16 Float16Regs:$a, CvtRZI)>;
  2483. def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
  2484. (CVT_s64_f16 Float16Regs:$a, CvtRZI)>;
  2485. // f16 -> uint
  2486. def : Pat<(i1 (fp_to_uint Float16Regs:$a)),
  2487. (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
  2488. def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
  2489. (CVT_u16_f16 Float16Regs:$a, CvtRZI)>;
  2490. def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
  2491. (CVT_u32_f16 Float16Regs:$a, CvtRZI)>;
  2492. def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
  2493. (CVT_u64_f16 Float16Regs:$a, CvtRZI)>;
  2494. // f32 -> sint
  2495. def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
  2496. (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
  2497. def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
  2498. (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
  2499. def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
  2500. (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
  2501. def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
  2502. (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
  2503. def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
  2504. (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
  2505. def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
  2506. (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
  2507. def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
  2508. (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
  2509. // f32 -> uint
  2510. def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
  2511. (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
  2512. def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
  2513. (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
  2514. def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
  2515. (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
  2516. def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
  2517. (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
  2518. def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
  2519. (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
  2520. def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
  2521. (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
  2522. def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
  2523. (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
  2524. // f64 -> sint
  2525. def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
  2526. (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
  2527. def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
  2528. (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
  2529. def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
  2530. (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
  2531. def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
  2532. (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
  2533. // f64 -> uint
  2534. def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
  2535. (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
  2536. def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
  2537. (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
  2538. def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
  2539. (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
  2540. def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
  2541. (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
  2542. // sext i1
  2543. def : Pat<(i16 (sext Int1Regs:$a)),
  2544. (SELP_s16ii -1, 0, Int1Regs:$a)>;
  2545. def : Pat<(i32 (sext Int1Regs:$a)),
  2546. (SELP_s32ii -1, 0, Int1Regs:$a)>;
  2547. def : Pat<(i64 (sext Int1Regs:$a)),
  2548. (SELP_s64ii -1, 0, Int1Regs:$a)>;
  2549. // zext i1
  2550. def : Pat<(i16 (zext Int1Regs:$a)),
  2551. (SELP_u16ii 1, 0, Int1Regs:$a)>;
  2552. def : Pat<(i32 (zext Int1Regs:$a)),
  2553. (SELP_u32ii 1, 0, Int1Regs:$a)>;
  2554. def : Pat<(i64 (zext Int1Regs:$a)),
  2555. (SELP_u64ii 1, 0, Int1Regs:$a)>;
  2556. // anyext i1
  2557. def : Pat<(i16 (anyext Int1Regs:$a)),
  2558. (SELP_u16ii -1, 0, Int1Regs:$a)>;
  2559. def : Pat<(i32 (anyext Int1Regs:$a)),
  2560. (SELP_u32ii -1, 0, Int1Regs:$a)>;
  2561. def : Pat<(i64 (anyext Int1Regs:$a)),
  2562. (SELP_u64ii -1, 0, Int1Regs:$a)>;
  2563. // sext i16
  2564. def : Pat<(i32 (sext Int16Regs:$a)),
  2565. (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
  2566. def : Pat<(i64 (sext Int16Regs:$a)),
  2567. (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
  2568. // zext i16
  2569. def : Pat<(i32 (zext Int16Regs:$a)),
  2570. (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
  2571. def : Pat<(i64 (zext Int16Regs:$a)),
  2572. (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
  2573. // anyext i16
  2574. def : Pat<(i32 (anyext Int16Regs:$a)),
  2575. (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
  2576. def : Pat<(i64 (anyext Int16Regs:$a)),
  2577. (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
  2578. // sext i32
  2579. def : Pat<(i64 (sext Int32Regs:$a)),
  2580. (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
  2581. // zext i32
  2582. def : Pat<(i64 (zext Int32Regs:$a)),
  2583. (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
  2584. // anyext i32
  2585. def : Pat<(i64 (anyext Int32Regs:$a)),
  2586. (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
  2587. // truncate i64
  2588. def : Pat<(i32 (trunc Int64Regs:$a)),
  2589. (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
  2590. def : Pat<(i16 (trunc Int64Regs:$a)),
  2591. (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
  2592. def : Pat<(i1 (trunc Int64Regs:$a)),
  2593. (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
  2594. // truncate i32
  2595. def : Pat<(i16 (trunc Int32Regs:$a)),
  2596. (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
  2597. def : Pat<(i1 (trunc Int32Regs:$a)),
  2598. (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
  2599. // truncate i16
  2600. def : Pat<(i1 (trunc Int16Regs:$a)),
  2601. (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
  2602. // sext_inreg
  2603. def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
  2604. def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
  2605. def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
  2606. def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
  2607. def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
  2608. def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
  2609. // Select instructions with 32-bit predicates
  2610. def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
  2611. (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
  2612. (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
  2613. def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
  2614. (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
  2615. (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
  2616. def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
  2617. (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
  2618. (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
  2619. def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),
  2620. (SELP_f16rr Float16Regs:$a, Float16Regs:$b,
  2621. (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
  2622. def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
  2623. (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
  2624. (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
  2625. def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
  2626. (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
  2627. (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
  2628. let hasSideEffects = false in {
  2629. // pack a set of smaller int registers to a larger int register
  2630. def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
  2631. (ins Int16Regs:$s1, Int16Regs:$s2,
  2632. Int16Regs:$s3, Int16Regs:$s4),
  2633. "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
  2634. def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
  2635. (ins Int16Regs:$s1, Int16Regs:$s2),
  2636. "mov.b32 \t$d, {{$s1, $s2}};", []>;
  2637. def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
  2638. (ins Int32Regs:$s1, Int32Regs:$s2),
  2639. "mov.b64 \t$d, {{$s1, $s2}};", []>;
  2640. def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
  2641. (ins Float32Regs:$s1, Float32Regs:$s2),
  2642. "mov.b64 \t$d, {{$s1, $s2}};", []>;
  2643. // unpack a larger int register to a set of smaller int registers
  2644. def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
  2645. Int16Regs:$d3, Int16Regs:$d4),
  2646. (ins Int64Regs:$s),
  2647. "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
  2648. def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
  2649. (ins Int32Regs:$s),
  2650. "mov.b32 \t{{$d1, $d2}}, $s;", []>;
  2651. def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
  2652. (ins Int64Regs:$s),
  2653. "mov.b64 \t{{$d1, $d2}}, $s;", []>;
  2654. def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
  2655. (ins Float64Regs:$s),
  2656. "mov.b64 \t{{$d1, $d2}}, $s;", []>;
  2657. }
  2658. let hasSideEffects = false in {
  2659. // Extract element of f16x2 register. PTX does not provide any way
  2660. // to access elements of f16x2 vector directly, so we need to
  2661. // extract it using a temporary register.
  2662. def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
  2663. (ins Float16x2Regs:$src),
  2664. "{{ .reg .b16 \t%tmp_hi;\n\t"
  2665. " mov.b32 \t{$dst, %tmp_hi}, $src; }}",
  2666. [(set Float16Regs:$dst,
  2667. (extractelt (v2f16 Float16x2Regs:$src), 0))]>;
  2668. def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
  2669. (ins Float16x2Regs:$src),
  2670. "{{ .reg .b16 \t%tmp_lo;\n\t"
  2671. " mov.b32 \t{%tmp_lo, $dst}, $src; }}",
  2672. [(set Float16Regs:$dst,
  2673. (extractelt (v2f16 Float16x2Regs:$src), 1))]>;
  2674. // Coalesce two f16 registers into f16x2
  2675. def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
  2676. (ins Float16Regs:$a, Float16Regs:$b),
  2677. "mov.b32 \t$dst, {{$a, $b}};",
  2678. [(set Float16x2Regs:$dst,
  2679. (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
  2680. // Directly initializing underlying the b32 register is one less SASS
  2681. // instruction than than vector-packing move.
  2682. def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
  2683. "mov.b32 \t$dst, $src;",
  2684. []>;
  2685. // Split f16x2 into two f16 registers.
  2686. def SplitF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
  2687. (ins Float16x2Regs:$src),
  2688. "mov.b32 \t{{$lo, $hi}}, $src;",
  2689. []>;
  2690. // Split an i32 into two f16
  2691. def SplitI32toF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
  2692. (ins Int32Regs:$src),
  2693. "mov.b32 \t{{$lo, $hi}}, $src;",
  2694. []>;
  2695. }
  2696. // Count leading zeros
  2697. let hasSideEffects = false in {
  2698. def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
  2699. "clz.b32 \t$d, $a;", []>;
  2700. def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
  2701. "clz.b64 \t$d, $a;", []>;
  2702. }
  2703. // 32-bit has a direct PTX instruction
  2704. def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
  2705. // The return type of the ctlz ISD node is the same as its input, but the PTX
  2706. // ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
  2707. // ptx value to 64 bits to match the ISD node's semantics, unless we know we're
  2708. // truncating back down to 32 bits.
  2709. def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
  2710. def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
  2711. // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
  2712. // result back to 16-bits if necessary. We also need to subtract 16 because
  2713. // the high-order 16 zeros were counted.
  2714. //
  2715. // TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
  2716. // use to save one SASS instruction (on sm_35 anyway):
  2717. //
  2718. // mov.b32 $tmp, {0xffff, $a}
  2719. // ctlz.b32 $result, $tmp
  2720. //
  2721. // That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
  2722. // and then ctlz that value. This way we don't have to subtract 16 from the
  2723. // result. Unfortunately today we don't have a way to generate
  2724. // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
  2725. def : Pat<(i16 (ctlz Int16Regs:$a)),
  2726. (SUBi16ri (CVT_u16_u32
  2727. (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
  2728. def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
  2729. (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
  2730. // Population count
  2731. let hasSideEffects = false in {
  2732. def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
  2733. "popc.b32 \t$d, $a;", []>;
  2734. def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
  2735. "popc.b64 \t$d, $a;", []>;
  2736. }
  2737. // 32-bit has a direct PTX instruction
  2738. def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
  2739. // For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
  2740. // to match the LLVM semantics. Just as with ctlz.i64, we provide a second
  2741. // pattern that avoids the type conversion if we're truncating the result to
  2742. // i32 anyway.
  2743. def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
  2744. def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
  2745. // For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
  2746. // If we know that we're storing into an i32, we can avoid the final trunc.
  2747. def : Pat<(ctpop Int16Regs:$a),
  2748. (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
  2749. def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
  2750. (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
  2751. // fpround f32 -> f16
  2752. def : Pat<(f16 (fpround Float32Regs:$a)),
  2753. (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
  2754. // fpround f64 -> f16
  2755. def : Pat<(f16 (fpround Float64Regs:$a)),
  2756. (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
  2757. // fpround f64 -> f32
  2758. def : Pat<(f32 (fpround Float64Regs:$a)),
  2759. (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
  2760. def : Pat<(f32 (fpround Float64Regs:$a)),
  2761. (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
  2762. // fpextend f16 -> f32
  2763. def : Pat<(f32 (fpextend Float16Regs:$a)),
  2764. (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
  2765. def : Pat<(f32 (fpextend Float16Regs:$a)),
  2766. (CVT_f32_f16 Float16Regs:$a, CvtNONE)>;
  2767. // fpextend f16 -> f64
  2768. def : Pat<(f64 (fpextend Float16Regs:$a)),
  2769. (CVT_f64_f16 Float16Regs:$a, CvtNONE)>;
  2770. // fpextend f32 -> f64
  2771. def : Pat<(f64 (fpextend Float32Regs:$a)),
  2772. (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
  2773. def : Pat<(f64 (fpextend Float32Regs:$a)),
  2774. (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
  2775. def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
  2776. [SDNPHasChain, SDNPOptInGlue]>;
  2777. // fceil, ffloor, fround, ftrunc.
  2778. def : Pat<(fceil Float16Regs:$a),
  2779. (CVT_f16_f16 Float16Regs:$a, CvtRPI)>;
  2780. def : Pat<(fceil Float32Regs:$a),
  2781. (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
  2782. def : Pat<(fceil Float32Regs:$a),
  2783. (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
  2784. def : Pat<(fceil Float64Regs:$a),
  2785. (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
  2786. def : Pat<(ffloor Float16Regs:$a),
  2787. (CVT_f16_f16 Float16Regs:$a, CvtRMI)>;
  2788. def : Pat<(ffloor Float32Regs:$a),
  2789. (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
  2790. def : Pat<(ffloor Float32Regs:$a),
  2791. (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
  2792. def : Pat<(ffloor Float64Regs:$a),
  2793. (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
  2794. def : Pat<(ftrunc Float16Regs:$a),
  2795. (CVT_f16_f16 Float16Regs:$a, CvtRZI)>;
  2796. def : Pat<(ftrunc Float32Regs:$a),
  2797. (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
  2798. def : Pat<(ftrunc Float32Regs:$a),
  2799. (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
  2800. def : Pat<(ftrunc Float64Regs:$a),
  2801. (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
  2802. // nearbyint and rint are implemented as rounding to nearest even. This isn't
  2803. // strictly correct, because it causes us to ignore the rounding mode. But it
  2804. // matches what CUDA's "libm" does.
  2805. def : Pat<(fnearbyint Float16Regs:$a),
  2806. (CVT_f16_f16 Float16Regs:$a, CvtRNI)>;
  2807. def : Pat<(fnearbyint Float32Regs:$a),
  2808. (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
  2809. def : Pat<(fnearbyint Float32Regs:$a),
  2810. (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
  2811. def : Pat<(fnearbyint Float64Regs:$a),
  2812. (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
  2813. def : Pat<(frint Float16Regs:$a),
  2814. (CVT_f16_f16 Float16Regs:$a, CvtRNI)>;
  2815. def : Pat<(frint Float32Regs:$a),
  2816. (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
  2817. def : Pat<(frint Float32Regs:$a),
  2818. (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
  2819. def : Pat<(frint Float64Regs:$a),
  2820. (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
  2821. //-----------------------------------
  2822. // Control-flow
  2823. //-----------------------------------
  2824. let isTerminator=1 in {
  2825. let isReturn=1, isBarrier=1 in
  2826. def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
  2827. let isBranch=1 in
  2828. def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
  2829. "@$a bra \t$target;",
  2830. [(brcond Int1Regs:$a, bb:$target)]>;
  2831. let isBranch=1 in
  2832. def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
  2833. "@!$a bra \t$target;", []>;
  2834. let isBranch=1, isBarrier=1 in
  2835. def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
  2836. "bra.uni \t$target;", [(br bb:$target)]>;
  2837. }
  2838. def : Pat<(brcond Int32Regs:$a, bb:$target),
  2839. (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
  2840. // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
  2841. // conditional branch if the target block is the next block so that the code
  2842. // can fall through to the target block. The invertion is done by 'xor
  2843. // condition, 1', which will be translated to (setne condition, -1). Since ptx
  2844. // supports '@!pred bra target', we should use it.
  2845. def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
  2846. (CBranchOther Int1Regs:$a, bb:$target)>;
  2847. // Call
  2848. def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
  2849. SDTCisVT<1, i32>]>;
  2850. def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
  2851. def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
  2852. [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
  2853. def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
  2854. [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
  2855. SDNPSideEffect]>;
  2856. def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
  2857. def call : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
  2858. [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
  2859. def calltarget : Operand<i32>;
  2860. let isCall=1 in {
  2861. def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
  2862. }
  2863. def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
  2864. def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
  2865. // Pseudo instructions.
  2866. class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
  2867. : NVPTXInst<outs, ins, asmstr, pattern>;
  2868. def Callseq_Start :
  2869. NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
  2870. "\\{ // callseq $amt1, $amt2\n"
  2871. "\t.reg .b32 temp_param_reg;",
  2872. [(callseq_start timm:$amt1, timm:$amt2)]>;
  2873. def Callseq_End :
  2874. NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
  2875. "\\} // callseq $amt1",
  2876. [(callseq_end timm:$amt1, timm:$amt2)]>;
  2877. // trap instruction
  2878. def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
  2879. // Call prototype wrapper
  2880. def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
  2881. def CallPrototype :
  2882. SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
  2883. [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
  2884. def ProtoIdent : Operand<i32> {
  2885. let PrintMethod = "printProtoIdent";
  2886. }
  2887. def CALL_PROTOTYPE :
  2888. NVPTXInst<(outs), (ins ProtoIdent:$ident),
  2889. "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
  2890. include "NVPTXIntrinsics.td"
  2891. //-----------------------------------
  2892. // Notes
  2893. //-----------------------------------
  2894. // BSWAP is currently expanded. The following is a more efficient
  2895. // - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
  2896. // - for sm_20, use pmpt (use vector scalar mov to get the pack and
  2897. // unpack). sm_20 supports native 32-bit register, but not native 16-bit
  2898. // register.