dfmul.S 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. //===----------------------Hexagon builtin routine ------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. // Double Precision Multiply
  9. #define A r1:0
  10. #define AH r1
  11. #define AL r0
  12. #define B r3:2
  13. #define BH r3
  14. #define BL r2
  15. #define BTMP r5:4
  16. #define BTMPH r5
  17. #define BTMPL r4
  18. #define PP_ODD r7:6
  19. #define PP_ODD_H r7
  20. #define PP_ODD_L r6
  21. #define ONE r9:8
  22. #define S_ONE r8
  23. #define S_ZERO r9
  24. #define PP_HH r11:10
  25. #define PP_HH_H r11
  26. #define PP_HH_L r10
  27. #define ATMP r13:12
  28. #define ATMPH r13
  29. #define ATMPL r12
  30. #define PP_LL r15:14
  31. #define PP_LL_H r15
  32. #define PP_LL_L r14
  33. #define TMP r28
  34. #define MANTBITS 52
  35. #define HI_MANTBITS 20
  36. #define EXPBITS 11
  37. #define BIAS 1024
  38. #define MANTISSA_TO_INT_BIAS 52
  39. // Some constant to adjust normalization amount in error code
  40. // Amount to right shift the partial product to get to a denorm
  41. #define FUDGE 5
  42. #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
  43. #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
  44. #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
  45. #define END(TAG) .size TAG,.-TAG
  46. #define SR_ROUND_OFF 22
  47. .text
  48. .global __hexagon_muldf3
  49. .type __hexagon_muldf3,@function
  50. Q6_ALIAS(muldf3)
  51. FAST_ALIAS(muldf3)
  52. FAST2_ALIAS(muldf3)
  53. .p2align 5
  54. __hexagon_muldf3:
  55. {
  56. p0 = dfclass(A,#2)
  57. p0 = dfclass(B,#2)
  58. ATMP = combine(##0x40000000,#0)
  59. }
  60. {
  61. ATMP = insert(A,#MANTBITS,#EXPBITS-1)
  62. BTMP = asl(B,#EXPBITS-1)
  63. TMP = #-BIAS
  64. ONE = #1
  65. }
  66. {
  67. PP_ODD = mpyu(BTMPL,ATMPH)
  68. BTMP = insert(ONE,#2,#62)
  69. }
  70. // since we know that the MSB of the H registers is zero, we should never carry
  71. // H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1
  72. // Adding 2 HLs, we get 2^64-3*2^32+2 maximum.
  73. // Therefore, we can add 3 2^32-1 values safely without carry. We only need one.
  74. {
  75. PP_LL = mpyu(ATMPL,BTMPL)
  76. PP_ODD += mpyu(ATMPL,BTMPH)
  77. }
  78. {
  79. PP_ODD += lsr(PP_LL,#32)
  80. PP_HH = mpyu(ATMPH,BTMPH)
  81. BTMP = combine(##BIAS+BIAS-4,#0)
  82. }
  83. {
  84. PP_HH += lsr(PP_ODD,#32)
  85. if (!p0) jump .Lmul_abnormal
  86. p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0?
  87. p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0?
  88. }
  89. // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
  90. // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
  91. #undef PP_ODD
  92. #undef PP_ODD_H
  93. #undef PP_ODD_L
  94. #define EXP10 r7:6
  95. #define EXP1 r7
  96. #define EXP0 r6
  97. {
  98. if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
  99. EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
  100. EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
  101. }
  102. {
  103. PP_LL = neg(PP_HH)
  104. EXP0 += add(TMP,EXP1)
  105. TMP = xor(AH,BH)
  106. }
  107. {
  108. if (!p2.new) PP_HH = PP_LL
  109. p2 = cmp.gt(TMP,#-1)
  110. p0 = !cmp.gt(EXP0,BTMPH)
  111. p0 = cmp.gt(EXP0,BTMPL)
  112. if (!p0.new) jump:nt .Lmul_ovf_unf
  113. }
  114. {
  115. A = convert_d2df(PP_HH)
  116. EXP0 = add(EXP0,#-BIAS-58)
  117. }
  118. {
  119. AH += asl(EXP0,#HI_MANTBITS)
  120. jumpr r31
  121. }
  122. .falign
  123. .Lpossible_unf:
  124. // We end up with a positive exponent
  125. // But we may have rounded up to an exponent of 1.
  126. // If the exponent is 1, if we rounded up to it
  127. // we need to also raise underflow
  128. // Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
  129. // And the PP should also have more than one bit set
  130. //
  131. // Note: ATMP should have abs(PP_HH)
  132. // Note: BTMPL should have 0x7FEFFFFF
  133. {
  134. p0 = cmp.eq(AL,#0)
  135. p0 = bitsclr(AH,BTMPL)
  136. if (!p0.new) jumpr:t r31
  137. BTMPH = #0x7fff
  138. }
  139. {
  140. p0 = bitsset(ATMPH,BTMPH)
  141. BTMPL = USR
  142. BTMPH = #0x030
  143. }
  144. {
  145. if (p0) BTMPL = or(BTMPL,BTMPH)
  146. }
  147. {
  148. USR = BTMPL
  149. }
  150. {
  151. p0 = dfcmp.eq(A,A)
  152. jumpr r31
  153. }
  154. .falign
  155. .Lmul_ovf_unf:
  156. {
  157. A = convert_d2df(PP_HH)
  158. ATMP = abs(PP_HH) // take absolute value
  159. EXP1 = add(EXP0,#-BIAS-58)
  160. }
  161. {
  162. AH += asl(EXP1,#HI_MANTBITS)
  163. EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
  164. BTMPL = ##0x7FEFFFFF
  165. }
  166. {
  167. EXP1 += add(EXP0,##-BIAS-58)
  168. //BTMPH = add(clb(ATMP),#-2)
  169. BTMPH = #0
  170. }
  171. {
  172. p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow
  173. if (p0.new) jump:nt .Lmul_ovf
  174. }
  175. {
  176. p0 = cmp.gt(EXP1,#0)
  177. if (p0.new) jump:nt .Lpossible_unf
  178. BTMPH = sub(EXP0,BTMPH)
  179. TMP = #63 // max amount to shift
  180. }
  181. // Underflow
  182. //
  183. // PP_HH has the partial product with sticky LSB.
  184. // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
  185. // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
  186. // The exponent of PP_HH is in EXP1, which is non-positive (0 or negative)
  187. // That's the exponent that happens after the normalization
  188. //
  189. // EXP0 has the exponent that, when added to the normalized value, is out of range.
  190. //
  191. // Strategy:
  192. //
  193. // * Shift down bits, with sticky bit, such that the bits are aligned according
  194. // to the LZ count and appropriate exponent, but not all the way to mantissa
  195. // field, keep around the last few bits.
  196. // * Put a 1 near the MSB
  197. // * Check the LSBs for inexact; if inexact also set underflow
  198. // * Convert [u]d2df -- will correctly round according to rounding mode
  199. // * Replace exponent field with zero
  200. {
  201. BTMPL = #0 // offset for extract
  202. BTMPH = sub(#FUDGE,BTMPH) // amount to right shift
  203. }
  204. {
  205. p3 = cmp.gt(PP_HH_H,#-1) // is it positive?
  206. BTMPH = min(BTMPH,TMP) // Don't shift more than 63
  207. PP_HH = ATMP
  208. }
  209. {
  210. TMP = USR
  211. PP_LL = extractu(PP_HH,BTMP)
  212. }
  213. {
  214. PP_HH = asr(PP_HH,BTMPH)
  215. BTMPL = #0x0030 // underflow flag
  216. AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
  217. }
  218. {
  219. p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros?
  220. if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit
  221. PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction
  222. }
  223. {
  224. PP_LL = neg(PP_HH)
  225. p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear?
  226. if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow
  227. }
  228. {
  229. if (!p3) PP_HH = PP_LL
  230. USR = TMP
  231. }
  232. {
  233. A = convert_d2df(PP_HH) // Do rounding
  234. p0 = dfcmp.eq(A,A) // realize exception
  235. }
  236. {
  237. AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent
  238. jumpr r31
  239. }
  240. .falign
  241. .Lmul_ovf:
  242. // We get either max finite value or infinity. Either way, overflow+inexact
  243. {
  244. TMP = USR
  245. ATMP = combine(##0x7fefffff,#-1) // positive max finite
  246. A = PP_HH
  247. }
  248. {
  249. PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits
  250. TMP = or(TMP,#0x28) // inexact + overflow
  251. BTMP = combine(##0x7ff00000,#0) // positive infinity
  252. }
  253. {
  254. USR = TMP
  255. PP_LL_L ^= lsr(AH,#31) // Does sign match rounding?
  256. TMP = PP_LL_L // unmodified rounding mode
  257. }
  258. {
  259. p0 = !cmp.eq(TMP,#1) // If not round-to-zero and
  260. p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way,
  261. if (p0.new) ATMP = BTMP // we should get infinity
  262. p0 = dfcmp.eq(A,A) // Realize FP exception if enabled
  263. }
  264. {
  265. A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign
  266. jumpr r31
  267. }
  268. .Lmul_abnormal:
  269. {
  270. ATMP = extractu(A,#63,#0) // strip off sign
  271. BTMP = extractu(B,#63,#0) // strip off sign
  272. }
  273. {
  274. p3 = cmp.gtu(ATMP,BTMP)
  275. if (!p3.new) A = B // sort values
  276. if (!p3.new) B = A // sort values
  277. }
  278. {
  279. // Any NaN --> NaN, possibly raise invalid if sNaN
  280. p0 = dfclass(A,#0x0f) // A not NaN?
  281. if (!p0.new) jump:nt .Linvalid_nan
  282. if (!p3) ATMP = BTMP
  283. if (!p3) BTMP = ATMP
  284. }
  285. {
  286. // Infinity * nonzero number is infinity
  287. p1 = dfclass(A,#0x08) // A is infinity
  288. p1 = dfclass(B,#0x0e) // B is nonzero
  289. }
  290. {
  291. // Infinity * zero --> NaN, raise invalid
  292. // Other zeros return zero
  293. p0 = dfclass(A,#0x08) // A is infinity
  294. p0 = dfclass(B,#0x01) // B is zero
  295. }
  296. {
  297. if (p1) jump .Ltrue_inf
  298. p2 = dfclass(B,#0x01)
  299. }
  300. {
  301. if (p0) jump .Linvalid_zeroinf
  302. if (p2) jump .Ltrue_zero // so return zero
  303. TMP = ##0x7c000000
  304. }
  305. // We are left with a normal or subnormal times a subnormal. A > B
  306. // If A and B are both very small (exp(a) < BIAS-MANTBITS),
  307. // we go to a single sticky bit, which we can round easily.
  308. // If A and B might multiply to something bigger, decrease A exponent and increase
  309. // B exponent and try again
  310. {
  311. p0 = bitsclr(AH,TMP)
  312. if (p0.new) jump:nt .Lmul_tiny
  313. }
  314. {
  315. TMP = cl0(BTMP)
  316. }
  317. {
  318. TMP = add(TMP,#-EXPBITS)
  319. }
  320. {
  321. BTMP = asl(BTMP,TMP)
  322. }
  323. {
  324. B = insert(BTMP,#63,#0)
  325. AH -= asl(TMP,#HI_MANTBITS)
  326. }
  327. jump __hexagon_muldf3
  328. .Lmul_tiny:
  329. {
  330. TMP = USR
  331. A = xor(A,B) // get sign bit
  332. }
  333. {
  334. TMP = or(TMP,#0x30) // Inexact + Underflow
  335. A = insert(ONE,#63,#0) // put in rounded up value
  336. BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode
  337. }
  338. {
  339. USR = TMP
  340. p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf?
  341. if (!p0.new) AL = #0 // If not, zero
  342. BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB
  343. }
  344. {
  345. p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf
  346. if (!p0.new) AL = #0 // don't go to zero
  347. jumpr r31
  348. }
  349. .Linvalid_zeroinf:
  350. {
  351. TMP = USR
  352. }
  353. {
  354. A = #-1
  355. TMP = or(TMP,#2)
  356. }
  357. {
  358. USR = TMP
  359. }
  360. {
  361. p0 = dfcmp.uo(A,A) // force exception if enabled
  362. jumpr r31
  363. }
  364. .Linvalid_nan:
  365. {
  366. p0 = dfclass(B,#0x0f) // if B is not NaN
  367. TMP = convert_df2sf(A) // will generate invalid if sNaN
  368. if (p0.new) B = A // make it whatever A is
  369. }
  370. {
  371. BL = convert_df2sf(B) // will generate invalid if sNaN
  372. A = #-1
  373. jumpr r31
  374. }
  375. .falign
  376. .Ltrue_zero:
  377. {
  378. A = B
  379. B = A
  380. }
  381. .Ltrue_inf:
  382. {
  383. BH = extract(BH,#1,#31)
  384. }
  385. {
  386. AH ^= asl(BH,#31)
  387. jumpr r31
  388. }
  389. END(__hexagon_muldf3)
  390. #undef ATMP
  391. #undef ATMPL
  392. #undef ATMPH
  393. #undef BTMP
  394. #undef BTMPL
  395. #undef BTMPH