dfsqrt.S 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. //===----------------------Hexagon builtin routine ------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. // Double Precision square root
  9. #define EXP r28
  10. #define A r1:0
  11. #define AH r1
  12. #define AL r0
  13. #define SFSH r3:2
  14. #define SF_S r3
  15. #define SF_H r2
  16. #define SFHALF_SONE r5:4
  17. #define S_ONE r4
  18. #define SFHALF r5
  19. #define SF_D r6
  20. #define SF_E r7
  21. #define RECIPEST r8
  22. #define SFRAD r9
  23. #define FRACRAD r11:10
  24. #define FRACRADH r11
  25. #define FRACRADL r10
  26. #define ROOT r13:12
  27. #define ROOTHI r13
  28. #define ROOTLO r12
  29. #define PROD r15:14
  30. #define PRODHI r15
  31. #define PRODLO r14
  32. #define P_TMP p0
  33. #define P_EXP1 p1
  34. #define NORMAL p2
  35. #define SF_EXPBITS 8
  36. #define SF_MANTBITS 23
  37. #define DF_EXPBITS 11
  38. #define DF_MANTBITS 52
  39. #define DF_BIAS 0x3ff
  40. #define DFCLASS_ZERO 0x01
  41. #define DFCLASS_NORMAL 0x02
  42. #define DFCLASS_DENORMAL 0x02
  43. #define DFCLASS_INFINITE 0x08
  44. #define DFCLASS_NAN 0x10
  45. #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function
  46. #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function
  47. #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function
  48. #define END(TAG) .size TAG,.-TAG
  49. .text
  50. .global __hexagon_sqrtdf2
  51. .type __hexagon_sqrtdf2,@function
  52. .global __hexagon_sqrt
  53. .type __hexagon_sqrt,@function
  54. Q6_ALIAS(sqrtdf2)
  55. Q6_ALIAS(sqrt)
  56. FAST_ALIAS(sqrtdf2)
  57. FAST_ALIAS(sqrt)
  58. FAST2_ALIAS(sqrtdf2)
  59. FAST2_ALIAS(sqrt)
  60. .type sqrt,@function
  61. .p2align 5
  62. __hexagon_sqrtdf2:
  63. __hexagon_sqrt:
  64. {
  65. PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)
  66. EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
  67. SFHALF_SONE = combine(##0x3f000004,#1)
  68. }
  69. {
  70. NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal
  71. NORMAL = cmp.gt(AH,#-1) // and positive?
  72. if (!NORMAL.new) jump:nt .Lsqrt_abnormal
  73. SFRAD = or(SFHALF,PRODLO)
  74. }
  75. #undef NORMAL
  76. .Ldenormal_restart:
  77. {
  78. FRACRAD = A
  79. SF_E,P_TMP = sfinvsqrta(SFRAD)
  80. SFHALF = and(SFHALF,#-16)
  81. SFSH = #0
  82. }
  83. #undef A
  84. #undef AH
  85. #undef AL
  86. #define ERROR r1:0
  87. #define ERRORHI r1
  88. #define ERRORLO r0
  89. // SF_E : reciprocal square root
  90. // SF_H : half rsqrt
  91. // sf_S : square root
  92. // SF_D : error term
  93. // SFHALF: 0.5
  94. {
  95. SF_S += sfmpy(SF_E,SFRAD):lib // s0: root
  96. SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent...
  97. SF_D = SFHALF
  98. #undef SFRAD
  99. #define SHIFTAMT r9
  100. SHIFTAMT = and(EXP,#1)
  101. }
  102. {
  103. SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1
  104. FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden
  105. P_EXP1 = cmp.gtu(SHIFTAMT,#0)
  106. }
  107. {
  108. SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt
  109. SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip
  110. SF_D = SFHALF
  111. SHIFTAMT = mux(P_EXP1,#8,#9)
  112. }
  113. {
  114. SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term
  115. FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place
  116. SHIFTAMT = mux(P_EXP1,#3,#2)
  117. }
  118. {
  119. SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt
  120. // cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x).
  121. PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1)
  122. }
  123. {
  124. SF_H = and(SF_H,##0x007fffff)
  125. }
  126. {
  127. SF_H = add(SF_H,##0x00800000 - 3)
  128. SHIFTAMT = mux(P_EXP1,#7,#8)
  129. }
  130. {
  131. RECIPEST = asl(SF_H,SHIFTAMT)
  132. SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0))
  133. }
  134. {
  135. ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1)))
  136. }
  137. #undef SFSH // r3:2
  138. #undef SF_H // r2
  139. #undef SF_S // r3
  140. #undef S_ONE // r4
  141. #undef SFHALF // r5
  142. #undef SFHALF_SONE // r5:4
  143. #undef SF_D // r6
  144. #undef SF_E // r7
  145. #define HL r3:2
  146. #define LL r5:4
  147. #define HH r7:6
  148. #undef P_EXP1
  149. #define P_CARRY0 p1
  150. #define P_CARRY1 p2
  151. #define P_CARRY2 p3
  152. // Iteration 0
  153. // Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply
  154. // We can shift and subtract instead of shift and add?
  155. {
  156. ERROR = asl(FRACRAD,#15)
  157. PROD = mpyu(ROOTHI,ROOTHI)
  158. P_CARRY0 = cmp.eq(r0,r0)
  159. }
  160. {
  161. ERROR -= asl(PROD,#15)
  162. PROD = mpyu(ROOTHI,ROOTLO)
  163. P_CARRY1 = cmp.eq(r0,r0)
  164. }
  165. {
  166. ERROR -= lsr(PROD,#16)
  167. P_CARRY2 = cmp.eq(r0,r0)
  168. }
  169. {
  170. ERROR = mpyu(ERRORHI,RECIPEST)
  171. }
  172. {
  173. ROOT += lsr(ERROR,SHIFTAMT)
  174. SHIFTAMT = add(SHIFTAMT,#16)
  175. ERROR = asl(FRACRAD,#31) // for next iter
  176. }
  177. // Iteration 1
  178. {
  179. PROD = mpyu(ROOTHI,ROOTHI)
  180. ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed
  181. }
  182. {
  183. ERROR -= asl(PROD,#31)
  184. PROD = mpyu(ROOTLO,ROOTLO)
  185. }
  186. {
  187. ERROR -= lsr(PROD,#33)
  188. }
  189. {
  190. ERROR = mpyu(ERRORHI,RECIPEST)
  191. }
  192. {
  193. ROOT += lsr(ERROR,SHIFTAMT)
  194. SHIFTAMT = add(SHIFTAMT,#16)
  195. ERROR = asl(FRACRAD,#47) // for next iter
  196. }
  197. // Iteration 2
  198. {
  199. PROD = mpyu(ROOTHI,ROOTHI)
  200. }
  201. {
  202. ERROR -= asl(PROD,#47)
  203. PROD = mpyu(ROOTHI,ROOTLO)
  204. }
  205. {
  206. ERROR -= asl(PROD,#16) // bidir shr 31-47
  207. PROD = mpyu(ROOTLO,ROOTLO)
  208. }
  209. {
  210. ERROR -= lsr(PROD,#17) // 64-47
  211. }
  212. {
  213. ERROR = mpyu(ERRORHI,RECIPEST)
  214. }
  215. {
  216. ROOT += lsr(ERROR,SHIFTAMT)
  217. }
  218. #undef ERROR
  219. #undef PROD
  220. #undef PRODHI
  221. #undef PRODLO
  222. #define REM_HI r15:14
  223. #define REM_HI_HI r15
  224. #define REM_LO r1:0
  225. #undef RECIPEST
  226. #undef SHIFTAMT
  227. #define TWOROOT_LO r9:8
  228. // Adjust Root
  229. {
  230. HL = mpyu(ROOTHI,ROOTLO)
  231. LL = mpyu(ROOTLO,ROOTLO)
  232. REM_HI = #0
  233. REM_LO = #0
  234. }
  235. {
  236. HL += lsr(LL,#33)
  237. LL += asl(HL,#33)
  238. P_CARRY0 = cmp.eq(r0,r0)
  239. }
  240. {
  241. HH = mpyu(ROOTHI,ROOTHI)
  242. REM_LO = sub(REM_LO,LL,P_CARRY0):carry
  243. TWOROOT_LO = #1
  244. }
  245. {
  246. HH += lsr(HL,#31)
  247. TWOROOT_LO += asl(ROOT,#1)
  248. }
  249. #undef HL
  250. #undef LL
  251. #define REM_HI_TMP r3:2
  252. #define REM_HI_TMP_HI r3
  253. #define REM_LO_TMP r5:4
  254. {
  255. REM_HI = sub(FRACRAD,HH,P_CARRY0):carry
  256. REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry
  257. #undef FRACRAD
  258. #undef HH
  259. #define ZERO r11:10
  260. #define ONE r7:6
  261. ONE = #1
  262. ZERO = #0
  263. }
  264. {
  265. REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry
  266. ONE = add(ROOT,ONE)
  267. EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp
  268. }
  269. {
  270. // If carry set, no borrow: result was still positive
  271. if (P_CARRY1) ROOT = ONE
  272. if (P_CARRY1) REM_LO = REM_LO_TMP
  273. if (P_CARRY1) REM_HI = REM_HI_TMP
  274. }
  275. {
  276. REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry
  277. ONE = #1
  278. EXP = asr(EXP,#1) // divide signed exp by 2
  279. }
  280. {
  281. REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry
  282. ONE = add(ROOT,ONE)
  283. }
  284. {
  285. if (P_CARRY2) ROOT = ONE
  286. if (P_CARRY2) REM_LO = REM_LO_TMP
  287. // since tworoot <= 2^32, remhi must be zero
  288. #undef REM_HI_TMP
  289. #undef REM_HI_TMP_HI
  290. #define S_ONE r2
  291. #define ADJ r3
  292. S_ONE = #1
  293. }
  294. {
  295. P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero
  296. if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully
  297. ADJ = cl0(ROOT)
  298. EXP = add(EXP,#-63)
  299. }
  300. #undef REM_LO
  301. #define RET r1:0
  302. #define RETHI r1
  303. {
  304. RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag
  305. EXP = add(EXP,ADJ) // add back bias
  306. }
  307. {
  308. RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust
  309. jumpr r31
  310. }
  311. #undef REM_LO_TMP
  312. #undef REM_HI_TMP
  313. #undef REM_HI_TMP_HI
  314. #undef REM_LO
  315. #undef REM_HI
  316. #undef TWOROOT_LO
  317. #undef RET
  318. #define A r1:0
  319. #define AH r1
  320. #define AL r1
  321. #undef S_ONE
  322. #define TMP r3:2
  323. #define TMPHI r3
  324. #define TMPLO r2
  325. #undef P_CARRY0
  326. #define P_NEG p1
  327. #define SFHALF r5
  328. #define SFRAD r9
  329. .Lsqrt_abnormal:
  330. {
  331. P_TMP = dfclass(A,#DFCLASS_ZERO) // zero?
  332. if (P_TMP.new) jumpr:t r31
  333. }
  334. {
  335. P_TMP = dfclass(A,#DFCLASS_NAN)
  336. if (P_TMP.new) jump:nt .Lsqrt_nan
  337. }
  338. {
  339. P_TMP = cmp.gt(AH,#-1)
  340. if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg
  341. if (!P_TMP.new) EXP = ##0x7F800001 // sNaN
  342. }
  343. {
  344. P_TMP = dfclass(A,#DFCLASS_INFINITE)
  345. if (P_TMP.new) jumpr:nt r31
  346. }
  347. // If we got here, we're denormal
  348. // prepare to restart
  349. {
  350. A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa
  351. }
  352. {
  353. EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize?
  354. }
  355. {
  356. A = asl(A,EXP) // Shift mantissa
  357. EXP = sub(#1,EXP) // Form exponent
  358. }
  359. {
  360. AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent
  361. }
  362. {
  363. TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1)
  364. SFHALF = ##0x3f000004 // form half constant
  365. }
  366. {
  367. SFRAD = or(SFHALF,TMPLO) // form sf value
  368. SFHALF = and(SFHALF,#-16)
  369. jump .Ldenormal_restart // restart
  370. }
  371. .Lsqrt_nan:
  372. {
  373. EXP = convert_df2sf(A) // if sNaN, get invalid
  374. A = #-1 // qNaN
  375. jumpr r31
  376. }
  377. .Lsqrt_invalid_neg:
  378. {
  379. A = convert_sf2df(EXP) // Invalid,NaNval
  380. jumpr r31
  381. }
  382. END(__hexagon_sqrt)
  383. END(__hexagon_sqrtdf2)