123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405 |
- //===----------------------Hexagon builtin routine ------------------------===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- // Double Precision square root
- #define EXP r28
- #define A r1:0
- #define AH r1
- #define AL r0
- #define SFSH r3:2
- #define SF_S r3
- #define SF_H r2
- #define SFHALF_SONE r5:4
- #define S_ONE r4
- #define SFHALF r5
- #define SF_D r6
- #define SF_E r7
- #define RECIPEST r8
- #define SFRAD r9
- #define FRACRAD r11:10
- #define FRACRADH r11
- #define FRACRADL r10
- #define ROOT r13:12
- #define ROOTHI r13
- #define ROOTLO r12
- #define PROD r15:14
- #define PRODHI r15
- #define PRODLO r14
- #define P_TMP p0
- #define P_EXP1 p1
- #define NORMAL p2
- #define SF_EXPBITS 8
- #define SF_MANTBITS 23
- #define DF_EXPBITS 11
- #define DF_MANTBITS 52
- #define DF_BIAS 0x3ff
- #define DFCLASS_ZERO 0x01
- #define DFCLASS_NORMAL 0x02
- #define DFCLASS_DENORMAL 0x02
- #define DFCLASS_INFINITE 0x08
- #define DFCLASS_NAN 0x10
- #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function
- #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function
- #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function
- #define END(TAG) .size TAG,.-TAG
- .text
- .global __hexagon_sqrtdf2
- .type __hexagon_sqrtdf2,@function
- .global __hexagon_sqrt
- .type __hexagon_sqrt,@function
- Q6_ALIAS(sqrtdf2)
- Q6_ALIAS(sqrt)
- FAST_ALIAS(sqrtdf2)
- FAST_ALIAS(sqrt)
- FAST2_ALIAS(sqrtdf2)
- FAST2_ALIAS(sqrt)
- .type sqrt,@function
- .p2align 5
- __hexagon_sqrtdf2:
- __hexagon_sqrt:
- {
- PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS)
- EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32)
- SFHALF_SONE = combine(##0x3f000004,#1)
- }
- {
- NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal
- NORMAL = cmp.gt(AH,#-1) // and positive?
- if (!NORMAL.new) jump:nt .Lsqrt_abnormal
- SFRAD = or(SFHALF,PRODLO)
- }
- #undef NORMAL
- .Ldenormal_restart:
- {
- FRACRAD = A
- SF_E,P_TMP = sfinvsqrta(SFRAD)
- SFHALF = and(SFHALF,#-16)
- SFSH = #0
- }
- #undef A
- #undef AH
- #undef AL
- #define ERROR r1:0
- #define ERRORHI r1
- #define ERRORLO r0
- // SF_E : reciprocal square root
- // SF_H : half rsqrt
- // sf_S : square root
- // SF_D : error term
- // SFHALF: 0.5
- {
- SF_S += sfmpy(SF_E,SFRAD):lib // s0: root
- SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent...
- SF_D = SFHALF
- #undef SFRAD
- #define SHIFTAMT r9
- SHIFTAMT = and(EXP,#1)
- }
- {
- SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1
- FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden
- P_EXP1 = cmp.gtu(SHIFTAMT,#0)
- }
- {
- SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt
- SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip
- SF_D = SFHALF
- SHIFTAMT = mux(P_EXP1,#8,#9)
- }
- {
- SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term
- FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place
- SHIFTAMT = mux(P_EXP1,#3,#2)
- }
- {
- SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt
- // cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x).
- PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1)
- }
- {
- SF_H = and(SF_H,##0x007fffff)
- }
- {
- SF_H = add(SF_H,##0x00800000 - 3)
- SHIFTAMT = mux(P_EXP1,#7,#8)
- }
- {
- RECIPEST = asl(SF_H,SHIFTAMT)
- SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0))
- }
- {
- ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1)))
- }
- #undef SFSH // r3:2
- #undef SF_H // r2
- #undef SF_S // r3
- #undef S_ONE // r4
- #undef SFHALF // r5
- #undef SFHALF_SONE // r5:4
- #undef SF_D // r6
- #undef SF_E // r7
- #define HL r3:2
- #define LL r5:4
- #define HH r7:6
- #undef P_EXP1
- #define P_CARRY0 p1
- #define P_CARRY1 p2
- #define P_CARRY2 p3
- // Iteration 0
- // Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply
- // We can shift and subtract instead of shift and add?
- {
- ERROR = asl(FRACRAD,#15)
- PROD = mpyu(ROOTHI,ROOTHI)
- P_CARRY0 = cmp.eq(r0,r0)
- }
- {
- ERROR -= asl(PROD,#15)
- PROD = mpyu(ROOTHI,ROOTLO)
- P_CARRY1 = cmp.eq(r0,r0)
- }
- {
- ERROR -= lsr(PROD,#16)
- P_CARRY2 = cmp.eq(r0,r0)
- }
- {
- ERROR = mpyu(ERRORHI,RECIPEST)
- }
- {
- ROOT += lsr(ERROR,SHIFTAMT)
- SHIFTAMT = add(SHIFTAMT,#16)
- ERROR = asl(FRACRAD,#31) // for next iter
- }
- // Iteration 1
- {
- PROD = mpyu(ROOTHI,ROOTHI)
- ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed
- }
- {
- ERROR -= asl(PROD,#31)
- PROD = mpyu(ROOTLO,ROOTLO)
- }
- {
- ERROR -= lsr(PROD,#33)
- }
- {
- ERROR = mpyu(ERRORHI,RECIPEST)
- }
- {
- ROOT += lsr(ERROR,SHIFTAMT)
- SHIFTAMT = add(SHIFTAMT,#16)
- ERROR = asl(FRACRAD,#47) // for next iter
- }
- // Iteration 2
- {
- PROD = mpyu(ROOTHI,ROOTHI)
- }
- {
- ERROR -= asl(PROD,#47)
- PROD = mpyu(ROOTHI,ROOTLO)
- }
- {
- ERROR -= asl(PROD,#16) // bidir shr 31-47
- PROD = mpyu(ROOTLO,ROOTLO)
- }
- {
- ERROR -= lsr(PROD,#17) // 64-47
- }
- {
- ERROR = mpyu(ERRORHI,RECIPEST)
- }
- {
- ROOT += lsr(ERROR,SHIFTAMT)
- }
- #undef ERROR
- #undef PROD
- #undef PRODHI
- #undef PRODLO
- #define REM_HI r15:14
- #define REM_HI_HI r15
- #define REM_LO r1:0
- #undef RECIPEST
- #undef SHIFTAMT
- #define TWOROOT_LO r9:8
- // Adjust Root
- {
- HL = mpyu(ROOTHI,ROOTLO)
- LL = mpyu(ROOTLO,ROOTLO)
- REM_HI = #0
- REM_LO = #0
- }
- {
- HL += lsr(LL,#33)
- LL += asl(HL,#33)
- P_CARRY0 = cmp.eq(r0,r0)
- }
- {
- HH = mpyu(ROOTHI,ROOTHI)
- REM_LO = sub(REM_LO,LL,P_CARRY0):carry
- TWOROOT_LO = #1
- }
- {
- HH += lsr(HL,#31)
- TWOROOT_LO += asl(ROOT,#1)
- }
- #undef HL
- #undef LL
- #define REM_HI_TMP r3:2
- #define REM_HI_TMP_HI r3
- #define REM_LO_TMP r5:4
- {
- REM_HI = sub(FRACRAD,HH,P_CARRY0):carry
- REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry
- #undef FRACRAD
- #undef HH
- #define ZERO r11:10
- #define ONE r7:6
- ONE = #1
- ZERO = #0
- }
- {
- REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry
- ONE = add(ROOT,ONE)
- EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp
- }
- {
- // If carry set, no borrow: result was still positive
- if (P_CARRY1) ROOT = ONE
- if (P_CARRY1) REM_LO = REM_LO_TMP
- if (P_CARRY1) REM_HI = REM_HI_TMP
- }
- {
- REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry
- ONE = #1
- EXP = asr(EXP,#1) // divide signed exp by 2
- }
- {
- REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry
- ONE = add(ROOT,ONE)
- }
- {
- if (P_CARRY2) ROOT = ONE
- if (P_CARRY2) REM_LO = REM_LO_TMP
- // since tworoot <= 2^32, remhi must be zero
- #undef REM_HI_TMP
- #undef REM_HI_TMP_HI
- #define S_ONE r2
- #define ADJ r3
- S_ONE = #1
- }
- {
- P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero
- if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully
- ADJ = cl0(ROOT)
- EXP = add(EXP,#-63)
- }
- #undef REM_LO
- #define RET r1:0
- #define RETHI r1
- {
- RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag
- EXP = add(EXP,ADJ) // add back bias
- }
- {
- RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust
- jumpr r31
- }
- #undef REM_LO_TMP
- #undef REM_HI_TMP
- #undef REM_HI_TMP_HI
- #undef REM_LO
- #undef REM_HI
- #undef TWOROOT_LO
- #undef RET
- #define A r1:0
- #define AH r1
- #define AL r1
- #undef S_ONE
- #define TMP r3:2
- #define TMPHI r3
- #define TMPLO r2
- #undef P_CARRY0
- #define P_NEG p1
- #define SFHALF r5
- #define SFRAD r9
- .Lsqrt_abnormal:
- {
- P_TMP = dfclass(A,#DFCLASS_ZERO) // zero?
- if (P_TMP.new) jumpr:t r31
- }
- {
- P_TMP = dfclass(A,#DFCLASS_NAN)
- if (P_TMP.new) jump:nt .Lsqrt_nan
- }
- {
- P_TMP = cmp.gt(AH,#-1)
- if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg
- if (!P_TMP.new) EXP = ##0x7F800001 // sNaN
- }
- {
- P_TMP = dfclass(A,#DFCLASS_INFINITE)
- if (P_TMP.new) jumpr:nt r31
- }
- // If we got here, we're denormal
- // prepare to restart
- {
- A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa
- }
- {
- EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize?
- }
- {
- A = asl(A,EXP) // Shift mantissa
- EXP = sub(#1,EXP) // Form exponent
- }
- {
- AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent
- }
- {
- TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1)
- SFHALF = ##0x3f000004 // form half constant
- }
- {
- SFRAD = or(SFHALF,TMPLO) // form sf value
- SFHALF = and(SFHALF,#-16)
- jump .Ldenormal_restart // restart
- }
- .Lsqrt_nan:
- {
- EXP = convert_df2sf(A) // if sNaN, get invalid
- A = #-1 // qNaN
- jumpr r31
- }
- .Lsqrt_invalid_neg:
- {
- A = convert_sf2df(EXP) // Invalid,NaNval
- jumpr r31
- }
- END(__hexagon_sqrt)
- END(__hexagon_sqrtdf2)
|