123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694 |
- //===----------------------Hexagon builtin routine ------------------------===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
- #define END(TAG) .size TAG,.-TAG
- // Double Precision Multiply
- #define A r1:0
- #define AH r1
- #define AL r0
- #define B r3:2
- #define BH r3
- #define BL r2
- #define C r5:4
- #define CH r5
- #define CL r4
- #define BTMP r15:14
- #define BTMPH r15
- #define BTMPL r14
- #define ATMP r13:12
- #define ATMPH r13
- #define ATMPL r12
- #define CTMP r11:10
- #define CTMPH r11
- #define CTMPL r10
- #define PP_LL r9:8
- #define PP_LL_H r9
- #define PP_LL_L r8
- #define PP_ODD r7:6
- #define PP_ODD_H r7
- #define PP_ODD_L r6
- #define PP_HH r17:16
- #define PP_HH_H r17
- #define PP_HH_L r16
- #define EXPA r18
- #define EXPB r19
- #define EXPBA r19:18
- #define TMP r28
- #define P_TMP p0
- #define PROD_NEG p3
- #define EXACT p2
- #define SWAP p1
- #define MANTBITS 52
- #define HI_MANTBITS 20
- #define EXPBITS 11
- #define BIAS 1023
- #define STACKSPACE 32
- #define ADJUST 4
- #define FUDGE 7
- #define FUDGE2 3
- #ifndef SR_ROUND_OFF
- #define SR_ROUND_OFF 22
- #endif
- // First, classify for normal values, and abort if abnormal
- //
- // Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
- //
- // Since we know that the 2 MSBs of the H registers is zero, we should never carry
- // the partial products that involve the H registers
- //
- // Try to buy X slots, at the expense of latency if needed
- //
- // We will have PP_HH with the upper bits of the product, PP_LL with the lower
- // PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
- // PP_HH can have a minimum of 0x0100_0000_0000_0000
- //
- // 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
- //
- // We need to align CTMP.
- // If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
- // If CTMP << PP align CTMP and add 128 bits. Then compute sticky
- // If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation.
- //
- // Convert partial product and CTMP to 2's complement prior to addition
- //
- // After we add, we need to normalize into upper 64 bits, then compute sticky.
- .text
- .global __hexagon_fmadf4
- .type __hexagon_fmadf4,@function
- .global __hexagon_fmadf5
- .type __hexagon_fmadf5,@function
- Q6_ALIAS(fmadf5)
- .p2align 5
- __hexagon_fmadf4:
- __hexagon_fmadf5:
- .Lfma_begin:
- {
- P_TMP = dfclass(A,#2)
- P_TMP = dfclass(B,#2)
- ATMP = #0
- BTMP = #0
- }
- {
- ATMP = insert(A,#MANTBITS,#EXPBITS-3)
- BTMP = insert(B,#MANTBITS,#EXPBITS-3)
- PP_ODD_H = ##0x10000000
- allocframe(#STACKSPACE)
- }
- {
- PP_LL = mpyu(ATMPL,BTMPL)
- if (!P_TMP) jump .Lfma_abnormal_ab
- ATMPH = or(ATMPH,PP_ODD_H)
- BTMPH = or(BTMPH,PP_ODD_H)
- }
- {
- P_TMP = dfclass(C,#2)
- if (!P_TMP.new) jump:nt .Lfma_abnormal_c
- CTMP = combine(PP_ODD_H,#0)
- PP_ODD = combine(#0,PP_LL_H)
- }
- .Lfma_abnormal_c_restart:
- {
- PP_ODD += mpyu(BTMPL,ATMPH)
- CTMP = insert(C,#MANTBITS,#EXPBITS-3)
- memd(r29+#0) = PP_HH
- memd(r29+#8) = EXPBA
- }
- {
- PP_ODD += mpyu(ATMPL,BTMPH)
- EXPBA = neg(CTMP)
- P_TMP = cmp.gt(CH,#-1)
- TMP = xor(AH,BH)
- }
- {
- EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
- EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
- PP_HH = combine(#0,PP_ODD_H)
- if (!P_TMP) CTMP = EXPBA
- }
- {
- PP_HH += mpyu(ATMPH,BTMPH)
- PP_LL = combine(PP_ODD_L,PP_LL_L)
- #undef PP_ODD
- #undef PP_ODD_H
- #undef PP_ODD_L
- #undef ATMP
- #undef ATMPL
- #undef ATMPH
- #undef BTMP
- #undef BTMPL
- #undef BTMPH
- #define RIGHTLEFTSHIFT r13:12
- #define RIGHTSHIFT r13
- #define LEFTSHIFT r12
- EXPA = add(EXPA,EXPB)
- #undef EXPB
- #undef EXPBA
- #define EXPC r19
- #define EXPCA r19:18
- EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
- }
- // PP_HH:PP_LL now has product
- // CTMP is negated
- // EXPA,B,C are extracted
- // We need to negate PP
- // Since we will be adding with carry later, if we need to negate,
- // just invert all bits now, which we can do conditionally and in parallel
- #define PP_HH_TMP r15:14
- #define PP_LL_TMP r7:6
- {
- EXPA = add(EXPA,#-BIAS+(ADJUST))
- PROD_NEG = !cmp.gt(TMP,#-1)
- PP_LL_TMP = #0
- PP_HH_TMP = #0
- }
- {
- PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
- P_TMP = !cmp.gt(TMP,#-1)
- SWAP = cmp.gt(EXPC,EXPA) // If C >> PP
- if (SWAP.new) EXPCA = combine(EXPA,EXPC)
- }
- {
- PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
- if (P_TMP) PP_LL = PP_LL_TMP
- #undef PP_LL_TMP
- #define CTMP2 r7:6
- #define CTMP2H r7
- #define CTMP2L r6
- CTMP2 = #0
- EXPC = sub(EXPA,EXPC)
- }
- {
- if (P_TMP) PP_HH = PP_HH_TMP
- P_TMP = cmp.gt(EXPC,#63)
- if (SWAP) PP_LL = CTMP2
- if (SWAP) CTMP2 = PP_LL
- }
- #undef PP_HH_TMP
- //#define ONE r15:14
- //#define S_ONE r14
- #define ZERO r15:14
- #define S_ZERO r15
- #undef PROD_NEG
- #define P_CARRY p3
- {
- if (SWAP) PP_HH = CTMP // Swap C and PP
- if (SWAP) CTMP = PP_HH
- if (P_TMP) EXPC = add(EXPC,#-64)
- TMP = #63
- }
- {
- // If diff > 63, pre-shift-right by 64...
- if (P_TMP) CTMP2 = CTMP
- TMP = asr(CTMPH,#31)
- RIGHTSHIFT = min(EXPC,TMP)
- LEFTSHIFT = #0
- }
- #undef C
- #undef CH
- #undef CL
- #define STICKIES r5:4
- #define STICKIESH r5
- #define STICKIESL r4
- {
- if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64
- STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
- CTMP2 = lsr(CTMP2,RIGHTSHIFT)
- LEFTSHIFT = sub(#64,RIGHTSHIFT)
- }
- {
- ZERO = #0
- TMP = #-2
- CTMP2 |= lsl(CTMP,LEFTSHIFT)
- CTMP = asr(CTMP,RIGHTSHIFT)
- }
- {
- P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift
- if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
- #undef ZERO
- #define ONE r15:14
- #define S_ONE r14
- ONE = #1
- STICKIES = #0
- }
- {
- PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky
- }
- {
- PP_HH = add(CTMP,PP_HH,P_CARRY):carry
- TMP = #62
- }
- // PP_HH:PP_LL now holds the sum
- // We may need to normalize left, up to ??? bits.
- //
- // I think that if we have massive cancellation, the range we normalize by
- // is still limited
- {
- LEFTSHIFT = add(clb(PP_HH),#-2)
- if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f // all sign bits?
- }
- // We had all sign bits, shift left by 62.
- {
- CTMP = extractu(PP_LL,#62,#2)
- PP_LL = asl(PP_LL,#62)
- EXPA = add(EXPA,#-62) // And adjust exponent of result
- }
- {
- PP_HH = insert(CTMP,#62,#0) // Then shift 63
- }
- {
- LEFTSHIFT = add(clb(PP_HH),#-2)
- }
- .falign
- 1:
- {
- CTMP = asl(PP_HH,LEFTSHIFT)
- STICKIES |= asl(PP_LL,LEFTSHIFT)
- RIGHTSHIFT = sub(#64,LEFTSHIFT)
- EXPA = sub(EXPA,LEFTSHIFT)
- }
- {
- CTMP |= lsr(PP_LL,RIGHTSHIFT)
- EXACT = cmp.gtu(ONE,STICKIES)
- TMP = #BIAS+BIAS-2
- }
- {
- if (!EXACT) CTMPL = or(CTMPL,S_ONE)
- // If EXPA is overflow/underflow, jump to ovf_unf
- P_TMP = !cmp.gt(EXPA,TMP)
- P_TMP = cmp.gt(EXPA,#1)
- if (!P_TMP.new) jump:nt .Lfma_ovf_unf
- }
- {
- // XXX: FIXME: should PP_HH for check of zero be CTMP?
- P_TMP = cmp.gtu(ONE,CTMP) // is result true zero?
- A = convert_d2df(CTMP)
- EXPA = add(EXPA,#-BIAS-60)
- PP_HH = memd(r29+#0)
- }
- {
- AH += asl(EXPA,#HI_MANTBITS)
- EXPCA = memd(r29+#8)
- if (!P_TMP) dealloc_return // not zero, return
- }
- .Ladd_yields_zero:
- // We had full cancellation. Return +/- zero (-0 when round-down)
- {
- TMP = USR
- A = #0
- }
- {
- TMP = extractu(TMP,#2,#SR_ROUND_OFF)
- PP_HH = memd(r29+#0)
- EXPCA = memd(r29+#8)
- }
- {
- p0 = cmp.eq(TMP,#2)
- if (p0.new) AH = ##0x80000000
- dealloc_return
- }
- #undef RIGHTLEFTSHIFT
- #undef RIGHTSHIFT
- #undef LEFTSHIFT
- #undef CTMP2
- #undef CTMP2H
- #undef CTMP2L
- .Lfma_ovf_unf:
- {
- p0 = cmp.gtu(ONE,CTMP)
- if (p0.new) jump:nt .Ladd_yields_zero
- }
- {
- A = convert_d2df(CTMP)
- EXPA = add(EXPA,#-BIAS-60)
- TMP = EXPA
- }
- #define NEW_EXPB r7
- #define NEW_EXPA r6
- {
- AH += asl(EXPA,#HI_MANTBITS)
- NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
- }
- {
- NEW_EXPA = add(EXPA,NEW_EXPB)
- PP_HH = memd(r29+#0)
- EXPCA = memd(r29+#8)
- #undef PP_HH
- #undef PP_HH_H
- #undef PP_HH_L
- #undef EXPCA
- #undef EXPC
- #undef EXPA
- #undef PP_LL
- #undef PP_LL_H
- #undef PP_LL_L
- #define EXPA r6
- #define EXPB r7
- #define EXPBA r7:6
- #define ATMP r9:8
- #define ATMPH r9
- #define ATMPL r8
- #undef NEW_EXPB
- #undef NEW_EXPA
- ATMP = abs(CTMP)
- }
- {
- p0 = cmp.gt(EXPA,##BIAS+BIAS)
- if (p0.new) jump:nt .Lfma_ovf
- }
- {
- p0 = cmp.gt(EXPA,#0)
- if (p0.new) jump:nt .Lpossible_unf
- }
- {
- // TMP has original EXPA.
- // ATMP is corresponding value
- // Normalize ATMP and shift right to correct location
- EXPB = add(clb(ATMP),#-2) // Amount to left shift to normalize
- EXPA = sub(#1+5,TMP) // Amount to right shift to denormalize
- p3 = cmp.gt(CTMPH,#-1)
- }
- // Underflow
- // We know that the infinte range exponent should be EXPA
- // CTMP is 2's complement, ATMP is abs(CTMP)
- {
- EXPA = add(EXPA,EXPB) // how much to shift back right
- ATMP = asl(ATMP,EXPB) // shift left
- AH = USR
- TMP = #63
- }
- {
- EXPB = min(EXPA,TMP)
- EXPA = #0
- AL = #0x0030
- }
- {
- B = extractu(ATMP,EXPBA)
- ATMP = asr(ATMP,EXPB)
- }
- {
- p0 = cmp.gtu(ONE,B)
- if (!p0.new) ATMPL = or(ATMPL,S_ONE)
- ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
- }
- {
- CTMP = neg(ATMP)
- p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
- if (!p1.new) AH = or(AH,AL)
- B = #0
- }
- {
- if (p3) CTMP = ATMP
- USR = AH
- TMP = #-BIAS-(MANTBITS+FUDGE2)
- }
- {
- A = convert_d2df(CTMP)
- }
- {
- AH += asl(TMP,#HI_MANTBITS)
- dealloc_return
- }
- .Lpossible_unf:
- {
- TMP = ##0x7fefffff
- ATMP = abs(CTMP)
- }
- {
- p0 = cmp.eq(AL,#0)
- p0 = bitsclr(AH,TMP)
- if (!p0.new) dealloc_return:t
- TMP = #0x7fff
- }
- {
- p0 = bitsset(ATMPH,TMP)
- BH = USR
- BL = #0x0030
- }
- {
- if (p0) BH = or(BH,BL)
- }
- {
- USR = BH
- }
- {
- p0 = dfcmp.eq(A,A)
- dealloc_return
- }
- .Lfma_ovf:
- {
- TMP = USR
- CTMP = combine(##0x7fefffff,#-1)
- A = CTMP
- }
- {
- ATMP = combine(##0x7ff00000,#0)
- BH = extractu(TMP,#2,#SR_ROUND_OFF)
- TMP = or(TMP,#0x28)
- }
- {
- USR = TMP
- BH ^= lsr(AH,#31)
- BL = BH
- }
- {
- p0 = !cmp.eq(BL,#1)
- p0 = !cmp.eq(BH,#2)
- }
- {
- p0 = dfcmp.eq(ATMP,ATMP)
- if (p0.new) CTMP = ATMP
- }
- {
- A = insert(CTMP,#63,#0)
- dealloc_return
- }
- #undef CTMP
- #undef CTMPH
- #undef CTMPL
- #define BTMP r11:10
- #define BTMPH r11
- #define BTMPL r10
- #undef STICKIES
- #undef STICKIESH
- #undef STICKIESL
- #define C r5:4
- #define CH r5
- #define CL r4
- .Lfma_abnormal_ab:
- {
- ATMP = extractu(A,#63,#0)
- BTMP = extractu(B,#63,#0)
- deallocframe
- }
- {
- p3 = cmp.gtu(ATMP,BTMP)
- if (!p3.new) A = B // sort values
- if (!p3.new) B = A
- }
- {
- p0 = dfclass(A,#0x0f) // A NaN?
- if (!p0.new) jump:nt .Lnan
- if (!p3) ATMP = BTMP
- if (!p3) BTMP = ATMP
- }
- {
- p1 = dfclass(A,#0x08) // A is infinity
- p1 = dfclass(B,#0x0e) // B is nonzero
- }
- {
- p0 = dfclass(A,#0x08) // a is inf
- p0 = dfclass(B,#0x01) // b is zero
- }
- {
- if (p1) jump .Lab_inf
- p2 = dfclass(B,#0x01)
- }
- {
- if (p0) jump .Linvalid
- if (p2) jump .Lab_true_zero
- TMP = ##0x7c000000
- }
- // We are left with a normal or subnormal times a subnormal, A > B
- // If A and B are both very small, we will go to a single sticky bit; replace
- // A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
- // if A and B might multiply to something bigger, decrease A exp and increase B exp
- // and start over
- {
- p0 = bitsclr(AH,TMP)
- if (p0.new) jump:nt .Lfma_ab_tiny
- }
- {
- TMP = add(clb(BTMP),#-EXPBITS)
- }
- {
- BTMP = asl(BTMP,TMP)
- }
- {
- B = insert(BTMP,#63,#0)
- AH -= asl(TMP,#HI_MANTBITS)
- }
- jump .Lfma_begin
- .Lfma_ab_tiny:
- ATMP = combine(##0x00100000,#0)
- {
- A = insert(ATMP,#63,#0)
- B = insert(ATMP,#63,#0)
- }
- jump .Lfma_begin
- .Lab_inf:
- {
- B = lsr(B,#63)
- p0 = dfclass(C,#0x10)
- }
- {
- A ^= asl(B,#63)
- if (p0) jump .Lnan
- }
- {
- p1 = dfclass(C,#0x08)
- if (p1.new) jump:nt .Lfma_inf_plus_inf
- }
- // A*B is +/- inf, C is finite. Return A
- {
- jumpr r31
- }
- .falign
- .Lfma_inf_plus_inf:
- { // adding infinities of different signs is invalid
- p0 = dfcmp.eq(A,C)
- if (!p0.new) jump:nt .Linvalid
- }
- {
- jumpr r31
- }
- .Lnan:
- {
- p0 = dfclass(B,#0x10)
- p1 = dfclass(C,#0x10)
- if (!p0.new) B = A
- if (!p1.new) C = A
- }
- { // find sNaNs
- BH = convert_df2sf(B)
- BL = convert_df2sf(C)
- }
- {
- BH = convert_df2sf(A)
- A = #-1
- jumpr r31
- }
- .Linvalid:
- {
- TMP = ##0x7f800001 // sp snan
- }
- {
- A = convert_sf2df(TMP)
- jumpr r31
- }
- .Lab_true_zero:
- // B is zero, A is finite number
- {
- p0 = dfclass(C,#0x10)
- if (p0.new) jump:nt .Lnan
- if (p0.new) A = C
- }
- {
- p0 = dfcmp.eq(B,C) // is C also zero?
- AH = lsr(AH,#31) // get sign
- }
- {
- BH ^= asl(AH,#31) // form correctly signed zero in B
- if (!p0) A = C // If C is not zero, return C
- if (!p0) jumpr r31
- }
- // B has correctly signed zero, C is also zero
- .Lzero_plus_zero:
- {
- p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0
- if (p0.new) jumpr:t r31
- A = B
- }
- {
- TMP = USR
- }
- {
- TMP = extractu(TMP,#2,#SR_ROUND_OFF)
- A = #0
- }
- {
- p0 = cmp.eq(TMP,#2)
- if (p0.new) AH = ##0x80000000
- jumpr r31
- }
- #undef BTMP
- #undef BTMPH
- #undef BTMPL
- #define CTMP r11:10
- .falign
- .Lfma_abnormal_c:
- // We know that AB is normal * normal
- // C is not normal: zero, subnormal, inf, or NaN.
- {
- p0 = dfclass(C,#0x10) // is C NaN?
- if (p0.new) jump:nt .Lnan
- if (p0.new) A = C // move NaN to A
- deallocframe
- }
- {
- p0 = dfclass(C,#0x08) // is C inf?
- if (p0.new) A = C // return C
- if (p0.new) jumpr:nt r31
- }
- // zero or subnormal
- // If we have a zero, and we know AB is normal*normal, we can just call normal multiply
- {
- p0 = dfclass(C,#0x01) // is C zero?
- if (p0.new) jump:nt __hexagon_muldf3
- TMP = #1
- }
- // Left with: subnormal
- // Adjust C and jump back to restart
- {
- allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame
- CTMP = #0
- CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
- jump .Lfma_abnormal_c_restart
- }
- END(fma)
|