1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408 |
- .text
- .globl bn_mul_mont
- .type bn_mul_mont,%function
- .align 5
- bn_mul_mont:
- tst x5,#7
- b.eq __bn_sqr8x_mont
- tst x5,#3
- b.eq __bn_mul4x_mont
- .Lmul_mont:
- stp x29,x30,[sp,#-64]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- ldr x9,[x2],#8 // bp[0]
- sub x22,sp,x5,lsl#3
- ldp x7,x8,[x1],#16 // ap[0..1]
- lsl x5,x5,#3
- ldr x4,[x4] // *n0
- and x22,x22,#-16 // ABI says so
- ldp x13,x14,[x3],#16 // np[0..1]
- mul x6,x7,x9 // ap[0]*bp[0]
- sub x21,x5,#16 // j=num-2
- umulh x7,x7,x9
- mul x10,x8,x9 // ap[1]*bp[0]
- umulh x11,x8,x9
- mul x15,x6,x4 // "tp[0]"*n0
- mov sp,x22 // alloca
- // (*) mul x12,x13,x15 // np[0]*m1
- umulh x13,x13,x15
- mul x16,x14,x15 // np[1]*m1
- // (*) adds x12,x12,x6 // discarded
- // (*) As for removal of first multiplication and addition
- // instructions. The outcome of first addition is
- // guaranteed to be zero, which leaves two computationally
- // significant outcomes: it either carries or not. Then
- // question is when does it carry? Is there alternative
- // way to deduce it? If you follow operations, you can
- // observe that condition for carry is quite simple:
- // x6 being non-zero. So that carry can be calculated
- // by adding -1 to x6. That's what next instruction does.
- subs xzr,x6,#1 // (*)
- umulh x17,x14,x15
- adc x13,x13,xzr
- cbz x21,.L1st_skip
- .L1st:
- ldr x8,[x1],#8
- adds x6,x10,x7
- sub x21,x21,#8 // j--
- adc x7,x11,xzr
- ldr x14,[x3],#8
- adds x12,x16,x13
- mul x10,x8,x9 // ap[j]*bp[0]
- adc x13,x17,xzr
- umulh x11,x8,x9
- adds x12,x12,x6
- mul x16,x14,x15 // np[j]*m1
- adc x13,x13,xzr
- umulh x17,x14,x15
- str x12,[x22],#8 // tp[j-1]
- cbnz x21,.L1st
- .L1st_skip:
- adds x6,x10,x7
- sub x1,x1,x5 // rewind x1
- adc x7,x11,xzr
- adds x12,x16,x13
- sub x3,x3,x5 // rewind x3
- adc x13,x17,xzr
- adds x12,x12,x6
- sub x20,x5,#8 // i=num-1
- adcs x13,x13,x7
- adc x19,xzr,xzr // upmost overflow bit
- stp x12,x13,[x22]
- .Louter:
- ldr x9,[x2],#8 // bp[i]
- ldp x7,x8,[x1],#16
- ldr x23,[sp] // tp[0]
- add x22,sp,#8
- mul x6,x7,x9 // ap[0]*bp[i]
- sub x21,x5,#16 // j=num-2
- umulh x7,x7,x9
- ldp x13,x14,[x3],#16
- mul x10,x8,x9 // ap[1]*bp[i]
- adds x6,x6,x23
- umulh x11,x8,x9
- adc x7,x7,xzr
- mul x15,x6,x4
- sub x20,x20,#8 // i--
- // (*) mul x12,x13,x15 // np[0]*m1
- umulh x13,x13,x15
- mul x16,x14,x15 // np[1]*m1
- // (*) adds x12,x12,x6
- subs xzr,x6,#1 // (*)
- umulh x17,x14,x15
- cbz x21,.Linner_skip
- .Linner:
- ldr x8,[x1],#8
- adc x13,x13,xzr
- ldr x23,[x22],#8 // tp[j]
- adds x6,x10,x7
- sub x21,x21,#8 // j--
- adc x7,x11,xzr
- adds x12,x16,x13
- ldr x14,[x3],#8
- adc x13,x17,xzr
- mul x10,x8,x9 // ap[j]*bp[i]
- adds x6,x6,x23
- umulh x11,x8,x9
- adc x7,x7,xzr
- mul x16,x14,x15 // np[j]*m1
- adds x12,x12,x6
- umulh x17,x14,x15
- str x12,[x22,#-16] // tp[j-1]
- cbnz x21,.Linner
- .Linner_skip:
- ldr x23,[x22],#8 // tp[j]
- adc x13,x13,xzr
- adds x6,x10,x7
- sub x1,x1,x5 // rewind x1
- adc x7,x11,xzr
- adds x12,x16,x13
- sub x3,x3,x5 // rewind x3
- adcs x13,x17,x19
- adc x19,xzr,xzr
- adds x6,x6,x23
- adc x7,x7,xzr
- adds x12,x12,x6
- adcs x13,x13,x7
- adc x19,x19,xzr // upmost overflow bit
- stp x12,x13,[x22,#-16]
- cbnz x20,.Louter
- // Final step. We see if result is larger than modulus, and
- // if it is, subtract the modulus. But comparison implies
- // subtraction. So we subtract modulus, see if it borrowed,
- // and conditionally copy original value.
- ldr x23,[sp] // tp[0]
- add x22,sp,#8
- ldr x14,[x3],#8 // np[0]
- subs x21,x5,#8 // j=num-1 and clear borrow
- mov x1,x0
- .Lsub:
- sbcs x8,x23,x14 // tp[j]-np[j]
- ldr x23,[x22],#8
- sub x21,x21,#8 // j--
- ldr x14,[x3],#8
- str x8,[x1],#8 // rp[j]=tp[j]-np[j]
- cbnz x21,.Lsub
- sbcs x8,x23,x14
- sbcs x19,x19,xzr // did it borrow?
- str x8,[x1],#8 // rp[num-1]
- ldr x23,[sp] // tp[0]
- add x22,sp,#8
- ldr x8,[x0],#8 // rp[0]
- sub x5,x5,#8 // num--
- nop
- .Lcond_copy:
- sub x5,x5,#8 // num--
- csel x14,x23,x8,lo // did it borrow?
- ldr x23,[x22],#8
- ldr x8,[x0],#8
- str xzr,[x22,#-16] // wipe tp
- str x14,[x0,#-16]
- cbnz x5,.Lcond_copy
- csel x14,x23,x8,lo
- str xzr,[x22,#-8] // wipe tp
- str x14,[x0,#-8]
- ldp x19,x20,[x29,#16]
- mov sp,x29
- ldp x21,x22,[x29,#32]
- mov x0,#1
- ldp x23,x24,[x29,#48]
- ldr x29,[sp],#64
- ret
- .size bn_mul_mont,.-bn_mul_mont
- .type __bn_sqr8x_mont,%function
- .align 5
- __bn_sqr8x_mont:
- cmp x1,x2
- b.ne __bn_mul4x_mont
- .Lsqr8x_mont:
- .inst 0xd503233f // paciasp
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- stp x0,x3,[sp,#96] // offload rp and np
- ldp x6,x7,[x1,#8*0]
- ldp x8,x9,[x1,#8*2]
- ldp x10,x11,[x1,#8*4]
- ldp x12,x13,[x1,#8*6]
- sub x2,sp,x5,lsl#4
- lsl x5,x5,#3
- ldr x4,[x4] // *n0
- mov sp,x2 // alloca
- sub x27,x5,#8*8
- b .Lsqr8x_zero_start
- .Lsqr8x_zero:
- sub x27,x27,#8*8
- stp xzr,xzr,[x2,#8*0]
- stp xzr,xzr,[x2,#8*2]
- stp xzr,xzr,[x2,#8*4]
- stp xzr,xzr,[x2,#8*6]
- .Lsqr8x_zero_start:
- stp xzr,xzr,[x2,#8*8]
- stp xzr,xzr,[x2,#8*10]
- stp xzr,xzr,[x2,#8*12]
- stp xzr,xzr,[x2,#8*14]
- add x2,x2,#8*16
- cbnz x27,.Lsqr8x_zero
- add x3,x1,x5
- add x1,x1,#8*8
- mov x19,xzr
- mov x20,xzr
- mov x21,xzr
- mov x22,xzr
- mov x23,xzr
- mov x24,xzr
- mov x25,xzr
- mov x26,xzr
- mov x2,sp
- str x4,[x29,#112] // offload n0
- // Multiply everything but a[i]*a[i]
- .align 4
- .Lsqr8x_outer_loop:
- // a[1]a[0] (i)
- // a[2]a[0]
- // a[3]a[0]
- // a[4]a[0]
- // a[5]a[0]
- // a[6]a[0]
- // a[7]a[0]
- // a[2]a[1] (ii)
- // a[3]a[1]
- // a[4]a[1]
- // a[5]a[1]
- // a[6]a[1]
- // a[7]a[1]
- // a[3]a[2] (iii)
- // a[4]a[2]
- // a[5]a[2]
- // a[6]a[2]
- // a[7]a[2]
- // a[4]a[3] (iv)
- // a[5]a[3]
- // a[6]a[3]
- // a[7]a[3]
- // a[5]a[4] (v)
- // a[6]a[4]
- // a[7]a[4]
- // a[6]a[5] (vi)
- // a[7]a[5]
- // a[7]a[6] (vii)
- mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
- mul x15,x8,x6
- mul x16,x9,x6
- mul x17,x10,x6
- adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
- mul x14,x11,x6
- adcs x21,x21,x15
- mul x15,x12,x6
- adcs x22,x22,x16
- mul x16,x13,x6
- adcs x23,x23,x17
- umulh x17,x7,x6 // hi(a[1..7]*a[0])
- adcs x24,x24,x14
- umulh x14,x8,x6
- adcs x25,x25,x15
- umulh x15,x9,x6
- adcs x26,x26,x16
- umulh x16,x10,x6
- stp x19,x20,[x2],#8*2 // t[0..1]
- adc x19,xzr,xzr // t[8]
- adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
- umulh x17,x11,x6
- adcs x22,x22,x14
- umulh x14,x12,x6
- adcs x23,x23,x15
- umulh x15,x13,x6
- adcs x24,x24,x16
- mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
- adcs x25,x25,x17
- mul x17,x9,x7
- adcs x26,x26,x14
- mul x14,x10,x7
- adc x19,x19,x15
- mul x15,x11,x7
- adds x22,x22,x16
- mul x16,x12,x7
- adcs x23,x23,x17
- mul x17,x13,x7
- adcs x24,x24,x14
- umulh x14,x8,x7 // hi(a[2..7]*a[1])
- adcs x25,x25,x15
- umulh x15,x9,x7
- adcs x26,x26,x16
- umulh x16,x10,x7
- adcs x19,x19,x17
- umulh x17,x11,x7
- stp x21,x22,[x2],#8*2 // t[2..3]
- adc x20,xzr,xzr // t[9]
- adds x23,x23,x14
- umulh x14,x12,x7
- adcs x24,x24,x15
- umulh x15,x13,x7
- adcs x25,x25,x16
- mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
- adcs x26,x26,x17
- mul x17,x10,x8
- adcs x19,x19,x14
- mul x14,x11,x8
- adc x20,x20,x15
- mul x15,x12,x8
- adds x24,x24,x16
- mul x16,x13,x8
- adcs x25,x25,x17
- umulh x17,x9,x8 // hi(a[3..7]*a[2])
- adcs x26,x26,x14
- umulh x14,x10,x8
- adcs x19,x19,x15
- umulh x15,x11,x8
- adcs x20,x20,x16
- umulh x16,x12,x8
- stp x23,x24,[x2],#8*2 // t[4..5]
- adc x21,xzr,xzr // t[10]
- adds x25,x25,x17
- umulh x17,x13,x8
- adcs x26,x26,x14
- mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
- adcs x19,x19,x15
- mul x15,x11,x9
- adcs x20,x20,x16
- mul x16,x12,x9
- adc x21,x21,x17
- mul x17,x13,x9
- adds x26,x26,x14
- umulh x14,x10,x9 // hi(a[4..7]*a[3])
- adcs x19,x19,x15
- umulh x15,x11,x9
- adcs x20,x20,x16
- umulh x16,x12,x9
- adcs x21,x21,x17
- umulh x17,x13,x9
- stp x25,x26,[x2],#8*2 // t[6..7]
- adc x22,xzr,xzr // t[11]
- adds x19,x19,x14
- mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
- adcs x20,x20,x15
- mul x15,x12,x10
- adcs x21,x21,x16
- mul x16,x13,x10
- adc x22,x22,x17
- umulh x17,x11,x10 // hi(a[5..7]*a[4])
- adds x20,x20,x14
- umulh x14,x12,x10
- adcs x21,x21,x15
- umulh x15,x13,x10
- adcs x22,x22,x16
- mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
- adc x23,xzr,xzr // t[12]
- adds x21,x21,x17
- mul x17,x13,x11
- adcs x22,x22,x14
- umulh x14,x12,x11 // hi(a[6..7]*a[5])
- adc x23,x23,x15
- umulh x15,x13,x11
- adds x22,x22,x16
- mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
- adcs x23,x23,x17
- umulh x17,x13,x12 // hi(a[7]*a[6])
- adc x24,xzr,xzr // t[13]
- adds x23,x23,x14
- sub x27,x3,x1 // done yet?
- adc x24,x24,x15
- adds x24,x24,x16
- sub x14,x3,x5 // rewinded ap
- adc x25,xzr,xzr // t[14]
- add x25,x25,x17
- cbz x27,.Lsqr8x_outer_break
- mov x4,x6
- ldp x6,x7,[x2,#8*0]
- ldp x8,x9,[x2,#8*2]
- ldp x10,x11,[x2,#8*4]
- ldp x12,x13,[x2,#8*6]
- adds x19,x19,x6
- adcs x20,x20,x7
- ldp x6,x7,[x1,#8*0]
- adcs x21,x21,x8
- adcs x22,x22,x9
- ldp x8,x9,[x1,#8*2]
- adcs x23,x23,x10
- adcs x24,x24,x11
- ldp x10,x11,[x1,#8*4]
- adcs x25,x25,x12
- mov x0,x1
- adcs x26,xzr,x13
- ldp x12,x13,[x1,#8*6]
- add x1,x1,#8*8
- //adc x28,xzr,xzr // moved below
- mov x27,#-8*8
- // a[8]a[0]
- // a[9]a[0]
- // a[a]a[0]
- // a[b]a[0]
- // a[c]a[0]
- // a[d]a[0]
- // a[e]a[0]
- // a[f]a[0]
- // a[8]a[1]
- // a[f]a[1]........................
- // a[8]a[2]
- // a[f]a[2]........................
- // a[8]a[3]
- // a[f]a[3]........................
- // a[8]a[4]
- // a[f]a[4]........................
- // a[8]a[5]
- // a[f]a[5]........................
- // a[8]a[6]
- // a[f]a[6]........................
- // a[8]a[7]
- // a[f]a[7]........................
- .Lsqr8x_mul:
- mul x14,x6,x4
- adc x28,xzr,xzr // carry bit, modulo-scheduled
- mul x15,x7,x4
- add x27,x27,#8
- mul x16,x8,x4
- mul x17,x9,x4
- adds x19,x19,x14
- mul x14,x10,x4
- adcs x20,x20,x15
- mul x15,x11,x4
- adcs x21,x21,x16
- mul x16,x12,x4
- adcs x22,x22,x17
- mul x17,x13,x4
- adcs x23,x23,x14
- umulh x14,x6,x4
- adcs x24,x24,x15
- umulh x15,x7,x4
- adcs x25,x25,x16
- umulh x16,x8,x4
- adcs x26,x26,x17
- umulh x17,x9,x4
- adc x28,x28,xzr
- str x19,[x2],#8
- adds x19,x20,x14
- umulh x14,x10,x4
- adcs x20,x21,x15
- umulh x15,x11,x4
- adcs x21,x22,x16
- umulh x16,x12,x4
- adcs x22,x23,x17
- umulh x17,x13,x4
- ldr x4,[x0,x27]
- adcs x23,x24,x14
- adcs x24,x25,x15
- adcs x25,x26,x16
- adcs x26,x28,x17
- //adc x28,xzr,xzr // moved above
- cbnz x27,.Lsqr8x_mul
- // note that carry flag is guaranteed
- // to be zero at this point
- cmp x1,x3 // done yet?
- b.eq .Lsqr8x_break
- ldp x6,x7,[x2,#8*0]
- ldp x8,x9,[x2,#8*2]
- ldp x10,x11,[x2,#8*4]
- ldp x12,x13,[x2,#8*6]
- adds x19,x19,x6
- ldr x4,[x0,#-8*8]
- adcs x20,x20,x7
- ldp x6,x7,[x1,#8*0]
- adcs x21,x21,x8
- adcs x22,x22,x9
- ldp x8,x9,[x1,#8*2]
- adcs x23,x23,x10
- adcs x24,x24,x11
- ldp x10,x11,[x1,#8*4]
- adcs x25,x25,x12
- mov x27,#-8*8
- adcs x26,x26,x13
- ldp x12,x13,[x1,#8*6]
- add x1,x1,#8*8
- //adc x28,xzr,xzr // moved above
- b .Lsqr8x_mul
- .align 4
- .Lsqr8x_break:
- ldp x6,x7,[x0,#8*0]
- add x1,x0,#8*8
- ldp x8,x9,[x0,#8*2]
- sub x14,x3,x1 // is it last iteration?
- ldp x10,x11,[x0,#8*4]
- sub x15,x2,x14
- ldp x12,x13,[x0,#8*6]
- cbz x14,.Lsqr8x_outer_loop
- stp x19,x20,[x2,#8*0]
- ldp x19,x20,[x15,#8*0]
- stp x21,x22,[x2,#8*2]
- ldp x21,x22,[x15,#8*2]
- stp x23,x24,[x2,#8*4]
- ldp x23,x24,[x15,#8*4]
- stp x25,x26,[x2,#8*6]
- mov x2,x15
- ldp x25,x26,[x15,#8*6]
- b .Lsqr8x_outer_loop
- .align 4
- .Lsqr8x_outer_break:
- // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
- ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
- ldp x15,x16,[sp,#8*1]
- ldp x11,x13,[x14,#8*2]
- add x1,x14,#8*4
- ldp x17,x14,[sp,#8*3]
- stp x19,x20,[x2,#8*0]
- mul x19,x7,x7
- stp x21,x22,[x2,#8*2]
- umulh x7,x7,x7
- stp x23,x24,[x2,#8*4]
- mul x8,x9,x9
- stp x25,x26,[x2,#8*6]
- mov x2,sp
- umulh x9,x9,x9
- adds x20,x7,x15,lsl#1
- extr x15,x16,x15,#63
- sub x27,x5,#8*4
- .Lsqr4x_shift_n_add:
- adcs x21,x8,x15
- extr x16,x17,x16,#63
- sub x27,x27,#8*4
- adcs x22,x9,x16
- ldp x15,x16,[x2,#8*5]
- mul x10,x11,x11
- ldp x7,x9,[x1],#8*2
- umulh x11,x11,x11
- mul x12,x13,x13
- umulh x13,x13,x13
- extr x17,x14,x17,#63
- stp x19,x20,[x2,#8*0]
- adcs x23,x10,x17
- extr x14,x15,x14,#63
- stp x21,x22,[x2,#8*2]
- adcs x24,x11,x14
- ldp x17,x14,[x2,#8*7]
- extr x15,x16,x15,#63
- adcs x25,x12,x15
- extr x16,x17,x16,#63
- adcs x26,x13,x16
- ldp x15,x16,[x2,#8*9]
- mul x6,x7,x7
- ldp x11,x13,[x1],#8*2
- umulh x7,x7,x7
- mul x8,x9,x9
- umulh x9,x9,x9
- stp x23,x24,[x2,#8*4]
- extr x17,x14,x17,#63
- stp x25,x26,[x2,#8*6]
- add x2,x2,#8*8
- adcs x19,x6,x17
- extr x14,x15,x14,#63
- adcs x20,x7,x14
- ldp x17,x14,[x2,#8*3]
- extr x15,x16,x15,#63
- cbnz x27,.Lsqr4x_shift_n_add
- ldp x1,x4,[x29,#104] // pull np and n0
- adcs x21,x8,x15
- extr x16,x17,x16,#63
- adcs x22,x9,x16
- ldp x15,x16,[x2,#8*5]
- mul x10,x11,x11
- umulh x11,x11,x11
- stp x19,x20,[x2,#8*0]
- mul x12,x13,x13
- umulh x13,x13,x13
- stp x21,x22,[x2,#8*2]
- extr x17,x14,x17,#63
- adcs x23,x10,x17
- extr x14,x15,x14,#63
- ldp x19,x20,[sp,#8*0]
- adcs x24,x11,x14
- extr x15,x16,x15,#63
- ldp x6,x7,[x1,#8*0]
- adcs x25,x12,x15
- extr x16,xzr,x16,#63
- ldp x8,x9,[x1,#8*2]
- adc x26,x13,x16
- ldp x10,x11,[x1,#8*4]
- // Reduce by 512 bits per iteration
- mul x28,x4,x19 // t[0]*n0
- ldp x12,x13,[x1,#8*6]
- add x3,x1,x5
- ldp x21,x22,[sp,#8*2]
- stp x23,x24,[x2,#8*4]
- ldp x23,x24,[sp,#8*4]
- stp x25,x26,[x2,#8*6]
- ldp x25,x26,[sp,#8*6]
- add x1,x1,#8*8
- mov x30,xzr // initial top-most carry
- mov x2,sp
- mov x27,#8
- .Lsqr8x_reduction:
- // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
- mul x15,x7,x28
- sub x27,x27,#1
- mul x16,x8,x28
- str x28,[x2],#8 // put aside t[0]*n0 for tail processing
- mul x17,x9,x28
- // (*) adds xzr,x19,x14
- subs xzr,x19,#1 // (*)
- mul x14,x10,x28
- adcs x19,x20,x15
- mul x15,x11,x28
- adcs x20,x21,x16
- mul x16,x12,x28
- adcs x21,x22,x17
- mul x17,x13,x28
- adcs x22,x23,x14
- umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
- adcs x23,x24,x15
- umulh x15,x7,x28
- adcs x24,x25,x16
- umulh x16,x8,x28
- adcs x25,x26,x17
- umulh x17,x9,x28
- adc x26,xzr,xzr
- adds x19,x19,x14
- umulh x14,x10,x28
- adcs x20,x20,x15
- umulh x15,x11,x28
- adcs x21,x21,x16
- umulh x16,x12,x28
- adcs x22,x22,x17
- umulh x17,x13,x28
- mul x28,x4,x19 // next t[0]*n0
- adcs x23,x23,x14
- adcs x24,x24,x15
- adcs x25,x25,x16
- adc x26,x26,x17
- cbnz x27,.Lsqr8x_reduction
- ldp x14,x15,[x2,#8*0]
- ldp x16,x17,[x2,#8*2]
- mov x0,x2
- sub x27,x3,x1 // done yet?
- adds x19,x19,x14
- adcs x20,x20,x15
- ldp x14,x15,[x2,#8*4]
- adcs x21,x21,x16
- adcs x22,x22,x17
- ldp x16,x17,[x2,#8*6]
- adcs x23,x23,x14
- adcs x24,x24,x15
- adcs x25,x25,x16
- adcs x26,x26,x17
- //adc x28,xzr,xzr // moved below
- cbz x27,.Lsqr8x8_post_condition
- ldr x4,[x2,#-8*8]
- ldp x6,x7,[x1,#8*0]
- ldp x8,x9,[x1,#8*2]
- ldp x10,x11,[x1,#8*4]
- mov x27,#-8*8
- ldp x12,x13,[x1,#8*6]
- add x1,x1,#8*8
- .Lsqr8x_tail:
- mul x14,x6,x4
- adc x28,xzr,xzr // carry bit, modulo-scheduled
- mul x15,x7,x4
- add x27,x27,#8
- mul x16,x8,x4
- mul x17,x9,x4
- adds x19,x19,x14
- mul x14,x10,x4
- adcs x20,x20,x15
- mul x15,x11,x4
- adcs x21,x21,x16
- mul x16,x12,x4
- adcs x22,x22,x17
- mul x17,x13,x4
- adcs x23,x23,x14
- umulh x14,x6,x4
- adcs x24,x24,x15
- umulh x15,x7,x4
- adcs x25,x25,x16
- umulh x16,x8,x4
- adcs x26,x26,x17
- umulh x17,x9,x4
- adc x28,x28,xzr
- str x19,[x2],#8
- adds x19,x20,x14
- umulh x14,x10,x4
- adcs x20,x21,x15
- umulh x15,x11,x4
- adcs x21,x22,x16
- umulh x16,x12,x4
- adcs x22,x23,x17
- umulh x17,x13,x4
- ldr x4,[x0,x27]
- adcs x23,x24,x14
- adcs x24,x25,x15
- adcs x25,x26,x16
- adcs x26,x28,x17
- //adc x28,xzr,xzr // moved above
- cbnz x27,.Lsqr8x_tail
- // note that carry flag is guaranteed
- // to be zero at this point
- ldp x6,x7,[x2,#8*0]
- sub x27,x3,x1 // done yet?
- sub x16,x3,x5 // rewinded np
- ldp x8,x9,[x2,#8*2]
- ldp x10,x11,[x2,#8*4]
- ldp x12,x13,[x2,#8*6]
- cbz x27,.Lsqr8x_tail_break
- ldr x4,[x0,#-8*8]
- adds x19,x19,x6
- adcs x20,x20,x7
- ldp x6,x7,[x1,#8*0]
- adcs x21,x21,x8
- adcs x22,x22,x9
- ldp x8,x9,[x1,#8*2]
- adcs x23,x23,x10
- adcs x24,x24,x11
- ldp x10,x11,[x1,#8*4]
- adcs x25,x25,x12
- mov x27,#-8*8
- adcs x26,x26,x13
- ldp x12,x13,[x1,#8*6]
- add x1,x1,#8*8
- //adc x28,xzr,xzr // moved above
- b .Lsqr8x_tail
- .align 4
- .Lsqr8x_tail_break:
- ldr x4,[x29,#112] // pull n0
- add x27,x2,#8*8 // end of current t[num] window
- subs xzr,x30,#1 // "move" top-most carry to carry bit
- adcs x14,x19,x6
- adcs x15,x20,x7
- ldp x19,x20,[x0,#8*0]
- adcs x21,x21,x8
- ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
- adcs x22,x22,x9
- ldp x8,x9,[x16,#8*2]
- adcs x23,x23,x10
- adcs x24,x24,x11
- ldp x10,x11,[x16,#8*4]
- adcs x25,x25,x12
- adcs x26,x26,x13
- ldp x12,x13,[x16,#8*6]
- add x1,x16,#8*8
- adc x30,xzr,xzr // top-most carry
- mul x28,x4,x19
- stp x14,x15,[x2,#8*0]
- stp x21,x22,[x2,#8*2]
- ldp x21,x22,[x0,#8*2]
- stp x23,x24,[x2,#8*4]
- ldp x23,x24,[x0,#8*4]
- cmp x27,x29 // did we hit the bottom?
- stp x25,x26,[x2,#8*6]
- mov x2,x0 // slide the window
- ldp x25,x26,[x0,#8*6]
- mov x27,#8
- b.ne .Lsqr8x_reduction
- // Final step. We see if result is larger than modulus, and
- // if it is, subtract the modulus. But comparison implies
- // subtraction. So we subtract modulus, see if it borrowed,
- // and conditionally copy original value.
- ldr x0,[x29,#96] // pull rp
- add x2,x2,#8*8
- subs x14,x19,x6
- sbcs x15,x20,x7
- sub x27,x5,#8*8
- mov x3,x0 // x0 copy
- .Lsqr8x_sub:
- sbcs x16,x21,x8
- ldp x6,x7,[x1,#8*0]
- sbcs x17,x22,x9
- stp x14,x15,[x0,#8*0]
- sbcs x14,x23,x10
- ldp x8,x9,[x1,#8*2]
- sbcs x15,x24,x11
- stp x16,x17,[x0,#8*2]
- sbcs x16,x25,x12
- ldp x10,x11,[x1,#8*4]
- sbcs x17,x26,x13
- ldp x12,x13,[x1,#8*6]
- add x1,x1,#8*8
- ldp x19,x20,[x2,#8*0]
- sub x27,x27,#8*8
- ldp x21,x22,[x2,#8*2]
- ldp x23,x24,[x2,#8*4]
- ldp x25,x26,[x2,#8*6]
- add x2,x2,#8*8
- stp x14,x15,[x0,#8*4]
- sbcs x14,x19,x6
- stp x16,x17,[x0,#8*6]
- add x0,x0,#8*8
- sbcs x15,x20,x7
- cbnz x27,.Lsqr8x_sub
- sbcs x16,x21,x8
- mov x2,sp
- add x1,sp,x5
- ldp x6,x7,[x3,#8*0]
- sbcs x17,x22,x9
- stp x14,x15,[x0,#8*0]
- sbcs x14,x23,x10
- ldp x8,x9,[x3,#8*2]
- sbcs x15,x24,x11
- stp x16,x17,[x0,#8*2]
- sbcs x16,x25,x12
- ldp x19,x20,[x1,#8*0]
- sbcs x17,x26,x13
- ldp x21,x22,[x1,#8*2]
- sbcs xzr,x30,xzr // did it borrow?
- ldr x30,[x29,#8] // pull return address
- stp x14,x15,[x0,#8*4]
- stp x16,x17,[x0,#8*6]
- sub x27,x5,#8*4
- .Lsqr4x_cond_copy:
- sub x27,x27,#8*4
- csel x14,x19,x6,lo
- stp xzr,xzr,[x2,#8*0]
- csel x15,x20,x7,lo
- ldp x6,x7,[x3,#8*4]
- ldp x19,x20,[x1,#8*4]
- csel x16,x21,x8,lo
- stp xzr,xzr,[x2,#8*2]
- add x2,x2,#8*4
- csel x17,x22,x9,lo
- ldp x8,x9,[x3,#8*6]
- ldp x21,x22,[x1,#8*6]
- add x1,x1,#8*4
- stp x14,x15,[x3,#8*0]
- stp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
- stp xzr,xzr,[x1,#8*0]
- stp xzr,xzr,[x1,#8*2]
- cbnz x27,.Lsqr4x_cond_copy
- csel x14,x19,x6,lo
- stp xzr,xzr,[x2,#8*0]
- csel x15,x20,x7,lo
- stp xzr,xzr,[x2,#8*2]
- csel x16,x21,x8,lo
- csel x17,x22,x9,lo
- stp x14,x15,[x3,#8*0]
- stp x16,x17,[x3,#8*2]
- b .Lsqr8x_done
- .align 4
- .Lsqr8x8_post_condition:
- adc x28,xzr,xzr
- ldr x30,[x29,#8] // pull return address
- // x19-7,x28 hold result, x6-7 hold modulus
- subs x6,x19,x6
- ldr x1,[x29,#96] // pull rp
- sbcs x7,x20,x7
- stp xzr,xzr,[sp,#8*0]
- sbcs x8,x21,x8
- stp xzr,xzr,[sp,#8*2]
- sbcs x9,x22,x9
- stp xzr,xzr,[sp,#8*4]
- sbcs x10,x23,x10
- stp xzr,xzr,[sp,#8*6]
- sbcs x11,x24,x11
- stp xzr,xzr,[sp,#8*8]
- sbcs x12,x25,x12
- stp xzr,xzr,[sp,#8*10]
- sbcs x13,x26,x13
- stp xzr,xzr,[sp,#8*12]
- sbcs x28,x28,xzr // did it borrow?
- stp xzr,xzr,[sp,#8*14]
- // x6-7 hold result-modulus
- csel x6,x19,x6,lo
- csel x7,x20,x7,lo
- csel x8,x21,x8,lo
- csel x9,x22,x9,lo
- stp x6,x7,[x1,#8*0]
- csel x10,x23,x10,lo
- csel x11,x24,x11,lo
- stp x8,x9,[x1,#8*2]
- csel x12,x25,x12,lo
- csel x13,x26,x13,lo
- stp x10,x11,[x1,#8*4]
- stp x12,x13,[x1,#8*6]
- .Lsqr8x_done:
- ldp x19,x20,[x29,#16]
- mov sp,x29
- ldp x21,x22,[x29,#32]
- mov x0,#1
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldr x29,[sp],#128
- .inst 0xd50323bf // autiasp
- ret
- .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
- .type __bn_mul4x_mont,%function
- .align 5
- __bn_mul4x_mont:
- .inst 0xd503233f // paciasp
- stp x29,x30,[sp,#-128]!
- add x29,sp,#0
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub x26,sp,x5,lsl#3
- lsl x5,x5,#3
- ldr x4,[x4] // *n0
- sub sp,x26,#8*4 // alloca
- add x10,x2,x5
- add x27,x1,x5
- stp x0,x10,[x29,#96] // offload rp and &b[num]
- ldr x24,[x2,#8*0] // b[0]
- ldp x6,x7,[x1,#8*0] // a[0..3]
- ldp x8,x9,[x1,#8*2]
- add x1,x1,#8*4
- mov x19,xzr
- mov x20,xzr
- mov x21,xzr
- mov x22,xzr
- ldp x14,x15,[x3,#8*0] // n[0..3]
- ldp x16,x17,[x3,#8*2]
- adds x3,x3,#8*4 // clear carry bit
- mov x0,xzr
- mov x28,#0
- mov x26,sp
- .Loop_mul4x_1st_reduction:
- mul x10,x6,x24 // lo(a[0..3]*b[0])
- adc x0,x0,xzr // modulo-scheduled
- mul x11,x7,x24
- add x28,x28,#8
- mul x12,x8,x24
- and x28,x28,#31
- mul x13,x9,x24
- adds x19,x19,x10
- umulh x10,x6,x24 // hi(a[0..3]*b[0])
- adcs x20,x20,x11
- mul x25,x19,x4 // t[0]*n0
- adcs x21,x21,x12
- umulh x11,x7,x24
- adcs x22,x22,x13
- umulh x12,x8,x24
- adc x23,xzr,xzr
- umulh x13,x9,x24
- ldr x24,[x2,x28] // next b[i] (or b[0])
- adds x20,x20,x10
- // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
- str x25,[x26],#8 // put aside t[0]*n0 for tail processing
- adcs x21,x21,x11
- mul x11,x15,x25
- adcs x22,x22,x12
- mul x12,x16,x25
- adc x23,x23,x13 // can't overflow
- mul x13,x17,x25
- // (*) adds xzr,x19,x10
- subs xzr,x19,#1 // (*)
- umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
- adcs x19,x20,x11
- umulh x11,x15,x25
- adcs x20,x21,x12
- umulh x12,x16,x25
- adcs x21,x22,x13
- umulh x13,x17,x25
- adcs x22,x23,x0
- adc x0,xzr,xzr
- adds x19,x19,x10
- sub x10,x27,x1
- adcs x20,x20,x11
- adcs x21,x21,x12
- adcs x22,x22,x13
- //adc x0,x0,xzr
- cbnz x28,.Loop_mul4x_1st_reduction
- cbz x10,.Lmul4x4_post_condition
- ldp x6,x7,[x1,#8*0] // a[4..7]
- ldp x8,x9,[x1,#8*2]
- add x1,x1,#8*4
- ldr x25,[sp] // a[0]*n0
- ldp x14,x15,[x3,#8*0] // n[4..7]
- ldp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
- .Loop_mul4x_1st_tail:
- mul x10,x6,x24 // lo(a[4..7]*b[i])
- adc x0,x0,xzr // modulo-scheduled
- mul x11,x7,x24
- add x28,x28,#8
- mul x12,x8,x24
- and x28,x28,#31
- mul x13,x9,x24
- adds x19,x19,x10
- umulh x10,x6,x24 // hi(a[4..7]*b[i])
- adcs x20,x20,x11
- umulh x11,x7,x24
- adcs x21,x21,x12
- umulh x12,x8,x24
- adcs x22,x22,x13
- umulh x13,x9,x24
- adc x23,xzr,xzr
- ldr x24,[x2,x28] // next b[i] (or b[0])
- adds x20,x20,x10
- mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
- adcs x21,x21,x11
- mul x11,x15,x25
- adcs x22,x22,x12
- mul x12,x16,x25
- adc x23,x23,x13 // can't overflow
- mul x13,x17,x25
- adds x19,x19,x10
- umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
- adcs x20,x20,x11
- umulh x11,x15,x25
- adcs x21,x21,x12
- umulh x12,x16,x25
- adcs x22,x22,x13
- adcs x23,x23,x0
- umulh x13,x17,x25
- adc x0,xzr,xzr
- ldr x25,[sp,x28] // next t[0]*n0
- str x19,[x26],#8 // result!!!
- adds x19,x20,x10
- sub x10,x27,x1 // done yet?
- adcs x20,x21,x11
- adcs x21,x22,x12
- adcs x22,x23,x13
- //adc x0,x0,xzr
- cbnz x28,.Loop_mul4x_1st_tail
- sub x11,x27,x5 // rewinded x1
- cbz x10,.Lmul4x_proceed
- ldp x6,x7,[x1,#8*0]
- ldp x8,x9,[x1,#8*2]
- add x1,x1,#8*4
- ldp x14,x15,[x3,#8*0]
- ldp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
- b .Loop_mul4x_1st_tail
- .align 5
- .Lmul4x_proceed:
- ldr x24,[x2,#8*4]! // *++b
- adc x30,x0,xzr
- ldp x6,x7,[x11,#8*0] // a[0..3]
- sub x3,x3,x5 // rewind np
- ldp x8,x9,[x11,#8*2]
- add x1,x11,#8*4
- stp x19,x20,[x26,#8*0] // result!!!
- ldp x19,x20,[sp,#8*4] // t[0..3]
- stp x21,x22,[x26,#8*2] // result!!!
- ldp x21,x22,[sp,#8*6]
- ldp x14,x15,[x3,#8*0] // n[0..3]
- mov x26,sp
- ldp x16,x17,[x3,#8*2]
- adds x3,x3,#8*4 // clear carry bit
- mov x0,xzr
- .align 4
- .Loop_mul4x_reduction:
- mul x10,x6,x24 // lo(a[0..3]*b[4])
- adc x0,x0,xzr // modulo-scheduled
- mul x11,x7,x24
- add x28,x28,#8
- mul x12,x8,x24
- and x28,x28,#31
- mul x13,x9,x24
- adds x19,x19,x10
- umulh x10,x6,x24 // hi(a[0..3]*b[4])
- adcs x20,x20,x11
- mul x25,x19,x4 // t[0]*n0
- adcs x21,x21,x12
- umulh x11,x7,x24
- adcs x22,x22,x13
- umulh x12,x8,x24
- adc x23,xzr,xzr
- umulh x13,x9,x24
- ldr x24,[x2,x28] // next b[i]
- adds x20,x20,x10
- // (*) mul x10,x14,x25
- str x25,[x26],#8 // put aside t[0]*n0 for tail processing
- adcs x21,x21,x11
- mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
- adcs x22,x22,x12
- mul x12,x16,x25
- adc x23,x23,x13 // can't overflow
- mul x13,x17,x25
- // (*) adds xzr,x19,x10
- subs xzr,x19,#1 // (*)
- umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
- adcs x19,x20,x11
- umulh x11,x15,x25
- adcs x20,x21,x12
- umulh x12,x16,x25
- adcs x21,x22,x13
- umulh x13,x17,x25
- adcs x22,x23,x0
- adc x0,xzr,xzr
- adds x19,x19,x10
- adcs x20,x20,x11
- adcs x21,x21,x12
- adcs x22,x22,x13
- //adc x0,x0,xzr
- cbnz x28,.Loop_mul4x_reduction
- adc x0,x0,xzr
- ldp x10,x11,[x26,#8*4] // t[4..7]
- ldp x12,x13,[x26,#8*6]
- ldp x6,x7,[x1,#8*0] // a[4..7]
- ldp x8,x9,[x1,#8*2]
- add x1,x1,#8*4
- adds x19,x19,x10
- adcs x20,x20,x11
- adcs x21,x21,x12
- adcs x22,x22,x13
- //adc x0,x0,xzr
- ldr x25,[sp] // t[0]*n0
- ldp x14,x15,[x3,#8*0] // n[4..7]
- ldp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
- .align 4
- .Loop_mul4x_tail:
- mul x10,x6,x24 // lo(a[4..7]*b[4])
- adc x0,x0,xzr // modulo-scheduled
- mul x11,x7,x24
- add x28,x28,#8
- mul x12,x8,x24
- and x28,x28,#31
- mul x13,x9,x24
- adds x19,x19,x10
- umulh x10,x6,x24 // hi(a[4..7]*b[4])
- adcs x20,x20,x11
- umulh x11,x7,x24
- adcs x21,x21,x12
- umulh x12,x8,x24
- adcs x22,x22,x13
- umulh x13,x9,x24
- adc x23,xzr,xzr
- ldr x24,[x2,x28] // next b[i]
- adds x20,x20,x10
- mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
- adcs x21,x21,x11
- mul x11,x15,x25
- adcs x22,x22,x12
- mul x12,x16,x25
- adc x23,x23,x13 // can't overflow
- mul x13,x17,x25
- adds x19,x19,x10
- umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
- adcs x20,x20,x11
- umulh x11,x15,x25
- adcs x21,x21,x12
- umulh x12,x16,x25
- adcs x22,x22,x13
- umulh x13,x17,x25
- adcs x23,x23,x0
- ldr x25,[sp,x28] // next a[0]*n0
- adc x0,xzr,xzr
- str x19,[x26],#8 // result!!!
- adds x19,x20,x10
- sub x10,x27,x1 // done yet?
- adcs x20,x21,x11
- adcs x21,x22,x12
- adcs x22,x23,x13
- //adc x0,x0,xzr
- cbnz x28,.Loop_mul4x_tail
- sub x11,x3,x5 // rewinded np?
- adc x0,x0,xzr
- cbz x10,.Loop_mul4x_break
- ldp x10,x11,[x26,#8*4]
- ldp x12,x13,[x26,#8*6]
- ldp x6,x7,[x1,#8*0]
- ldp x8,x9,[x1,#8*2]
- add x1,x1,#8*4
- adds x19,x19,x10
- adcs x20,x20,x11
- adcs x21,x21,x12
- adcs x22,x22,x13
- //adc x0,x0,xzr
- ldp x14,x15,[x3,#8*0]
- ldp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
- b .Loop_mul4x_tail
- .align 4
- .Loop_mul4x_break:
- ldp x12,x13,[x29,#96] // pull rp and &b[num]
- adds x19,x19,x30
- add x2,x2,#8*4 // bp++
- adcs x20,x20,xzr
- sub x1,x1,x5 // rewind ap
- adcs x21,x21,xzr
- stp x19,x20,[x26,#8*0] // result!!!
- adcs x22,x22,xzr
- ldp x19,x20,[sp,#8*4] // t[0..3]
- adc x30,x0,xzr
- stp x21,x22,[x26,#8*2] // result!!!
- cmp x2,x13 // done yet?
- ldp x21,x22,[sp,#8*6]
- ldp x14,x15,[x11,#8*0] // n[0..3]
- ldp x16,x17,[x11,#8*2]
- add x3,x11,#8*4
- b.eq .Lmul4x_post
- ldr x24,[x2]
- ldp x6,x7,[x1,#8*0] // a[0..3]
- ldp x8,x9,[x1,#8*2]
- adds x1,x1,#8*4 // clear carry bit
- mov x0,xzr
- mov x26,sp
- b .Loop_mul4x_reduction
- .align 4
- .Lmul4x_post:
- // Final step. We see if result is larger than modulus, and
- // if it is, subtract the modulus. But comparison implies
- // subtraction. So we subtract modulus, see if it borrowed,
- // and conditionally copy original value.
- mov x0,x12
- mov x27,x12 // x0 copy
- subs x10,x19,x14
- add x26,sp,#8*8
- sbcs x11,x20,x15
- sub x28,x5,#8*4
- .Lmul4x_sub:
- sbcs x12,x21,x16
- ldp x14,x15,[x3,#8*0]
- sub x28,x28,#8*4
- ldp x19,x20,[x26,#8*0]
- sbcs x13,x22,x17
- ldp x16,x17,[x3,#8*2]
- add x3,x3,#8*4
- ldp x21,x22,[x26,#8*2]
- add x26,x26,#8*4
- stp x10,x11,[x0,#8*0]
- sbcs x10,x19,x14
- stp x12,x13,[x0,#8*2]
- add x0,x0,#8*4
- sbcs x11,x20,x15
- cbnz x28,.Lmul4x_sub
- sbcs x12,x21,x16
- mov x26,sp
- add x1,sp,#8*4
- ldp x6,x7,[x27,#8*0]
- sbcs x13,x22,x17
- stp x10,x11,[x0,#8*0]
- ldp x8,x9,[x27,#8*2]
- stp x12,x13,[x0,#8*2]
- ldp x19,x20,[x1,#8*0]
- ldp x21,x22,[x1,#8*2]
- sbcs xzr,x30,xzr // did it borrow?
- ldr x30,[x29,#8] // pull return address
- sub x28,x5,#8*4
- .Lmul4x_cond_copy:
- sub x28,x28,#8*4
- csel x10,x19,x6,lo
- stp xzr,xzr,[x26,#8*0]
- csel x11,x20,x7,lo
- ldp x6,x7,[x27,#8*4]
- ldp x19,x20,[x1,#8*4]
- csel x12,x21,x8,lo
- stp xzr,xzr,[x26,#8*2]
- add x26,x26,#8*4
- csel x13,x22,x9,lo
- ldp x8,x9,[x27,#8*6]
- ldp x21,x22,[x1,#8*6]
- add x1,x1,#8*4
- stp x10,x11,[x27,#8*0]
- stp x12,x13,[x27,#8*2]
- add x27,x27,#8*4
- cbnz x28,.Lmul4x_cond_copy
- csel x10,x19,x6,lo
- stp xzr,xzr,[x26,#8*0]
- csel x11,x20,x7,lo
- stp xzr,xzr,[x26,#8*2]
- csel x12,x21,x8,lo
- stp xzr,xzr,[x26,#8*3]
- csel x13,x22,x9,lo
- stp xzr,xzr,[x26,#8*4]
- stp x10,x11,[x27,#8*0]
- stp x12,x13,[x27,#8*2]
- b .Lmul4x_done
- .align 4
- .Lmul4x4_post_condition:
- adc x0,x0,xzr
- ldr x1,[x29,#96] // pull rp
- // x19-3,x0 hold result, x14-7 hold modulus
- subs x6,x19,x14
- ldr x30,[x29,#8] // pull return address
- sbcs x7,x20,x15
- stp xzr,xzr,[sp,#8*0]
- sbcs x8,x21,x16
- stp xzr,xzr,[sp,#8*2]
- sbcs x9,x22,x17
- stp xzr,xzr,[sp,#8*4]
- sbcs xzr,x0,xzr // did it borrow?
- stp xzr,xzr,[sp,#8*6]
- // x6-3 hold result-modulus
- csel x6,x19,x6,lo
- csel x7,x20,x7,lo
- csel x8,x21,x8,lo
- csel x9,x22,x9,lo
- stp x6,x7,[x1,#8*0]
- stp x8,x9,[x1,#8*2]
- .Lmul4x_done:
- ldp x19,x20,[x29,#16]
- mov sp,x29
- ldp x21,x22,[x29,#32]
- mov x0,#1
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldr x29,[sp],#128
- .inst 0xd50323bf // autiasp
- ret
- .size __bn_mul4x_mont,.-__bn_mul4x_mont
- .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
- .align 2
- .align 4
|