armv8-mont.S 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408
  1. .text
  2. .globl _bn_mul_mont
  3. .align 5
  4. _bn_mul_mont:
  5. tst x5,#7
  6. b.eq __bn_sqr8x_mont
  7. tst x5,#3
  8. b.eq __bn_mul4x_mont
  9. Lmul_mont:
  10. stp x29,x30,[sp,#-64]!
  11. add x29,sp,#0
  12. stp x19,x20,[sp,#16]
  13. stp x21,x22,[sp,#32]
  14. stp x23,x24,[sp,#48]
  15. ldr x9,[x2],#8 // bp[0]
  16. sub x22,sp,x5,lsl#3
  17. ldp x7,x8,[x1],#16 // ap[0..1]
  18. lsl x5,x5,#3
  19. ldr x4,[x4] // *n0
  20. and x22,x22,#-16 // ABI says so
  21. ldp x13,x14,[x3],#16 // np[0..1]
  22. mul x6,x7,x9 // ap[0]*bp[0]
  23. sub x21,x5,#16 // j=num-2
  24. umulh x7,x7,x9
  25. mul x10,x8,x9 // ap[1]*bp[0]
  26. umulh x11,x8,x9
  27. mul x15,x6,x4 // "tp[0]"*n0
  28. mov sp,x22 // alloca
  29. // (*) mul x12,x13,x15 // np[0]*m1
  30. umulh x13,x13,x15
  31. mul x16,x14,x15 // np[1]*m1
  32. // (*) adds x12,x12,x6 // discarded
  33. // (*) As for removal of first multiplication and addition
  34. // instructions. The outcome of first addition is
  35. // guaranteed to be zero, which leaves two computationally
  36. // significant outcomes: it either carries or not. Then
  37. // question is when does it carry? Is there alternative
  38. // way to deduce it? If you follow operations, you can
  39. // observe that condition for carry is quite simple:
  40. // x6 being non-zero. So that carry can be calculated
  41. // by adding -1 to x6. That's what next instruction does.
  42. subs xzr,x6,#1 // (*)
  43. umulh x17,x14,x15
  44. adc x13,x13,xzr
  45. cbz x21,L1st_skip
  46. L1st:
  47. ldr x8,[x1],#8
  48. adds x6,x10,x7
  49. sub x21,x21,#8 // j--
  50. adc x7,x11,xzr
  51. ldr x14,[x3],#8
  52. adds x12,x16,x13
  53. mul x10,x8,x9 // ap[j]*bp[0]
  54. adc x13,x17,xzr
  55. umulh x11,x8,x9
  56. adds x12,x12,x6
  57. mul x16,x14,x15 // np[j]*m1
  58. adc x13,x13,xzr
  59. umulh x17,x14,x15
  60. str x12,[x22],#8 // tp[j-1]
  61. cbnz x21,L1st
  62. L1st_skip:
  63. adds x6,x10,x7
  64. sub x1,x1,x5 // rewind x1
  65. adc x7,x11,xzr
  66. adds x12,x16,x13
  67. sub x3,x3,x5 // rewind x3
  68. adc x13,x17,xzr
  69. adds x12,x12,x6
  70. sub x20,x5,#8 // i=num-1
  71. adcs x13,x13,x7
  72. adc x19,xzr,xzr // upmost overflow bit
  73. stp x12,x13,[x22]
  74. Louter:
  75. ldr x9,[x2],#8 // bp[i]
  76. ldp x7,x8,[x1],#16
  77. ldr x23,[sp] // tp[0]
  78. add x22,sp,#8
  79. mul x6,x7,x9 // ap[0]*bp[i]
  80. sub x21,x5,#16 // j=num-2
  81. umulh x7,x7,x9
  82. ldp x13,x14,[x3],#16
  83. mul x10,x8,x9 // ap[1]*bp[i]
  84. adds x6,x6,x23
  85. umulh x11,x8,x9
  86. adc x7,x7,xzr
  87. mul x15,x6,x4
  88. sub x20,x20,#8 // i--
  89. // (*) mul x12,x13,x15 // np[0]*m1
  90. umulh x13,x13,x15
  91. mul x16,x14,x15 // np[1]*m1
  92. // (*) adds x12,x12,x6
  93. subs xzr,x6,#1 // (*)
  94. umulh x17,x14,x15
  95. cbz x21,Linner_skip
  96. Linner:
  97. ldr x8,[x1],#8
  98. adc x13,x13,xzr
  99. ldr x23,[x22],#8 // tp[j]
  100. adds x6,x10,x7
  101. sub x21,x21,#8 // j--
  102. adc x7,x11,xzr
  103. adds x12,x16,x13
  104. ldr x14,[x3],#8
  105. adc x13,x17,xzr
  106. mul x10,x8,x9 // ap[j]*bp[i]
  107. adds x6,x6,x23
  108. umulh x11,x8,x9
  109. adc x7,x7,xzr
  110. mul x16,x14,x15 // np[j]*m1
  111. adds x12,x12,x6
  112. umulh x17,x14,x15
  113. str x12,[x22,#-16] // tp[j-1]
  114. cbnz x21,Linner
  115. Linner_skip:
  116. ldr x23,[x22],#8 // tp[j]
  117. adc x13,x13,xzr
  118. adds x6,x10,x7
  119. sub x1,x1,x5 // rewind x1
  120. adc x7,x11,xzr
  121. adds x12,x16,x13
  122. sub x3,x3,x5 // rewind x3
  123. adcs x13,x17,x19
  124. adc x19,xzr,xzr
  125. adds x6,x6,x23
  126. adc x7,x7,xzr
  127. adds x12,x12,x6
  128. adcs x13,x13,x7
  129. adc x19,x19,xzr // upmost overflow bit
  130. stp x12,x13,[x22,#-16]
  131. cbnz x20,Louter
  132. // Final step. We see if result is larger than modulus, and
  133. // if it is, subtract the modulus. But comparison implies
  134. // subtraction. So we subtract modulus, see if it borrowed,
  135. // and conditionally copy original value.
  136. ldr x23,[sp] // tp[0]
  137. add x22,sp,#8
  138. ldr x14,[x3],#8 // np[0]
  139. subs x21,x5,#8 // j=num-1 and clear borrow
  140. mov x1,x0
  141. Lsub:
  142. sbcs x8,x23,x14 // tp[j]-np[j]
  143. ldr x23,[x22],#8
  144. sub x21,x21,#8 // j--
  145. ldr x14,[x3],#8
  146. str x8,[x1],#8 // rp[j]=tp[j]-np[j]
  147. cbnz x21,Lsub
  148. sbcs x8,x23,x14
  149. sbcs x19,x19,xzr // did it borrow?
  150. str x8,[x1],#8 // rp[num-1]
  151. ldr x23,[sp] // tp[0]
  152. add x22,sp,#8
  153. ldr x8,[x0],#8 // rp[0]
  154. sub x5,x5,#8 // num--
  155. nop
  156. Lcond_copy:
  157. sub x5,x5,#8 // num--
  158. csel x14,x23,x8,lo // did it borrow?
  159. ldr x23,[x22],#8
  160. ldr x8,[x0],#8
  161. str xzr,[x22,#-16] // wipe tp
  162. str x14,[x0,#-16]
  163. cbnz x5,Lcond_copy
  164. csel x14,x23,x8,lo
  165. str xzr,[x22,#-8] // wipe tp
  166. str x14,[x0,#-8]
  167. ldp x19,x20,[x29,#16]
  168. mov sp,x29
  169. ldp x21,x22,[x29,#32]
  170. mov x0,#1
  171. ldp x23,x24,[x29,#48]
  172. ldr x29,[sp],#64
  173. ret
  174. .align 5
  175. __bn_sqr8x_mont:
  176. cmp x1,x2
  177. b.ne __bn_mul4x_mont
  178. Lsqr8x_mont:
  179. .long 0xd503233f // paciasp
  180. stp x29,x30,[sp,#-128]!
  181. add x29,sp,#0
  182. stp x19,x20,[sp,#16]
  183. stp x21,x22,[sp,#32]
  184. stp x23,x24,[sp,#48]
  185. stp x25,x26,[sp,#64]
  186. stp x27,x28,[sp,#80]
  187. stp x0,x3,[sp,#96] // offload rp and np
  188. ldp x6,x7,[x1,#8*0]
  189. ldp x8,x9,[x1,#8*2]
  190. ldp x10,x11,[x1,#8*4]
  191. ldp x12,x13,[x1,#8*6]
  192. sub x2,sp,x5,lsl#4
  193. lsl x5,x5,#3
  194. ldr x4,[x4] // *n0
  195. mov sp,x2 // alloca
  196. sub x27,x5,#8*8
  197. b Lsqr8x_zero_start
  198. Lsqr8x_zero:
  199. sub x27,x27,#8*8
  200. stp xzr,xzr,[x2,#8*0]
  201. stp xzr,xzr,[x2,#8*2]
  202. stp xzr,xzr,[x2,#8*4]
  203. stp xzr,xzr,[x2,#8*6]
  204. Lsqr8x_zero_start:
  205. stp xzr,xzr,[x2,#8*8]
  206. stp xzr,xzr,[x2,#8*10]
  207. stp xzr,xzr,[x2,#8*12]
  208. stp xzr,xzr,[x2,#8*14]
  209. add x2,x2,#8*16
  210. cbnz x27,Lsqr8x_zero
  211. add x3,x1,x5
  212. add x1,x1,#8*8
  213. mov x19,xzr
  214. mov x20,xzr
  215. mov x21,xzr
  216. mov x22,xzr
  217. mov x23,xzr
  218. mov x24,xzr
  219. mov x25,xzr
  220. mov x26,xzr
  221. mov x2,sp
  222. str x4,[x29,#112] // offload n0
  223. // Multiply everything but a[i]*a[i]
  224. .align 4
  225. Lsqr8x_outer_loop:
  226. // a[1]a[0] (i)
  227. // a[2]a[0]
  228. // a[3]a[0]
  229. // a[4]a[0]
  230. // a[5]a[0]
  231. // a[6]a[0]
  232. // a[7]a[0]
  233. // a[2]a[1] (ii)
  234. // a[3]a[1]
  235. // a[4]a[1]
  236. // a[5]a[1]
  237. // a[6]a[1]
  238. // a[7]a[1]
  239. // a[3]a[2] (iii)
  240. // a[4]a[2]
  241. // a[5]a[2]
  242. // a[6]a[2]
  243. // a[7]a[2]
  244. // a[4]a[3] (iv)
  245. // a[5]a[3]
  246. // a[6]a[3]
  247. // a[7]a[3]
  248. // a[5]a[4] (v)
  249. // a[6]a[4]
  250. // a[7]a[4]
  251. // a[6]a[5] (vi)
  252. // a[7]a[5]
  253. // a[7]a[6] (vii)
  254. mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
  255. mul x15,x8,x6
  256. mul x16,x9,x6
  257. mul x17,x10,x6
  258. adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
  259. mul x14,x11,x6
  260. adcs x21,x21,x15
  261. mul x15,x12,x6
  262. adcs x22,x22,x16
  263. mul x16,x13,x6
  264. adcs x23,x23,x17
  265. umulh x17,x7,x6 // hi(a[1..7]*a[0])
  266. adcs x24,x24,x14
  267. umulh x14,x8,x6
  268. adcs x25,x25,x15
  269. umulh x15,x9,x6
  270. adcs x26,x26,x16
  271. umulh x16,x10,x6
  272. stp x19,x20,[x2],#8*2 // t[0..1]
  273. adc x19,xzr,xzr // t[8]
  274. adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
  275. umulh x17,x11,x6
  276. adcs x22,x22,x14
  277. umulh x14,x12,x6
  278. adcs x23,x23,x15
  279. umulh x15,x13,x6
  280. adcs x24,x24,x16
  281. mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
  282. adcs x25,x25,x17
  283. mul x17,x9,x7
  284. adcs x26,x26,x14
  285. mul x14,x10,x7
  286. adc x19,x19,x15
  287. mul x15,x11,x7
  288. adds x22,x22,x16
  289. mul x16,x12,x7
  290. adcs x23,x23,x17
  291. mul x17,x13,x7
  292. adcs x24,x24,x14
  293. umulh x14,x8,x7 // hi(a[2..7]*a[1])
  294. adcs x25,x25,x15
  295. umulh x15,x9,x7
  296. adcs x26,x26,x16
  297. umulh x16,x10,x7
  298. adcs x19,x19,x17
  299. umulh x17,x11,x7
  300. stp x21,x22,[x2],#8*2 // t[2..3]
  301. adc x20,xzr,xzr // t[9]
  302. adds x23,x23,x14
  303. umulh x14,x12,x7
  304. adcs x24,x24,x15
  305. umulh x15,x13,x7
  306. adcs x25,x25,x16
  307. mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
  308. adcs x26,x26,x17
  309. mul x17,x10,x8
  310. adcs x19,x19,x14
  311. mul x14,x11,x8
  312. adc x20,x20,x15
  313. mul x15,x12,x8
  314. adds x24,x24,x16
  315. mul x16,x13,x8
  316. adcs x25,x25,x17
  317. umulh x17,x9,x8 // hi(a[3..7]*a[2])
  318. adcs x26,x26,x14
  319. umulh x14,x10,x8
  320. adcs x19,x19,x15
  321. umulh x15,x11,x8
  322. adcs x20,x20,x16
  323. umulh x16,x12,x8
  324. stp x23,x24,[x2],#8*2 // t[4..5]
  325. adc x21,xzr,xzr // t[10]
  326. adds x25,x25,x17
  327. umulh x17,x13,x8
  328. adcs x26,x26,x14
  329. mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
  330. adcs x19,x19,x15
  331. mul x15,x11,x9
  332. adcs x20,x20,x16
  333. mul x16,x12,x9
  334. adc x21,x21,x17
  335. mul x17,x13,x9
  336. adds x26,x26,x14
  337. umulh x14,x10,x9 // hi(a[4..7]*a[3])
  338. adcs x19,x19,x15
  339. umulh x15,x11,x9
  340. adcs x20,x20,x16
  341. umulh x16,x12,x9
  342. adcs x21,x21,x17
  343. umulh x17,x13,x9
  344. stp x25,x26,[x2],#8*2 // t[6..7]
  345. adc x22,xzr,xzr // t[11]
  346. adds x19,x19,x14
  347. mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
  348. adcs x20,x20,x15
  349. mul x15,x12,x10
  350. adcs x21,x21,x16
  351. mul x16,x13,x10
  352. adc x22,x22,x17
  353. umulh x17,x11,x10 // hi(a[5..7]*a[4])
  354. adds x20,x20,x14
  355. umulh x14,x12,x10
  356. adcs x21,x21,x15
  357. umulh x15,x13,x10
  358. adcs x22,x22,x16
  359. mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
  360. adc x23,xzr,xzr // t[12]
  361. adds x21,x21,x17
  362. mul x17,x13,x11
  363. adcs x22,x22,x14
  364. umulh x14,x12,x11 // hi(a[6..7]*a[5])
  365. adc x23,x23,x15
  366. umulh x15,x13,x11
  367. adds x22,x22,x16
  368. mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
  369. adcs x23,x23,x17
  370. umulh x17,x13,x12 // hi(a[7]*a[6])
  371. adc x24,xzr,xzr // t[13]
  372. adds x23,x23,x14
  373. sub x27,x3,x1 // done yet?
  374. adc x24,x24,x15
  375. adds x24,x24,x16
  376. sub x14,x3,x5 // rewinded ap
  377. adc x25,xzr,xzr // t[14]
  378. add x25,x25,x17
  379. cbz x27,Lsqr8x_outer_break
  380. mov x4,x6
  381. ldp x6,x7,[x2,#8*0]
  382. ldp x8,x9,[x2,#8*2]
  383. ldp x10,x11,[x2,#8*4]
  384. ldp x12,x13,[x2,#8*6]
  385. adds x19,x19,x6
  386. adcs x20,x20,x7
  387. ldp x6,x7,[x1,#8*0]
  388. adcs x21,x21,x8
  389. adcs x22,x22,x9
  390. ldp x8,x9,[x1,#8*2]
  391. adcs x23,x23,x10
  392. adcs x24,x24,x11
  393. ldp x10,x11,[x1,#8*4]
  394. adcs x25,x25,x12
  395. mov x0,x1
  396. adcs x26,xzr,x13
  397. ldp x12,x13,[x1,#8*6]
  398. add x1,x1,#8*8
  399. //adc x28,xzr,xzr // moved below
  400. mov x27,#-8*8
  401. // a[8]a[0]
  402. // a[9]a[0]
  403. // a[a]a[0]
  404. // a[b]a[0]
  405. // a[c]a[0]
  406. // a[d]a[0]
  407. // a[e]a[0]
  408. // a[f]a[0]
  409. // a[8]a[1]
  410. // a[f]a[1]........................
  411. // a[8]a[2]
  412. // a[f]a[2]........................
  413. // a[8]a[3]
  414. // a[f]a[3]........................
  415. // a[8]a[4]
  416. // a[f]a[4]........................
  417. // a[8]a[5]
  418. // a[f]a[5]........................
  419. // a[8]a[6]
  420. // a[f]a[6]........................
  421. // a[8]a[7]
  422. // a[f]a[7]........................
  423. Lsqr8x_mul:
  424. mul x14,x6,x4
  425. adc x28,xzr,xzr // carry bit, modulo-scheduled
  426. mul x15,x7,x4
  427. add x27,x27,#8
  428. mul x16,x8,x4
  429. mul x17,x9,x4
  430. adds x19,x19,x14
  431. mul x14,x10,x4
  432. adcs x20,x20,x15
  433. mul x15,x11,x4
  434. adcs x21,x21,x16
  435. mul x16,x12,x4
  436. adcs x22,x22,x17
  437. mul x17,x13,x4
  438. adcs x23,x23,x14
  439. umulh x14,x6,x4
  440. adcs x24,x24,x15
  441. umulh x15,x7,x4
  442. adcs x25,x25,x16
  443. umulh x16,x8,x4
  444. adcs x26,x26,x17
  445. umulh x17,x9,x4
  446. adc x28,x28,xzr
  447. str x19,[x2],#8
  448. adds x19,x20,x14
  449. umulh x14,x10,x4
  450. adcs x20,x21,x15
  451. umulh x15,x11,x4
  452. adcs x21,x22,x16
  453. umulh x16,x12,x4
  454. adcs x22,x23,x17
  455. umulh x17,x13,x4
  456. ldr x4,[x0,x27]
  457. adcs x23,x24,x14
  458. adcs x24,x25,x15
  459. adcs x25,x26,x16
  460. adcs x26,x28,x17
  461. //adc x28,xzr,xzr // moved above
  462. cbnz x27,Lsqr8x_mul
  463. // note that carry flag is guaranteed
  464. // to be zero at this point
  465. cmp x1,x3 // done yet?
  466. b.eq Lsqr8x_break
  467. ldp x6,x7,[x2,#8*0]
  468. ldp x8,x9,[x2,#8*2]
  469. ldp x10,x11,[x2,#8*4]
  470. ldp x12,x13,[x2,#8*6]
  471. adds x19,x19,x6
  472. ldr x4,[x0,#-8*8]
  473. adcs x20,x20,x7
  474. ldp x6,x7,[x1,#8*0]
  475. adcs x21,x21,x8
  476. adcs x22,x22,x9
  477. ldp x8,x9,[x1,#8*2]
  478. adcs x23,x23,x10
  479. adcs x24,x24,x11
  480. ldp x10,x11,[x1,#8*4]
  481. adcs x25,x25,x12
  482. mov x27,#-8*8
  483. adcs x26,x26,x13
  484. ldp x12,x13,[x1,#8*6]
  485. add x1,x1,#8*8
  486. //adc x28,xzr,xzr // moved above
  487. b Lsqr8x_mul
  488. .align 4
  489. Lsqr8x_break:
  490. ldp x6,x7,[x0,#8*0]
  491. add x1,x0,#8*8
  492. ldp x8,x9,[x0,#8*2]
  493. sub x14,x3,x1 // is it last iteration?
  494. ldp x10,x11,[x0,#8*4]
  495. sub x15,x2,x14
  496. ldp x12,x13,[x0,#8*6]
  497. cbz x14,Lsqr8x_outer_loop
  498. stp x19,x20,[x2,#8*0]
  499. ldp x19,x20,[x15,#8*0]
  500. stp x21,x22,[x2,#8*2]
  501. ldp x21,x22,[x15,#8*2]
  502. stp x23,x24,[x2,#8*4]
  503. ldp x23,x24,[x15,#8*4]
  504. stp x25,x26,[x2,#8*6]
  505. mov x2,x15
  506. ldp x25,x26,[x15,#8*6]
  507. b Lsqr8x_outer_loop
  508. .align 4
  509. Lsqr8x_outer_break:
  510. // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
  511. ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
  512. ldp x15,x16,[sp,#8*1]
  513. ldp x11,x13,[x14,#8*2]
  514. add x1,x14,#8*4
  515. ldp x17,x14,[sp,#8*3]
  516. stp x19,x20,[x2,#8*0]
  517. mul x19,x7,x7
  518. stp x21,x22,[x2,#8*2]
  519. umulh x7,x7,x7
  520. stp x23,x24,[x2,#8*4]
  521. mul x8,x9,x9
  522. stp x25,x26,[x2,#8*6]
  523. mov x2,sp
  524. umulh x9,x9,x9
  525. adds x20,x7,x15,lsl#1
  526. extr x15,x16,x15,#63
  527. sub x27,x5,#8*4
  528. Lsqr4x_shift_n_add:
  529. adcs x21,x8,x15
  530. extr x16,x17,x16,#63
  531. sub x27,x27,#8*4
  532. adcs x22,x9,x16
  533. ldp x15,x16,[x2,#8*5]
  534. mul x10,x11,x11
  535. ldp x7,x9,[x1],#8*2
  536. umulh x11,x11,x11
  537. mul x12,x13,x13
  538. umulh x13,x13,x13
  539. extr x17,x14,x17,#63
  540. stp x19,x20,[x2,#8*0]
  541. adcs x23,x10,x17
  542. extr x14,x15,x14,#63
  543. stp x21,x22,[x2,#8*2]
  544. adcs x24,x11,x14
  545. ldp x17,x14,[x2,#8*7]
  546. extr x15,x16,x15,#63
  547. adcs x25,x12,x15
  548. extr x16,x17,x16,#63
  549. adcs x26,x13,x16
  550. ldp x15,x16,[x2,#8*9]
  551. mul x6,x7,x7
  552. ldp x11,x13,[x1],#8*2
  553. umulh x7,x7,x7
  554. mul x8,x9,x9
  555. umulh x9,x9,x9
  556. stp x23,x24,[x2,#8*4]
  557. extr x17,x14,x17,#63
  558. stp x25,x26,[x2,#8*6]
  559. add x2,x2,#8*8
  560. adcs x19,x6,x17
  561. extr x14,x15,x14,#63
  562. adcs x20,x7,x14
  563. ldp x17,x14,[x2,#8*3]
  564. extr x15,x16,x15,#63
  565. cbnz x27,Lsqr4x_shift_n_add
  566. ldp x1,x4,[x29,#104] // pull np and n0
  567. adcs x21,x8,x15
  568. extr x16,x17,x16,#63
  569. adcs x22,x9,x16
  570. ldp x15,x16,[x2,#8*5]
  571. mul x10,x11,x11
  572. umulh x11,x11,x11
  573. stp x19,x20,[x2,#8*0]
  574. mul x12,x13,x13
  575. umulh x13,x13,x13
  576. stp x21,x22,[x2,#8*2]
  577. extr x17,x14,x17,#63
  578. adcs x23,x10,x17
  579. extr x14,x15,x14,#63
  580. ldp x19,x20,[sp,#8*0]
  581. adcs x24,x11,x14
  582. extr x15,x16,x15,#63
  583. ldp x6,x7,[x1,#8*0]
  584. adcs x25,x12,x15
  585. extr x16,xzr,x16,#63
  586. ldp x8,x9,[x1,#8*2]
  587. adc x26,x13,x16
  588. ldp x10,x11,[x1,#8*4]
  589. // Reduce by 512 bits per iteration
  590. mul x28,x4,x19 // t[0]*n0
  591. ldp x12,x13,[x1,#8*6]
  592. add x3,x1,x5
  593. ldp x21,x22,[sp,#8*2]
  594. stp x23,x24,[x2,#8*4]
  595. ldp x23,x24,[sp,#8*4]
  596. stp x25,x26,[x2,#8*6]
  597. ldp x25,x26,[sp,#8*6]
  598. add x1,x1,#8*8
  599. mov x30,xzr // initial top-most carry
  600. mov x2,sp
  601. mov x27,#8
  602. Lsqr8x_reduction:
  603. // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
  604. mul x15,x7,x28
  605. sub x27,x27,#1
  606. mul x16,x8,x28
  607. str x28,[x2],#8 // put aside t[0]*n0 for tail processing
  608. mul x17,x9,x28
  609. // (*) adds xzr,x19,x14
  610. subs xzr,x19,#1 // (*)
  611. mul x14,x10,x28
  612. adcs x19,x20,x15
  613. mul x15,x11,x28
  614. adcs x20,x21,x16
  615. mul x16,x12,x28
  616. adcs x21,x22,x17
  617. mul x17,x13,x28
  618. adcs x22,x23,x14
  619. umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
  620. adcs x23,x24,x15
  621. umulh x15,x7,x28
  622. adcs x24,x25,x16
  623. umulh x16,x8,x28
  624. adcs x25,x26,x17
  625. umulh x17,x9,x28
  626. adc x26,xzr,xzr
  627. adds x19,x19,x14
  628. umulh x14,x10,x28
  629. adcs x20,x20,x15
  630. umulh x15,x11,x28
  631. adcs x21,x21,x16
  632. umulh x16,x12,x28
  633. adcs x22,x22,x17
  634. umulh x17,x13,x28
  635. mul x28,x4,x19 // next t[0]*n0
  636. adcs x23,x23,x14
  637. adcs x24,x24,x15
  638. adcs x25,x25,x16
  639. adc x26,x26,x17
  640. cbnz x27,Lsqr8x_reduction
  641. ldp x14,x15,[x2,#8*0]
  642. ldp x16,x17,[x2,#8*2]
  643. mov x0,x2
  644. sub x27,x3,x1 // done yet?
  645. adds x19,x19,x14
  646. adcs x20,x20,x15
  647. ldp x14,x15,[x2,#8*4]
  648. adcs x21,x21,x16
  649. adcs x22,x22,x17
  650. ldp x16,x17,[x2,#8*6]
  651. adcs x23,x23,x14
  652. adcs x24,x24,x15
  653. adcs x25,x25,x16
  654. adcs x26,x26,x17
  655. //adc x28,xzr,xzr // moved below
  656. cbz x27,Lsqr8x8_post_condition
  657. ldr x4,[x2,#-8*8]
  658. ldp x6,x7,[x1,#8*0]
  659. ldp x8,x9,[x1,#8*2]
  660. ldp x10,x11,[x1,#8*4]
  661. mov x27,#-8*8
  662. ldp x12,x13,[x1,#8*6]
  663. add x1,x1,#8*8
  664. Lsqr8x_tail:
  665. mul x14,x6,x4
  666. adc x28,xzr,xzr // carry bit, modulo-scheduled
  667. mul x15,x7,x4
  668. add x27,x27,#8
  669. mul x16,x8,x4
  670. mul x17,x9,x4
  671. adds x19,x19,x14
  672. mul x14,x10,x4
  673. adcs x20,x20,x15
  674. mul x15,x11,x4
  675. adcs x21,x21,x16
  676. mul x16,x12,x4
  677. adcs x22,x22,x17
  678. mul x17,x13,x4
  679. adcs x23,x23,x14
  680. umulh x14,x6,x4
  681. adcs x24,x24,x15
  682. umulh x15,x7,x4
  683. adcs x25,x25,x16
  684. umulh x16,x8,x4
  685. adcs x26,x26,x17
  686. umulh x17,x9,x4
  687. adc x28,x28,xzr
  688. str x19,[x2],#8
  689. adds x19,x20,x14
  690. umulh x14,x10,x4
  691. adcs x20,x21,x15
  692. umulh x15,x11,x4
  693. adcs x21,x22,x16
  694. umulh x16,x12,x4
  695. adcs x22,x23,x17
  696. umulh x17,x13,x4
  697. ldr x4,[x0,x27]
  698. adcs x23,x24,x14
  699. adcs x24,x25,x15
  700. adcs x25,x26,x16
  701. adcs x26,x28,x17
  702. //adc x28,xzr,xzr // moved above
  703. cbnz x27,Lsqr8x_tail
  704. // note that carry flag is guaranteed
  705. // to be zero at this point
  706. ldp x6,x7,[x2,#8*0]
  707. sub x27,x3,x1 // done yet?
  708. sub x16,x3,x5 // rewinded np
  709. ldp x8,x9,[x2,#8*2]
  710. ldp x10,x11,[x2,#8*4]
  711. ldp x12,x13,[x2,#8*6]
  712. cbz x27,Lsqr8x_tail_break
  713. ldr x4,[x0,#-8*8]
  714. adds x19,x19,x6
  715. adcs x20,x20,x7
  716. ldp x6,x7,[x1,#8*0]
  717. adcs x21,x21,x8
  718. adcs x22,x22,x9
  719. ldp x8,x9,[x1,#8*2]
  720. adcs x23,x23,x10
  721. adcs x24,x24,x11
  722. ldp x10,x11,[x1,#8*4]
  723. adcs x25,x25,x12
  724. mov x27,#-8*8
  725. adcs x26,x26,x13
  726. ldp x12,x13,[x1,#8*6]
  727. add x1,x1,#8*8
  728. //adc x28,xzr,xzr // moved above
  729. b Lsqr8x_tail
  730. .align 4
  731. Lsqr8x_tail_break:
  732. ldr x4,[x29,#112] // pull n0
  733. add x27,x2,#8*8 // end of current t[num] window
  734. subs xzr,x30,#1 // "move" top-most carry to carry bit
  735. adcs x14,x19,x6
  736. adcs x15,x20,x7
  737. ldp x19,x20,[x0,#8*0]
  738. adcs x21,x21,x8
  739. ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
  740. adcs x22,x22,x9
  741. ldp x8,x9,[x16,#8*2]
  742. adcs x23,x23,x10
  743. adcs x24,x24,x11
  744. ldp x10,x11,[x16,#8*4]
  745. adcs x25,x25,x12
  746. adcs x26,x26,x13
  747. ldp x12,x13,[x16,#8*6]
  748. add x1,x16,#8*8
  749. adc x30,xzr,xzr // top-most carry
  750. mul x28,x4,x19
  751. stp x14,x15,[x2,#8*0]
  752. stp x21,x22,[x2,#8*2]
  753. ldp x21,x22,[x0,#8*2]
  754. stp x23,x24,[x2,#8*4]
  755. ldp x23,x24,[x0,#8*4]
  756. cmp x27,x29 // did we hit the bottom?
  757. stp x25,x26,[x2,#8*6]
  758. mov x2,x0 // slide the window
  759. ldp x25,x26,[x0,#8*6]
  760. mov x27,#8
  761. b.ne Lsqr8x_reduction
  762. // Final step. We see if result is larger than modulus, and
  763. // if it is, subtract the modulus. But comparison implies
  764. // subtraction. So we subtract modulus, see if it borrowed,
  765. // and conditionally copy original value.
  766. ldr x0,[x29,#96] // pull rp
  767. add x2,x2,#8*8
  768. subs x14,x19,x6
  769. sbcs x15,x20,x7
  770. sub x27,x5,#8*8
  771. mov x3,x0 // x0 copy
  772. Lsqr8x_sub:
  773. sbcs x16,x21,x8
  774. ldp x6,x7,[x1,#8*0]
  775. sbcs x17,x22,x9
  776. stp x14,x15,[x0,#8*0]
  777. sbcs x14,x23,x10
  778. ldp x8,x9,[x1,#8*2]
  779. sbcs x15,x24,x11
  780. stp x16,x17,[x0,#8*2]
  781. sbcs x16,x25,x12
  782. ldp x10,x11,[x1,#8*4]
  783. sbcs x17,x26,x13
  784. ldp x12,x13,[x1,#8*6]
  785. add x1,x1,#8*8
  786. ldp x19,x20,[x2,#8*0]
  787. sub x27,x27,#8*8
  788. ldp x21,x22,[x2,#8*2]
  789. ldp x23,x24,[x2,#8*4]
  790. ldp x25,x26,[x2,#8*6]
  791. add x2,x2,#8*8
  792. stp x14,x15,[x0,#8*4]
  793. sbcs x14,x19,x6
  794. stp x16,x17,[x0,#8*6]
  795. add x0,x0,#8*8
  796. sbcs x15,x20,x7
  797. cbnz x27,Lsqr8x_sub
  798. sbcs x16,x21,x8
  799. mov x2,sp
  800. add x1,sp,x5
  801. ldp x6,x7,[x3,#8*0]
  802. sbcs x17,x22,x9
  803. stp x14,x15,[x0,#8*0]
  804. sbcs x14,x23,x10
  805. ldp x8,x9,[x3,#8*2]
  806. sbcs x15,x24,x11
  807. stp x16,x17,[x0,#8*2]
  808. sbcs x16,x25,x12
  809. ldp x19,x20,[x1,#8*0]
  810. sbcs x17,x26,x13
  811. ldp x21,x22,[x1,#8*2]
  812. sbcs xzr,x30,xzr // did it borrow?
  813. ldr x30,[x29,#8] // pull return address
  814. stp x14,x15,[x0,#8*4]
  815. stp x16,x17,[x0,#8*6]
  816. sub x27,x5,#8*4
  817. Lsqr4x_cond_copy:
  818. sub x27,x27,#8*4
  819. csel x14,x19,x6,lo
  820. stp xzr,xzr,[x2,#8*0]
  821. csel x15,x20,x7,lo
  822. ldp x6,x7,[x3,#8*4]
  823. ldp x19,x20,[x1,#8*4]
  824. csel x16,x21,x8,lo
  825. stp xzr,xzr,[x2,#8*2]
  826. add x2,x2,#8*4
  827. csel x17,x22,x9,lo
  828. ldp x8,x9,[x3,#8*6]
  829. ldp x21,x22,[x1,#8*6]
  830. add x1,x1,#8*4
  831. stp x14,x15,[x3,#8*0]
  832. stp x16,x17,[x3,#8*2]
  833. add x3,x3,#8*4
  834. stp xzr,xzr,[x1,#8*0]
  835. stp xzr,xzr,[x1,#8*2]
  836. cbnz x27,Lsqr4x_cond_copy
  837. csel x14,x19,x6,lo
  838. stp xzr,xzr,[x2,#8*0]
  839. csel x15,x20,x7,lo
  840. stp xzr,xzr,[x2,#8*2]
  841. csel x16,x21,x8,lo
  842. csel x17,x22,x9,lo
  843. stp x14,x15,[x3,#8*0]
  844. stp x16,x17,[x3,#8*2]
  845. b Lsqr8x_done
  846. .align 4
  847. Lsqr8x8_post_condition:
  848. adc x28,xzr,xzr
  849. ldr x30,[x29,#8] // pull return address
  850. // x19-7,x28 hold result, x6-7 hold modulus
  851. subs x6,x19,x6
  852. ldr x1,[x29,#96] // pull rp
  853. sbcs x7,x20,x7
  854. stp xzr,xzr,[sp,#8*0]
  855. sbcs x8,x21,x8
  856. stp xzr,xzr,[sp,#8*2]
  857. sbcs x9,x22,x9
  858. stp xzr,xzr,[sp,#8*4]
  859. sbcs x10,x23,x10
  860. stp xzr,xzr,[sp,#8*6]
  861. sbcs x11,x24,x11
  862. stp xzr,xzr,[sp,#8*8]
  863. sbcs x12,x25,x12
  864. stp xzr,xzr,[sp,#8*10]
  865. sbcs x13,x26,x13
  866. stp xzr,xzr,[sp,#8*12]
  867. sbcs x28,x28,xzr // did it borrow?
  868. stp xzr,xzr,[sp,#8*14]
  869. // x6-7 hold result-modulus
  870. csel x6,x19,x6,lo
  871. csel x7,x20,x7,lo
  872. csel x8,x21,x8,lo
  873. csel x9,x22,x9,lo
  874. stp x6,x7,[x1,#8*0]
  875. csel x10,x23,x10,lo
  876. csel x11,x24,x11,lo
  877. stp x8,x9,[x1,#8*2]
  878. csel x12,x25,x12,lo
  879. csel x13,x26,x13,lo
  880. stp x10,x11,[x1,#8*4]
  881. stp x12,x13,[x1,#8*6]
  882. Lsqr8x_done:
  883. ldp x19,x20,[x29,#16]
  884. mov sp,x29
  885. ldp x21,x22,[x29,#32]
  886. mov x0,#1
  887. ldp x23,x24,[x29,#48]
  888. ldp x25,x26,[x29,#64]
  889. ldp x27,x28,[x29,#80]
  890. ldr x29,[sp],#128
  891. .long 0xd50323bf // autiasp
  892. ret
  893. .align 5
  894. __bn_mul4x_mont:
  895. .long 0xd503233f // paciasp
  896. stp x29,x30,[sp,#-128]!
  897. add x29,sp,#0
  898. stp x19,x20,[sp,#16]
  899. stp x21,x22,[sp,#32]
  900. stp x23,x24,[sp,#48]
  901. stp x25,x26,[sp,#64]
  902. stp x27,x28,[sp,#80]
  903. sub x26,sp,x5,lsl#3
  904. lsl x5,x5,#3
  905. ldr x4,[x4] // *n0
  906. sub sp,x26,#8*4 // alloca
  907. add x10,x2,x5
  908. add x27,x1,x5
  909. stp x0,x10,[x29,#96] // offload rp and &b[num]
  910. ldr x24,[x2,#8*0] // b[0]
  911. ldp x6,x7,[x1,#8*0] // a[0..3]
  912. ldp x8,x9,[x1,#8*2]
  913. add x1,x1,#8*4
  914. mov x19,xzr
  915. mov x20,xzr
  916. mov x21,xzr
  917. mov x22,xzr
  918. ldp x14,x15,[x3,#8*0] // n[0..3]
  919. ldp x16,x17,[x3,#8*2]
  920. adds x3,x3,#8*4 // clear carry bit
  921. mov x0,xzr
  922. mov x28,#0
  923. mov x26,sp
  924. Loop_mul4x_1st_reduction:
  925. mul x10,x6,x24 // lo(a[0..3]*b[0])
  926. adc x0,x0,xzr // modulo-scheduled
  927. mul x11,x7,x24
  928. add x28,x28,#8
  929. mul x12,x8,x24
  930. and x28,x28,#31
  931. mul x13,x9,x24
  932. adds x19,x19,x10
  933. umulh x10,x6,x24 // hi(a[0..3]*b[0])
  934. adcs x20,x20,x11
  935. mul x25,x19,x4 // t[0]*n0
  936. adcs x21,x21,x12
  937. umulh x11,x7,x24
  938. adcs x22,x22,x13
  939. umulh x12,x8,x24
  940. adc x23,xzr,xzr
  941. umulh x13,x9,x24
  942. ldr x24,[x2,x28] // next b[i] (or b[0])
  943. adds x20,x20,x10
  944. // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
  945. str x25,[x26],#8 // put aside t[0]*n0 for tail processing
  946. adcs x21,x21,x11
  947. mul x11,x15,x25
  948. adcs x22,x22,x12
  949. mul x12,x16,x25
  950. adc x23,x23,x13 // can't overflow
  951. mul x13,x17,x25
  952. // (*) adds xzr,x19,x10
  953. subs xzr,x19,#1 // (*)
  954. umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
  955. adcs x19,x20,x11
  956. umulh x11,x15,x25
  957. adcs x20,x21,x12
  958. umulh x12,x16,x25
  959. adcs x21,x22,x13
  960. umulh x13,x17,x25
  961. adcs x22,x23,x0
  962. adc x0,xzr,xzr
  963. adds x19,x19,x10
  964. sub x10,x27,x1
  965. adcs x20,x20,x11
  966. adcs x21,x21,x12
  967. adcs x22,x22,x13
  968. //adc x0,x0,xzr
  969. cbnz x28,Loop_mul4x_1st_reduction
  970. cbz x10,Lmul4x4_post_condition
  971. ldp x6,x7,[x1,#8*0] // a[4..7]
  972. ldp x8,x9,[x1,#8*2]
  973. add x1,x1,#8*4
  974. ldr x25,[sp] // a[0]*n0
  975. ldp x14,x15,[x3,#8*0] // n[4..7]
  976. ldp x16,x17,[x3,#8*2]
  977. add x3,x3,#8*4
  978. Loop_mul4x_1st_tail:
  979. mul x10,x6,x24 // lo(a[4..7]*b[i])
  980. adc x0,x0,xzr // modulo-scheduled
  981. mul x11,x7,x24
  982. add x28,x28,#8
  983. mul x12,x8,x24
  984. and x28,x28,#31
  985. mul x13,x9,x24
  986. adds x19,x19,x10
  987. umulh x10,x6,x24 // hi(a[4..7]*b[i])
  988. adcs x20,x20,x11
  989. umulh x11,x7,x24
  990. adcs x21,x21,x12
  991. umulh x12,x8,x24
  992. adcs x22,x22,x13
  993. umulh x13,x9,x24
  994. adc x23,xzr,xzr
  995. ldr x24,[x2,x28] // next b[i] (or b[0])
  996. adds x20,x20,x10
  997. mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
  998. adcs x21,x21,x11
  999. mul x11,x15,x25
  1000. adcs x22,x22,x12
  1001. mul x12,x16,x25
  1002. adc x23,x23,x13 // can't overflow
  1003. mul x13,x17,x25
  1004. adds x19,x19,x10
  1005. umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
  1006. adcs x20,x20,x11
  1007. umulh x11,x15,x25
  1008. adcs x21,x21,x12
  1009. umulh x12,x16,x25
  1010. adcs x22,x22,x13
  1011. adcs x23,x23,x0
  1012. umulh x13,x17,x25
  1013. adc x0,xzr,xzr
  1014. ldr x25,[sp,x28] // next t[0]*n0
  1015. str x19,[x26],#8 // result!!!
  1016. adds x19,x20,x10
  1017. sub x10,x27,x1 // done yet?
  1018. adcs x20,x21,x11
  1019. adcs x21,x22,x12
  1020. adcs x22,x23,x13
  1021. //adc x0,x0,xzr
  1022. cbnz x28,Loop_mul4x_1st_tail
  1023. sub x11,x27,x5 // rewinded x1
  1024. cbz x10,Lmul4x_proceed
  1025. ldp x6,x7,[x1,#8*0]
  1026. ldp x8,x9,[x1,#8*2]
  1027. add x1,x1,#8*4
  1028. ldp x14,x15,[x3,#8*0]
  1029. ldp x16,x17,[x3,#8*2]
  1030. add x3,x3,#8*4
  1031. b Loop_mul4x_1st_tail
  1032. .align 5
  1033. Lmul4x_proceed:
  1034. ldr x24,[x2,#8*4]! // *++b
  1035. adc x30,x0,xzr
  1036. ldp x6,x7,[x11,#8*0] // a[0..3]
  1037. sub x3,x3,x5 // rewind np
  1038. ldp x8,x9,[x11,#8*2]
  1039. add x1,x11,#8*4
  1040. stp x19,x20,[x26,#8*0] // result!!!
  1041. ldp x19,x20,[sp,#8*4] // t[0..3]
  1042. stp x21,x22,[x26,#8*2] // result!!!
  1043. ldp x21,x22,[sp,#8*6]
  1044. ldp x14,x15,[x3,#8*0] // n[0..3]
  1045. mov x26,sp
  1046. ldp x16,x17,[x3,#8*2]
  1047. adds x3,x3,#8*4 // clear carry bit
  1048. mov x0,xzr
  1049. .align 4
  1050. Loop_mul4x_reduction:
  1051. mul x10,x6,x24 // lo(a[0..3]*b[4])
  1052. adc x0,x0,xzr // modulo-scheduled
  1053. mul x11,x7,x24
  1054. add x28,x28,#8
  1055. mul x12,x8,x24
  1056. and x28,x28,#31
  1057. mul x13,x9,x24
  1058. adds x19,x19,x10
  1059. umulh x10,x6,x24 // hi(a[0..3]*b[4])
  1060. adcs x20,x20,x11
  1061. mul x25,x19,x4 // t[0]*n0
  1062. adcs x21,x21,x12
  1063. umulh x11,x7,x24
  1064. adcs x22,x22,x13
  1065. umulh x12,x8,x24
  1066. adc x23,xzr,xzr
  1067. umulh x13,x9,x24
  1068. ldr x24,[x2,x28] // next b[i]
  1069. adds x20,x20,x10
  1070. // (*) mul x10,x14,x25
  1071. str x25,[x26],#8 // put aside t[0]*n0 for tail processing
  1072. adcs x21,x21,x11
  1073. mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
  1074. adcs x22,x22,x12
  1075. mul x12,x16,x25
  1076. adc x23,x23,x13 // can't overflow
  1077. mul x13,x17,x25
  1078. // (*) adds xzr,x19,x10
  1079. subs xzr,x19,#1 // (*)
  1080. umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
  1081. adcs x19,x20,x11
  1082. umulh x11,x15,x25
  1083. adcs x20,x21,x12
  1084. umulh x12,x16,x25
  1085. adcs x21,x22,x13
  1086. umulh x13,x17,x25
  1087. adcs x22,x23,x0
  1088. adc x0,xzr,xzr
  1089. adds x19,x19,x10
  1090. adcs x20,x20,x11
  1091. adcs x21,x21,x12
  1092. adcs x22,x22,x13
  1093. //adc x0,x0,xzr
  1094. cbnz x28,Loop_mul4x_reduction
  1095. adc x0,x0,xzr
  1096. ldp x10,x11,[x26,#8*4] // t[4..7]
  1097. ldp x12,x13,[x26,#8*6]
  1098. ldp x6,x7,[x1,#8*0] // a[4..7]
  1099. ldp x8,x9,[x1,#8*2]
  1100. add x1,x1,#8*4
  1101. adds x19,x19,x10
  1102. adcs x20,x20,x11
  1103. adcs x21,x21,x12
  1104. adcs x22,x22,x13
  1105. //adc x0,x0,xzr
  1106. ldr x25,[sp] // t[0]*n0
  1107. ldp x14,x15,[x3,#8*0] // n[4..7]
  1108. ldp x16,x17,[x3,#8*2]
  1109. add x3,x3,#8*4
  1110. .align 4
  1111. Loop_mul4x_tail:
  1112. mul x10,x6,x24 // lo(a[4..7]*b[4])
  1113. adc x0,x0,xzr // modulo-scheduled
  1114. mul x11,x7,x24
  1115. add x28,x28,#8
  1116. mul x12,x8,x24
  1117. and x28,x28,#31
  1118. mul x13,x9,x24
  1119. adds x19,x19,x10
  1120. umulh x10,x6,x24 // hi(a[4..7]*b[4])
  1121. adcs x20,x20,x11
  1122. umulh x11,x7,x24
  1123. adcs x21,x21,x12
  1124. umulh x12,x8,x24
  1125. adcs x22,x22,x13
  1126. umulh x13,x9,x24
  1127. adc x23,xzr,xzr
  1128. ldr x24,[x2,x28] // next b[i]
  1129. adds x20,x20,x10
  1130. mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
  1131. adcs x21,x21,x11
  1132. mul x11,x15,x25
  1133. adcs x22,x22,x12
  1134. mul x12,x16,x25
  1135. adc x23,x23,x13 // can't overflow
  1136. mul x13,x17,x25
  1137. adds x19,x19,x10
  1138. umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
  1139. adcs x20,x20,x11
  1140. umulh x11,x15,x25
  1141. adcs x21,x21,x12
  1142. umulh x12,x16,x25
  1143. adcs x22,x22,x13
  1144. umulh x13,x17,x25
  1145. adcs x23,x23,x0
  1146. ldr x25,[sp,x28] // next a[0]*n0
  1147. adc x0,xzr,xzr
  1148. str x19,[x26],#8 // result!!!
  1149. adds x19,x20,x10
  1150. sub x10,x27,x1 // done yet?
  1151. adcs x20,x21,x11
  1152. adcs x21,x22,x12
  1153. adcs x22,x23,x13
  1154. //adc x0,x0,xzr
  1155. cbnz x28,Loop_mul4x_tail
  1156. sub x11,x3,x5 // rewinded np?
  1157. adc x0,x0,xzr
  1158. cbz x10,Loop_mul4x_break
  1159. ldp x10,x11,[x26,#8*4]
  1160. ldp x12,x13,[x26,#8*6]
  1161. ldp x6,x7,[x1,#8*0]
  1162. ldp x8,x9,[x1,#8*2]
  1163. add x1,x1,#8*4
  1164. adds x19,x19,x10
  1165. adcs x20,x20,x11
  1166. adcs x21,x21,x12
  1167. adcs x22,x22,x13
  1168. //adc x0,x0,xzr
  1169. ldp x14,x15,[x3,#8*0]
  1170. ldp x16,x17,[x3,#8*2]
  1171. add x3,x3,#8*4
  1172. b Loop_mul4x_tail
  1173. .align 4
  1174. Loop_mul4x_break:
  1175. ldp x12,x13,[x29,#96] // pull rp and &b[num]
  1176. adds x19,x19,x30
  1177. add x2,x2,#8*4 // bp++
  1178. adcs x20,x20,xzr
  1179. sub x1,x1,x5 // rewind ap
  1180. adcs x21,x21,xzr
  1181. stp x19,x20,[x26,#8*0] // result!!!
  1182. adcs x22,x22,xzr
  1183. ldp x19,x20,[sp,#8*4] // t[0..3]
  1184. adc x30,x0,xzr
  1185. stp x21,x22,[x26,#8*2] // result!!!
  1186. cmp x2,x13 // done yet?
  1187. ldp x21,x22,[sp,#8*6]
  1188. ldp x14,x15,[x11,#8*0] // n[0..3]
  1189. ldp x16,x17,[x11,#8*2]
  1190. add x3,x11,#8*4
  1191. b.eq Lmul4x_post
  1192. ldr x24,[x2]
  1193. ldp x6,x7,[x1,#8*0] // a[0..3]
  1194. ldp x8,x9,[x1,#8*2]
  1195. adds x1,x1,#8*4 // clear carry bit
  1196. mov x0,xzr
  1197. mov x26,sp
  1198. b Loop_mul4x_reduction
  1199. .align 4
  1200. Lmul4x_post:
  1201. // Final step. We see if result is larger than modulus, and
  1202. // if it is, subtract the modulus. But comparison implies
  1203. // subtraction. So we subtract modulus, see if it borrowed,
  1204. // and conditionally copy original value.
  1205. mov x0,x12
  1206. mov x27,x12 // x0 copy
  1207. subs x10,x19,x14
  1208. add x26,sp,#8*8
  1209. sbcs x11,x20,x15
  1210. sub x28,x5,#8*4
  1211. Lmul4x_sub:
  1212. sbcs x12,x21,x16
  1213. ldp x14,x15,[x3,#8*0]
  1214. sub x28,x28,#8*4
  1215. ldp x19,x20,[x26,#8*0]
  1216. sbcs x13,x22,x17
  1217. ldp x16,x17,[x3,#8*2]
  1218. add x3,x3,#8*4
  1219. ldp x21,x22,[x26,#8*2]
  1220. add x26,x26,#8*4
  1221. stp x10,x11,[x0,#8*0]
  1222. sbcs x10,x19,x14
  1223. stp x12,x13,[x0,#8*2]
  1224. add x0,x0,#8*4
  1225. sbcs x11,x20,x15
  1226. cbnz x28,Lmul4x_sub
  1227. sbcs x12,x21,x16
  1228. mov x26,sp
  1229. add x1,sp,#8*4
  1230. ldp x6,x7,[x27,#8*0]
  1231. sbcs x13,x22,x17
  1232. stp x10,x11,[x0,#8*0]
  1233. ldp x8,x9,[x27,#8*2]
  1234. stp x12,x13,[x0,#8*2]
  1235. ldp x19,x20,[x1,#8*0]
  1236. ldp x21,x22,[x1,#8*2]
  1237. sbcs xzr,x30,xzr // did it borrow?
  1238. ldr x30,[x29,#8] // pull return address
  1239. sub x28,x5,#8*4
  1240. Lmul4x_cond_copy:
  1241. sub x28,x28,#8*4
  1242. csel x10,x19,x6,lo
  1243. stp xzr,xzr,[x26,#8*0]
  1244. csel x11,x20,x7,lo
  1245. ldp x6,x7,[x27,#8*4]
  1246. ldp x19,x20,[x1,#8*4]
  1247. csel x12,x21,x8,lo
  1248. stp xzr,xzr,[x26,#8*2]
  1249. add x26,x26,#8*4
  1250. csel x13,x22,x9,lo
  1251. ldp x8,x9,[x27,#8*6]
  1252. ldp x21,x22,[x1,#8*6]
  1253. add x1,x1,#8*4
  1254. stp x10,x11,[x27,#8*0]
  1255. stp x12,x13,[x27,#8*2]
  1256. add x27,x27,#8*4
  1257. cbnz x28,Lmul4x_cond_copy
  1258. csel x10,x19,x6,lo
  1259. stp xzr,xzr,[x26,#8*0]
  1260. csel x11,x20,x7,lo
  1261. stp xzr,xzr,[x26,#8*2]
  1262. csel x12,x21,x8,lo
  1263. stp xzr,xzr,[x26,#8*3]
  1264. csel x13,x22,x9,lo
  1265. stp xzr,xzr,[x26,#8*4]
  1266. stp x10,x11,[x27,#8*0]
  1267. stp x12,x13,[x27,#8*2]
  1268. b Lmul4x_done
  1269. .align 4
  1270. Lmul4x4_post_condition:
  1271. adc x0,x0,xzr
  1272. ldr x1,[x29,#96] // pull rp
  1273. // x19-3,x0 hold result, x14-7 hold modulus
  1274. subs x6,x19,x14
  1275. ldr x30,[x29,#8] // pull return address
  1276. sbcs x7,x20,x15
  1277. stp xzr,xzr,[sp,#8*0]
  1278. sbcs x8,x21,x16
  1279. stp xzr,xzr,[sp,#8*2]
  1280. sbcs x9,x22,x17
  1281. stp xzr,xzr,[sp,#8*4]
  1282. sbcs xzr,x0,xzr // did it borrow?
  1283. stp xzr,xzr,[sp,#8*6]
  1284. // x6-3 hold result-modulus
  1285. csel x6,x19,x6,lo
  1286. csel x7,x20,x7,lo
  1287. csel x8,x21,x8,lo
  1288. csel x9,x22,x9,lo
  1289. stp x6,x7,[x1,#8*0]
  1290. stp x8,x9,[x1,#8*2]
  1291. Lmul4x_done:
  1292. ldp x19,x20,[x29,#16]
  1293. mov sp,x29
  1294. ldp x21,x22,[x29,#32]
  1295. mov x0,#1
  1296. ldp x23,x24,[x29,#48]
  1297. ldp x25,x26,[x29,#64]
  1298. ldp x27,x28,[x29,#80]
  1299. ldr x29,[sp],#128
  1300. .long 0xd50323bf // autiasp
  1301. ret
  1302. .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1303. .align 2
  1304. .align 4