armv8-mont.S 30 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408
  1. .text
  2. .globl bn_mul_mont
  3. .type bn_mul_mont,%function
  4. .align 5
  5. bn_mul_mont:
  6. tst x5,#7
  7. b.eq __bn_sqr8x_mont
  8. tst x5,#3
  9. b.eq __bn_mul4x_mont
  10. .Lmul_mont:
  11. stp x29,x30,[sp,#-64]!
  12. add x29,sp,#0
  13. stp x19,x20,[sp,#16]
  14. stp x21,x22,[sp,#32]
  15. stp x23,x24,[sp,#48]
  16. ldr x9,[x2],#8 // bp[0]
  17. sub x22,sp,x5,lsl#3
  18. ldp x7,x8,[x1],#16 // ap[0..1]
  19. lsl x5,x5,#3
  20. ldr x4,[x4] // *n0
  21. and x22,x22,#-16 // ABI says so
  22. ldp x13,x14,[x3],#16 // np[0..1]
  23. mul x6,x7,x9 // ap[0]*bp[0]
  24. sub x21,x5,#16 // j=num-2
  25. umulh x7,x7,x9
  26. mul x10,x8,x9 // ap[1]*bp[0]
  27. umulh x11,x8,x9
  28. mul x15,x6,x4 // "tp[0]"*n0
  29. mov sp,x22 // alloca
  30. // (*) mul x12,x13,x15 // np[0]*m1
  31. umulh x13,x13,x15
  32. mul x16,x14,x15 // np[1]*m1
  33. // (*) adds x12,x12,x6 // discarded
  34. // (*) As for removal of first multiplication and addition
  35. // instructions. The outcome of first addition is
  36. // guaranteed to be zero, which leaves two computationally
  37. // significant outcomes: it either carries or not. Then
  38. // question is when does it carry? Is there alternative
  39. // way to deduce it? If you follow operations, you can
  40. // observe that condition for carry is quite simple:
  41. // x6 being non-zero. So that carry can be calculated
  42. // by adding -1 to x6. That's what next instruction does.
  43. subs xzr,x6,#1 // (*)
  44. umulh x17,x14,x15
  45. adc x13,x13,xzr
  46. cbz x21,.L1st_skip
  47. .L1st:
  48. ldr x8,[x1],#8
  49. adds x6,x10,x7
  50. sub x21,x21,#8 // j--
  51. adc x7,x11,xzr
  52. ldr x14,[x3],#8
  53. adds x12,x16,x13
  54. mul x10,x8,x9 // ap[j]*bp[0]
  55. adc x13,x17,xzr
  56. umulh x11,x8,x9
  57. adds x12,x12,x6
  58. mul x16,x14,x15 // np[j]*m1
  59. adc x13,x13,xzr
  60. umulh x17,x14,x15
  61. str x12,[x22],#8 // tp[j-1]
  62. cbnz x21,.L1st
  63. .L1st_skip:
  64. adds x6,x10,x7
  65. sub x1,x1,x5 // rewind x1
  66. adc x7,x11,xzr
  67. adds x12,x16,x13
  68. sub x3,x3,x5 // rewind x3
  69. adc x13,x17,xzr
  70. adds x12,x12,x6
  71. sub x20,x5,#8 // i=num-1
  72. adcs x13,x13,x7
  73. adc x19,xzr,xzr // upmost overflow bit
  74. stp x12,x13,[x22]
  75. .Louter:
  76. ldr x9,[x2],#8 // bp[i]
  77. ldp x7,x8,[x1],#16
  78. ldr x23,[sp] // tp[0]
  79. add x22,sp,#8
  80. mul x6,x7,x9 // ap[0]*bp[i]
  81. sub x21,x5,#16 // j=num-2
  82. umulh x7,x7,x9
  83. ldp x13,x14,[x3],#16
  84. mul x10,x8,x9 // ap[1]*bp[i]
  85. adds x6,x6,x23
  86. umulh x11,x8,x9
  87. adc x7,x7,xzr
  88. mul x15,x6,x4
  89. sub x20,x20,#8 // i--
  90. // (*) mul x12,x13,x15 // np[0]*m1
  91. umulh x13,x13,x15
  92. mul x16,x14,x15 // np[1]*m1
  93. // (*) adds x12,x12,x6
  94. subs xzr,x6,#1 // (*)
  95. umulh x17,x14,x15
  96. cbz x21,.Linner_skip
  97. .Linner:
  98. ldr x8,[x1],#8
  99. adc x13,x13,xzr
  100. ldr x23,[x22],#8 // tp[j]
  101. adds x6,x10,x7
  102. sub x21,x21,#8 // j--
  103. adc x7,x11,xzr
  104. adds x12,x16,x13
  105. ldr x14,[x3],#8
  106. adc x13,x17,xzr
  107. mul x10,x8,x9 // ap[j]*bp[i]
  108. adds x6,x6,x23
  109. umulh x11,x8,x9
  110. adc x7,x7,xzr
  111. mul x16,x14,x15 // np[j]*m1
  112. adds x12,x12,x6
  113. umulh x17,x14,x15
  114. str x12,[x22,#-16] // tp[j-1]
  115. cbnz x21,.Linner
  116. .Linner_skip:
  117. ldr x23,[x22],#8 // tp[j]
  118. adc x13,x13,xzr
  119. adds x6,x10,x7
  120. sub x1,x1,x5 // rewind x1
  121. adc x7,x11,xzr
  122. adds x12,x16,x13
  123. sub x3,x3,x5 // rewind x3
  124. adcs x13,x17,x19
  125. adc x19,xzr,xzr
  126. adds x6,x6,x23
  127. adc x7,x7,xzr
  128. adds x12,x12,x6
  129. adcs x13,x13,x7
  130. adc x19,x19,xzr // upmost overflow bit
  131. stp x12,x13,[x22,#-16]
  132. cbnz x20,.Louter
  133. // Final step. We see if result is larger than modulus, and
  134. // if it is, subtract the modulus. But comparison implies
  135. // subtraction. So we subtract modulus, see if it borrowed,
  136. // and conditionally copy original value.
  137. ldr x23,[sp] // tp[0]
  138. add x22,sp,#8
  139. ldr x14,[x3],#8 // np[0]
  140. subs x21,x5,#8 // j=num-1 and clear borrow
  141. mov x1,x0
  142. .Lsub:
  143. sbcs x8,x23,x14 // tp[j]-np[j]
  144. ldr x23,[x22],#8
  145. sub x21,x21,#8 // j--
  146. ldr x14,[x3],#8
  147. str x8,[x1],#8 // rp[j]=tp[j]-np[j]
  148. cbnz x21,.Lsub
  149. sbcs x8,x23,x14
  150. sbcs x19,x19,xzr // did it borrow?
  151. str x8,[x1],#8 // rp[num-1]
  152. ldr x23,[sp] // tp[0]
  153. add x22,sp,#8
  154. ldr x8,[x0],#8 // rp[0]
  155. sub x5,x5,#8 // num--
  156. nop
  157. .Lcond_copy:
  158. sub x5,x5,#8 // num--
  159. csel x14,x23,x8,lo // did it borrow?
  160. ldr x23,[x22],#8
  161. ldr x8,[x0],#8
  162. str xzr,[x22,#-16] // wipe tp
  163. str x14,[x0,#-16]
  164. cbnz x5,.Lcond_copy
  165. csel x14,x23,x8,lo
  166. str xzr,[x22,#-8] // wipe tp
  167. str x14,[x0,#-8]
  168. ldp x19,x20,[x29,#16]
  169. mov sp,x29
  170. ldp x21,x22,[x29,#32]
  171. mov x0,#1
  172. ldp x23,x24,[x29,#48]
  173. ldr x29,[sp],#64
  174. ret
  175. .size bn_mul_mont,.-bn_mul_mont
  176. .type __bn_sqr8x_mont,%function
  177. .align 5
  178. __bn_sqr8x_mont:
  179. cmp x1,x2
  180. b.ne __bn_mul4x_mont
  181. .Lsqr8x_mont:
  182. .inst 0xd503233f // paciasp
  183. stp x29,x30,[sp,#-128]!
  184. add x29,sp,#0
  185. stp x19,x20,[sp,#16]
  186. stp x21,x22,[sp,#32]
  187. stp x23,x24,[sp,#48]
  188. stp x25,x26,[sp,#64]
  189. stp x27,x28,[sp,#80]
  190. stp x0,x3,[sp,#96] // offload rp and np
  191. ldp x6,x7,[x1,#8*0]
  192. ldp x8,x9,[x1,#8*2]
  193. ldp x10,x11,[x1,#8*4]
  194. ldp x12,x13,[x1,#8*6]
  195. sub x2,sp,x5,lsl#4
  196. lsl x5,x5,#3
  197. ldr x4,[x4] // *n0
  198. mov sp,x2 // alloca
  199. sub x27,x5,#8*8
  200. b .Lsqr8x_zero_start
  201. .Lsqr8x_zero:
  202. sub x27,x27,#8*8
  203. stp xzr,xzr,[x2,#8*0]
  204. stp xzr,xzr,[x2,#8*2]
  205. stp xzr,xzr,[x2,#8*4]
  206. stp xzr,xzr,[x2,#8*6]
  207. .Lsqr8x_zero_start:
  208. stp xzr,xzr,[x2,#8*8]
  209. stp xzr,xzr,[x2,#8*10]
  210. stp xzr,xzr,[x2,#8*12]
  211. stp xzr,xzr,[x2,#8*14]
  212. add x2,x2,#8*16
  213. cbnz x27,.Lsqr8x_zero
  214. add x3,x1,x5
  215. add x1,x1,#8*8
  216. mov x19,xzr
  217. mov x20,xzr
  218. mov x21,xzr
  219. mov x22,xzr
  220. mov x23,xzr
  221. mov x24,xzr
  222. mov x25,xzr
  223. mov x26,xzr
  224. mov x2,sp
  225. str x4,[x29,#112] // offload n0
  226. // Multiply everything but a[i]*a[i]
  227. .align 4
  228. .Lsqr8x_outer_loop:
  229. // a[1]a[0] (i)
  230. // a[2]a[0]
  231. // a[3]a[0]
  232. // a[4]a[0]
  233. // a[5]a[0]
  234. // a[6]a[0]
  235. // a[7]a[0]
  236. // a[2]a[1] (ii)
  237. // a[3]a[1]
  238. // a[4]a[1]
  239. // a[5]a[1]
  240. // a[6]a[1]
  241. // a[7]a[1]
  242. // a[3]a[2] (iii)
  243. // a[4]a[2]
  244. // a[5]a[2]
  245. // a[6]a[2]
  246. // a[7]a[2]
  247. // a[4]a[3] (iv)
  248. // a[5]a[3]
  249. // a[6]a[3]
  250. // a[7]a[3]
  251. // a[5]a[4] (v)
  252. // a[6]a[4]
  253. // a[7]a[4]
  254. // a[6]a[5] (vi)
  255. // a[7]a[5]
  256. // a[7]a[6] (vii)
  257. mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
  258. mul x15,x8,x6
  259. mul x16,x9,x6
  260. mul x17,x10,x6
  261. adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
  262. mul x14,x11,x6
  263. adcs x21,x21,x15
  264. mul x15,x12,x6
  265. adcs x22,x22,x16
  266. mul x16,x13,x6
  267. adcs x23,x23,x17
  268. umulh x17,x7,x6 // hi(a[1..7]*a[0])
  269. adcs x24,x24,x14
  270. umulh x14,x8,x6
  271. adcs x25,x25,x15
  272. umulh x15,x9,x6
  273. adcs x26,x26,x16
  274. umulh x16,x10,x6
  275. stp x19,x20,[x2],#8*2 // t[0..1]
  276. adc x19,xzr,xzr // t[8]
  277. adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
  278. umulh x17,x11,x6
  279. adcs x22,x22,x14
  280. umulh x14,x12,x6
  281. adcs x23,x23,x15
  282. umulh x15,x13,x6
  283. adcs x24,x24,x16
  284. mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
  285. adcs x25,x25,x17
  286. mul x17,x9,x7
  287. adcs x26,x26,x14
  288. mul x14,x10,x7
  289. adc x19,x19,x15
  290. mul x15,x11,x7
  291. adds x22,x22,x16
  292. mul x16,x12,x7
  293. adcs x23,x23,x17
  294. mul x17,x13,x7
  295. adcs x24,x24,x14
  296. umulh x14,x8,x7 // hi(a[2..7]*a[1])
  297. adcs x25,x25,x15
  298. umulh x15,x9,x7
  299. adcs x26,x26,x16
  300. umulh x16,x10,x7
  301. adcs x19,x19,x17
  302. umulh x17,x11,x7
  303. stp x21,x22,[x2],#8*2 // t[2..3]
  304. adc x20,xzr,xzr // t[9]
  305. adds x23,x23,x14
  306. umulh x14,x12,x7
  307. adcs x24,x24,x15
  308. umulh x15,x13,x7
  309. adcs x25,x25,x16
  310. mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
  311. adcs x26,x26,x17
  312. mul x17,x10,x8
  313. adcs x19,x19,x14
  314. mul x14,x11,x8
  315. adc x20,x20,x15
  316. mul x15,x12,x8
  317. adds x24,x24,x16
  318. mul x16,x13,x8
  319. adcs x25,x25,x17
  320. umulh x17,x9,x8 // hi(a[3..7]*a[2])
  321. adcs x26,x26,x14
  322. umulh x14,x10,x8
  323. adcs x19,x19,x15
  324. umulh x15,x11,x8
  325. adcs x20,x20,x16
  326. umulh x16,x12,x8
  327. stp x23,x24,[x2],#8*2 // t[4..5]
  328. adc x21,xzr,xzr // t[10]
  329. adds x25,x25,x17
  330. umulh x17,x13,x8
  331. adcs x26,x26,x14
  332. mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
  333. adcs x19,x19,x15
  334. mul x15,x11,x9
  335. adcs x20,x20,x16
  336. mul x16,x12,x9
  337. adc x21,x21,x17
  338. mul x17,x13,x9
  339. adds x26,x26,x14
  340. umulh x14,x10,x9 // hi(a[4..7]*a[3])
  341. adcs x19,x19,x15
  342. umulh x15,x11,x9
  343. adcs x20,x20,x16
  344. umulh x16,x12,x9
  345. adcs x21,x21,x17
  346. umulh x17,x13,x9
  347. stp x25,x26,[x2],#8*2 // t[6..7]
  348. adc x22,xzr,xzr // t[11]
  349. adds x19,x19,x14
  350. mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
  351. adcs x20,x20,x15
  352. mul x15,x12,x10
  353. adcs x21,x21,x16
  354. mul x16,x13,x10
  355. adc x22,x22,x17
  356. umulh x17,x11,x10 // hi(a[5..7]*a[4])
  357. adds x20,x20,x14
  358. umulh x14,x12,x10
  359. adcs x21,x21,x15
  360. umulh x15,x13,x10
  361. adcs x22,x22,x16
  362. mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
  363. adc x23,xzr,xzr // t[12]
  364. adds x21,x21,x17
  365. mul x17,x13,x11
  366. adcs x22,x22,x14
  367. umulh x14,x12,x11 // hi(a[6..7]*a[5])
  368. adc x23,x23,x15
  369. umulh x15,x13,x11
  370. adds x22,x22,x16
  371. mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
  372. adcs x23,x23,x17
  373. umulh x17,x13,x12 // hi(a[7]*a[6])
  374. adc x24,xzr,xzr // t[13]
  375. adds x23,x23,x14
  376. sub x27,x3,x1 // done yet?
  377. adc x24,x24,x15
  378. adds x24,x24,x16
  379. sub x14,x3,x5 // rewinded ap
  380. adc x25,xzr,xzr // t[14]
  381. add x25,x25,x17
  382. cbz x27,.Lsqr8x_outer_break
  383. mov x4,x6
  384. ldp x6,x7,[x2,#8*0]
  385. ldp x8,x9,[x2,#8*2]
  386. ldp x10,x11,[x2,#8*4]
  387. ldp x12,x13,[x2,#8*6]
  388. adds x19,x19,x6
  389. adcs x20,x20,x7
  390. ldp x6,x7,[x1,#8*0]
  391. adcs x21,x21,x8
  392. adcs x22,x22,x9
  393. ldp x8,x9,[x1,#8*2]
  394. adcs x23,x23,x10
  395. adcs x24,x24,x11
  396. ldp x10,x11,[x1,#8*4]
  397. adcs x25,x25,x12
  398. mov x0,x1
  399. adcs x26,xzr,x13
  400. ldp x12,x13,[x1,#8*6]
  401. add x1,x1,#8*8
  402. //adc x28,xzr,xzr // moved below
  403. mov x27,#-8*8
  404. // a[8]a[0]
  405. // a[9]a[0]
  406. // a[a]a[0]
  407. // a[b]a[0]
  408. // a[c]a[0]
  409. // a[d]a[0]
  410. // a[e]a[0]
  411. // a[f]a[0]
  412. // a[8]a[1]
  413. // a[f]a[1]........................
  414. // a[8]a[2]
  415. // a[f]a[2]........................
  416. // a[8]a[3]
  417. // a[f]a[3]........................
  418. // a[8]a[4]
  419. // a[f]a[4]........................
  420. // a[8]a[5]
  421. // a[f]a[5]........................
  422. // a[8]a[6]
  423. // a[f]a[6]........................
  424. // a[8]a[7]
  425. // a[f]a[7]........................
  426. .Lsqr8x_mul:
  427. mul x14,x6,x4
  428. adc x28,xzr,xzr // carry bit, modulo-scheduled
  429. mul x15,x7,x4
  430. add x27,x27,#8
  431. mul x16,x8,x4
  432. mul x17,x9,x4
  433. adds x19,x19,x14
  434. mul x14,x10,x4
  435. adcs x20,x20,x15
  436. mul x15,x11,x4
  437. adcs x21,x21,x16
  438. mul x16,x12,x4
  439. adcs x22,x22,x17
  440. mul x17,x13,x4
  441. adcs x23,x23,x14
  442. umulh x14,x6,x4
  443. adcs x24,x24,x15
  444. umulh x15,x7,x4
  445. adcs x25,x25,x16
  446. umulh x16,x8,x4
  447. adcs x26,x26,x17
  448. umulh x17,x9,x4
  449. adc x28,x28,xzr
  450. str x19,[x2],#8
  451. adds x19,x20,x14
  452. umulh x14,x10,x4
  453. adcs x20,x21,x15
  454. umulh x15,x11,x4
  455. adcs x21,x22,x16
  456. umulh x16,x12,x4
  457. adcs x22,x23,x17
  458. umulh x17,x13,x4
  459. ldr x4,[x0,x27]
  460. adcs x23,x24,x14
  461. adcs x24,x25,x15
  462. adcs x25,x26,x16
  463. adcs x26,x28,x17
  464. //adc x28,xzr,xzr // moved above
  465. cbnz x27,.Lsqr8x_mul
  466. // note that carry flag is guaranteed
  467. // to be zero at this point
  468. cmp x1,x3 // done yet?
  469. b.eq .Lsqr8x_break
  470. ldp x6,x7,[x2,#8*0]
  471. ldp x8,x9,[x2,#8*2]
  472. ldp x10,x11,[x2,#8*4]
  473. ldp x12,x13,[x2,#8*6]
  474. adds x19,x19,x6
  475. ldr x4,[x0,#-8*8]
  476. adcs x20,x20,x7
  477. ldp x6,x7,[x1,#8*0]
  478. adcs x21,x21,x8
  479. adcs x22,x22,x9
  480. ldp x8,x9,[x1,#8*2]
  481. adcs x23,x23,x10
  482. adcs x24,x24,x11
  483. ldp x10,x11,[x1,#8*4]
  484. adcs x25,x25,x12
  485. mov x27,#-8*8
  486. adcs x26,x26,x13
  487. ldp x12,x13,[x1,#8*6]
  488. add x1,x1,#8*8
  489. //adc x28,xzr,xzr // moved above
  490. b .Lsqr8x_mul
  491. .align 4
  492. .Lsqr8x_break:
  493. ldp x6,x7,[x0,#8*0]
  494. add x1,x0,#8*8
  495. ldp x8,x9,[x0,#8*2]
  496. sub x14,x3,x1 // is it last iteration?
  497. ldp x10,x11,[x0,#8*4]
  498. sub x15,x2,x14
  499. ldp x12,x13,[x0,#8*6]
  500. cbz x14,.Lsqr8x_outer_loop
  501. stp x19,x20,[x2,#8*0]
  502. ldp x19,x20,[x15,#8*0]
  503. stp x21,x22,[x2,#8*2]
  504. ldp x21,x22,[x15,#8*2]
  505. stp x23,x24,[x2,#8*4]
  506. ldp x23,x24,[x15,#8*4]
  507. stp x25,x26,[x2,#8*6]
  508. mov x2,x15
  509. ldp x25,x26,[x15,#8*6]
  510. b .Lsqr8x_outer_loop
  511. .align 4
  512. .Lsqr8x_outer_break:
  513. // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
  514. ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
  515. ldp x15,x16,[sp,#8*1]
  516. ldp x11,x13,[x14,#8*2]
  517. add x1,x14,#8*4
  518. ldp x17,x14,[sp,#8*3]
  519. stp x19,x20,[x2,#8*0]
  520. mul x19,x7,x7
  521. stp x21,x22,[x2,#8*2]
  522. umulh x7,x7,x7
  523. stp x23,x24,[x2,#8*4]
  524. mul x8,x9,x9
  525. stp x25,x26,[x2,#8*6]
  526. mov x2,sp
  527. umulh x9,x9,x9
  528. adds x20,x7,x15,lsl#1
  529. extr x15,x16,x15,#63
  530. sub x27,x5,#8*4
  531. .Lsqr4x_shift_n_add:
  532. adcs x21,x8,x15
  533. extr x16,x17,x16,#63
  534. sub x27,x27,#8*4
  535. adcs x22,x9,x16
  536. ldp x15,x16,[x2,#8*5]
  537. mul x10,x11,x11
  538. ldp x7,x9,[x1],#8*2
  539. umulh x11,x11,x11
  540. mul x12,x13,x13
  541. umulh x13,x13,x13
  542. extr x17,x14,x17,#63
  543. stp x19,x20,[x2,#8*0]
  544. adcs x23,x10,x17
  545. extr x14,x15,x14,#63
  546. stp x21,x22,[x2,#8*2]
  547. adcs x24,x11,x14
  548. ldp x17,x14,[x2,#8*7]
  549. extr x15,x16,x15,#63
  550. adcs x25,x12,x15
  551. extr x16,x17,x16,#63
  552. adcs x26,x13,x16
  553. ldp x15,x16,[x2,#8*9]
  554. mul x6,x7,x7
  555. ldp x11,x13,[x1],#8*2
  556. umulh x7,x7,x7
  557. mul x8,x9,x9
  558. umulh x9,x9,x9
  559. stp x23,x24,[x2,#8*4]
  560. extr x17,x14,x17,#63
  561. stp x25,x26,[x2,#8*6]
  562. add x2,x2,#8*8
  563. adcs x19,x6,x17
  564. extr x14,x15,x14,#63
  565. adcs x20,x7,x14
  566. ldp x17,x14,[x2,#8*3]
  567. extr x15,x16,x15,#63
  568. cbnz x27,.Lsqr4x_shift_n_add
  569. ldp x1,x4,[x29,#104] // pull np and n0
  570. adcs x21,x8,x15
  571. extr x16,x17,x16,#63
  572. adcs x22,x9,x16
  573. ldp x15,x16,[x2,#8*5]
  574. mul x10,x11,x11
  575. umulh x11,x11,x11
  576. stp x19,x20,[x2,#8*0]
  577. mul x12,x13,x13
  578. umulh x13,x13,x13
  579. stp x21,x22,[x2,#8*2]
  580. extr x17,x14,x17,#63
  581. adcs x23,x10,x17
  582. extr x14,x15,x14,#63
  583. ldp x19,x20,[sp,#8*0]
  584. adcs x24,x11,x14
  585. extr x15,x16,x15,#63
  586. ldp x6,x7,[x1,#8*0]
  587. adcs x25,x12,x15
  588. extr x16,xzr,x16,#63
  589. ldp x8,x9,[x1,#8*2]
  590. adc x26,x13,x16
  591. ldp x10,x11,[x1,#8*4]
  592. // Reduce by 512 bits per iteration
  593. mul x28,x4,x19 // t[0]*n0
  594. ldp x12,x13,[x1,#8*6]
  595. add x3,x1,x5
  596. ldp x21,x22,[sp,#8*2]
  597. stp x23,x24,[x2,#8*4]
  598. ldp x23,x24,[sp,#8*4]
  599. stp x25,x26,[x2,#8*6]
  600. ldp x25,x26,[sp,#8*6]
  601. add x1,x1,#8*8
  602. mov x30,xzr // initial top-most carry
  603. mov x2,sp
  604. mov x27,#8
  605. .Lsqr8x_reduction:
  606. // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
  607. mul x15,x7,x28
  608. sub x27,x27,#1
  609. mul x16,x8,x28
  610. str x28,[x2],#8 // put aside t[0]*n0 for tail processing
  611. mul x17,x9,x28
  612. // (*) adds xzr,x19,x14
  613. subs xzr,x19,#1 // (*)
  614. mul x14,x10,x28
  615. adcs x19,x20,x15
  616. mul x15,x11,x28
  617. adcs x20,x21,x16
  618. mul x16,x12,x28
  619. adcs x21,x22,x17
  620. mul x17,x13,x28
  621. adcs x22,x23,x14
  622. umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
  623. adcs x23,x24,x15
  624. umulh x15,x7,x28
  625. adcs x24,x25,x16
  626. umulh x16,x8,x28
  627. adcs x25,x26,x17
  628. umulh x17,x9,x28
  629. adc x26,xzr,xzr
  630. adds x19,x19,x14
  631. umulh x14,x10,x28
  632. adcs x20,x20,x15
  633. umulh x15,x11,x28
  634. adcs x21,x21,x16
  635. umulh x16,x12,x28
  636. adcs x22,x22,x17
  637. umulh x17,x13,x28
  638. mul x28,x4,x19 // next t[0]*n0
  639. adcs x23,x23,x14
  640. adcs x24,x24,x15
  641. adcs x25,x25,x16
  642. adc x26,x26,x17
  643. cbnz x27,.Lsqr8x_reduction
  644. ldp x14,x15,[x2,#8*0]
  645. ldp x16,x17,[x2,#8*2]
  646. mov x0,x2
  647. sub x27,x3,x1 // done yet?
  648. adds x19,x19,x14
  649. adcs x20,x20,x15
  650. ldp x14,x15,[x2,#8*4]
  651. adcs x21,x21,x16
  652. adcs x22,x22,x17
  653. ldp x16,x17,[x2,#8*6]
  654. adcs x23,x23,x14
  655. adcs x24,x24,x15
  656. adcs x25,x25,x16
  657. adcs x26,x26,x17
  658. //adc x28,xzr,xzr // moved below
  659. cbz x27,.Lsqr8x8_post_condition
  660. ldr x4,[x2,#-8*8]
  661. ldp x6,x7,[x1,#8*0]
  662. ldp x8,x9,[x1,#8*2]
  663. ldp x10,x11,[x1,#8*4]
  664. mov x27,#-8*8
  665. ldp x12,x13,[x1,#8*6]
  666. add x1,x1,#8*8
  667. .Lsqr8x_tail:
  668. mul x14,x6,x4
  669. adc x28,xzr,xzr // carry bit, modulo-scheduled
  670. mul x15,x7,x4
  671. add x27,x27,#8
  672. mul x16,x8,x4
  673. mul x17,x9,x4
  674. adds x19,x19,x14
  675. mul x14,x10,x4
  676. adcs x20,x20,x15
  677. mul x15,x11,x4
  678. adcs x21,x21,x16
  679. mul x16,x12,x4
  680. adcs x22,x22,x17
  681. mul x17,x13,x4
  682. adcs x23,x23,x14
  683. umulh x14,x6,x4
  684. adcs x24,x24,x15
  685. umulh x15,x7,x4
  686. adcs x25,x25,x16
  687. umulh x16,x8,x4
  688. adcs x26,x26,x17
  689. umulh x17,x9,x4
  690. adc x28,x28,xzr
  691. str x19,[x2],#8
  692. adds x19,x20,x14
  693. umulh x14,x10,x4
  694. adcs x20,x21,x15
  695. umulh x15,x11,x4
  696. adcs x21,x22,x16
  697. umulh x16,x12,x4
  698. adcs x22,x23,x17
  699. umulh x17,x13,x4
  700. ldr x4,[x0,x27]
  701. adcs x23,x24,x14
  702. adcs x24,x25,x15
  703. adcs x25,x26,x16
  704. adcs x26,x28,x17
  705. //adc x28,xzr,xzr // moved above
  706. cbnz x27,.Lsqr8x_tail
  707. // note that carry flag is guaranteed
  708. // to be zero at this point
  709. ldp x6,x7,[x2,#8*0]
  710. sub x27,x3,x1 // done yet?
  711. sub x16,x3,x5 // rewinded np
  712. ldp x8,x9,[x2,#8*2]
  713. ldp x10,x11,[x2,#8*4]
  714. ldp x12,x13,[x2,#8*6]
  715. cbz x27,.Lsqr8x_tail_break
  716. ldr x4,[x0,#-8*8]
  717. adds x19,x19,x6
  718. adcs x20,x20,x7
  719. ldp x6,x7,[x1,#8*0]
  720. adcs x21,x21,x8
  721. adcs x22,x22,x9
  722. ldp x8,x9,[x1,#8*2]
  723. adcs x23,x23,x10
  724. adcs x24,x24,x11
  725. ldp x10,x11,[x1,#8*4]
  726. adcs x25,x25,x12
  727. mov x27,#-8*8
  728. adcs x26,x26,x13
  729. ldp x12,x13,[x1,#8*6]
  730. add x1,x1,#8*8
  731. //adc x28,xzr,xzr // moved above
  732. b .Lsqr8x_tail
  733. .align 4
  734. .Lsqr8x_tail_break:
  735. ldr x4,[x29,#112] // pull n0
  736. add x27,x2,#8*8 // end of current t[num] window
  737. subs xzr,x30,#1 // "move" top-most carry to carry bit
  738. adcs x14,x19,x6
  739. adcs x15,x20,x7
  740. ldp x19,x20,[x0,#8*0]
  741. adcs x21,x21,x8
  742. ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
  743. adcs x22,x22,x9
  744. ldp x8,x9,[x16,#8*2]
  745. adcs x23,x23,x10
  746. adcs x24,x24,x11
  747. ldp x10,x11,[x16,#8*4]
  748. adcs x25,x25,x12
  749. adcs x26,x26,x13
  750. ldp x12,x13,[x16,#8*6]
  751. add x1,x16,#8*8
  752. adc x30,xzr,xzr // top-most carry
  753. mul x28,x4,x19
  754. stp x14,x15,[x2,#8*0]
  755. stp x21,x22,[x2,#8*2]
  756. ldp x21,x22,[x0,#8*2]
  757. stp x23,x24,[x2,#8*4]
  758. ldp x23,x24,[x0,#8*4]
  759. cmp x27,x29 // did we hit the bottom?
  760. stp x25,x26,[x2,#8*6]
  761. mov x2,x0 // slide the window
  762. ldp x25,x26,[x0,#8*6]
  763. mov x27,#8
  764. b.ne .Lsqr8x_reduction
  765. // Final step. We see if result is larger than modulus, and
  766. // if it is, subtract the modulus. But comparison implies
  767. // subtraction. So we subtract modulus, see if it borrowed,
  768. // and conditionally copy original value.
  769. ldr x0,[x29,#96] // pull rp
  770. add x2,x2,#8*8
  771. subs x14,x19,x6
  772. sbcs x15,x20,x7
  773. sub x27,x5,#8*8
  774. mov x3,x0 // x0 copy
  775. .Lsqr8x_sub:
  776. sbcs x16,x21,x8
  777. ldp x6,x7,[x1,#8*0]
  778. sbcs x17,x22,x9
  779. stp x14,x15,[x0,#8*0]
  780. sbcs x14,x23,x10
  781. ldp x8,x9,[x1,#8*2]
  782. sbcs x15,x24,x11
  783. stp x16,x17,[x0,#8*2]
  784. sbcs x16,x25,x12
  785. ldp x10,x11,[x1,#8*4]
  786. sbcs x17,x26,x13
  787. ldp x12,x13,[x1,#8*6]
  788. add x1,x1,#8*8
  789. ldp x19,x20,[x2,#8*0]
  790. sub x27,x27,#8*8
  791. ldp x21,x22,[x2,#8*2]
  792. ldp x23,x24,[x2,#8*4]
  793. ldp x25,x26,[x2,#8*6]
  794. add x2,x2,#8*8
  795. stp x14,x15,[x0,#8*4]
  796. sbcs x14,x19,x6
  797. stp x16,x17,[x0,#8*6]
  798. add x0,x0,#8*8
  799. sbcs x15,x20,x7
  800. cbnz x27,.Lsqr8x_sub
  801. sbcs x16,x21,x8
  802. mov x2,sp
  803. add x1,sp,x5
  804. ldp x6,x7,[x3,#8*0]
  805. sbcs x17,x22,x9
  806. stp x14,x15,[x0,#8*0]
  807. sbcs x14,x23,x10
  808. ldp x8,x9,[x3,#8*2]
  809. sbcs x15,x24,x11
  810. stp x16,x17,[x0,#8*2]
  811. sbcs x16,x25,x12
  812. ldp x19,x20,[x1,#8*0]
  813. sbcs x17,x26,x13
  814. ldp x21,x22,[x1,#8*2]
  815. sbcs xzr,x30,xzr // did it borrow?
  816. ldr x30,[x29,#8] // pull return address
  817. stp x14,x15,[x0,#8*4]
  818. stp x16,x17,[x0,#8*6]
  819. sub x27,x5,#8*4
  820. .Lsqr4x_cond_copy:
  821. sub x27,x27,#8*4
  822. csel x14,x19,x6,lo
  823. stp xzr,xzr,[x2,#8*0]
  824. csel x15,x20,x7,lo
  825. ldp x6,x7,[x3,#8*4]
  826. ldp x19,x20,[x1,#8*4]
  827. csel x16,x21,x8,lo
  828. stp xzr,xzr,[x2,#8*2]
  829. add x2,x2,#8*4
  830. csel x17,x22,x9,lo
  831. ldp x8,x9,[x3,#8*6]
  832. ldp x21,x22,[x1,#8*6]
  833. add x1,x1,#8*4
  834. stp x14,x15,[x3,#8*0]
  835. stp x16,x17,[x3,#8*2]
  836. add x3,x3,#8*4
  837. stp xzr,xzr,[x1,#8*0]
  838. stp xzr,xzr,[x1,#8*2]
  839. cbnz x27,.Lsqr4x_cond_copy
  840. csel x14,x19,x6,lo
  841. stp xzr,xzr,[x2,#8*0]
  842. csel x15,x20,x7,lo
  843. stp xzr,xzr,[x2,#8*2]
  844. csel x16,x21,x8,lo
  845. csel x17,x22,x9,lo
  846. stp x14,x15,[x3,#8*0]
  847. stp x16,x17,[x3,#8*2]
  848. b .Lsqr8x_done
  849. .align 4
  850. .Lsqr8x8_post_condition:
  851. adc x28,xzr,xzr
  852. ldr x30,[x29,#8] // pull return address
  853. // x19-7,x28 hold result, x6-7 hold modulus
  854. subs x6,x19,x6
  855. ldr x1,[x29,#96] // pull rp
  856. sbcs x7,x20,x7
  857. stp xzr,xzr,[sp,#8*0]
  858. sbcs x8,x21,x8
  859. stp xzr,xzr,[sp,#8*2]
  860. sbcs x9,x22,x9
  861. stp xzr,xzr,[sp,#8*4]
  862. sbcs x10,x23,x10
  863. stp xzr,xzr,[sp,#8*6]
  864. sbcs x11,x24,x11
  865. stp xzr,xzr,[sp,#8*8]
  866. sbcs x12,x25,x12
  867. stp xzr,xzr,[sp,#8*10]
  868. sbcs x13,x26,x13
  869. stp xzr,xzr,[sp,#8*12]
  870. sbcs x28,x28,xzr // did it borrow?
  871. stp xzr,xzr,[sp,#8*14]
  872. // x6-7 hold result-modulus
  873. csel x6,x19,x6,lo
  874. csel x7,x20,x7,lo
  875. csel x8,x21,x8,lo
  876. csel x9,x22,x9,lo
  877. stp x6,x7,[x1,#8*0]
  878. csel x10,x23,x10,lo
  879. csel x11,x24,x11,lo
  880. stp x8,x9,[x1,#8*2]
  881. csel x12,x25,x12,lo
  882. csel x13,x26,x13,lo
  883. stp x10,x11,[x1,#8*4]
  884. stp x12,x13,[x1,#8*6]
  885. .Lsqr8x_done:
  886. ldp x19,x20,[x29,#16]
  887. mov sp,x29
  888. ldp x21,x22,[x29,#32]
  889. mov x0,#1
  890. ldp x23,x24,[x29,#48]
  891. ldp x25,x26,[x29,#64]
  892. ldp x27,x28,[x29,#80]
  893. ldr x29,[sp],#128
  894. .inst 0xd50323bf // autiasp
  895. ret
  896. .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
  897. .type __bn_mul4x_mont,%function
  898. .align 5
  899. __bn_mul4x_mont:
  900. .inst 0xd503233f // paciasp
  901. stp x29,x30,[sp,#-128]!
  902. add x29,sp,#0
  903. stp x19,x20,[sp,#16]
  904. stp x21,x22,[sp,#32]
  905. stp x23,x24,[sp,#48]
  906. stp x25,x26,[sp,#64]
  907. stp x27,x28,[sp,#80]
  908. sub x26,sp,x5,lsl#3
  909. lsl x5,x5,#3
  910. ldr x4,[x4] // *n0
  911. sub sp,x26,#8*4 // alloca
  912. add x10,x2,x5
  913. add x27,x1,x5
  914. stp x0,x10,[x29,#96] // offload rp and &b[num]
  915. ldr x24,[x2,#8*0] // b[0]
  916. ldp x6,x7,[x1,#8*0] // a[0..3]
  917. ldp x8,x9,[x1,#8*2]
  918. add x1,x1,#8*4
  919. mov x19,xzr
  920. mov x20,xzr
  921. mov x21,xzr
  922. mov x22,xzr
  923. ldp x14,x15,[x3,#8*0] // n[0..3]
  924. ldp x16,x17,[x3,#8*2]
  925. adds x3,x3,#8*4 // clear carry bit
  926. mov x0,xzr
  927. mov x28,#0
  928. mov x26,sp
  929. .Loop_mul4x_1st_reduction:
  930. mul x10,x6,x24 // lo(a[0..3]*b[0])
  931. adc x0,x0,xzr // modulo-scheduled
  932. mul x11,x7,x24
  933. add x28,x28,#8
  934. mul x12,x8,x24
  935. and x28,x28,#31
  936. mul x13,x9,x24
  937. adds x19,x19,x10
  938. umulh x10,x6,x24 // hi(a[0..3]*b[0])
  939. adcs x20,x20,x11
  940. mul x25,x19,x4 // t[0]*n0
  941. adcs x21,x21,x12
  942. umulh x11,x7,x24
  943. adcs x22,x22,x13
  944. umulh x12,x8,x24
  945. adc x23,xzr,xzr
  946. umulh x13,x9,x24
  947. ldr x24,[x2,x28] // next b[i] (or b[0])
  948. adds x20,x20,x10
  949. // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
  950. str x25,[x26],#8 // put aside t[0]*n0 for tail processing
  951. adcs x21,x21,x11
  952. mul x11,x15,x25
  953. adcs x22,x22,x12
  954. mul x12,x16,x25
  955. adc x23,x23,x13 // can't overflow
  956. mul x13,x17,x25
  957. // (*) adds xzr,x19,x10
  958. subs xzr,x19,#1 // (*)
  959. umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
  960. adcs x19,x20,x11
  961. umulh x11,x15,x25
  962. adcs x20,x21,x12
  963. umulh x12,x16,x25
  964. adcs x21,x22,x13
  965. umulh x13,x17,x25
  966. adcs x22,x23,x0
  967. adc x0,xzr,xzr
  968. adds x19,x19,x10
  969. sub x10,x27,x1
  970. adcs x20,x20,x11
  971. adcs x21,x21,x12
  972. adcs x22,x22,x13
  973. //adc x0,x0,xzr
  974. cbnz x28,.Loop_mul4x_1st_reduction
  975. cbz x10,.Lmul4x4_post_condition
  976. ldp x6,x7,[x1,#8*0] // a[4..7]
  977. ldp x8,x9,[x1,#8*2]
  978. add x1,x1,#8*4
  979. ldr x25,[sp] // a[0]*n0
  980. ldp x14,x15,[x3,#8*0] // n[4..7]
  981. ldp x16,x17,[x3,#8*2]
  982. add x3,x3,#8*4
  983. .Loop_mul4x_1st_tail:
  984. mul x10,x6,x24 // lo(a[4..7]*b[i])
  985. adc x0,x0,xzr // modulo-scheduled
  986. mul x11,x7,x24
  987. add x28,x28,#8
  988. mul x12,x8,x24
  989. and x28,x28,#31
  990. mul x13,x9,x24
  991. adds x19,x19,x10
  992. umulh x10,x6,x24 // hi(a[4..7]*b[i])
  993. adcs x20,x20,x11
  994. umulh x11,x7,x24
  995. adcs x21,x21,x12
  996. umulh x12,x8,x24
  997. adcs x22,x22,x13
  998. umulh x13,x9,x24
  999. adc x23,xzr,xzr
  1000. ldr x24,[x2,x28] // next b[i] (or b[0])
  1001. adds x20,x20,x10
  1002. mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
  1003. adcs x21,x21,x11
  1004. mul x11,x15,x25
  1005. adcs x22,x22,x12
  1006. mul x12,x16,x25
  1007. adc x23,x23,x13 // can't overflow
  1008. mul x13,x17,x25
  1009. adds x19,x19,x10
  1010. umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
  1011. adcs x20,x20,x11
  1012. umulh x11,x15,x25
  1013. adcs x21,x21,x12
  1014. umulh x12,x16,x25
  1015. adcs x22,x22,x13
  1016. adcs x23,x23,x0
  1017. umulh x13,x17,x25
  1018. adc x0,xzr,xzr
  1019. ldr x25,[sp,x28] // next t[0]*n0
  1020. str x19,[x26],#8 // result!!!
  1021. adds x19,x20,x10
  1022. sub x10,x27,x1 // done yet?
  1023. adcs x20,x21,x11
  1024. adcs x21,x22,x12
  1025. adcs x22,x23,x13
  1026. //adc x0,x0,xzr
  1027. cbnz x28,.Loop_mul4x_1st_tail
  1028. sub x11,x27,x5 // rewinded x1
  1029. cbz x10,.Lmul4x_proceed
  1030. ldp x6,x7,[x1,#8*0]
  1031. ldp x8,x9,[x1,#8*2]
  1032. add x1,x1,#8*4
  1033. ldp x14,x15,[x3,#8*0]
  1034. ldp x16,x17,[x3,#8*2]
  1035. add x3,x3,#8*4
  1036. b .Loop_mul4x_1st_tail
  1037. .align 5
  1038. .Lmul4x_proceed:
  1039. ldr x24,[x2,#8*4]! // *++b
  1040. adc x30,x0,xzr
  1041. ldp x6,x7,[x11,#8*0] // a[0..3]
  1042. sub x3,x3,x5 // rewind np
  1043. ldp x8,x9,[x11,#8*2]
  1044. add x1,x11,#8*4
  1045. stp x19,x20,[x26,#8*0] // result!!!
  1046. ldp x19,x20,[sp,#8*4] // t[0..3]
  1047. stp x21,x22,[x26,#8*2] // result!!!
  1048. ldp x21,x22,[sp,#8*6]
  1049. ldp x14,x15,[x3,#8*0] // n[0..3]
  1050. mov x26,sp
  1051. ldp x16,x17,[x3,#8*2]
  1052. adds x3,x3,#8*4 // clear carry bit
  1053. mov x0,xzr
  1054. .align 4
  1055. .Loop_mul4x_reduction:
  1056. mul x10,x6,x24 // lo(a[0..3]*b[4])
  1057. adc x0,x0,xzr // modulo-scheduled
  1058. mul x11,x7,x24
  1059. add x28,x28,#8
  1060. mul x12,x8,x24
  1061. and x28,x28,#31
  1062. mul x13,x9,x24
  1063. adds x19,x19,x10
  1064. umulh x10,x6,x24 // hi(a[0..3]*b[4])
  1065. adcs x20,x20,x11
  1066. mul x25,x19,x4 // t[0]*n0
  1067. adcs x21,x21,x12
  1068. umulh x11,x7,x24
  1069. adcs x22,x22,x13
  1070. umulh x12,x8,x24
  1071. adc x23,xzr,xzr
  1072. umulh x13,x9,x24
  1073. ldr x24,[x2,x28] // next b[i]
  1074. adds x20,x20,x10
  1075. // (*) mul x10,x14,x25
  1076. str x25,[x26],#8 // put aside t[0]*n0 for tail processing
  1077. adcs x21,x21,x11
  1078. mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
  1079. adcs x22,x22,x12
  1080. mul x12,x16,x25
  1081. adc x23,x23,x13 // can't overflow
  1082. mul x13,x17,x25
  1083. // (*) adds xzr,x19,x10
  1084. subs xzr,x19,#1 // (*)
  1085. umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
  1086. adcs x19,x20,x11
  1087. umulh x11,x15,x25
  1088. adcs x20,x21,x12
  1089. umulh x12,x16,x25
  1090. adcs x21,x22,x13
  1091. umulh x13,x17,x25
  1092. adcs x22,x23,x0
  1093. adc x0,xzr,xzr
  1094. adds x19,x19,x10
  1095. adcs x20,x20,x11
  1096. adcs x21,x21,x12
  1097. adcs x22,x22,x13
  1098. //adc x0,x0,xzr
  1099. cbnz x28,.Loop_mul4x_reduction
  1100. adc x0,x0,xzr
  1101. ldp x10,x11,[x26,#8*4] // t[4..7]
  1102. ldp x12,x13,[x26,#8*6]
  1103. ldp x6,x7,[x1,#8*0] // a[4..7]
  1104. ldp x8,x9,[x1,#8*2]
  1105. add x1,x1,#8*4
  1106. adds x19,x19,x10
  1107. adcs x20,x20,x11
  1108. adcs x21,x21,x12
  1109. adcs x22,x22,x13
  1110. //adc x0,x0,xzr
  1111. ldr x25,[sp] // t[0]*n0
  1112. ldp x14,x15,[x3,#8*0] // n[4..7]
  1113. ldp x16,x17,[x3,#8*2]
  1114. add x3,x3,#8*4
  1115. .align 4
  1116. .Loop_mul4x_tail:
  1117. mul x10,x6,x24 // lo(a[4..7]*b[4])
  1118. adc x0,x0,xzr // modulo-scheduled
  1119. mul x11,x7,x24
  1120. add x28,x28,#8
  1121. mul x12,x8,x24
  1122. and x28,x28,#31
  1123. mul x13,x9,x24
  1124. adds x19,x19,x10
  1125. umulh x10,x6,x24 // hi(a[4..7]*b[4])
  1126. adcs x20,x20,x11
  1127. umulh x11,x7,x24
  1128. adcs x21,x21,x12
  1129. umulh x12,x8,x24
  1130. adcs x22,x22,x13
  1131. umulh x13,x9,x24
  1132. adc x23,xzr,xzr
  1133. ldr x24,[x2,x28] // next b[i]
  1134. adds x20,x20,x10
  1135. mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
  1136. adcs x21,x21,x11
  1137. mul x11,x15,x25
  1138. adcs x22,x22,x12
  1139. mul x12,x16,x25
  1140. adc x23,x23,x13 // can't overflow
  1141. mul x13,x17,x25
  1142. adds x19,x19,x10
  1143. umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
  1144. adcs x20,x20,x11
  1145. umulh x11,x15,x25
  1146. adcs x21,x21,x12
  1147. umulh x12,x16,x25
  1148. adcs x22,x22,x13
  1149. umulh x13,x17,x25
  1150. adcs x23,x23,x0
  1151. ldr x25,[sp,x28] // next a[0]*n0
  1152. adc x0,xzr,xzr
  1153. str x19,[x26],#8 // result!!!
  1154. adds x19,x20,x10
  1155. sub x10,x27,x1 // done yet?
  1156. adcs x20,x21,x11
  1157. adcs x21,x22,x12
  1158. adcs x22,x23,x13
  1159. //adc x0,x0,xzr
  1160. cbnz x28,.Loop_mul4x_tail
  1161. sub x11,x3,x5 // rewinded np?
  1162. adc x0,x0,xzr
  1163. cbz x10,.Loop_mul4x_break
  1164. ldp x10,x11,[x26,#8*4]
  1165. ldp x12,x13,[x26,#8*6]
  1166. ldp x6,x7,[x1,#8*0]
  1167. ldp x8,x9,[x1,#8*2]
  1168. add x1,x1,#8*4
  1169. adds x19,x19,x10
  1170. adcs x20,x20,x11
  1171. adcs x21,x21,x12
  1172. adcs x22,x22,x13
  1173. //adc x0,x0,xzr
  1174. ldp x14,x15,[x3,#8*0]
  1175. ldp x16,x17,[x3,#8*2]
  1176. add x3,x3,#8*4
  1177. b .Loop_mul4x_tail
  1178. .align 4
  1179. .Loop_mul4x_break:
  1180. ldp x12,x13,[x29,#96] // pull rp and &b[num]
  1181. adds x19,x19,x30
  1182. add x2,x2,#8*4 // bp++
  1183. adcs x20,x20,xzr
  1184. sub x1,x1,x5 // rewind ap
  1185. adcs x21,x21,xzr
  1186. stp x19,x20,[x26,#8*0] // result!!!
  1187. adcs x22,x22,xzr
  1188. ldp x19,x20,[sp,#8*4] // t[0..3]
  1189. adc x30,x0,xzr
  1190. stp x21,x22,[x26,#8*2] // result!!!
  1191. cmp x2,x13 // done yet?
  1192. ldp x21,x22,[sp,#8*6]
  1193. ldp x14,x15,[x11,#8*0] // n[0..3]
  1194. ldp x16,x17,[x11,#8*2]
  1195. add x3,x11,#8*4
  1196. b.eq .Lmul4x_post
  1197. ldr x24,[x2]
  1198. ldp x6,x7,[x1,#8*0] // a[0..3]
  1199. ldp x8,x9,[x1,#8*2]
  1200. adds x1,x1,#8*4 // clear carry bit
  1201. mov x0,xzr
  1202. mov x26,sp
  1203. b .Loop_mul4x_reduction
  1204. .align 4
  1205. .Lmul4x_post:
  1206. // Final step. We see if result is larger than modulus, and
  1207. // if it is, subtract the modulus. But comparison implies
  1208. // subtraction. So we subtract modulus, see if it borrowed,
  1209. // and conditionally copy original value.
  1210. mov x0,x12
  1211. mov x27,x12 // x0 copy
  1212. subs x10,x19,x14
  1213. add x26,sp,#8*8
  1214. sbcs x11,x20,x15
  1215. sub x28,x5,#8*4
  1216. .Lmul4x_sub:
  1217. sbcs x12,x21,x16
  1218. ldp x14,x15,[x3,#8*0]
  1219. sub x28,x28,#8*4
  1220. ldp x19,x20,[x26,#8*0]
  1221. sbcs x13,x22,x17
  1222. ldp x16,x17,[x3,#8*2]
  1223. add x3,x3,#8*4
  1224. ldp x21,x22,[x26,#8*2]
  1225. add x26,x26,#8*4
  1226. stp x10,x11,[x0,#8*0]
  1227. sbcs x10,x19,x14
  1228. stp x12,x13,[x0,#8*2]
  1229. add x0,x0,#8*4
  1230. sbcs x11,x20,x15
  1231. cbnz x28,.Lmul4x_sub
  1232. sbcs x12,x21,x16
  1233. mov x26,sp
  1234. add x1,sp,#8*4
  1235. ldp x6,x7,[x27,#8*0]
  1236. sbcs x13,x22,x17
  1237. stp x10,x11,[x0,#8*0]
  1238. ldp x8,x9,[x27,#8*2]
  1239. stp x12,x13,[x0,#8*2]
  1240. ldp x19,x20,[x1,#8*0]
  1241. ldp x21,x22,[x1,#8*2]
  1242. sbcs xzr,x30,xzr // did it borrow?
  1243. ldr x30,[x29,#8] // pull return address
  1244. sub x28,x5,#8*4
  1245. .Lmul4x_cond_copy:
  1246. sub x28,x28,#8*4
  1247. csel x10,x19,x6,lo
  1248. stp xzr,xzr,[x26,#8*0]
  1249. csel x11,x20,x7,lo
  1250. ldp x6,x7,[x27,#8*4]
  1251. ldp x19,x20,[x1,#8*4]
  1252. csel x12,x21,x8,lo
  1253. stp xzr,xzr,[x26,#8*2]
  1254. add x26,x26,#8*4
  1255. csel x13,x22,x9,lo
  1256. ldp x8,x9,[x27,#8*6]
  1257. ldp x21,x22,[x1,#8*6]
  1258. add x1,x1,#8*4
  1259. stp x10,x11,[x27,#8*0]
  1260. stp x12,x13,[x27,#8*2]
  1261. add x27,x27,#8*4
  1262. cbnz x28,.Lmul4x_cond_copy
  1263. csel x10,x19,x6,lo
  1264. stp xzr,xzr,[x26,#8*0]
  1265. csel x11,x20,x7,lo
  1266. stp xzr,xzr,[x26,#8*2]
  1267. csel x12,x21,x8,lo
  1268. stp xzr,xzr,[x26,#8*3]
  1269. csel x13,x22,x9,lo
  1270. stp xzr,xzr,[x26,#8*4]
  1271. stp x10,x11,[x27,#8*0]
  1272. stp x12,x13,[x27,#8*2]
  1273. b .Lmul4x_done
  1274. .align 4
  1275. .Lmul4x4_post_condition:
  1276. adc x0,x0,xzr
  1277. ldr x1,[x29,#96] // pull rp
  1278. // x19-3,x0 hold result, x14-7 hold modulus
  1279. subs x6,x19,x14
  1280. ldr x30,[x29,#8] // pull return address
  1281. sbcs x7,x20,x15
  1282. stp xzr,xzr,[sp,#8*0]
  1283. sbcs x8,x21,x16
  1284. stp xzr,xzr,[sp,#8*2]
  1285. sbcs x9,x22,x17
  1286. stp xzr,xzr,[sp,#8*4]
  1287. sbcs xzr,x0,xzr // did it borrow?
  1288. stp xzr,xzr,[sp,#8*6]
  1289. // x6-3 hold result-modulus
  1290. csel x6,x19,x6,lo
  1291. csel x7,x20,x7,lo
  1292. csel x8,x21,x8,lo
  1293. csel x9,x22,x9,lo
  1294. stp x6,x7,[x1,#8*0]
  1295. stp x8,x9,[x1,#8*2]
  1296. .Lmul4x_done:
  1297. ldp x19,x20,[x29,#16]
  1298. mov sp,x29
  1299. ldp x21,x22,[x29,#32]
  1300. mov x0,#1
  1301. ldp x23,x24,[x29,#48]
  1302. ldp x25,x26,[x29,#64]
  1303. ldp x27,x28,[x29,#80]
  1304. ldr x29,[sp],#128
  1305. .inst 0xd50323bf // autiasp
  1306. ret
  1307. .size __bn_mul4x_mont,.-__bn_mul4x_mont
  1308. .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1309. .align 2
  1310. .align 4