sha512-armv8.S 48 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618
  1. // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
  2. //
  3. // Licensed under the OpenSSL license (the "License"). You may not use
  4. // this file except in compliance with the License. You can obtain a copy
  5. // in the file LICENSE in the source distribution or at
  6. // https://www.openssl.org/source/license.html
  7. // ====================================================================
  8. // Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  9. // project. The module is, however, dual licensed under OpenSSL and
  10. // CRYPTOGAMS licenses depending on where you obtain it. For further
  11. // details see http://www.openssl.org/~appro/cryptogams/.
  12. //
  13. // Permission to use under GPLv2 terms is granted.
  14. // ====================================================================
  15. //
  16. // SHA256/512 for ARMv8.
  17. //
  18. // Performance in cycles per processed byte and improvement coefficient
  19. // over code generated with "default" compiler:
  20. //
  21. // SHA256-hw SHA256(*) SHA512
  22. // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
  23. // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
  24. // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
  25. // Denver 2.01 10.5 (+26%) 6.70 (+8%)
  26. // X-Gene 20.0 (+100%) 12.8 (+300%(***))
  27. // Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
  28. // Kryo 1.92 17.4 (+30%) 11.2 (+8%)
  29. //
  30. // (*) Software SHA256 results are of lesser relevance, presented
  31. // mostly for informational purposes.
  32. // (**) The result is a trade-off: it's possible to improve it by
  33. // 10% (or by 1 cycle per round), but at the cost of 20% loss
  34. // on Cortex-A53 (or by 4 cycles per round).
  35. // (***) Super-impressive coefficients over gcc-generated code are
  36. // indication of some compiler "pathology", most notably code
  37. // generated with -mgeneral-regs-only is significantly faster
  38. // and the gap is only 40-90%.
  39. //
  40. // October 2016.
  41. //
  42. // Originally it was reckoned that it makes no sense to implement NEON
  43. // version of SHA256 for 64-bit processors. This is because performance
  44. // improvement on most wide-spread Cortex-A5x processors was observed
  45. // to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
  46. // observed that 32-bit NEON SHA256 performs significantly better than
  47. // 64-bit scalar version on *some* of the more recent processors. As
  48. // result 64-bit NEON version of SHA256 was added to provide best
  49. // all-round performance. For example it executes ~30% faster on X-Gene
  50. // and Mongoose. [For reference, NEON version of SHA512 is bound to
  51. // deliver much less improvement, likely *negative* on Cortex-A5x.
  52. // Which is why NEON support is limited to SHA256.]
  53. #ifndef __KERNEL__
  54. # include "arm_arch.h"
  55. #endif
  56. .text
  57. .private_extern _OPENSSL_armcap_P
  58. .globl _sha512_block_data_order
  59. .align 6
  60. _sha512_block_data_order:
  61. #ifndef __KERNEL__
  62. # ifdef __ILP32__
  63. ldrsw x16,LOPENSSL_armcap_P
  64. # else
  65. ldr x16,LOPENSSL_armcap_P
  66. # endif
  67. adr x17,LOPENSSL_armcap_P
  68. add x16,x16,x17
  69. ldr w16,[x16]
  70. tst w16,#ARMV8_SHA512
  71. b.ne Lv8_entry
  72. #endif
  73. .long 0xd503233f // paciasp
  74. stp x29,x30,[sp,#-128]!
  75. add x29,sp,#0
  76. stp x19,x20,[sp,#16]
  77. stp x21,x22,[sp,#32]
  78. stp x23,x24,[sp,#48]
  79. stp x25,x26,[sp,#64]
  80. stp x27,x28,[sp,#80]
  81. sub sp,sp,#4*8
  82. ldp x20,x21,[x0] // load context
  83. ldp x22,x23,[x0,#2*8]
  84. ldp x24,x25,[x0,#4*8]
  85. add x2,x1,x2,lsl#7 // end of input
  86. ldp x26,x27,[x0,#6*8]
  87. adr x30,LK512
  88. stp x0,x2,[x29,#96]
  89. Loop:
  90. ldp x3,x4,[x1],#2*8
  91. ldr x19,[x30],#8 // *K++
  92. eor x28,x21,x22 // magic seed
  93. str x1,[x29,#112]
  94. #ifndef __AARCH64EB__
  95. rev x3,x3 // 0
  96. #endif
  97. ror x16,x24,#14
  98. add x27,x27,x19 // h+=K[i]
  99. eor x6,x24,x24,ror#23
  100. and x17,x25,x24
  101. bic x19,x26,x24
  102. add x27,x27,x3 // h+=X[i]
  103. orr x17,x17,x19 // Ch(e,f,g)
  104. eor x19,x20,x21 // a^b, b^c in next round
  105. eor x16,x16,x6,ror#18 // Sigma1(e)
  106. ror x6,x20,#28
  107. add x27,x27,x17 // h+=Ch(e,f,g)
  108. eor x17,x20,x20,ror#5
  109. add x27,x27,x16 // h+=Sigma1(e)
  110. and x28,x28,x19 // (b^c)&=(a^b)
  111. add x23,x23,x27 // d+=h
  112. eor x28,x28,x21 // Maj(a,b,c)
  113. eor x17,x6,x17,ror#34 // Sigma0(a)
  114. add x27,x27,x28 // h+=Maj(a,b,c)
  115. ldr x28,[x30],#8 // *K++, x19 in next round
  116. //add x27,x27,x17 // h+=Sigma0(a)
  117. #ifndef __AARCH64EB__
  118. rev x4,x4 // 1
  119. #endif
  120. ldp x5,x6,[x1],#2*8
  121. add x27,x27,x17 // h+=Sigma0(a)
  122. ror x16,x23,#14
  123. add x26,x26,x28 // h+=K[i]
  124. eor x7,x23,x23,ror#23
  125. and x17,x24,x23
  126. bic x28,x25,x23
  127. add x26,x26,x4 // h+=X[i]
  128. orr x17,x17,x28 // Ch(e,f,g)
  129. eor x28,x27,x20 // a^b, b^c in next round
  130. eor x16,x16,x7,ror#18 // Sigma1(e)
  131. ror x7,x27,#28
  132. add x26,x26,x17 // h+=Ch(e,f,g)
  133. eor x17,x27,x27,ror#5
  134. add x26,x26,x16 // h+=Sigma1(e)
  135. and x19,x19,x28 // (b^c)&=(a^b)
  136. add x22,x22,x26 // d+=h
  137. eor x19,x19,x20 // Maj(a,b,c)
  138. eor x17,x7,x17,ror#34 // Sigma0(a)
  139. add x26,x26,x19 // h+=Maj(a,b,c)
  140. ldr x19,[x30],#8 // *K++, x28 in next round
  141. //add x26,x26,x17 // h+=Sigma0(a)
  142. #ifndef __AARCH64EB__
  143. rev x5,x5 // 2
  144. #endif
  145. add x26,x26,x17 // h+=Sigma0(a)
  146. ror x16,x22,#14
  147. add x25,x25,x19 // h+=K[i]
  148. eor x8,x22,x22,ror#23
  149. and x17,x23,x22
  150. bic x19,x24,x22
  151. add x25,x25,x5 // h+=X[i]
  152. orr x17,x17,x19 // Ch(e,f,g)
  153. eor x19,x26,x27 // a^b, b^c in next round
  154. eor x16,x16,x8,ror#18 // Sigma1(e)
  155. ror x8,x26,#28
  156. add x25,x25,x17 // h+=Ch(e,f,g)
  157. eor x17,x26,x26,ror#5
  158. add x25,x25,x16 // h+=Sigma1(e)
  159. and x28,x28,x19 // (b^c)&=(a^b)
  160. add x21,x21,x25 // d+=h
  161. eor x28,x28,x27 // Maj(a,b,c)
  162. eor x17,x8,x17,ror#34 // Sigma0(a)
  163. add x25,x25,x28 // h+=Maj(a,b,c)
  164. ldr x28,[x30],#8 // *K++, x19 in next round
  165. //add x25,x25,x17 // h+=Sigma0(a)
  166. #ifndef __AARCH64EB__
  167. rev x6,x6 // 3
  168. #endif
  169. ldp x7,x8,[x1],#2*8
  170. add x25,x25,x17 // h+=Sigma0(a)
  171. ror x16,x21,#14
  172. add x24,x24,x28 // h+=K[i]
  173. eor x9,x21,x21,ror#23
  174. and x17,x22,x21
  175. bic x28,x23,x21
  176. add x24,x24,x6 // h+=X[i]
  177. orr x17,x17,x28 // Ch(e,f,g)
  178. eor x28,x25,x26 // a^b, b^c in next round
  179. eor x16,x16,x9,ror#18 // Sigma1(e)
  180. ror x9,x25,#28
  181. add x24,x24,x17 // h+=Ch(e,f,g)
  182. eor x17,x25,x25,ror#5
  183. add x24,x24,x16 // h+=Sigma1(e)
  184. and x19,x19,x28 // (b^c)&=(a^b)
  185. add x20,x20,x24 // d+=h
  186. eor x19,x19,x26 // Maj(a,b,c)
  187. eor x17,x9,x17,ror#34 // Sigma0(a)
  188. add x24,x24,x19 // h+=Maj(a,b,c)
  189. ldr x19,[x30],#8 // *K++, x28 in next round
  190. //add x24,x24,x17 // h+=Sigma0(a)
  191. #ifndef __AARCH64EB__
  192. rev x7,x7 // 4
  193. #endif
  194. add x24,x24,x17 // h+=Sigma0(a)
  195. ror x16,x20,#14
  196. add x23,x23,x19 // h+=K[i]
  197. eor x10,x20,x20,ror#23
  198. and x17,x21,x20
  199. bic x19,x22,x20
  200. add x23,x23,x7 // h+=X[i]
  201. orr x17,x17,x19 // Ch(e,f,g)
  202. eor x19,x24,x25 // a^b, b^c in next round
  203. eor x16,x16,x10,ror#18 // Sigma1(e)
  204. ror x10,x24,#28
  205. add x23,x23,x17 // h+=Ch(e,f,g)
  206. eor x17,x24,x24,ror#5
  207. add x23,x23,x16 // h+=Sigma1(e)
  208. and x28,x28,x19 // (b^c)&=(a^b)
  209. add x27,x27,x23 // d+=h
  210. eor x28,x28,x25 // Maj(a,b,c)
  211. eor x17,x10,x17,ror#34 // Sigma0(a)
  212. add x23,x23,x28 // h+=Maj(a,b,c)
  213. ldr x28,[x30],#8 // *K++, x19 in next round
  214. //add x23,x23,x17 // h+=Sigma0(a)
  215. #ifndef __AARCH64EB__
  216. rev x8,x8 // 5
  217. #endif
  218. ldp x9,x10,[x1],#2*8
  219. add x23,x23,x17 // h+=Sigma0(a)
  220. ror x16,x27,#14
  221. add x22,x22,x28 // h+=K[i]
  222. eor x11,x27,x27,ror#23
  223. and x17,x20,x27
  224. bic x28,x21,x27
  225. add x22,x22,x8 // h+=X[i]
  226. orr x17,x17,x28 // Ch(e,f,g)
  227. eor x28,x23,x24 // a^b, b^c in next round
  228. eor x16,x16,x11,ror#18 // Sigma1(e)
  229. ror x11,x23,#28
  230. add x22,x22,x17 // h+=Ch(e,f,g)
  231. eor x17,x23,x23,ror#5
  232. add x22,x22,x16 // h+=Sigma1(e)
  233. and x19,x19,x28 // (b^c)&=(a^b)
  234. add x26,x26,x22 // d+=h
  235. eor x19,x19,x24 // Maj(a,b,c)
  236. eor x17,x11,x17,ror#34 // Sigma0(a)
  237. add x22,x22,x19 // h+=Maj(a,b,c)
  238. ldr x19,[x30],#8 // *K++, x28 in next round
  239. //add x22,x22,x17 // h+=Sigma0(a)
  240. #ifndef __AARCH64EB__
  241. rev x9,x9 // 6
  242. #endif
  243. add x22,x22,x17 // h+=Sigma0(a)
  244. ror x16,x26,#14
  245. add x21,x21,x19 // h+=K[i]
  246. eor x12,x26,x26,ror#23
  247. and x17,x27,x26
  248. bic x19,x20,x26
  249. add x21,x21,x9 // h+=X[i]
  250. orr x17,x17,x19 // Ch(e,f,g)
  251. eor x19,x22,x23 // a^b, b^c in next round
  252. eor x16,x16,x12,ror#18 // Sigma1(e)
  253. ror x12,x22,#28
  254. add x21,x21,x17 // h+=Ch(e,f,g)
  255. eor x17,x22,x22,ror#5
  256. add x21,x21,x16 // h+=Sigma1(e)
  257. and x28,x28,x19 // (b^c)&=(a^b)
  258. add x25,x25,x21 // d+=h
  259. eor x28,x28,x23 // Maj(a,b,c)
  260. eor x17,x12,x17,ror#34 // Sigma0(a)
  261. add x21,x21,x28 // h+=Maj(a,b,c)
  262. ldr x28,[x30],#8 // *K++, x19 in next round
  263. //add x21,x21,x17 // h+=Sigma0(a)
  264. #ifndef __AARCH64EB__
  265. rev x10,x10 // 7
  266. #endif
  267. ldp x11,x12,[x1],#2*8
  268. add x21,x21,x17 // h+=Sigma0(a)
  269. ror x16,x25,#14
  270. add x20,x20,x28 // h+=K[i]
  271. eor x13,x25,x25,ror#23
  272. and x17,x26,x25
  273. bic x28,x27,x25
  274. add x20,x20,x10 // h+=X[i]
  275. orr x17,x17,x28 // Ch(e,f,g)
  276. eor x28,x21,x22 // a^b, b^c in next round
  277. eor x16,x16,x13,ror#18 // Sigma1(e)
  278. ror x13,x21,#28
  279. add x20,x20,x17 // h+=Ch(e,f,g)
  280. eor x17,x21,x21,ror#5
  281. add x20,x20,x16 // h+=Sigma1(e)
  282. and x19,x19,x28 // (b^c)&=(a^b)
  283. add x24,x24,x20 // d+=h
  284. eor x19,x19,x22 // Maj(a,b,c)
  285. eor x17,x13,x17,ror#34 // Sigma0(a)
  286. add x20,x20,x19 // h+=Maj(a,b,c)
  287. ldr x19,[x30],#8 // *K++, x28 in next round
  288. //add x20,x20,x17 // h+=Sigma0(a)
  289. #ifndef __AARCH64EB__
  290. rev x11,x11 // 8
  291. #endif
  292. add x20,x20,x17 // h+=Sigma0(a)
  293. ror x16,x24,#14
  294. add x27,x27,x19 // h+=K[i]
  295. eor x14,x24,x24,ror#23
  296. and x17,x25,x24
  297. bic x19,x26,x24
  298. add x27,x27,x11 // h+=X[i]
  299. orr x17,x17,x19 // Ch(e,f,g)
  300. eor x19,x20,x21 // a^b, b^c in next round
  301. eor x16,x16,x14,ror#18 // Sigma1(e)
  302. ror x14,x20,#28
  303. add x27,x27,x17 // h+=Ch(e,f,g)
  304. eor x17,x20,x20,ror#5
  305. add x27,x27,x16 // h+=Sigma1(e)
  306. and x28,x28,x19 // (b^c)&=(a^b)
  307. add x23,x23,x27 // d+=h
  308. eor x28,x28,x21 // Maj(a,b,c)
  309. eor x17,x14,x17,ror#34 // Sigma0(a)
  310. add x27,x27,x28 // h+=Maj(a,b,c)
  311. ldr x28,[x30],#8 // *K++, x19 in next round
  312. //add x27,x27,x17 // h+=Sigma0(a)
  313. #ifndef __AARCH64EB__
  314. rev x12,x12 // 9
  315. #endif
  316. ldp x13,x14,[x1],#2*8
  317. add x27,x27,x17 // h+=Sigma0(a)
  318. ror x16,x23,#14
  319. add x26,x26,x28 // h+=K[i]
  320. eor x15,x23,x23,ror#23
  321. and x17,x24,x23
  322. bic x28,x25,x23
  323. add x26,x26,x12 // h+=X[i]
  324. orr x17,x17,x28 // Ch(e,f,g)
  325. eor x28,x27,x20 // a^b, b^c in next round
  326. eor x16,x16,x15,ror#18 // Sigma1(e)
  327. ror x15,x27,#28
  328. add x26,x26,x17 // h+=Ch(e,f,g)
  329. eor x17,x27,x27,ror#5
  330. add x26,x26,x16 // h+=Sigma1(e)
  331. and x19,x19,x28 // (b^c)&=(a^b)
  332. add x22,x22,x26 // d+=h
  333. eor x19,x19,x20 // Maj(a,b,c)
  334. eor x17,x15,x17,ror#34 // Sigma0(a)
  335. add x26,x26,x19 // h+=Maj(a,b,c)
  336. ldr x19,[x30],#8 // *K++, x28 in next round
  337. //add x26,x26,x17 // h+=Sigma0(a)
  338. #ifndef __AARCH64EB__
  339. rev x13,x13 // 10
  340. #endif
  341. add x26,x26,x17 // h+=Sigma0(a)
  342. ror x16,x22,#14
  343. add x25,x25,x19 // h+=K[i]
  344. eor x0,x22,x22,ror#23
  345. and x17,x23,x22
  346. bic x19,x24,x22
  347. add x25,x25,x13 // h+=X[i]
  348. orr x17,x17,x19 // Ch(e,f,g)
  349. eor x19,x26,x27 // a^b, b^c in next round
  350. eor x16,x16,x0,ror#18 // Sigma1(e)
  351. ror x0,x26,#28
  352. add x25,x25,x17 // h+=Ch(e,f,g)
  353. eor x17,x26,x26,ror#5
  354. add x25,x25,x16 // h+=Sigma1(e)
  355. and x28,x28,x19 // (b^c)&=(a^b)
  356. add x21,x21,x25 // d+=h
  357. eor x28,x28,x27 // Maj(a,b,c)
  358. eor x17,x0,x17,ror#34 // Sigma0(a)
  359. add x25,x25,x28 // h+=Maj(a,b,c)
  360. ldr x28,[x30],#8 // *K++, x19 in next round
  361. //add x25,x25,x17 // h+=Sigma0(a)
  362. #ifndef __AARCH64EB__
  363. rev x14,x14 // 11
  364. #endif
  365. ldp x15,x0,[x1],#2*8
  366. add x25,x25,x17 // h+=Sigma0(a)
  367. str x6,[sp,#24]
  368. ror x16,x21,#14
  369. add x24,x24,x28 // h+=K[i]
  370. eor x6,x21,x21,ror#23
  371. and x17,x22,x21
  372. bic x28,x23,x21
  373. add x24,x24,x14 // h+=X[i]
  374. orr x17,x17,x28 // Ch(e,f,g)
  375. eor x28,x25,x26 // a^b, b^c in next round
  376. eor x16,x16,x6,ror#18 // Sigma1(e)
  377. ror x6,x25,#28
  378. add x24,x24,x17 // h+=Ch(e,f,g)
  379. eor x17,x25,x25,ror#5
  380. add x24,x24,x16 // h+=Sigma1(e)
  381. and x19,x19,x28 // (b^c)&=(a^b)
  382. add x20,x20,x24 // d+=h
  383. eor x19,x19,x26 // Maj(a,b,c)
  384. eor x17,x6,x17,ror#34 // Sigma0(a)
  385. add x24,x24,x19 // h+=Maj(a,b,c)
  386. ldr x19,[x30],#8 // *K++, x28 in next round
  387. //add x24,x24,x17 // h+=Sigma0(a)
  388. #ifndef __AARCH64EB__
  389. rev x15,x15 // 12
  390. #endif
  391. add x24,x24,x17 // h+=Sigma0(a)
  392. str x7,[sp,#0]
  393. ror x16,x20,#14
  394. add x23,x23,x19 // h+=K[i]
  395. eor x7,x20,x20,ror#23
  396. and x17,x21,x20
  397. bic x19,x22,x20
  398. add x23,x23,x15 // h+=X[i]
  399. orr x17,x17,x19 // Ch(e,f,g)
  400. eor x19,x24,x25 // a^b, b^c in next round
  401. eor x16,x16,x7,ror#18 // Sigma1(e)
  402. ror x7,x24,#28
  403. add x23,x23,x17 // h+=Ch(e,f,g)
  404. eor x17,x24,x24,ror#5
  405. add x23,x23,x16 // h+=Sigma1(e)
  406. and x28,x28,x19 // (b^c)&=(a^b)
  407. add x27,x27,x23 // d+=h
  408. eor x28,x28,x25 // Maj(a,b,c)
  409. eor x17,x7,x17,ror#34 // Sigma0(a)
  410. add x23,x23,x28 // h+=Maj(a,b,c)
  411. ldr x28,[x30],#8 // *K++, x19 in next round
  412. //add x23,x23,x17 // h+=Sigma0(a)
  413. #ifndef __AARCH64EB__
  414. rev x0,x0 // 13
  415. #endif
  416. ldp x1,x2,[x1]
  417. add x23,x23,x17 // h+=Sigma0(a)
  418. str x8,[sp,#8]
  419. ror x16,x27,#14
  420. add x22,x22,x28 // h+=K[i]
  421. eor x8,x27,x27,ror#23
  422. and x17,x20,x27
  423. bic x28,x21,x27
  424. add x22,x22,x0 // h+=X[i]
  425. orr x17,x17,x28 // Ch(e,f,g)
  426. eor x28,x23,x24 // a^b, b^c in next round
  427. eor x16,x16,x8,ror#18 // Sigma1(e)
  428. ror x8,x23,#28
  429. add x22,x22,x17 // h+=Ch(e,f,g)
  430. eor x17,x23,x23,ror#5
  431. add x22,x22,x16 // h+=Sigma1(e)
  432. and x19,x19,x28 // (b^c)&=(a^b)
  433. add x26,x26,x22 // d+=h
  434. eor x19,x19,x24 // Maj(a,b,c)
  435. eor x17,x8,x17,ror#34 // Sigma0(a)
  436. add x22,x22,x19 // h+=Maj(a,b,c)
  437. ldr x19,[x30],#8 // *K++, x28 in next round
  438. //add x22,x22,x17 // h+=Sigma0(a)
  439. #ifndef __AARCH64EB__
  440. rev x1,x1 // 14
  441. #endif
  442. ldr x6,[sp,#24]
  443. add x22,x22,x17 // h+=Sigma0(a)
  444. str x9,[sp,#16]
  445. ror x16,x26,#14
  446. add x21,x21,x19 // h+=K[i]
  447. eor x9,x26,x26,ror#23
  448. and x17,x27,x26
  449. bic x19,x20,x26
  450. add x21,x21,x1 // h+=X[i]
  451. orr x17,x17,x19 // Ch(e,f,g)
  452. eor x19,x22,x23 // a^b, b^c in next round
  453. eor x16,x16,x9,ror#18 // Sigma1(e)
  454. ror x9,x22,#28
  455. add x21,x21,x17 // h+=Ch(e,f,g)
  456. eor x17,x22,x22,ror#5
  457. add x21,x21,x16 // h+=Sigma1(e)
  458. and x28,x28,x19 // (b^c)&=(a^b)
  459. add x25,x25,x21 // d+=h
  460. eor x28,x28,x23 // Maj(a,b,c)
  461. eor x17,x9,x17,ror#34 // Sigma0(a)
  462. add x21,x21,x28 // h+=Maj(a,b,c)
  463. ldr x28,[x30],#8 // *K++, x19 in next round
  464. //add x21,x21,x17 // h+=Sigma0(a)
  465. #ifndef __AARCH64EB__
  466. rev x2,x2 // 15
  467. #endif
  468. ldr x7,[sp,#0]
  469. add x21,x21,x17 // h+=Sigma0(a)
  470. str x10,[sp,#24]
  471. ror x16,x25,#14
  472. add x20,x20,x28 // h+=K[i]
  473. ror x9,x4,#1
  474. and x17,x26,x25
  475. ror x8,x1,#19
  476. bic x28,x27,x25
  477. ror x10,x21,#28
  478. add x20,x20,x2 // h+=X[i]
  479. eor x16,x16,x25,ror#18
  480. eor x9,x9,x4,ror#8
  481. orr x17,x17,x28 // Ch(e,f,g)
  482. eor x28,x21,x22 // a^b, b^c in next round
  483. eor x16,x16,x25,ror#41 // Sigma1(e)
  484. eor x10,x10,x21,ror#34
  485. add x20,x20,x17 // h+=Ch(e,f,g)
  486. and x19,x19,x28 // (b^c)&=(a^b)
  487. eor x8,x8,x1,ror#61
  488. eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
  489. add x20,x20,x16 // h+=Sigma1(e)
  490. eor x19,x19,x22 // Maj(a,b,c)
  491. eor x17,x10,x21,ror#39 // Sigma0(a)
  492. eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
  493. add x3,x3,x12
  494. add x24,x24,x20 // d+=h
  495. add x20,x20,x19 // h+=Maj(a,b,c)
  496. ldr x19,[x30],#8 // *K++, x28 in next round
  497. add x3,x3,x9
  498. add x20,x20,x17 // h+=Sigma0(a)
  499. add x3,x3,x8
  500. Loop_16_xx:
  501. ldr x8,[sp,#8]
  502. str x11,[sp,#0]
  503. ror x16,x24,#14
  504. add x27,x27,x19 // h+=K[i]
  505. ror x10,x5,#1
  506. and x17,x25,x24
  507. ror x9,x2,#19
  508. bic x19,x26,x24
  509. ror x11,x20,#28
  510. add x27,x27,x3 // h+=X[i]
  511. eor x16,x16,x24,ror#18
  512. eor x10,x10,x5,ror#8
  513. orr x17,x17,x19 // Ch(e,f,g)
  514. eor x19,x20,x21 // a^b, b^c in next round
  515. eor x16,x16,x24,ror#41 // Sigma1(e)
  516. eor x11,x11,x20,ror#34
  517. add x27,x27,x17 // h+=Ch(e,f,g)
  518. and x28,x28,x19 // (b^c)&=(a^b)
  519. eor x9,x9,x2,ror#61
  520. eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
  521. add x27,x27,x16 // h+=Sigma1(e)
  522. eor x28,x28,x21 // Maj(a,b,c)
  523. eor x17,x11,x20,ror#39 // Sigma0(a)
  524. eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
  525. add x4,x4,x13
  526. add x23,x23,x27 // d+=h
  527. add x27,x27,x28 // h+=Maj(a,b,c)
  528. ldr x28,[x30],#8 // *K++, x19 in next round
  529. add x4,x4,x10
  530. add x27,x27,x17 // h+=Sigma0(a)
  531. add x4,x4,x9
  532. ldr x9,[sp,#16]
  533. str x12,[sp,#8]
  534. ror x16,x23,#14
  535. add x26,x26,x28 // h+=K[i]
  536. ror x11,x6,#1
  537. and x17,x24,x23
  538. ror x10,x3,#19
  539. bic x28,x25,x23
  540. ror x12,x27,#28
  541. add x26,x26,x4 // h+=X[i]
  542. eor x16,x16,x23,ror#18
  543. eor x11,x11,x6,ror#8
  544. orr x17,x17,x28 // Ch(e,f,g)
  545. eor x28,x27,x20 // a^b, b^c in next round
  546. eor x16,x16,x23,ror#41 // Sigma1(e)
  547. eor x12,x12,x27,ror#34
  548. add x26,x26,x17 // h+=Ch(e,f,g)
  549. and x19,x19,x28 // (b^c)&=(a^b)
  550. eor x10,x10,x3,ror#61
  551. eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
  552. add x26,x26,x16 // h+=Sigma1(e)
  553. eor x19,x19,x20 // Maj(a,b,c)
  554. eor x17,x12,x27,ror#39 // Sigma0(a)
  555. eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
  556. add x5,x5,x14
  557. add x22,x22,x26 // d+=h
  558. add x26,x26,x19 // h+=Maj(a,b,c)
  559. ldr x19,[x30],#8 // *K++, x28 in next round
  560. add x5,x5,x11
  561. add x26,x26,x17 // h+=Sigma0(a)
  562. add x5,x5,x10
  563. ldr x10,[sp,#24]
  564. str x13,[sp,#16]
  565. ror x16,x22,#14
  566. add x25,x25,x19 // h+=K[i]
  567. ror x12,x7,#1
  568. and x17,x23,x22
  569. ror x11,x4,#19
  570. bic x19,x24,x22
  571. ror x13,x26,#28
  572. add x25,x25,x5 // h+=X[i]
  573. eor x16,x16,x22,ror#18
  574. eor x12,x12,x7,ror#8
  575. orr x17,x17,x19 // Ch(e,f,g)
  576. eor x19,x26,x27 // a^b, b^c in next round
  577. eor x16,x16,x22,ror#41 // Sigma1(e)
  578. eor x13,x13,x26,ror#34
  579. add x25,x25,x17 // h+=Ch(e,f,g)
  580. and x28,x28,x19 // (b^c)&=(a^b)
  581. eor x11,x11,x4,ror#61
  582. eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
  583. add x25,x25,x16 // h+=Sigma1(e)
  584. eor x28,x28,x27 // Maj(a,b,c)
  585. eor x17,x13,x26,ror#39 // Sigma0(a)
  586. eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
  587. add x6,x6,x15
  588. add x21,x21,x25 // d+=h
  589. add x25,x25,x28 // h+=Maj(a,b,c)
  590. ldr x28,[x30],#8 // *K++, x19 in next round
  591. add x6,x6,x12
  592. add x25,x25,x17 // h+=Sigma0(a)
  593. add x6,x6,x11
  594. ldr x11,[sp,#0]
  595. str x14,[sp,#24]
  596. ror x16,x21,#14
  597. add x24,x24,x28 // h+=K[i]
  598. ror x13,x8,#1
  599. and x17,x22,x21
  600. ror x12,x5,#19
  601. bic x28,x23,x21
  602. ror x14,x25,#28
  603. add x24,x24,x6 // h+=X[i]
  604. eor x16,x16,x21,ror#18
  605. eor x13,x13,x8,ror#8
  606. orr x17,x17,x28 // Ch(e,f,g)
  607. eor x28,x25,x26 // a^b, b^c in next round
  608. eor x16,x16,x21,ror#41 // Sigma1(e)
  609. eor x14,x14,x25,ror#34
  610. add x24,x24,x17 // h+=Ch(e,f,g)
  611. and x19,x19,x28 // (b^c)&=(a^b)
  612. eor x12,x12,x5,ror#61
  613. eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
  614. add x24,x24,x16 // h+=Sigma1(e)
  615. eor x19,x19,x26 // Maj(a,b,c)
  616. eor x17,x14,x25,ror#39 // Sigma0(a)
  617. eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
  618. add x7,x7,x0
  619. add x20,x20,x24 // d+=h
  620. add x24,x24,x19 // h+=Maj(a,b,c)
  621. ldr x19,[x30],#8 // *K++, x28 in next round
  622. add x7,x7,x13
  623. add x24,x24,x17 // h+=Sigma0(a)
  624. add x7,x7,x12
  625. ldr x12,[sp,#8]
  626. str x15,[sp,#0]
  627. ror x16,x20,#14
  628. add x23,x23,x19 // h+=K[i]
  629. ror x14,x9,#1
  630. and x17,x21,x20
  631. ror x13,x6,#19
  632. bic x19,x22,x20
  633. ror x15,x24,#28
  634. add x23,x23,x7 // h+=X[i]
  635. eor x16,x16,x20,ror#18
  636. eor x14,x14,x9,ror#8
  637. orr x17,x17,x19 // Ch(e,f,g)
  638. eor x19,x24,x25 // a^b, b^c in next round
  639. eor x16,x16,x20,ror#41 // Sigma1(e)
  640. eor x15,x15,x24,ror#34
  641. add x23,x23,x17 // h+=Ch(e,f,g)
  642. and x28,x28,x19 // (b^c)&=(a^b)
  643. eor x13,x13,x6,ror#61
  644. eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
  645. add x23,x23,x16 // h+=Sigma1(e)
  646. eor x28,x28,x25 // Maj(a,b,c)
  647. eor x17,x15,x24,ror#39 // Sigma0(a)
  648. eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
  649. add x8,x8,x1
  650. add x27,x27,x23 // d+=h
  651. add x23,x23,x28 // h+=Maj(a,b,c)
  652. ldr x28,[x30],#8 // *K++, x19 in next round
  653. add x8,x8,x14
  654. add x23,x23,x17 // h+=Sigma0(a)
  655. add x8,x8,x13
  656. ldr x13,[sp,#16]
  657. str x0,[sp,#8]
  658. ror x16,x27,#14
  659. add x22,x22,x28 // h+=K[i]
  660. ror x15,x10,#1
  661. and x17,x20,x27
  662. ror x14,x7,#19
  663. bic x28,x21,x27
  664. ror x0,x23,#28
  665. add x22,x22,x8 // h+=X[i]
  666. eor x16,x16,x27,ror#18
  667. eor x15,x15,x10,ror#8
  668. orr x17,x17,x28 // Ch(e,f,g)
  669. eor x28,x23,x24 // a^b, b^c in next round
  670. eor x16,x16,x27,ror#41 // Sigma1(e)
  671. eor x0,x0,x23,ror#34
  672. add x22,x22,x17 // h+=Ch(e,f,g)
  673. and x19,x19,x28 // (b^c)&=(a^b)
  674. eor x14,x14,x7,ror#61
  675. eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
  676. add x22,x22,x16 // h+=Sigma1(e)
  677. eor x19,x19,x24 // Maj(a,b,c)
  678. eor x17,x0,x23,ror#39 // Sigma0(a)
  679. eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
  680. add x9,x9,x2
  681. add x26,x26,x22 // d+=h
  682. add x22,x22,x19 // h+=Maj(a,b,c)
  683. ldr x19,[x30],#8 // *K++, x28 in next round
  684. add x9,x9,x15
  685. add x22,x22,x17 // h+=Sigma0(a)
  686. add x9,x9,x14
  687. ldr x14,[sp,#24]
  688. str x1,[sp,#16]
  689. ror x16,x26,#14
  690. add x21,x21,x19 // h+=K[i]
  691. ror x0,x11,#1
  692. and x17,x27,x26
  693. ror x15,x8,#19
  694. bic x19,x20,x26
  695. ror x1,x22,#28
  696. add x21,x21,x9 // h+=X[i]
  697. eor x16,x16,x26,ror#18
  698. eor x0,x0,x11,ror#8
  699. orr x17,x17,x19 // Ch(e,f,g)
  700. eor x19,x22,x23 // a^b, b^c in next round
  701. eor x16,x16,x26,ror#41 // Sigma1(e)
  702. eor x1,x1,x22,ror#34
  703. add x21,x21,x17 // h+=Ch(e,f,g)
  704. and x28,x28,x19 // (b^c)&=(a^b)
  705. eor x15,x15,x8,ror#61
  706. eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
  707. add x21,x21,x16 // h+=Sigma1(e)
  708. eor x28,x28,x23 // Maj(a,b,c)
  709. eor x17,x1,x22,ror#39 // Sigma0(a)
  710. eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
  711. add x10,x10,x3
  712. add x25,x25,x21 // d+=h
  713. add x21,x21,x28 // h+=Maj(a,b,c)
  714. ldr x28,[x30],#8 // *K++, x19 in next round
  715. add x10,x10,x0
  716. add x21,x21,x17 // h+=Sigma0(a)
  717. add x10,x10,x15
  718. ldr x15,[sp,#0]
  719. str x2,[sp,#24]
  720. ror x16,x25,#14
  721. add x20,x20,x28 // h+=K[i]
  722. ror x1,x12,#1
  723. and x17,x26,x25
  724. ror x0,x9,#19
  725. bic x28,x27,x25
  726. ror x2,x21,#28
  727. add x20,x20,x10 // h+=X[i]
  728. eor x16,x16,x25,ror#18
  729. eor x1,x1,x12,ror#8
  730. orr x17,x17,x28 // Ch(e,f,g)
  731. eor x28,x21,x22 // a^b, b^c in next round
  732. eor x16,x16,x25,ror#41 // Sigma1(e)
  733. eor x2,x2,x21,ror#34
  734. add x20,x20,x17 // h+=Ch(e,f,g)
  735. and x19,x19,x28 // (b^c)&=(a^b)
  736. eor x0,x0,x9,ror#61
  737. eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
  738. add x20,x20,x16 // h+=Sigma1(e)
  739. eor x19,x19,x22 // Maj(a,b,c)
  740. eor x17,x2,x21,ror#39 // Sigma0(a)
  741. eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
  742. add x11,x11,x4
  743. add x24,x24,x20 // d+=h
  744. add x20,x20,x19 // h+=Maj(a,b,c)
  745. ldr x19,[x30],#8 // *K++, x28 in next round
  746. add x11,x11,x1
  747. add x20,x20,x17 // h+=Sigma0(a)
  748. add x11,x11,x0
  749. ldr x0,[sp,#8]
  750. str x3,[sp,#0]
  751. ror x16,x24,#14
  752. add x27,x27,x19 // h+=K[i]
  753. ror x2,x13,#1
  754. and x17,x25,x24
  755. ror x1,x10,#19
  756. bic x19,x26,x24
  757. ror x3,x20,#28
  758. add x27,x27,x11 // h+=X[i]
  759. eor x16,x16,x24,ror#18
  760. eor x2,x2,x13,ror#8
  761. orr x17,x17,x19 // Ch(e,f,g)
  762. eor x19,x20,x21 // a^b, b^c in next round
  763. eor x16,x16,x24,ror#41 // Sigma1(e)
  764. eor x3,x3,x20,ror#34
  765. add x27,x27,x17 // h+=Ch(e,f,g)
  766. and x28,x28,x19 // (b^c)&=(a^b)
  767. eor x1,x1,x10,ror#61
  768. eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
  769. add x27,x27,x16 // h+=Sigma1(e)
  770. eor x28,x28,x21 // Maj(a,b,c)
  771. eor x17,x3,x20,ror#39 // Sigma0(a)
  772. eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
  773. add x12,x12,x5
  774. add x23,x23,x27 // d+=h
  775. add x27,x27,x28 // h+=Maj(a,b,c)
  776. ldr x28,[x30],#8 // *K++, x19 in next round
  777. add x12,x12,x2
  778. add x27,x27,x17 // h+=Sigma0(a)
  779. add x12,x12,x1
  780. ldr x1,[sp,#16]
  781. str x4,[sp,#8]
  782. ror x16,x23,#14
  783. add x26,x26,x28 // h+=K[i]
  784. ror x3,x14,#1
  785. and x17,x24,x23
  786. ror x2,x11,#19
  787. bic x28,x25,x23
  788. ror x4,x27,#28
  789. add x26,x26,x12 // h+=X[i]
  790. eor x16,x16,x23,ror#18
  791. eor x3,x3,x14,ror#8
  792. orr x17,x17,x28 // Ch(e,f,g)
  793. eor x28,x27,x20 // a^b, b^c in next round
  794. eor x16,x16,x23,ror#41 // Sigma1(e)
  795. eor x4,x4,x27,ror#34
  796. add x26,x26,x17 // h+=Ch(e,f,g)
  797. and x19,x19,x28 // (b^c)&=(a^b)
  798. eor x2,x2,x11,ror#61
  799. eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
  800. add x26,x26,x16 // h+=Sigma1(e)
  801. eor x19,x19,x20 // Maj(a,b,c)
  802. eor x17,x4,x27,ror#39 // Sigma0(a)
  803. eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
  804. add x13,x13,x6
  805. add x22,x22,x26 // d+=h
  806. add x26,x26,x19 // h+=Maj(a,b,c)
  807. ldr x19,[x30],#8 // *K++, x28 in next round
  808. add x13,x13,x3
  809. add x26,x26,x17 // h+=Sigma0(a)
  810. add x13,x13,x2
  811. ldr x2,[sp,#24]
  812. str x5,[sp,#16]
  813. ror x16,x22,#14
  814. add x25,x25,x19 // h+=K[i]
  815. ror x4,x15,#1
  816. and x17,x23,x22
  817. ror x3,x12,#19
  818. bic x19,x24,x22
  819. ror x5,x26,#28
  820. add x25,x25,x13 // h+=X[i]
  821. eor x16,x16,x22,ror#18
  822. eor x4,x4,x15,ror#8
  823. orr x17,x17,x19 // Ch(e,f,g)
  824. eor x19,x26,x27 // a^b, b^c in next round
  825. eor x16,x16,x22,ror#41 // Sigma1(e)
  826. eor x5,x5,x26,ror#34
  827. add x25,x25,x17 // h+=Ch(e,f,g)
  828. and x28,x28,x19 // (b^c)&=(a^b)
  829. eor x3,x3,x12,ror#61
  830. eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
  831. add x25,x25,x16 // h+=Sigma1(e)
  832. eor x28,x28,x27 // Maj(a,b,c)
  833. eor x17,x5,x26,ror#39 // Sigma0(a)
  834. eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
  835. add x14,x14,x7
  836. add x21,x21,x25 // d+=h
  837. add x25,x25,x28 // h+=Maj(a,b,c)
  838. ldr x28,[x30],#8 // *K++, x19 in next round
  839. add x14,x14,x4
  840. add x25,x25,x17 // h+=Sigma0(a)
  841. add x14,x14,x3
  842. ldr x3,[sp,#0]
  843. str x6,[sp,#24]
  844. ror x16,x21,#14
  845. add x24,x24,x28 // h+=K[i]
  846. ror x5,x0,#1
  847. and x17,x22,x21
  848. ror x4,x13,#19
  849. bic x28,x23,x21
  850. ror x6,x25,#28
  851. add x24,x24,x14 // h+=X[i]
  852. eor x16,x16,x21,ror#18
  853. eor x5,x5,x0,ror#8
  854. orr x17,x17,x28 // Ch(e,f,g)
  855. eor x28,x25,x26 // a^b, b^c in next round
  856. eor x16,x16,x21,ror#41 // Sigma1(e)
  857. eor x6,x6,x25,ror#34
  858. add x24,x24,x17 // h+=Ch(e,f,g)
  859. and x19,x19,x28 // (b^c)&=(a^b)
  860. eor x4,x4,x13,ror#61
  861. eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
  862. add x24,x24,x16 // h+=Sigma1(e)
  863. eor x19,x19,x26 // Maj(a,b,c)
  864. eor x17,x6,x25,ror#39 // Sigma0(a)
  865. eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
  866. add x15,x15,x8
  867. add x20,x20,x24 // d+=h
  868. add x24,x24,x19 // h+=Maj(a,b,c)
  869. ldr x19,[x30],#8 // *K++, x28 in next round
  870. add x15,x15,x5
  871. add x24,x24,x17 // h+=Sigma0(a)
  872. add x15,x15,x4
  873. ldr x4,[sp,#8]
  874. str x7,[sp,#0]
  875. ror x16,x20,#14
  876. add x23,x23,x19 // h+=K[i]
  877. ror x6,x1,#1
  878. and x17,x21,x20
  879. ror x5,x14,#19
  880. bic x19,x22,x20
  881. ror x7,x24,#28
  882. add x23,x23,x15 // h+=X[i]
  883. eor x16,x16,x20,ror#18
  884. eor x6,x6,x1,ror#8
  885. orr x17,x17,x19 // Ch(e,f,g)
  886. eor x19,x24,x25 // a^b, b^c in next round
  887. eor x16,x16,x20,ror#41 // Sigma1(e)
  888. eor x7,x7,x24,ror#34
  889. add x23,x23,x17 // h+=Ch(e,f,g)
  890. and x28,x28,x19 // (b^c)&=(a^b)
  891. eor x5,x5,x14,ror#61
  892. eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
  893. add x23,x23,x16 // h+=Sigma1(e)
  894. eor x28,x28,x25 // Maj(a,b,c)
  895. eor x17,x7,x24,ror#39 // Sigma0(a)
  896. eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
  897. add x0,x0,x9
  898. add x27,x27,x23 // d+=h
  899. add x23,x23,x28 // h+=Maj(a,b,c)
  900. ldr x28,[x30],#8 // *K++, x19 in next round
  901. add x0,x0,x6
  902. add x23,x23,x17 // h+=Sigma0(a)
  903. add x0,x0,x5
  904. ldr x5,[sp,#16]
  905. str x8,[sp,#8]
  906. ror x16,x27,#14
  907. add x22,x22,x28 // h+=K[i]
  908. ror x7,x2,#1
  909. and x17,x20,x27
  910. ror x6,x15,#19
  911. bic x28,x21,x27
  912. ror x8,x23,#28
  913. add x22,x22,x0 // h+=X[i]
  914. eor x16,x16,x27,ror#18
  915. eor x7,x7,x2,ror#8
  916. orr x17,x17,x28 // Ch(e,f,g)
  917. eor x28,x23,x24 // a^b, b^c in next round
  918. eor x16,x16,x27,ror#41 // Sigma1(e)
  919. eor x8,x8,x23,ror#34
  920. add x22,x22,x17 // h+=Ch(e,f,g)
  921. and x19,x19,x28 // (b^c)&=(a^b)
  922. eor x6,x6,x15,ror#61
  923. eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
  924. add x22,x22,x16 // h+=Sigma1(e)
  925. eor x19,x19,x24 // Maj(a,b,c)
  926. eor x17,x8,x23,ror#39 // Sigma0(a)
  927. eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
  928. add x1,x1,x10
  929. add x26,x26,x22 // d+=h
  930. add x22,x22,x19 // h+=Maj(a,b,c)
  931. ldr x19,[x30],#8 // *K++, x28 in next round
  932. add x1,x1,x7
  933. add x22,x22,x17 // h+=Sigma0(a)
  934. add x1,x1,x6
  935. ldr x6,[sp,#24]
  936. str x9,[sp,#16]
  937. ror x16,x26,#14
  938. add x21,x21,x19 // h+=K[i]
  939. ror x8,x3,#1
  940. and x17,x27,x26
  941. ror x7,x0,#19
  942. bic x19,x20,x26
  943. ror x9,x22,#28
  944. add x21,x21,x1 // h+=X[i]
  945. eor x16,x16,x26,ror#18
  946. eor x8,x8,x3,ror#8
  947. orr x17,x17,x19 // Ch(e,f,g)
  948. eor x19,x22,x23 // a^b, b^c in next round
  949. eor x16,x16,x26,ror#41 // Sigma1(e)
  950. eor x9,x9,x22,ror#34
  951. add x21,x21,x17 // h+=Ch(e,f,g)
  952. and x28,x28,x19 // (b^c)&=(a^b)
  953. eor x7,x7,x0,ror#61
  954. eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
  955. add x21,x21,x16 // h+=Sigma1(e)
  956. eor x28,x28,x23 // Maj(a,b,c)
  957. eor x17,x9,x22,ror#39 // Sigma0(a)
  958. eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
  959. add x2,x2,x11
  960. add x25,x25,x21 // d+=h
  961. add x21,x21,x28 // h+=Maj(a,b,c)
  962. ldr x28,[x30],#8 // *K++, x19 in next round
  963. add x2,x2,x8
  964. add x21,x21,x17 // h+=Sigma0(a)
  965. add x2,x2,x7
  966. ldr x7,[sp,#0]
  967. str x10,[sp,#24]
  968. ror x16,x25,#14
  969. add x20,x20,x28 // h+=K[i]
  970. ror x9,x4,#1
  971. and x17,x26,x25
  972. ror x8,x1,#19
  973. bic x28,x27,x25
  974. ror x10,x21,#28
  975. add x20,x20,x2 // h+=X[i]
  976. eor x16,x16,x25,ror#18
  977. eor x9,x9,x4,ror#8
  978. orr x17,x17,x28 // Ch(e,f,g)
  979. eor x28,x21,x22 // a^b, b^c in next round
  980. eor x16,x16,x25,ror#41 // Sigma1(e)
  981. eor x10,x10,x21,ror#34
  982. add x20,x20,x17 // h+=Ch(e,f,g)
  983. and x19,x19,x28 // (b^c)&=(a^b)
  984. eor x8,x8,x1,ror#61
  985. eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
  986. add x20,x20,x16 // h+=Sigma1(e)
  987. eor x19,x19,x22 // Maj(a,b,c)
  988. eor x17,x10,x21,ror#39 // Sigma0(a)
  989. eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
  990. add x3,x3,x12
  991. add x24,x24,x20 // d+=h
  992. add x20,x20,x19 // h+=Maj(a,b,c)
  993. ldr x19,[x30],#8 // *K++, x28 in next round
  994. add x3,x3,x9
  995. add x20,x20,x17 // h+=Sigma0(a)
  996. add x3,x3,x8
  997. cbnz x19,Loop_16_xx
  998. ldp x0,x2,[x29,#96]
  999. ldr x1,[x29,#112]
  1000. sub x30,x30,#648 // rewind
  1001. ldp x3,x4,[x0]
  1002. ldp x5,x6,[x0,#2*8]
  1003. add x1,x1,#14*8 // advance input pointer
  1004. ldp x7,x8,[x0,#4*8]
  1005. add x20,x20,x3
  1006. ldp x9,x10,[x0,#6*8]
  1007. add x21,x21,x4
  1008. add x22,x22,x5
  1009. add x23,x23,x6
  1010. stp x20,x21,[x0]
  1011. add x24,x24,x7
  1012. add x25,x25,x8
  1013. stp x22,x23,[x0,#2*8]
  1014. add x26,x26,x9
  1015. add x27,x27,x10
  1016. cmp x1,x2
  1017. stp x24,x25,[x0,#4*8]
  1018. stp x26,x27,[x0,#6*8]
  1019. b.ne Loop
  1020. ldp x19,x20,[x29,#16]
  1021. add sp,sp,#4*8
  1022. ldp x21,x22,[x29,#32]
  1023. ldp x23,x24,[x29,#48]
  1024. ldp x25,x26,[x29,#64]
  1025. ldp x27,x28,[x29,#80]
  1026. ldp x29,x30,[sp],#128
  1027. .long 0xd50323bf // autiasp
  1028. ret
  1029. .align 6
  1030. LK512:
  1031. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  1032. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  1033. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  1034. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  1035. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  1036. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  1037. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  1038. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  1039. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  1040. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  1041. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  1042. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  1043. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  1044. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  1045. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  1046. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  1047. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  1048. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  1049. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  1050. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  1051. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  1052. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  1053. .quad 0xd192e819d6ef5218,0xd69906245565a910
  1054. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  1055. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  1056. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  1057. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  1058. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  1059. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  1060. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  1061. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  1062. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  1063. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  1064. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  1065. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  1066. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  1067. .quad 0x28db77f523047d84,0x32caab7b40c72493
  1068. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  1069. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  1070. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  1071. .quad 0 // terminator
  1072. #ifndef __KERNEL__
  1073. .align 3
  1074. LOPENSSL_armcap_P:
  1075. # ifdef __ILP32__
  1076. .long _OPENSSL_armcap_P-.
  1077. # else
  1078. .quad _OPENSSL_armcap_P-.
  1079. # endif
  1080. #endif
  1081. .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1082. .align 2
  1083. .align 2
  1084. #ifndef __KERNEL__
  1085. .align 6
  1086. sha512_block_armv8:
  1087. Lv8_entry:
  1088. stp x29,x30,[sp,#-16]!
  1089. add x29,sp,#0
  1090. ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
  1091. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  1092. ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
  1093. adr x3,LK512
  1094. rev64 v16.16b,v16.16b
  1095. rev64 v17.16b,v17.16b
  1096. rev64 v18.16b,v18.16b
  1097. rev64 v19.16b,v19.16b
  1098. rev64 v20.16b,v20.16b
  1099. rev64 v21.16b,v21.16b
  1100. rev64 v22.16b,v22.16b
  1101. rev64 v23.16b,v23.16b
  1102. b Loop_hw
  1103. .align 4
  1104. Loop_hw:
  1105. ld1 {v24.2d},[x3],#16
  1106. subs x2,x2,#1
  1107. sub x4,x1,#128
  1108. orr v26.16b,v0.16b,v0.16b // offload
  1109. orr v27.16b,v1.16b,v1.16b
  1110. orr v28.16b,v2.16b,v2.16b
  1111. orr v29.16b,v3.16b,v3.16b
  1112. csel x1,x1,x4,ne // conditional rewind
  1113. add v24.2d,v24.2d,v16.2d
  1114. ld1 {v25.2d},[x3],#16
  1115. ext v24.16b,v24.16b,v24.16b,#8
  1116. ext v5.16b,v2.16b,v3.16b,#8
  1117. ext v6.16b,v1.16b,v2.16b,#8
  1118. add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
  1119. .long 0xcec08230 //sha512su0 v16.16b,v17.16b
  1120. ext v7.16b,v20.16b,v21.16b,#8
  1121. .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1122. .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
  1123. add v4.2d,v1.2d,v3.2d // "D + T1"
  1124. .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1125. add v25.2d,v25.2d,v17.2d
  1126. ld1 {v24.2d},[x3],#16
  1127. ext v25.16b,v25.16b,v25.16b,#8
  1128. ext v5.16b,v4.16b,v2.16b,#8
  1129. ext v6.16b,v0.16b,v4.16b,#8
  1130. add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
  1131. .long 0xcec08251 //sha512su0 v17.16b,v18.16b
  1132. ext v7.16b,v21.16b,v22.16b,#8
  1133. .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1134. .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
  1135. add v1.2d,v0.2d,v2.2d // "D + T1"
  1136. .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1137. add v24.2d,v24.2d,v18.2d
  1138. ld1 {v25.2d},[x3],#16
  1139. ext v24.16b,v24.16b,v24.16b,#8
  1140. ext v5.16b,v1.16b,v4.16b,#8
  1141. ext v6.16b,v3.16b,v1.16b,#8
  1142. add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
  1143. .long 0xcec08272 //sha512su0 v18.16b,v19.16b
  1144. ext v7.16b,v22.16b,v23.16b,#8
  1145. .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1146. .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
  1147. add v0.2d,v3.2d,v4.2d // "D + T1"
  1148. .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1149. add v25.2d,v25.2d,v19.2d
  1150. ld1 {v24.2d},[x3],#16
  1151. ext v25.16b,v25.16b,v25.16b,#8
  1152. ext v5.16b,v0.16b,v1.16b,#8
  1153. ext v6.16b,v2.16b,v0.16b,#8
  1154. add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
  1155. .long 0xcec08293 //sha512su0 v19.16b,v20.16b
  1156. ext v7.16b,v23.16b,v16.16b,#8
  1157. .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1158. .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
  1159. add v3.2d,v2.2d,v1.2d // "D + T1"
  1160. .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1161. add v24.2d,v24.2d,v20.2d
  1162. ld1 {v25.2d},[x3],#16
  1163. ext v24.16b,v24.16b,v24.16b,#8
  1164. ext v5.16b,v3.16b,v0.16b,#8
  1165. ext v6.16b,v4.16b,v3.16b,#8
  1166. add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
  1167. .long 0xcec082b4 //sha512su0 v20.16b,v21.16b
  1168. ext v7.16b,v16.16b,v17.16b,#8
  1169. .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1170. .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
  1171. add v2.2d,v4.2d,v0.2d // "D + T1"
  1172. .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1173. add v25.2d,v25.2d,v21.2d
  1174. ld1 {v24.2d},[x3],#16
  1175. ext v25.16b,v25.16b,v25.16b,#8
  1176. ext v5.16b,v2.16b,v3.16b,#8
  1177. ext v6.16b,v1.16b,v2.16b,#8
  1178. add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
  1179. .long 0xcec082d5 //sha512su0 v21.16b,v22.16b
  1180. ext v7.16b,v17.16b,v18.16b,#8
  1181. .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1182. .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
  1183. add v4.2d,v1.2d,v3.2d // "D + T1"
  1184. .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1185. add v24.2d,v24.2d,v22.2d
  1186. ld1 {v25.2d},[x3],#16
  1187. ext v24.16b,v24.16b,v24.16b,#8
  1188. ext v5.16b,v4.16b,v2.16b,#8
  1189. ext v6.16b,v0.16b,v4.16b,#8
  1190. add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
  1191. .long 0xcec082f6 //sha512su0 v22.16b,v23.16b
  1192. ext v7.16b,v18.16b,v19.16b,#8
  1193. .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1194. .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
  1195. add v1.2d,v0.2d,v2.2d // "D + T1"
  1196. .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1197. add v25.2d,v25.2d,v23.2d
  1198. ld1 {v24.2d},[x3],#16
  1199. ext v25.16b,v25.16b,v25.16b,#8
  1200. ext v5.16b,v1.16b,v4.16b,#8
  1201. ext v6.16b,v3.16b,v1.16b,#8
  1202. add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
  1203. .long 0xcec08217 //sha512su0 v23.16b,v16.16b
  1204. ext v7.16b,v19.16b,v20.16b,#8
  1205. .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1206. .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
  1207. add v0.2d,v3.2d,v4.2d // "D + T1"
  1208. .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1209. add v24.2d,v24.2d,v16.2d
  1210. ld1 {v25.2d},[x3],#16
  1211. ext v24.16b,v24.16b,v24.16b,#8
  1212. ext v5.16b,v0.16b,v1.16b,#8
  1213. ext v6.16b,v2.16b,v0.16b,#8
  1214. add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
  1215. .long 0xcec08230 //sha512su0 v16.16b,v17.16b
  1216. ext v7.16b,v20.16b,v21.16b,#8
  1217. .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1218. .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
  1219. add v3.2d,v2.2d,v1.2d // "D + T1"
  1220. .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1221. add v25.2d,v25.2d,v17.2d
  1222. ld1 {v24.2d},[x3],#16
  1223. ext v25.16b,v25.16b,v25.16b,#8
  1224. ext v5.16b,v3.16b,v0.16b,#8
  1225. ext v6.16b,v4.16b,v3.16b,#8
  1226. add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
  1227. .long 0xcec08251 //sha512su0 v17.16b,v18.16b
  1228. ext v7.16b,v21.16b,v22.16b,#8
  1229. .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1230. .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
  1231. add v2.2d,v4.2d,v0.2d // "D + T1"
  1232. .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1233. add v24.2d,v24.2d,v18.2d
  1234. ld1 {v25.2d},[x3],#16
  1235. ext v24.16b,v24.16b,v24.16b,#8
  1236. ext v5.16b,v2.16b,v3.16b,#8
  1237. ext v6.16b,v1.16b,v2.16b,#8
  1238. add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
  1239. .long 0xcec08272 //sha512su0 v18.16b,v19.16b
  1240. ext v7.16b,v22.16b,v23.16b,#8
  1241. .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1242. .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
  1243. add v4.2d,v1.2d,v3.2d // "D + T1"
  1244. .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1245. add v25.2d,v25.2d,v19.2d
  1246. ld1 {v24.2d},[x3],#16
  1247. ext v25.16b,v25.16b,v25.16b,#8
  1248. ext v5.16b,v4.16b,v2.16b,#8
  1249. ext v6.16b,v0.16b,v4.16b,#8
  1250. add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
  1251. .long 0xcec08293 //sha512su0 v19.16b,v20.16b
  1252. ext v7.16b,v23.16b,v16.16b,#8
  1253. .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1254. .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
  1255. add v1.2d,v0.2d,v2.2d // "D + T1"
  1256. .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1257. add v24.2d,v24.2d,v20.2d
  1258. ld1 {v25.2d},[x3],#16
  1259. ext v24.16b,v24.16b,v24.16b,#8
  1260. ext v5.16b,v1.16b,v4.16b,#8
  1261. ext v6.16b,v3.16b,v1.16b,#8
  1262. add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
  1263. .long 0xcec082b4 //sha512su0 v20.16b,v21.16b
  1264. ext v7.16b,v16.16b,v17.16b,#8
  1265. .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1266. .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
  1267. add v0.2d,v3.2d,v4.2d // "D + T1"
  1268. .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1269. add v25.2d,v25.2d,v21.2d
  1270. ld1 {v24.2d},[x3],#16
  1271. ext v25.16b,v25.16b,v25.16b,#8
  1272. ext v5.16b,v0.16b,v1.16b,#8
  1273. ext v6.16b,v2.16b,v0.16b,#8
  1274. add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
  1275. .long 0xcec082d5 //sha512su0 v21.16b,v22.16b
  1276. ext v7.16b,v17.16b,v18.16b,#8
  1277. .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1278. .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
  1279. add v3.2d,v2.2d,v1.2d // "D + T1"
  1280. .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1281. add v24.2d,v24.2d,v22.2d
  1282. ld1 {v25.2d},[x3],#16
  1283. ext v24.16b,v24.16b,v24.16b,#8
  1284. ext v5.16b,v3.16b,v0.16b,#8
  1285. ext v6.16b,v4.16b,v3.16b,#8
  1286. add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
  1287. .long 0xcec082f6 //sha512su0 v22.16b,v23.16b
  1288. ext v7.16b,v18.16b,v19.16b,#8
  1289. .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1290. .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
  1291. add v2.2d,v4.2d,v0.2d // "D + T1"
  1292. .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1293. add v25.2d,v25.2d,v23.2d
  1294. ld1 {v24.2d},[x3],#16
  1295. ext v25.16b,v25.16b,v25.16b,#8
  1296. ext v5.16b,v2.16b,v3.16b,#8
  1297. ext v6.16b,v1.16b,v2.16b,#8
  1298. add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
  1299. .long 0xcec08217 //sha512su0 v23.16b,v16.16b
  1300. ext v7.16b,v19.16b,v20.16b,#8
  1301. .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1302. .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
  1303. add v4.2d,v1.2d,v3.2d // "D + T1"
  1304. .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1305. add v24.2d,v24.2d,v16.2d
  1306. ld1 {v25.2d},[x3],#16
  1307. ext v24.16b,v24.16b,v24.16b,#8
  1308. ext v5.16b,v4.16b,v2.16b,#8
  1309. ext v6.16b,v0.16b,v4.16b,#8
  1310. add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
  1311. .long 0xcec08230 //sha512su0 v16.16b,v17.16b
  1312. ext v7.16b,v20.16b,v21.16b,#8
  1313. .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1314. .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
  1315. add v1.2d,v0.2d,v2.2d // "D + T1"
  1316. .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1317. add v25.2d,v25.2d,v17.2d
  1318. ld1 {v24.2d},[x3],#16
  1319. ext v25.16b,v25.16b,v25.16b,#8
  1320. ext v5.16b,v1.16b,v4.16b,#8
  1321. ext v6.16b,v3.16b,v1.16b,#8
  1322. add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
  1323. .long 0xcec08251 //sha512su0 v17.16b,v18.16b
  1324. ext v7.16b,v21.16b,v22.16b,#8
  1325. .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1326. .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
  1327. add v0.2d,v3.2d,v4.2d // "D + T1"
  1328. .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1329. add v24.2d,v24.2d,v18.2d
  1330. ld1 {v25.2d},[x3],#16
  1331. ext v24.16b,v24.16b,v24.16b,#8
  1332. ext v5.16b,v0.16b,v1.16b,#8
  1333. ext v6.16b,v2.16b,v0.16b,#8
  1334. add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
  1335. .long 0xcec08272 //sha512su0 v18.16b,v19.16b
  1336. ext v7.16b,v22.16b,v23.16b,#8
  1337. .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1338. .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
  1339. add v3.2d,v2.2d,v1.2d // "D + T1"
  1340. .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1341. add v25.2d,v25.2d,v19.2d
  1342. ld1 {v24.2d},[x3],#16
  1343. ext v25.16b,v25.16b,v25.16b,#8
  1344. ext v5.16b,v3.16b,v0.16b,#8
  1345. ext v6.16b,v4.16b,v3.16b,#8
  1346. add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
  1347. .long 0xcec08293 //sha512su0 v19.16b,v20.16b
  1348. ext v7.16b,v23.16b,v16.16b,#8
  1349. .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1350. .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
  1351. add v2.2d,v4.2d,v0.2d // "D + T1"
  1352. .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1353. add v24.2d,v24.2d,v20.2d
  1354. ld1 {v25.2d},[x3],#16
  1355. ext v24.16b,v24.16b,v24.16b,#8
  1356. ext v5.16b,v2.16b,v3.16b,#8
  1357. ext v6.16b,v1.16b,v2.16b,#8
  1358. add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
  1359. .long 0xcec082b4 //sha512su0 v20.16b,v21.16b
  1360. ext v7.16b,v16.16b,v17.16b,#8
  1361. .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1362. .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
  1363. add v4.2d,v1.2d,v3.2d // "D + T1"
  1364. .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1365. add v25.2d,v25.2d,v21.2d
  1366. ld1 {v24.2d},[x3],#16
  1367. ext v25.16b,v25.16b,v25.16b,#8
  1368. ext v5.16b,v4.16b,v2.16b,#8
  1369. ext v6.16b,v0.16b,v4.16b,#8
  1370. add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
  1371. .long 0xcec082d5 //sha512su0 v21.16b,v22.16b
  1372. ext v7.16b,v17.16b,v18.16b,#8
  1373. .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1374. .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
  1375. add v1.2d,v0.2d,v2.2d // "D + T1"
  1376. .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1377. add v24.2d,v24.2d,v22.2d
  1378. ld1 {v25.2d},[x3],#16
  1379. ext v24.16b,v24.16b,v24.16b,#8
  1380. ext v5.16b,v1.16b,v4.16b,#8
  1381. ext v6.16b,v3.16b,v1.16b,#8
  1382. add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
  1383. .long 0xcec082f6 //sha512su0 v22.16b,v23.16b
  1384. ext v7.16b,v18.16b,v19.16b,#8
  1385. .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1386. .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
  1387. add v0.2d,v3.2d,v4.2d // "D + T1"
  1388. .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1389. add v25.2d,v25.2d,v23.2d
  1390. ld1 {v24.2d},[x3],#16
  1391. ext v25.16b,v25.16b,v25.16b,#8
  1392. ext v5.16b,v0.16b,v1.16b,#8
  1393. ext v6.16b,v2.16b,v0.16b,#8
  1394. add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
  1395. .long 0xcec08217 //sha512su0 v23.16b,v16.16b
  1396. ext v7.16b,v19.16b,v20.16b,#8
  1397. .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1398. .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
  1399. add v3.2d,v2.2d,v1.2d // "D + T1"
  1400. .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1401. add v24.2d,v24.2d,v16.2d
  1402. ld1 {v25.2d},[x3],#16
  1403. ext v24.16b,v24.16b,v24.16b,#8
  1404. ext v5.16b,v3.16b,v0.16b,#8
  1405. ext v6.16b,v4.16b,v3.16b,#8
  1406. add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
  1407. .long 0xcec08230 //sha512su0 v16.16b,v17.16b
  1408. ext v7.16b,v20.16b,v21.16b,#8
  1409. .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1410. .long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
  1411. add v2.2d,v4.2d,v0.2d // "D + T1"
  1412. .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1413. add v25.2d,v25.2d,v17.2d
  1414. ld1 {v24.2d},[x3],#16
  1415. ext v25.16b,v25.16b,v25.16b,#8
  1416. ext v5.16b,v2.16b,v3.16b,#8
  1417. ext v6.16b,v1.16b,v2.16b,#8
  1418. add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
  1419. .long 0xcec08251 //sha512su0 v17.16b,v18.16b
  1420. ext v7.16b,v21.16b,v22.16b,#8
  1421. .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1422. .long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
  1423. add v4.2d,v1.2d,v3.2d // "D + T1"
  1424. .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1425. add v24.2d,v24.2d,v18.2d
  1426. ld1 {v25.2d},[x3],#16
  1427. ext v24.16b,v24.16b,v24.16b,#8
  1428. ext v5.16b,v4.16b,v2.16b,#8
  1429. ext v6.16b,v0.16b,v4.16b,#8
  1430. add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
  1431. .long 0xcec08272 //sha512su0 v18.16b,v19.16b
  1432. ext v7.16b,v22.16b,v23.16b,#8
  1433. .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1434. .long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
  1435. add v1.2d,v0.2d,v2.2d // "D + T1"
  1436. .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1437. add v25.2d,v25.2d,v19.2d
  1438. ld1 {v24.2d},[x3],#16
  1439. ext v25.16b,v25.16b,v25.16b,#8
  1440. ext v5.16b,v1.16b,v4.16b,#8
  1441. ext v6.16b,v3.16b,v1.16b,#8
  1442. add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
  1443. .long 0xcec08293 //sha512su0 v19.16b,v20.16b
  1444. ext v7.16b,v23.16b,v16.16b,#8
  1445. .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1446. .long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
  1447. add v0.2d,v3.2d,v4.2d // "D + T1"
  1448. .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1449. add v24.2d,v24.2d,v20.2d
  1450. ld1 {v25.2d},[x3],#16
  1451. ext v24.16b,v24.16b,v24.16b,#8
  1452. ext v5.16b,v0.16b,v1.16b,#8
  1453. ext v6.16b,v2.16b,v0.16b,#8
  1454. add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
  1455. .long 0xcec082b4 //sha512su0 v20.16b,v21.16b
  1456. ext v7.16b,v16.16b,v17.16b,#8
  1457. .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1458. .long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
  1459. add v3.2d,v2.2d,v1.2d // "D + T1"
  1460. .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1461. add v25.2d,v25.2d,v21.2d
  1462. ld1 {v24.2d},[x3],#16
  1463. ext v25.16b,v25.16b,v25.16b,#8
  1464. ext v5.16b,v3.16b,v0.16b,#8
  1465. ext v6.16b,v4.16b,v3.16b,#8
  1466. add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
  1467. .long 0xcec082d5 //sha512su0 v21.16b,v22.16b
  1468. ext v7.16b,v17.16b,v18.16b,#8
  1469. .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1470. .long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
  1471. add v2.2d,v4.2d,v0.2d // "D + T1"
  1472. .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1473. add v24.2d,v24.2d,v22.2d
  1474. ld1 {v25.2d},[x3],#16
  1475. ext v24.16b,v24.16b,v24.16b,#8
  1476. ext v5.16b,v2.16b,v3.16b,#8
  1477. ext v6.16b,v1.16b,v2.16b,#8
  1478. add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
  1479. .long 0xcec082f6 //sha512su0 v22.16b,v23.16b
  1480. ext v7.16b,v18.16b,v19.16b,#8
  1481. .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1482. .long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
  1483. add v4.2d,v1.2d,v3.2d // "D + T1"
  1484. .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1485. add v25.2d,v25.2d,v23.2d
  1486. ld1 {v24.2d},[x3],#16
  1487. ext v25.16b,v25.16b,v25.16b,#8
  1488. ext v5.16b,v4.16b,v2.16b,#8
  1489. ext v6.16b,v0.16b,v4.16b,#8
  1490. add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
  1491. .long 0xcec08217 //sha512su0 v23.16b,v16.16b
  1492. ext v7.16b,v19.16b,v20.16b,#8
  1493. .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1494. .long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
  1495. add v1.2d,v0.2d,v2.2d // "D + T1"
  1496. .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1497. ld1 {v25.2d},[x3],#16
  1498. add v24.2d,v24.2d,v16.2d
  1499. ld1 {v16.16b},[x1],#16 // load next input
  1500. ext v24.16b,v24.16b,v24.16b,#8
  1501. ext v5.16b,v1.16b,v4.16b,#8
  1502. ext v6.16b,v3.16b,v1.16b,#8
  1503. add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
  1504. .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1505. rev64 v16.16b,v16.16b
  1506. add v0.2d,v3.2d,v4.2d // "D + T1"
  1507. .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1508. ld1 {v24.2d},[x3],#16
  1509. add v25.2d,v25.2d,v17.2d
  1510. ld1 {v17.16b},[x1],#16 // load next input
  1511. ext v25.16b,v25.16b,v25.16b,#8
  1512. ext v5.16b,v0.16b,v1.16b,#8
  1513. ext v6.16b,v2.16b,v0.16b,#8
  1514. add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
  1515. .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1516. rev64 v17.16b,v17.16b
  1517. add v3.2d,v2.2d,v1.2d // "D + T1"
  1518. .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1519. ld1 {v25.2d},[x3],#16
  1520. add v24.2d,v24.2d,v18.2d
  1521. ld1 {v18.16b},[x1],#16 // load next input
  1522. ext v24.16b,v24.16b,v24.16b,#8
  1523. ext v5.16b,v3.16b,v0.16b,#8
  1524. ext v6.16b,v4.16b,v3.16b,#8
  1525. add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
  1526. .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1527. rev64 v18.16b,v18.16b
  1528. add v2.2d,v4.2d,v0.2d // "D + T1"
  1529. .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1530. ld1 {v24.2d},[x3],#16
  1531. add v25.2d,v25.2d,v19.2d
  1532. ld1 {v19.16b},[x1],#16 // load next input
  1533. ext v25.16b,v25.16b,v25.16b,#8
  1534. ext v5.16b,v2.16b,v3.16b,#8
  1535. ext v6.16b,v1.16b,v2.16b,#8
  1536. add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
  1537. .long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1538. rev64 v19.16b,v19.16b
  1539. add v4.2d,v1.2d,v3.2d // "D + T1"
  1540. .long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1541. ld1 {v25.2d},[x3],#16
  1542. add v24.2d,v24.2d,v20.2d
  1543. ld1 {v20.16b},[x1],#16 // load next input
  1544. ext v24.16b,v24.16b,v24.16b,#8
  1545. ext v5.16b,v4.16b,v2.16b,#8
  1546. ext v6.16b,v0.16b,v4.16b,#8
  1547. add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
  1548. .long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1549. rev64 v20.16b,v20.16b
  1550. add v1.2d,v0.2d,v2.2d // "D + T1"
  1551. .long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1552. ld1 {v24.2d},[x3],#16
  1553. add v25.2d,v25.2d,v21.2d
  1554. ld1 {v21.16b},[x1],#16 // load next input
  1555. ext v25.16b,v25.16b,v25.16b,#8
  1556. ext v5.16b,v1.16b,v4.16b,#8
  1557. ext v6.16b,v3.16b,v1.16b,#8
  1558. add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
  1559. .long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1560. rev64 v21.16b,v21.16b
  1561. add v0.2d,v3.2d,v4.2d // "D + T1"
  1562. .long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1563. ld1 {v25.2d},[x3],#16
  1564. add v24.2d,v24.2d,v22.2d
  1565. ld1 {v22.16b},[x1],#16 // load next input
  1566. ext v24.16b,v24.16b,v24.16b,#8
  1567. ext v5.16b,v0.16b,v1.16b,#8
  1568. ext v6.16b,v2.16b,v0.16b,#8
  1569. add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
  1570. .long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1571. rev64 v22.16b,v22.16b
  1572. add v3.2d,v2.2d,v1.2d // "D + T1"
  1573. .long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1574. sub x3,x3,#80*8 // rewind
  1575. add v25.2d,v25.2d,v23.2d
  1576. ld1 {v23.16b},[x1],#16 // load next input
  1577. ext v25.16b,v25.16b,v25.16b,#8
  1578. ext v5.16b,v3.16b,v0.16b,#8
  1579. ext v6.16b,v4.16b,v3.16b,#8
  1580. add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
  1581. .long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1582. rev64 v23.16b,v23.16b
  1583. add v2.2d,v4.2d,v0.2d // "D + T1"
  1584. .long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1585. add v0.2d,v0.2d,v26.2d // accumulate
  1586. add v1.2d,v1.2d,v27.2d
  1587. add v2.2d,v2.2d,v28.2d
  1588. add v3.2d,v3.2d,v29.2d
  1589. cbnz x2,Loop_hw
  1590. st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
  1591. ldr x29,[sp],#16
  1592. ret
  1593. #endif