sha512-armv8.S 49 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618
  1. // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
  2. //
  3. // Licensed under the OpenSSL license (the "License"). You may not use
  4. // this file except in compliance with the License. You can obtain a copy
  5. // in the file LICENSE in the source distribution or at
  6. // https://www.openssl.org/source/license.html
  7. // ====================================================================
  8. // Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  9. // project. The module is, however, dual licensed under OpenSSL and
  10. // CRYPTOGAMS licenses depending on where you obtain it. For further
  11. // details see http://www.openssl.org/~appro/cryptogams/.
  12. //
  13. // Permission to use under GPLv2 terms is granted.
  14. // ====================================================================
  15. //
  16. // SHA256/512 for ARMv8.
  17. //
  18. // Performance in cycles per processed byte and improvement coefficient
  19. // over code generated with "default" compiler:
  20. //
  21. // SHA256-hw SHA256(*) SHA512
  22. // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
  23. // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
  24. // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
  25. // Denver 2.01 10.5 (+26%) 6.70 (+8%)
  26. // X-Gene 20.0 (+100%) 12.8 (+300%(***))
  27. // Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
  28. // Kryo 1.92 17.4 (+30%) 11.2 (+8%)
  29. //
  30. // (*) Software SHA256 results are of lesser relevance, presented
  31. // mostly for informational purposes.
  32. // (**) The result is a trade-off: it's possible to improve it by
  33. // 10% (or by 1 cycle per round), but at the cost of 20% loss
  34. // on Cortex-A53 (or by 4 cycles per round).
  35. // (***) Super-impressive coefficients over gcc-generated code are
  36. // indication of some compiler "pathology", most notably code
  37. // generated with -mgeneral-regs-only is significantly faster
  38. // and the gap is only 40-90%.
  39. //
  40. // October 2016.
  41. //
  42. // Originally it was reckoned that it makes no sense to implement NEON
  43. // version of SHA256 for 64-bit processors. This is because performance
  44. // improvement on most wide-spread Cortex-A5x processors was observed
  45. // to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
  46. // observed that 32-bit NEON SHA256 performs significantly better than
  47. // 64-bit scalar version on *some* of the more recent processors. As
  48. // result 64-bit NEON version of SHA256 was added to provide best
  49. // all-round performance. For example it executes ~30% faster on X-Gene
  50. // and Mongoose. [For reference, NEON version of SHA512 is bound to
  51. // deliver much less improvement, likely *negative* on Cortex-A5x.
  52. // Which is why NEON support is limited to SHA256.]
  53. #ifndef __KERNEL__
  54. # include "arm_arch.h"
  55. #endif
  56. .text
  57. .hidden OPENSSL_armcap_P
  58. .globl sha512_block_data_order
  59. .type sha512_block_data_order,%function
  60. .align 6
  61. sha512_block_data_order:
  62. #ifndef __KERNEL__
  63. # ifdef __ILP32__
  64. ldrsw x16,.LOPENSSL_armcap_P
  65. # else
  66. ldr x16,.LOPENSSL_armcap_P
  67. # endif
  68. adr x17,.LOPENSSL_armcap_P
  69. add x16,x16,x17
  70. ldr w16,[x16]
  71. tst w16,#ARMV8_SHA512
  72. b.ne .Lv8_entry
  73. #endif
  74. .inst 0xd503233f // paciasp
  75. stp x29,x30,[sp,#-128]!
  76. add x29,sp,#0
  77. stp x19,x20,[sp,#16]
  78. stp x21,x22,[sp,#32]
  79. stp x23,x24,[sp,#48]
  80. stp x25,x26,[sp,#64]
  81. stp x27,x28,[sp,#80]
  82. sub sp,sp,#4*8
  83. ldp x20,x21,[x0] // load context
  84. ldp x22,x23,[x0,#2*8]
  85. ldp x24,x25,[x0,#4*8]
  86. add x2,x1,x2,lsl#7 // end of input
  87. ldp x26,x27,[x0,#6*8]
  88. adr x30,.LK512
  89. stp x0,x2,[x29,#96]
  90. .Loop:
  91. ldp x3,x4,[x1],#2*8
  92. ldr x19,[x30],#8 // *K++
  93. eor x28,x21,x22 // magic seed
  94. str x1,[x29,#112]
  95. #ifndef __AARCH64EB__
  96. rev x3,x3 // 0
  97. #endif
  98. ror x16,x24,#14
  99. add x27,x27,x19 // h+=K[i]
  100. eor x6,x24,x24,ror#23
  101. and x17,x25,x24
  102. bic x19,x26,x24
  103. add x27,x27,x3 // h+=X[i]
  104. orr x17,x17,x19 // Ch(e,f,g)
  105. eor x19,x20,x21 // a^b, b^c in next round
  106. eor x16,x16,x6,ror#18 // Sigma1(e)
  107. ror x6,x20,#28
  108. add x27,x27,x17 // h+=Ch(e,f,g)
  109. eor x17,x20,x20,ror#5
  110. add x27,x27,x16 // h+=Sigma1(e)
  111. and x28,x28,x19 // (b^c)&=(a^b)
  112. add x23,x23,x27 // d+=h
  113. eor x28,x28,x21 // Maj(a,b,c)
  114. eor x17,x6,x17,ror#34 // Sigma0(a)
  115. add x27,x27,x28 // h+=Maj(a,b,c)
  116. ldr x28,[x30],#8 // *K++, x19 in next round
  117. //add x27,x27,x17 // h+=Sigma0(a)
  118. #ifndef __AARCH64EB__
  119. rev x4,x4 // 1
  120. #endif
  121. ldp x5,x6,[x1],#2*8
  122. add x27,x27,x17 // h+=Sigma0(a)
  123. ror x16,x23,#14
  124. add x26,x26,x28 // h+=K[i]
  125. eor x7,x23,x23,ror#23
  126. and x17,x24,x23
  127. bic x28,x25,x23
  128. add x26,x26,x4 // h+=X[i]
  129. orr x17,x17,x28 // Ch(e,f,g)
  130. eor x28,x27,x20 // a^b, b^c in next round
  131. eor x16,x16,x7,ror#18 // Sigma1(e)
  132. ror x7,x27,#28
  133. add x26,x26,x17 // h+=Ch(e,f,g)
  134. eor x17,x27,x27,ror#5
  135. add x26,x26,x16 // h+=Sigma1(e)
  136. and x19,x19,x28 // (b^c)&=(a^b)
  137. add x22,x22,x26 // d+=h
  138. eor x19,x19,x20 // Maj(a,b,c)
  139. eor x17,x7,x17,ror#34 // Sigma0(a)
  140. add x26,x26,x19 // h+=Maj(a,b,c)
  141. ldr x19,[x30],#8 // *K++, x28 in next round
  142. //add x26,x26,x17 // h+=Sigma0(a)
  143. #ifndef __AARCH64EB__
  144. rev x5,x5 // 2
  145. #endif
  146. add x26,x26,x17 // h+=Sigma0(a)
  147. ror x16,x22,#14
  148. add x25,x25,x19 // h+=K[i]
  149. eor x8,x22,x22,ror#23
  150. and x17,x23,x22
  151. bic x19,x24,x22
  152. add x25,x25,x5 // h+=X[i]
  153. orr x17,x17,x19 // Ch(e,f,g)
  154. eor x19,x26,x27 // a^b, b^c in next round
  155. eor x16,x16,x8,ror#18 // Sigma1(e)
  156. ror x8,x26,#28
  157. add x25,x25,x17 // h+=Ch(e,f,g)
  158. eor x17,x26,x26,ror#5
  159. add x25,x25,x16 // h+=Sigma1(e)
  160. and x28,x28,x19 // (b^c)&=(a^b)
  161. add x21,x21,x25 // d+=h
  162. eor x28,x28,x27 // Maj(a,b,c)
  163. eor x17,x8,x17,ror#34 // Sigma0(a)
  164. add x25,x25,x28 // h+=Maj(a,b,c)
  165. ldr x28,[x30],#8 // *K++, x19 in next round
  166. //add x25,x25,x17 // h+=Sigma0(a)
  167. #ifndef __AARCH64EB__
  168. rev x6,x6 // 3
  169. #endif
  170. ldp x7,x8,[x1],#2*8
  171. add x25,x25,x17 // h+=Sigma0(a)
  172. ror x16,x21,#14
  173. add x24,x24,x28 // h+=K[i]
  174. eor x9,x21,x21,ror#23
  175. and x17,x22,x21
  176. bic x28,x23,x21
  177. add x24,x24,x6 // h+=X[i]
  178. orr x17,x17,x28 // Ch(e,f,g)
  179. eor x28,x25,x26 // a^b, b^c in next round
  180. eor x16,x16,x9,ror#18 // Sigma1(e)
  181. ror x9,x25,#28
  182. add x24,x24,x17 // h+=Ch(e,f,g)
  183. eor x17,x25,x25,ror#5
  184. add x24,x24,x16 // h+=Sigma1(e)
  185. and x19,x19,x28 // (b^c)&=(a^b)
  186. add x20,x20,x24 // d+=h
  187. eor x19,x19,x26 // Maj(a,b,c)
  188. eor x17,x9,x17,ror#34 // Sigma0(a)
  189. add x24,x24,x19 // h+=Maj(a,b,c)
  190. ldr x19,[x30],#8 // *K++, x28 in next round
  191. //add x24,x24,x17 // h+=Sigma0(a)
  192. #ifndef __AARCH64EB__
  193. rev x7,x7 // 4
  194. #endif
  195. add x24,x24,x17 // h+=Sigma0(a)
  196. ror x16,x20,#14
  197. add x23,x23,x19 // h+=K[i]
  198. eor x10,x20,x20,ror#23
  199. and x17,x21,x20
  200. bic x19,x22,x20
  201. add x23,x23,x7 // h+=X[i]
  202. orr x17,x17,x19 // Ch(e,f,g)
  203. eor x19,x24,x25 // a^b, b^c in next round
  204. eor x16,x16,x10,ror#18 // Sigma1(e)
  205. ror x10,x24,#28
  206. add x23,x23,x17 // h+=Ch(e,f,g)
  207. eor x17,x24,x24,ror#5
  208. add x23,x23,x16 // h+=Sigma1(e)
  209. and x28,x28,x19 // (b^c)&=(a^b)
  210. add x27,x27,x23 // d+=h
  211. eor x28,x28,x25 // Maj(a,b,c)
  212. eor x17,x10,x17,ror#34 // Sigma0(a)
  213. add x23,x23,x28 // h+=Maj(a,b,c)
  214. ldr x28,[x30],#8 // *K++, x19 in next round
  215. //add x23,x23,x17 // h+=Sigma0(a)
  216. #ifndef __AARCH64EB__
  217. rev x8,x8 // 5
  218. #endif
  219. ldp x9,x10,[x1],#2*8
  220. add x23,x23,x17 // h+=Sigma0(a)
  221. ror x16,x27,#14
  222. add x22,x22,x28 // h+=K[i]
  223. eor x11,x27,x27,ror#23
  224. and x17,x20,x27
  225. bic x28,x21,x27
  226. add x22,x22,x8 // h+=X[i]
  227. orr x17,x17,x28 // Ch(e,f,g)
  228. eor x28,x23,x24 // a^b, b^c in next round
  229. eor x16,x16,x11,ror#18 // Sigma1(e)
  230. ror x11,x23,#28
  231. add x22,x22,x17 // h+=Ch(e,f,g)
  232. eor x17,x23,x23,ror#5
  233. add x22,x22,x16 // h+=Sigma1(e)
  234. and x19,x19,x28 // (b^c)&=(a^b)
  235. add x26,x26,x22 // d+=h
  236. eor x19,x19,x24 // Maj(a,b,c)
  237. eor x17,x11,x17,ror#34 // Sigma0(a)
  238. add x22,x22,x19 // h+=Maj(a,b,c)
  239. ldr x19,[x30],#8 // *K++, x28 in next round
  240. //add x22,x22,x17 // h+=Sigma0(a)
  241. #ifndef __AARCH64EB__
  242. rev x9,x9 // 6
  243. #endif
  244. add x22,x22,x17 // h+=Sigma0(a)
  245. ror x16,x26,#14
  246. add x21,x21,x19 // h+=K[i]
  247. eor x12,x26,x26,ror#23
  248. and x17,x27,x26
  249. bic x19,x20,x26
  250. add x21,x21,x9 // h+=X[i]
  251. orr x17,x17,x19 // Ch(e,f,g)
  252. eor x19,x22,x23 // a^b, b^c in next round
  253. eor x16,x16,x12,ror#18 // Sigma1(e)
  254. ror x12,x22,#28
  255. add x21,x21,x17 // h+=Ch(e,f,g)
  256. eor x17,x22,x22,ror#5
  257. add x21,x21,x16 // h+=Sigma1(e)
  258. and x28,x28,x19 // (b^c)&=(a^b)
  259. add x25,x25,x21 // d+=h
  260. eor x28,x28,x23 // Maj(a,b,c)
  261. eor x17,x12,x17,ror#34 // Sigma0(a)
  262. add x21,x21,x28 // h+=Maj(a,b,c)
  263. ldr x28,[x30],#8 // *K++, x19 in next round
  264. //add x21,x21,x17 // h+=Sigma0(a)
  265. #ifndef __AARCH64EB__
  266. rev x10,x10 // 7
  267. #endif
  268. ldp x11,x12,[x1],#2*8
  269. add x21,x21,x17 // h+=Sigma0(a)
  270. ror x16,x25,#14
  271. add x20,x20,x28 // h+=K[i]
  272. eor x13,x25,x25,ror#23
  273. and x17,x26,x25
  274. bic x28,x27,x25
  275. add x20,x20,x10 // h+=X[i]
  276. orr x17,x17,x28 // Ch(e,f,g)
  277. eor x28,x21,x22 // a^b, b^c in next round
  278. eor x16,x16,x13,ror#18 // Sigma1(e)
  279. ror x13,x21,#28
  280. add x20,x20,x17 // h+=Ch(e,f,g)
  281. eor x17,x21,x21,ror#5
  282. add x20,x20,x16 // h+=Sigma1(e)
  283. and x19,x19,x28 // (b^c)&=(a^b)
  284. add x24,x24,x20 // d+=h
  285. eor x19,x19,x22 // Maj(a,b,c)
  286. eor x17,x13,x17,ror#34 // Sigma0(a)
  287. add x20,x20,x19 // h+=Maj(a,b,c)
  288. ldr x19,[x30],#8 // *K++, x28 in next round
  289. //add x20,x20,x17 // h+=Sigma0(a)
  290. #ifndef __AARCH64EB__
  291. rev x11,x11 // 8
  292. #endif
  293. add x20,x20,x17 // h+=Sigma0(a)
  294. ror x16,x24,#14
  295. add x27,x27,x19 // h+=K[i]
  296. eor x14,x24,x24,ror#23
  297. and x17,x25,x24
  298. bic x19,x26,x24
  299. add x27,x27,x11 // h+=X[i]
  300. orr x17,x17,x19 // Ch(e,f,g)
  301. eor x19,x20,x21 // a^b, b^c in next round
  302. eor x16,x16,x14,ror#18 // Sigma1(e)
  303. ror x14,x20,#28
  304. add x27,x27,x17 // h+=Ch(e,f,g)
  305. eor x17,x20,x20,ror#5
  306. add x27,x27,x16 // h+=Sigma1(e)
  307. and x28,x28,x19 // (b^c)&=(a^b)
  308. add x23,x23,x27 // d+=h
  309. eor x28,x28,x21 // Maj(a,b,c)
  310. eor x17,x14,x17,ror#34 // Sigma0(a)
  311. add x27,x27,x28 // h+=Maj(a,b,c)
  312. ldr x28,[x30],#8 // *K++, x19 in next round
  313. //add x27,x27,x17 // h+=Sigma0(a)
  314. #ifndef __AARCH64EB__
  315. rev x12,x12 // 9
  316. #endif
  317. ldp x13,x14,[x1],#2*8
  318. add x27,x27,x17 // h+=Sigma0(a)
  319. ror x16,x23,#14
  320. add x26,x26,x28 // h+=K[i]
  321. eor x15,x23,x23,ror#23
  322. and x17,x24,x23
  323. bic x28,x25,x23
  324. add x26,x26,x12 // h+=X[i]
  325. orr x17,x17,x28 // Ch(e,f,g)
  326. eor x28,x27,x20 // a^b, b^c in next round
  327. eor x16,x16,x15,ror#18 // Sigma1(e)
  328. ror x15,x27,#28
  329. add x26,x26,x17 // h+=Ch(e,f,g)
  330. eor x17,x27,x27,ror#5
  331. add x26,x26,x16 // h+=Sigma1(e)
  332. and x19,x19,x28 // (b^c)&=(a^b)
  333. add x22,x22,x26 // d+=h
  334. eor x19,x19,x20 // Maj(a,b,c)
  335. eor x17,x15,x17,ror#34 // Sigma0(a)
  336. add x26,x26,x19 // h+=Maj(a,b,c)
  337. ldr x19,[x30],#8 // *K++, x28 in next round
  338. //add x26,x26,x17 // h+=Sigma0(a)
  339. #ifndef __AARCH64EB__
  340. rev x13,x13 // 10
  341. #endif
  342. add x26,x26,x17 // h+=Sigma0(a)
  343. ror x16,x22,#14
  344. add x25,x25,x19 // h+=K[i]
  345. eor x0,x22,x22,ror#23
  346. and x17,x23,x22
  347. bic x19,x24,x22
  348. add x25,x25,x13 // h+=X[i]
  349. orr x17,x17,x19 // Ch(e,f,g)
  350. eor x19,x26,x27 // a^b, b^c in next round
  351. eor x16,x16,x0,ror#18 // Sigma1(e)
  352. ror x0,x26,#28
  353. add x25,x25,x17 // h+=Ch(e,f,g)
  354. eor x17,x26,x26,ror#5
  355. add x25,x25,x16 // h+=Sigma1(e)
  356. and x28,x28,x19 // (b^c)&=(a^b)
  357. add x21,x21,x25 // d+=h
  358. eor x28,x28,x27 // Maj(a,b,c)
  359. eor x17,x0,x17,ror#34 // Sigma0(a)
  360. add x25,x25,x28 // h+=Maj(a,b,c)
  361. ldr x28,[x30],#8 // *K++, x19 in next round
  362. //add x25,x25,x17 // h+=Sigma0(a)
  363. #ifndef __AARCH64EB__
  364. rev x14,x14 // 11
  365. #endif
  366. ldp x15,x0,[x1],#2*8
  367. add x25,x25,x17 // h+=Sigma0(a)
  368. str x6,[sp,#24]
  369. ror x16,x21,#14
  370. add x24,x24,x28 // h+=K[i]
  371. eor x6,x21,x21,ror#23
  372. and x17,x22,x21
  373. bic x28,x23,x21
  374. add x24,x24,x14 // h+=X[i]
  375. orr x17,x17,x28 // Ch(e,f,g)
  376. eor x28,x25,x26 // a^b, b^c in next round
  377. eor x16,x16,x6,ror#18 // Sigma1(e)
  378. ror x6,x25,#28
  379. add x24,x24,x17 // h+=Ch(e,f,g)
  380. eor x17,x25,x25,ror#5
  381. add x24,x24,x16 // h+=Sigma1(e)
  382. and x19,x19,x28 // (b^c)&=(a^b)
  383. add x20,x20,x24 // d+=h
  384. eor x19,x19,x26 // Maj(a,b,c)
  385. eor x17,x6,x17,ror#34 // Sigma0(a)
  386. add x24,x24,x19 // h+=Maj(a,b,c)
  387. ldr x19,[x30],#8 // *K++, x28 in next round
  388. //add x24,x24,x17 // h+=Sigma0(a)
  389. #ifndef __AARCH64EB__
  390. rev x15,x15 // 12
  391. #endif
  392. add x24,x24,x17 // h+=Sigma0(a)
  393. str x7,[sp,#0]
  394. ror x16,x20,#14
  395. add x23,x23,x19 // h+=K[i]
  396. eor x7,x20,x20,ror#23
  397. and x17,x21,x20
  398. bic x19,x22,x20
  399. add x23,x23,x15 // h+=X[i]
  400. orr x17,x17,x19 // Ch(e,f,g)
  401. eor x19,x24,x25 // a^b, b^c in next round
  402. eor x16,x16,x7,ror#18 // Sigma1(e)
  403. ror x7,x24,#28
  404. add x23,x23,x17 // h+=Ch(e,f,g)
  405. eor x17,x24,x24,ror#5
  406. add x23,x23,x16 // h+=Sigma1(e)
  407. and x28,x28,x19 // (b^c)&=(a^b)
  408. add x27,x27,x23 // d+=h
  409. eor x28,x28,x25 // Maj(a,b,c)
  410. eor x17,x7,x17,ror#34 // Sigma0(a)
  411. add x23,x23,x28 // h+=Maj(a,b,c)
  412. ldr x28,[x30],#8 // *K++, x19 in next round
  413. //add x23,x23,x17 // h+=Sigma0(a)
  414. #ifndef __AARCH64EB__
  415. rev x0,x0 // 13
  416. #endif
  417. ldp x1,x2,[x1]
  418. add x23,x23,x17 // h+=Sigma0(a)
  419. str x8,[sp,#8]
  420. ror x16,x27,#14
  421. add x22,x22,x28 // h+=K[i]
  422. eor x8,x27,x27,ror#23
  423. and x17,x20,x27
  424. bic x28,x21,x27
  425. add x22,x22,x0 // h+=X[i]
  426. orr x17,x17,x28 // Ch(e,f,g)
  427. eor x28,x23,x24 // a^b, b^c in next round
  428. eor x16,x16,x8,ror#18 // Sigma1(e)
  429. ror x8,x23,#28
  430. add x22,x22,x17 // h+=Ch(e,f,g)
  431. eor x17,x23,x23,ror#5
  432. add x22,x22,x16 // h+=Sigma1(e)
  433. and x19,x19,x28 // (b^c)&=(a^b)
  434. add x26,x26,x22 // d+=h
  435. eor x19,x19,x24 // Maj(a,b,c)
  436. eor x17,x8,x17,ror#34 // Sigma0(a)
  437. add x22,x22,x19 // h+=Maj(a,b,c)
  438. ldr x19,[x30],#8 // *K++, x28 in next round
  439. //add x22,x22,x17 // h+=Sigma0(a)
  440. #ifndef __AARCH64EB__
  441. rev x1,x1 // 14
  442. #endif
  443. ldr x6,[sp,#24]
  444. add x22,x22,x17 // h+=Sigma0(a)
  445. str x9,[sp,#16]
  446. ror x16,x26,#14
  447. add x21,x21,x19 // h+=K[i]
  448. eor x9,x26,x26,ror#23
  449. and x17,x27,x26
  450. bic x19,x20,x26
  451. add x21,x21,x1 // h+=X[i]
  452. orr x17,x17,x19 // Ch(e,f,g)
  453. eor x19,x22,x23 // a^b, b^c in next round
  454. eor x16,x16,x9,ror#18 // Sigma1(e)
  455. ror x9,x22,#28
  456. add x21,x21,x17 // h+=Ch(e,f,g)
  457. eor x17,x22,x22,ror#5
  458. add x21,x21,x16 // h+=Sigma1(e)
  459. and x28,x28,x19 // (b^c)&=(a^b)
  460. add x25,x25,x21 // d+=h
  461. eor x28,x28,x23 // Maj(a,b,c)
  462. eor x17,x9,x17,ror#34 // Sigma0(a)
  463. add x21,x21,x28 // h+=Maj(a,b,c)
  464. ldr x28,[x30],#8 // *K++, x19 in next round
  465. //add x21,x21,x17 // h+=Sigma0(a)
  466. #ifndef __AARCH64EB__
  467. rev x2,x2 // 15
  468. #endif
  469. ldr x7,[sp,#0]
  470. add x21,x21,x17 // h+=Sigma0(a)
  471. str x10,[sp,#24]
  472. ror x16,x25,#14
  473. add x20,x20,x28 // h+=K[i]
  474. ror x9,x4,#1
  475. and x17,x26,x25
  476. ror x8,x1,#19
  477. bic x28,x27,x25
  478. ror x10,x21,#28
  479. add x20,x20,x2 // h+=X[i]
  480. eor x16,x16,x25,ror#18
  481. eor x9,x9,x4,ror#8
  482. orr x17,x17,x28 // Ch(e,f,g)
  483. eor x28,x21,x22 // a^b, b^c in next round
  484. eor x16,x16,x25,ror#41 // Sigma1(e)
  485. eor x10,x10,x21,ror#34
  486. add x20,x20,x17 // h+=Ch(e,f,g)
  487. and x19,x19,x28 // (b^c)&=(a^b)
  488. eor x8,x8,x1,ror#61
  489. eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
  490. add x20,x20,x16 // h+=Sigma1(e)
  491. eor x19,x19,x22 // Maj(a,b,c)
  492. eor x17,x10,x21,ror#39 // Sigma0(a)
  493. eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
  494. add x3,x3,x12
  495. add x24,x24,x20 // d+=h
  496. add x20,x20,x19 // h+=Maj(a,b,c)
  497. ldr x19,[x30],#8 // *K++, x28 in next round
  498. add x3,x3,x9
  499. add x20,x20,x17 // h+=Sigma0(a)
  500. add x3,x3,x8
  501. .Loop_16_xx:
  502. ldr x8,[sp,#8]
  503. str x11,[sp,#0]
  504. ror x16,x24,#14
  505. add x27,x27,x19 // h+=K[i]
  506. ror x10,x5,#1
  507. and x17,x25,x24
  508. ror x9,x2,#19
  509. bic x19,x26,x24
  510. ror x11,x20,#28
  511. add x27,x27,x3 // h+=X[i]
  512. eor x16,x16,x24,ror#18
  513. eor x10,x10,x5,ror#8
  514. orr x17,x17,x19 // Ch(e,f,g)
  515. eor x19,x20,x21 // a^b, b^c in next round
  516. eor x16,x16,x24,ror#41 // Sigma1(e)
  517. eor x11,x11,x20,ror#34
  518. add x27,x27,x17 // h+=Ch(e,f,g)
  519. and x28,x28,x19 // (b^c)&=(a^b)
  520. eor x9,x9,x2,ror#61
  521. eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
  522. add x27,x27,x16 // h+=Sigma1(e)
  523. eor x28,x28,x21 // Maj(a,b,c)
  524. eor x17,x11,x20,ror#39 // Sigma0(a)
  525. eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
  526. add x4,x4,x13
  527. add x23,x23,x27 // d+=h
  528. add x27,x27,x28 // h+=Maj(a,b,c)
  529. ldr x28,[x30],#8 // *K++, x19 in next round
  530. add x4,x4,x10
  531. add x27,x27,x17 // h+=Sigma0(a)
  532. add x4,x4,x9
  533. ldr x9,[sp,#16]
  534. str x12,[sp,#8]
  535. ror x16,x23,#14
  536. add x26,x26,x28 // h+=K[i]
  537. ror x11,x6,#1
  538. and x17,x24,x23
  539. ror x10,x3,#19
  540. bic x28,x25,x23
  541. ror x12,x27,#28
  542. add x26,x26,x4 // h+=X[i]
  543. eor x16,x16,x23,ror#18
  544. eor x11,x11,x6,ror#8
  545. orr x17,x17,x28 // Ch(e,f,g)
  546. eor x28,x27,x20 // a^b, b^c in next round
  547. eor x16,x16,x23,ror#41 // Sigma1(e)
  548. eor x12,x12,x27,ror#34
  549. add x26,x26,x17 // h+=Ch(e,f,g)
  550. and x19,x19,x28 // (b^c)&=(a^b)
  551. eor x10,x10,x3,ror#61
  552. eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
  553. add x26,x26,x16 // h+=Sigma1(e)
  554. eor x19,x19,x20 // Maj(a,b,c)
  555. eor x17,x12,x27,ror#39 // Sigma0(a)
  556. eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
  557. add x5,x5,x14
  558. add x22,x22,x26 // d+=h
  559. add x26,x26,x19 // h+=Maj(a,b,c)
  560. ldr x19,[x30],#8 // *K++, x28 in next round
  561. add x5,x5,x11
  562. add x26,x26,x17 // h+=Sigma0(a)
  563. add x5,x5,x10
  564. ldr x10,[sp,#24]
  565. str x13,[sp,#16]
  566. ror x16,x22,#14
  567. add x25,x25,x19 // h+=K[i]
  568. ror x12,x7,#1
  569. and x17,x23,x22
  570. ror x11,x4,#19
  571. bic x19,x24,x22
  572. ror x13,x26,#28
  573. add x25,x25,x5 // h+=X[i]
  574. eor x16,x16,x22,ror#18
  575. eor x12,x12,x7,ror#8
  576. orr x17,x17,x19 // Ch(e,f,g)
  577. eor x19,x26,x27 // a^b, b^c in next round
  578. eor x16,x16,x22,ror#41 // Sigma1(e)
  579. eor x13,x13,x26,ror#34
  580. add x25,x25,x17 // h+=Ch(e,f,g)
  581. and x28,x28,x19 // (b^c)&=(a^b)
  582. eor x11,x11,x4,ror#61
  583. eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
  584. add x25,x25,x16 // h+=Sigma1(e)
  585. eor x28,x28,x27 // Maj(a,b,c)
  586. eor x17,x13,x26,ror#39 // Sigma0(a)
  587. eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
  588. add x6,x6,x15
  589. add x21,x21,x25 // d+=h
  590. add x25,x25,x28 // h+=Maj(a,b,c)
  591. ldr x28,[x30],#8 // *K++, x19 in next round
  592. add x6,x6,x12
  593. add x25,x25,x17 // h+=Sigma0(a)
  594. add x6,x6,x11
  595. ldr x11,[sp,#0]
  596. str x14,[sp,#24]
  597. ror x16,x21,#14
  598. add x24,x24,x28 // h+=K[i]
  599. ror x13,x8,#1
  600. and x17,x22,x21
  601. ror x12,x5,#19
  602. bic x28,x23,x21
  603. ror x14,x25,#28
  604. add x24,x24,x6 // h+=X[i]
  605. eor x16,x16,x21,ror#18
  606. eor x13,x13,x8,ror#8
  607. orr x17,x17,x28 // Ch(e,f,g)
  608. eor x28,x25,x26 // a^b, b^c in next round
  609. eor x16,x16,x21,ror#41 // Sigma1(e)
  610. eor x14,x14,x25,ror#34
  611. add x24,x24,x17 // h+=Ch(e,f,g)
  612. and x19,x19,x28 // (b^c)&=(a^b)
  613. eor x12,x12,x5,ror#61
  614. eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
  615. add x24,x24,x16 // h+=Sigma1(e)
  616. eor x19,x19,x26 // Maj(a,b,c)
  617. eor x17,x14,x25,ror#39 // Sigma0(a)
  618. eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
  619. add x7,x7,x0
  620. add x20,x20,x24 // d+=h
  621. add x24,x24,x19 // h+=Maj(a,b,c)
  622. ldr x19,[x30],#8 // *K++, x28 in next round
  623. add x7,x7,x13
  624. add x24,x24,x17 // h+=Sigma0(a)
  625. add x7,x7,x12
  626. ldr x12,[sp,#8]
  627. str x15,[sp,#0]
  628. ror x16,x20,#14
  629. add x23,x23,x19 // h+=K[i]
  630. ror x14,x9,#1
  631. and x17,x21,x20
  632. ror x13,x6,#19
  633. bic x19,x22,x20
  634. ror x15,x24,#28
  635. add x23,x23,x7 // h+=X[i]
  636. eor x16,x16,x20,ror#18
  637. eor x14,x14,x9,ror#8
  638. orr x17,x17,x19 // Ch(e,f,g)
  639. eor x19,x24,x25 // a^b, b^c in next round
  640. eor x16,x16,x20,ror#41 // Sigma1(e)
  641. eor x15,x15,x24,ror#34
  642. add x23,x23,x17 // h+=Ch(e,f,g)
  643. and x28,x28,x19 // (b^c)&=(a^b)
  644. eor x13,x13,x6,ror#61
  645. eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
  646. add x23,x23,x16 // h+=Sigma1(e)
  647. eor x28,x28,x25 // Maj(a,b,c)
  648. eor x17,x15,x24,ror#39 // Sigma0(a)
  649. eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
  650. add x8,x8,x1
  651. add x27,x27,x23 // d+=h
  652. add x23,x23,x28 // h+=Maj(a,b,c)
  653. ldr x28,[x30],#8 // *K++, x19 in next round
  654. add x8,x8,x14
  655. add x23,x23,x17 // h+=Sigma0(a)
  656. add x8,x8,x13
  657. ldr x13,[sp,#16]
  658. str x0,[sp,#8]
  659. ror x16,x27,#14
  660. add x22,x22,x28 // h+=K[i]
  661. ror x15,x10,#1
  662. and x17,x20,x27
  663. ror x14,x7,#19
  664. bic x28,x21,x27
  665. ror x0,x23,#28
  666. add x22,x22,x8 // h+=X[i]
  667. eor x16,x16,x27,ror#18
  668. eor x15,x15,x10,ror#8
  669. orr x17,x17,x28 // Ch(e,f,g)
  670. eor x28,x23,x24 // a^b, b^c in next round
  671. eor x16,x16,x27,ror#41 // Sigma1(e)
  672. eor x0,x0,x23,ror#34
  673. add x22,x22,x17 // h+=Ch(e,f,g)
  674. and x19,x19,x28 // (b^c)&=(a^b)
  675. eor x14,x14,x7,ror#61
  676. eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
  677. add x22,x22,x16 // h+=Sigma1(e)
  678. eor x19,x19,x24 // Maj(a,b,c)
  679. eor x17,x0,x23,ror#39 // Sigma0(a)
  680. eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
  681. add x9,x9,x2
  682. add x26,x26,x22 // d+=h
  683. add x22,x22,x19 // h+=Maj(a,b,c)
  684. ldr x19,[x30],#8 // *K++, x28 in next round
  685. add x9,x9,x15
  686. add x22,x22,x17 // h+=Sigma0(a)
  687. add x9,x9,x14
  688. ldr x14,[sp,#24]
  689. str x1,[sp,#16]
  690. ror x16,x26,#14
  691. add x21,x21,x19 // h+=K[i]
  692. ror x0,x11,#1
  693. and x17,x27,x26
  694. ror x15,x8,#19
  695. bic x19,x20,x26
  696. ror x1,x22,#28
  697. add x21,x21,x9 // h+=X[i]
  698. eor x16,x16,x26,ror#18
  699. eor x0,x0,x11,ror#8
  700. orr x17,x17,x19 // Ch(e,f,g)
  701. eor x19,x22,x23 // a^b, b^c in next round
  702. eor x16,x16,x26,ror#41 // Sigma1(e)
  703. eor x1,x1,x22,ror#34
  704. add x21,x21,x17 // h+=Ch(e,f,g)
  705. and x28,x28,x19 // (b^c)&=(a^b)
  706. eor x15,x15,x8,ror#61
  707. eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
  708. add x21,x21,x16 // h+=Sigma1(e)
  709. eor x28,x28,x23 // Maj(a,b,c)
  710. eor x17,x1,x22,ror#39 // Sigma0(a)
  711. eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
  712. add x10,x10,x3
  713. add x25,x25,x21 // d+=h
  714. add x21,x21,x28 // h+=Maj(a,b,c)
  715. ldr x28,[x30],#8 // *K++, x19 in next round
  716. add x10,x10,x0
  717. add x21,x21,x17 // h+=Sigma0(a)
  718. add x10,x10,x15
  719. ldr x15,[sp,#0]
  720. str x2,[sp,#24]
  721. ror x16,x25,#14
  722. add x20,x20,x28 // h+=K[i]
  723. ror x1,x12,#1
  724. and x17,x26,x25
  725. ror x0,x9,#19
  726. bic x28,x27,x25
  727. ror x2,x21,#28
  728. add x20,x20,x10 // h+=X[i]
  729. eor x16,x16,x25,ror#18
  730. eor x1,x1,x12,ror#8
  731. orr x17,x17,x28 // Ch(e,f,g)
  732. eor x28,x21,x22 // a^b, b^c in next round
  733. eor x16,x16,x25,ror#41 // Sigma1(e)
  734. eor x2,x2,x21,ror#34
  735. add x20,x20,x17 // h+=Ch(e,f,g)
  736. and x19,x19,x28 // (b^c)&=(a^b)
  737. eor x0,x0,x9,ror#61
  738. eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
  739. add x20,x20,x16 // h+=Sigma1(e)
  740. eor x19,x19,x22 // Maj(a,b,c)
  741. eor x17,x2,x21,ror#39 // Sigma0(a)
  742. eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
  743. add x11,x11,x4
  744. add x24,x24,x20 // d+=h
  745. add x20,x20,x19 // h+=Maj(a,b,c)
  746. ldr x19,[x30],#8 // *K++, x28 in next round
  747. add x11,x11,x1
  748. add x20,x20,x17 // h+=Sigma0(a)
  749. add x11,x11,x0
  750. ldr x0,[sp,#8]
  751. str x3,[sp,#0]
  752. ror x16,x24,#14
  753. add x27,x27,x19 // h+=K[i]
  754. ror x2,x13,#1
  755. and x17,x25,x24
  756. ror x1,x10,#19
  757. bic x19,x26,x24
  758. ror x3,x20,#28
  759. add x27,x27,x11 // h+=X[i]
  760. eor x16,x16,x24,ror#18
  761. eor x2,x2,x13,ror#8
  762. orr x17,x17,x19 // Ch(e,f,g)
  763. eor x19,x20,x21 // a^b, b^c in next round
  764. eor x16,x16,x24,ror#41 // Sigma1(e)
  765. eor x3,x3,x20,ror#34
  766. add x27,x27,x17 // h+=Ch(e,f,g)
  767. and x28,x28,x19 // (b^c)&=(a^b)
  768. eor x1,x1,x10,ror#61
  769. eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
  770. add x27,x27,x16 // h+=Sigma1(e)
  771. eor x28,x28,x21 // Maj(a,b,c)
  772. eor x17,x3,x20,ror#39 // Sigma0(a)
  773. eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
  774. add x12,x12,x5
  775. add x23,x23,x27 // d+=h
  776. add x27,x27,x28 // h+=Maj(a,b,c)
  777. ldr x28,[x30],#8 // *K++, x19 in next round
  778. add x12,x12,x2
  779. add x27,x27,x17 // h+=Sigma0(a)
  780. add x12,x12,x1
  781. ldr x1,[sp,#16]
  782. str x4,[sp,#8]
  783. ror x16,x23,#14
  784. add x26,x26,x28 // h+=K[i]
  785. ror x3,x14,#1
  786. and x17,x24,x23
  787. ror x2,x11,#19
  788. bic x28,x25,x23
  789. ror x4,x27,#28
  790. add x26,x26,x12 // h+=X[i]
  791. eor x16,x16,x23,ror#18
  792. eor x3,x3,x14,ror#8
  793. orr x17,x17,x28 // Ch(e,f,g)
  794. eor x28,x27,x20 // a^b, b^c in next round
  795. eor x16,x16,x23,ror#41 // Sigma1(e)
  796. eor x4,x4,x27,ror#34
  797. add x26,x26,x17 // h+=Ch(e,f,g)
  798. and x19,x19,x28 // (b^c)&=(a^b)
  799. eor x2,x2,x11,ror#61
  800. eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
  801. add x26,x26,x16 // h+=Sigma1(e)
  802. eor x19,x19,x20 // Maj(a,b,c)
  803. eor x17,x4,x27,ror#39 // Sigma0(a)
  804. eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
  805. add x13,x13,x6
  806. add x22,x22,x26 // d+=h
  807. add x26,x26,x19 // h+=Maj(a,b,c)
  808. ldr x19,[x30],#8 // *K++, x28 in next round
  809. add x13,x13,x3
  810. add x26,x26,x17 // h+=Sigma0(a)
  811. add x13,x13,x2
  812. ldr x2,[sp,#24]
  813. str x5,[sp,#16]
  814. ror x16,x22,#14
  815. add x25,x25,x19 // h+=K[i]
  816. ror x4,x15,#1
  817. and x17,x23,x22
  818. ror x3,x12,#19
  819. bic x19,x24,x22
  820. ror x5,x26,#28
  821. add x25,x25,x13 // h+=X[i]
  822. eor x16,x16,x22,ror#18
  823. eor x4,x4,x15,ror#8
  824. orr x17,x17,x19 // Ch(e,f,g)
  825. eor x19,x26,x27 // a^b, b^c in next round
  826. eor x16,x16,x22,ror#41 // Sigma1(e)
  827. eor x5,x5,x26,ror#34
  828. add x25,x25,x17 // h+=Ch(e,f,g)
  829. and x28,x28,x19 // (b^c)&=(a^b)
  830. eor x3,x3,x12,ror#61
  831. eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
  832. add x25,x25,x16 // h+=Sigma1(e)
  833. eor x28,x28,x27 // Maj(a,b,c)
  834. eor x17,x5,x26,ror#39 // Sigma0(a)
  835. eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
  836. add x14,x14,x7
  837. add x21,x21,x25 // d+=h
  838. add x25,x25,x28 // h+=Maj(a,b,c)
  839. ldr x28,[x30],#8 // *K++, x19 in next round
  840. add x14,x14,x4
  841. add x25,x25,x17 // h+=Sigma0(a)
  842. add x14,x14,x3
  843. ldr x3,[sp,#0]
  844. str x6,[sp,#24]
  845. ror x16,x21,#14
  846. add x24,x24,x28 // h+=K[i]
  847. ror x5,x0,#1
  848. and x17,x22,x21
  849. ror x4,x13,#19
  850. bic x28,x23,x21
  851. ror x6,x25,#28
  852. add x24,x24,x14 // h+=X[i]
  853. eor x16,x16,x21,ror#18
  854. eor x5,x5,x0,ror#8
  855. orr x17,x17,x28 // Ch(e,f,g)
  856. eor x28,x25,x26 // a^b, b^c in next round
  857. eor x16,x16,x21,ror#41 // Sigma1(e)
  858. eor x6,x6,x25,ror#34
  859. add x24,x24,x17 // h+=Ch(e,f,g)
  860. and x19,x19,x28 // (b^c)&=(a^b)
  861. eor x4,x4,x13,ror#61
  862. eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
  863. add x24,x24,x16 // h+=Sigma1(e)
  864. eor x19,x19,x26 // Maj(a,b,c)
  865. eor x17,x6,x25,ror#39 // Sigma0(a)
  866. eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
  867. add x15,x15,x8
  868. add x20,x20,x24 // d+=h
  869. add x24,x24,x19 // h+=Maj(a,b,c)
  870. ldr x19,[x30],#8 // *K++, x28 in next round
  871. add x15,x15,x5
  872. add x24,x24,x17 // h+=Sigma0(a)
  873. add x15,x15,x4
  874. ldr x4,[sp,#8]
  875. str x7,[sp,#0]
  876. ror x16,x20,#14
  877. add x23,x23,x19 // h+=K[i]
  878. ror x6,x1,#1
  879. and x17,x21,x20
  880. ror x5,x14,#19
  881. bic x19,x22,x20
  882. ror x7,x24,#28
  883. add x23,x23,x15 // h+=X[i]
  884. eor x16,x16,x20,ror#18
  885. eor x6,x6,x1,ror#8
  886. orr x17,x17,x19 // Ch(e,f,g)
  887. eor x19,x24,x25 // a^b, b^c in next round
  888. eor x16,x16,x20,ror#41 // Sigma1(e)
  889. eor x7,x7,x24,ror#34
  890. add x23,x23,x17 // h+=Ch(e,f,g)
  891. and x28,x28,x19 // (b^c)&=(a^b)
  892. eor x5,x5,x14,ror#61
  893. eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
  894. add x23,x23,x16 // h+=Sigma1(e)
  895. eor x28,x28,x25 // Maj(a,b,c)
  896. eor x17,x7,x24,ror#39 // Sigma0(a)
  897. eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
  898. add x0,x0,x9
  899. add x27,x27,x23 // d+=h
  900. add x23,x23,x28 // h+=Maj(a,b,c)
  901. ldr x28,[x30],#8 // *K++, x19 in next round
  902. add x0,x0,x6
  903. add x23,x23,x17 // h+=Sigma0(a)
  904. add x0,x0,x5
  905. ldr x5,[sp,#16]
  906. str x8,[sp,#8]
  907. ror x16,x27,#14
  908. add x22,x22,x28 // h+=K[i]
  909. ror x7,x2,#1
  910. and x17,x20,x27
  911. ror x6,x15,#19
  912. bic x28,x21,x27
  913. ror x8,x23,#28
  914. add x22,x22,x0 // h+=X[i]
  915. eor x16,x16,x27,ror#18
  916. eor x7,x7,x2,ror#8
  917. orr x17,x17,x28 // Ch(e,f,g)
  918. eor x28,x23,x24 // a^b, b^c in next round
  919. eor x16,x16,x27,ror#41 // Sigma1(e)
  920. eor x8,x8,x23,ror#34
  921. add x22,x22,x17 // h+=Ch(e,f,g)
  922. and x19,x19,x28 // (b^c)&=(a^b)
  923. eor x6,x6,x15,ror#61
  924. eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
  925. add x22,x22,x16 // h+=Sigma1(e)
  926. eor x19,x19,x24 // Maj(a,b,c)
  927. eor x17,x8,x23,ror#39 // Sigma0(a)
  928. eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
  929. add x1,x1,x10
  930. add x26,x26,x22 // d+=h
  931. add x22,x22,x19 // h+=Maj(a,b,c)
  932. ldr x19,[x30],#8 // *K++, x28 in next round
  933. add x1,x1,x7
  934. add x22,x22,x17 // h+=Sigma0(a)
  935. add x1,x1,x6
  936. ldr x6,[sp,#24]
  937. str x9,[sp,#16]
  938. ror x16,x26,#14
  939. add x21,x21,x19 // h+=K[i]
  940. ror x8,x3,#1
  941. and x17,x27,x26
  942. ror x7,x0,#19
  943. bic x19,x20,x26
  944. ror x9,x22,#28
  945. add x21,x21,x1 // h+=X[i]
  946. eor x16,x16,x26,ror#18
  947. eor x8,x8,x3,ror#8
  948. orr x17,x17,x19 // Ch(e,f,g)
  949. eor x19,x22,x23 // a^b, b^c in next round
  950. eor x16,x16,x26,ror#41 // Sigma1(e)
  951. eor x9,x9,x22,ror#34
  952. add x21,x21,x17 // h+=Ch(e,f,g)
  953. and x28,x28,x19 // (b^c)&=(a^b)
  954. eor x7,x7,x0,ror#61
  955. eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
  956. add x21,x21,x16 // h+=Sigma1(e)
  957. eor x28,x28,x23 // Maj(a,b,c)
  958. eor x17,x9,x22,ror#39 // Sigma0(a)
  959. eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
  960. add x2,x2,x11
  961. add x25,x25,x21 // d+=h
  962. add x21,x21,x28 // h+=Maj(a,b,c)
  963. ldr x28,[x30],#8 // *K++, x19 in next round
  964. add x2,x2,x8
  965. add x21,x21,x17 // h+=Sigma0(a)
  966. add x2,x2,x7
  967. ldr x7,[sp,#0]
  968. str x10,[sp,#24]
  969. ror x16,x25,#14
  970. add x20,x20,x28 // h+=K[i]
  971. ror x9,x4,#1
  972. and x17,x26,x25
  973. ror x8,x1,#19
  974. bic x28,x27,x25
  975. ror x10,x21,#28
  976. add x20,x20,x2 // h+=X[i]
  977. eor x16,x16,x25,ror#18
  978. eor x9,x9,x4,ror#8
  979. orr x17,x17,x28 // Ch(e,f,g)
  980. eor x28,x21,x22 // a^b, b^c in next round
  981. eor x16,x16,x25,ror#41 // Sigma1(e)
  982. eor x10,x10,x21,ror#34
  983. add x20,x20,x17 // h+=Ch(e,f,g)
  984. and x19,x19,x28 // (b^c)&=(a^b)
  985. eor x8,x8,x1,ror#61
  986. eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
  987. add x20,x20,x16 // h+=Sigma1(e)
  988. eor x19,x19,x22 // Maj(a,b,c)
  989. eor x17,x10,x21,ror#39 // Sigma0(a)
  990. eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
  991. add x3,x3,x12
  992. add x24,x24,x20 // d+=h
  993. add x20,x20,x19 // h+=Maj(a,b,c)
  994. ldr x19,[x30],#8 // *K++, x28 in next round
  995. add x3,x3,x9
  996. add x20,x20,x17 // h+=Sigma0(a)
  997. add x3,x3,x8
  998. cbnz x19,.Loop_16_xx
  999. ldp x0,x2,[x29,#96]
  1000. ldr x1,[x29,#112]
  1001. sub x30,x30,#648 // rewind
  1002. ldp x3,x4,[x0]
  1003. ldp x5,x6,[x0,#2*8]
  1004. add x1,x1,#14*8 // advance input pointer
  1005. ldp x7,x8,[x0,#4*8]
  1006. add x20,x20,x3
  1007. ldp x9,x10,[x0,#6*8]
  1008. add x21,x21,x4
  1009. add x22,x22,x5
  1010. add x23,x23,x6
  1011. stp x20,x21,[x0]
  1012. add x24,x24,x7
  1013. add x25,x25,x8
  1014. stp x22,x23,[x0,#2*8]
  1015. add x26,x26,x9
  1016. add x27,x27,x10
  1017. cmp x1,x2
  1018. stp x24,x25,[x0,#4*8]
  1019. stp x26,x27,[x0,#6*8]
  1020. b.ne .Loop
  1021. ldp x19,x20,[x29,#16]
  1022. add sp,sp,#4*8
  1023. ldp x21,x22,[x29,#32]
  1024. ldp x23,x24,[x29,#48]
  1025. ldp x25,x26,[x29,#64]
  1026. ldp x27,x28,[x29,#80]
  1027. ldp x29,x30,[sp],#128
  1028. .inst 0xd50323bf // autiasp
  1029. ret
  1030. .size sha512_block_data_order,.-sha512_block_data_order
  1031. .align 6
  1032. .type .LK512,%object
  1033. .LK512:
  1034. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  1035. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  1036. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  1037. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  1038. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  1039. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  1040. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  1041. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  1042. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  1043. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  1044. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  1045. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  1046. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  1047. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  1048. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  1049. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  1050. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  1051. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  1052. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  1053. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  1054. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  1055. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  1056. .quad 0xd192e819d6ef5218,0xd69906245565a910
  1057. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  1058. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  1059. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  1060. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  1061. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  1062. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  1063. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  1064. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  1065. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  1066. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  1067. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  1068. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  1069. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  1070. .quad 0x28db77f523047d84,0x32caab7b40c72493
  1071. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  1072. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  1073. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  1074. .quad 0 // terminator
  1075. .size .LK512,.-.LK512
  1076. #ifndef __KERNEL__
  1077. .align 3
  1078. .LOPENSSL_armcap_P:
  1079. # ifdef __ILP32__
  1080. .long OPENSSL_armcap_P-.
  1081. # else
  1082. .quad OPENSSL_armcap_P-.
  1083. # endif
  1084. #endif
  1085. .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1086. .align 2
  1087. .align 2
  1088. #ifndef __KERNEL__
  1089. .type sha512_block_armv8,%function
  1090. .align 6
  1091. sha512_block_armv8:
  1092. .Lv8_entry:
  1093. stp x29,x30,[sp,#-16]!
  1094. add x29,sp,#0
  1095. ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
  1096. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  1097. ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
  1098. adr x3,.LK512
  1099. rev64 v16.16b,v16.16b
  1100. rev64 v17.16b,v17.16b
  1101. rev64 v18.16b,v18.16b
  1102. rev64 v19.16b,v19.16b
  1103. rev64 v20.16b,v20.16b
  1104. rev64 v21.16b,v21.16b
  1105. rev64 v22.16b,v22.16b
  1106. rev64 v23.16b,v23.16b
  1107. b .Loop_hw
  1108. .align 4
  1109. .Loop_hw:
  1110. ld1 {v24.2d},[x3],#16
  1111. subs x2,x2,#1
  1112. sub x4,x1,#128
  1113. orr v26.16b,v0.16b,v0.16b // offload
  1114. orr v27.16b,v1.16b,v1.16b
  1115. orr v28.16b,v2.16b,v2.16b
  1116. orr v29.16b,v3.16b,v3.16b
  1117. csel x1,x1,x4,ne // conditional rewind
  1118. add v24.2d,v24.2d,v16.2d
  1119. ld1 {v25.2d},[x3],#16
  1120. ext v24.16b,v24.16b,v24.16b,#8
  1121. ext v5.16b,v2.16b,v3.16b,#8
  1122. ext v6.16b,v1.16b,v2.16b,#8
  1123. add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
  1124. .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
  1125. ext v7.16b,v20.16b,v21.16b,#8
  1126. .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1127. .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
  1128. add v4.2d,v1.2d,v3.2d // "D + T1"
  1129. .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1130. add v25.2d,v25.2d,v17.2d
  1131. ld1 {v24.2d},[x3],#16
  1132. ext v25.16b,v25.16b,v25.16b,#8
  1133. ext v5.16b,v4.16b,v2.16b,#8
  1134. ext v6.16b,v0.16b,v4.16b,#8
  1135. add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
  1136. .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
  1137. ext v7.16b,v21.16b,v22.16b,#8
  1138. .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1139. .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
  1140. add v1.2d,v0.2d,v2.2d // "D + T1"
  1141. .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1142. add v24.2d,v24.2d,v18.2d
  1143. ld1 {v25.2d},[x3],#16
  1144. ext v24.16b,v24.16b,v24.16b,#8
  1145. ext v5.16b,v1.16b,v4.16b,#8
  1146. ext v6.16b,v3.16b,v1.16b,#8
  1147. add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
  1148. .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
  1149. ext v7.16b,v22.16b,v23.16b,#8
  1150. .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1151. .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
  1152. add v0.2d,v3.2d,v4.2d // "D + T1"
  1153. .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1154. add v25.2d,v25.2d,v19.2d
  1155. ld1 {v24.2d},[x3],#16
  1156. ext v25.16b,v25.16b,v25.16b,#8
  1157. ext v5.16b,v0.16b,v1.16b,#8
  1158. ext v6.16b,v2.16b,v0.16b,#8
  1159. add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
  1160. .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
  1161. ext v7.16b,v23.16b,v16.16b,#8
  1162. .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1163. .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
  1164. add v3.2d,v2.2d,v1.2d // "D + T1"
  1165. .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1166. add v24.2d,v24.2d,v20.2d
  1167. ld1 {v25.2d},[x3],#16
  1168. ext v24.16b,v24.16b,v24.16b,#8
  1169. ext v5.16b,v3.16b,v0.16b,#8
  1170. ext v6.16b,v4.16b,v3.16b,#8
  1171. add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
  1172. .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
  1173. ext v7.16b,v16.16b,v17.16b,#8
  1174. .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1175. .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
  1176. add v2.2d,v4.2d,v0.2d // "D + T1"
  1177. .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1178. add v25.2d,v25.2d,v21.2d
  1179. ld1 {v24.2d},[x3],#16
  1180. ext v25.16b,v25.16b,v25.16b,#8
  1181. ext v5.16b,v2.16b,v3.16b,#8
  1182. ext v6.16b,v1.16b,v2.16b,#8
  1183. add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
  1184. .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
  1185. ext v7.16b,v17.16b,v18.16b,#8
  1186. .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1187. .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
  1188. add v4.2d,v1.2d,v3.2d // "D + T1"
  1189. .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1190. add v24.2d,v24.2d,v22.2d
  1191. ld1 {v25.2d},[x3],#16
  1192. ext v24.16b,v24.16b,v24.16b,#8
  1193. ext v5.16b,v4.16b,v2.16b,#8
  1194. ext v6.16b,v0.16b,v4.16b,#8
  1195. add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
  1196. .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
  1197. ext v7.16b,v18.16b,v19.16b,#8
  1198. .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1199. .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
  1200. add v1.2d,v0.2d,v2.2d // "D + T1"
  1201. .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1202. add v25.2d,v25.2d,v23.2d
  1203. ld1 {v24.2d},[x3],#16
  1204. ext v25.16b,v25.16b,v25.16b,#8
  1205. ext v5.16b,v1.16b,v4.16b,#8
  1206. ext v6.16b,v3.16b,v1.16b,#8
  1207. add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
  1208. .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
  1209. ext v7.16b,v19.16b,v20.16b,#8
  1210. .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1211. .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
  1212. add v0.2d,v3.2d,v4.2d // "D + T1"
  1213. .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1214. add v24.2d,v24.2d,v16.2d
  1215. ld1 {v25.2d},[x3],#16
  1216. ext v24.16b,v24.16b,v24.16b,#8
  1217. ext v5.16b,v0.16b,v1.16b,#8
  1218. ext v6.16b,v2.16b,v0.16b,#8
  1219. add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
  1220. .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
  1221. ext v7.16b,v20.16b,v21.16b,#8
  1222. .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1223. .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
  1224. add v3.2d,v2.2d,v1.2d // "D + T1"
  1225. .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1226. add v25.2d,v25.2d,v17.2d
  1227. ld1 {v24.2d},[x3],#16
  1228. ext v25.16b,v25.16b,v25.16b,#8
  1229. ext v5.16b,v3.16b,v0.16b,#8
  1230. ext v6.16b,v4.16b,v3.16b,#8
  1231. add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
  1232. .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
  1233. ext v7.16b,v21.16b,v22.16b,#8
  1234. .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1235. .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
  1236. add v2.2d,v4.2d,v0.2d // "D + T1"
  1237. .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1238. add v24.2d,v24.2d,v18.2d
  1239. ld1 {v25.2d},[x3],#16
  1240. ext v24.16b,v24.16b,v24.16b,#8
  1241. ext v5.16b,v2.16b,v3.16b,#8
  1242. ext v6.16b,v1.16b,v2.16b,#8
  1243. add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
  1244. .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
  1245. ext v7.16b,v22.16b,v23.16b,#8
  1246. .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1247. .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
  1248. add v4.2d,v1.2d,v3.2d // "D + T1"
  1249. .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1250. add v25.2d,v25.2d,v19.2d
  1251. ld1 {v24.2d},[x3],#16
  1252. ext v25.16b,v25.16b,v25.16b,#8
  1253. ext v5.16b,v4.16b,v2.16b,#8
  1254. ext v6.16b,v0.16b,v4.16b,#8
  1255. add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
  1256. .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
  1257. ext v7.16b,v23.16b,v16.16b,#8
  1258. .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1259. .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
  1260. add v1.2d,v0.2d,v2.2d // "D + T1"
  1261. .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1262. add v24.2d,v24.2d,v20.2d
  1263. ld1 {v25.2d},[x3],#16
  1264. ext v24.16b,v24.16b,v24.16b,#8
  1265. ext v5.16b,v1.16b,v4.16b,#8
  1266. ext v6.16b,v3.16b,v1.16b,#8
  1267. add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
  1268. .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
  1269. ext v7.16b,v16.16b,v17.16b,#8
  1270. .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1271. .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
  1272. add v0.2d,v3.2d,v4.2d // "D + T1"
  1273. .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1274. add v25.2d,v25.2d,v21.2d
  1275. ld1 {v24.2d},[x3],#16
  1276. ext v25.16b,v25.16b,v25.16b,#8
  1277. ext v5.16b,v0.16b,v1.16b,#8
  1278. ext v6.16b,v2.16b,v0.16b,#8
  1279. add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
  1280. .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
  1281. ext v7.16b,v17.16b,v18.16b,#8
  1282. .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1283. .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
  1284. add v3.2d,v2.2d,v1.2d // "D + T1"
  1285. .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1286. add v24.2d,v24.2d,v22.2d
  1287. ld1 {v25.2d},[x3],#16
  1288. ext v24.16b,v24.16b,v24.16b,#8
  1289. ext v5.16b,v3.16b,v0.16b,#8
  1290. ext v6.16b,v4.16b,v3.16b,#8
  1291. add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
  1292. .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
  1293. ext v7.16b,v18.16b,v19.16b,#8
  1294. .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1295. .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
  1296. add v2.2d,v4.2d,v0.2d // "D + T1"
  1297. .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1298. add v25.2d,v25.2d,v23.2d
  1299. ld1 {v24.2d},[x3],#16
  1300. ext v25.16b,v25.16b,v25.16b,#8
  1301. ext v5.16b,v2.16b,v3.16b,#8
  1302. ext v6.16b,v1.16b,v2.16b,#8
  1303. add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
  1304. .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
  1305. ext v7.16b,v19.16b,v20.16b,#8
  1306. .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1307. .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
  1308. add v4.2d,v1.2d,v3.2d // "D + T1"
  1309. .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1310. add v24.2d,v24.2d,v16.2d
  1311. ld1 {v25.2d},[x3],#16
  1312. ext v24.16b,v24.16b,v24.16b,#8
  1313. ext v5.16b,v4.16b,v2.16b,#8
  1314. ext v6.16b,v0.16b,v4.16b,#8
  1315. add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
  1316. .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
  1317. ext v7.16b,v20.16b,v21.16b,#8
  1318. .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1319. .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
  1320. add v1.2d,v0.2d,v2.2d // "D + T1"
  1321. .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1322. add v25.2d,v25.2d,v17.2d
  1323. ld1 {v24.2d},[x3],#16
  1324. ext v25.16b,v25.16b,v25.16b,#8
  1325. ext v5.16b,v1.16b,v4.16b,#8
  1326. ext v6.16b,v3.16b,v1.16b,#8
  1327. add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
  1328. .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
  1329. ext v7.16b,v21.16b,v22.16b,#8
  1330. .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1331. .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
  1332. add v0.2d,v3.2d,v4.2d // "D + T1"
  1333. .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1334. add v24.2d,v24.2d,v18.2d
  1335. ld1 {v25.2d},[x3],#16
  1336. ext v24.16b,v24.16b,v24.16b,#8
  1337. ext v5.16b,v0.16b,v1.16b,#8
  1338. ext v6.16b,v2.16b,v0.16b,#8
  1339. add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
  1340. .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
  1341. ext v7.16b,v22.16b,v23.16b,#8
  1342. .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1343. .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
  1344. add v3.2d,v2.2d,v1.2d // "D + T1"
  1345. .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1346. add v25.2d,v25.2d,v19.2d
  1347. ld1 {v24.2d},[x3],#16
  1348. ext v25.16b,v25.16b,v25.16b,#8
  1349. ext v5.16b,v3.16b,v0.16b,#8
  1350. ext v6.16b,v4.16b,v3.16b,#8
  1351. add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
  1352. .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
  1353. ext v7.16b,v23.16b,v16.16b,#8
  1354. .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1355. .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
  1356. add v2.2d,v4.2d,v0.2d // "D + T1"
  1357. .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1358. add v24.2d,v24.2d,v20.2d
  1359. ld1 {v25.2d},[x3],#16
  1360. ext v24.16b,v24.16b,v24.16b,#8
  1361. ext v5.16b,v2.16b,v3.16b,#8
  1362. ext v6.16b,v1.16b,v2.16b,#8
  1363. add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
  1364. .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
  1365. ext v7.16b,v16.16b,v17.16b,#8
  1366. .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1367. .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
  1368. add v4.2d,v1.2d,v3.2d // "D + T1"
  1369. .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1370. add v25.2d,v25.2d,v21.2d
  1371. ld1 {v24.2d},[x3],#16
  1372. ext v25.16b,v25.16b,v25.16b,#8
  1373. ext v5.16b,v4.16b,v2.16b,#8
  1374. ext v6.16b,v0.16b,v4.16b,#8
  1375. add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
  1376. .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
  1377. ext v7.16b,v17.16b,v18.16b,#8
  1378. .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1379. .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
  1380. add v1.2d,v0.2d,v2.2d // "D + T1"
  1381. .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1382. add v24.2d,v24.2d,v22.2d
  1383. ld1 {v25.2d},[x3],#16
  1384. ext v24.16b,v24.16b,v24.16b,#8
  1385. ext v5.16b,v1.16b,v4.16b,#8
  1386. ext v6.16b,v3.16b,v1.16b,#8
  1387. add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
  1388. .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
  1389. ext v7.16b,v18.16b,v19.16b,#8
  1390. .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1391. .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
  1392. add v0.2d,v3.2d,v4.2d // "D + T1"
  1393. .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1394. add v25.2d,v25.2d,v23.2d
  1395. ld1 {v24.2d},[x3],#16
  1396. ext v25.16b,v25.16b,v25.16b,#8
  1397. ext v5.16b,v0.16b,v1.16b,#8
  1398. ext v6.16b,v2.16b,v0.16b,#8
  1399. add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
  1400. .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
  1401. ext v7.16b,v19.16b,v20.16b,#8
  1402. .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1403. .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
  1404. add v3.2d,v2.2d,v1.2d // "D + T1"
  1405. .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1406. add v24.2d,v24.2d,v16.2d
  1407. ld1 {v25.2d},[x3],#16
  1408. ext v24.16b,v24.16b,v24.16b,#8
  1409. ext v5.16b,v3.16b,v0.16b,#8
  1410. ext v6.16b,v4.16b,v3.16b,#8
  1411. add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
  1412. .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
  1413. ext v7.16b,v20.16b,v21.16b,#8
  1414. .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1415. .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
  1416. add v2.2d,v4.2d,v0.2d // "D + T1"
  1417. .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1418. add v25.2d,v25.2d,v17.2d
  1419. ld1 {v24.2d},[x3],#16
  1420. ext v25.16b,v25.16b,v25.16b,#8
  1421. ext v5.16b,v2.16b,v3.16b,#8
  1422. ext v6.16b,v1.16b,v2.16b,#8
  1423. add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
  1424. .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
  1425. ext v7.16b,v21.16b,v22.16b,#8
  1426. .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1427. .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
  1428. add v4.2d,v1.2d,v3.2d // "D + T1"
  1429. .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1430. add v24.2d,v24.2d,v18.2d
  1431. ld1 {v25.2d},[x3],#16
  1432. ext v24.16b,v24.16b,v24.16b,#8
  1433. ext v5.16b,v4.16b,v2.16b,#8
  1434. ext v6.16b,v0.16b,v4.16b,#8
  1435. add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
  1436. .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
  1437. ext v7.16b,v22.16b,v23.16b,#8
  1438. .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1439. .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
  1440. add v1.2d,v0.2d,v2.2d // "D + T1"
  1441. .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1442. add v25.2d,v25.2d,v19.2d
  1443. ld1 {v24.2d},[x3],#16
  1444. ext v25.16b,v25.16b,v25.16b,#8
  1445. ext v5.16b,v1.16b,v4.16b,#8
  1446. ext v6.16b,v3.16b,v1.16b,#8
  1447. add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
  1448. .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
  1449. ext v7.16b,v23.16b,v16.16b,#8
  1450. .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1451. .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
  1452. add v0.2d,v3.2d,v4.2d // "D + T1"
  1453. .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1454. add v24.2d,v24.2d,v20.2d
  1455. ld1 {v25.2d},[x3],#16
  1456. ext v24.16b,v24.16b,v24.16b,#8
  1457. ext v5.16b,v0.16b,v1.16b,#8
  1458. ext v6.16b,v2.16b,v0.16b,#8
  1459. add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
  1460. .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
  1461. ext v7.16b,v16.16b,v17.16b,#8
  1462. .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1463. .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
  1464. add v3.2d,v2.2d,v1.2d // "D + T1"
  1465. .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1466. add v25.2d,v25.2d,v21.2d
  1467. ld1 {v24.2d},[x3],#16
  1468. ext v25.16b,v25.16b,v25.16b,#8
  1469. ext v5.16b,v3.16b,v0.16b,#8
  1470. ext v6.16b,v4.16b,v3.16b,#8
  1471. add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
  1472. .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
  1473. ext v7.16b,v17.16b,v18.16b,#8
  1474. .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1475. .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
  1476. add v2.2d,v4.2d,v0.2d // "D + T1"
  1477. .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1478. add v24.2d,v24.2d,v22.2d
  1479. ld1 {v25.2d},[x3],#16
  1480. ext v24.16b,v24.16b,v24.16b,#8
  1481. ext v5.16b,v2.16b,v3.16b,#8
  1482. ext v6.16b,v1.16b,v2.16b,#8
  1483. add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
  1484. .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
  1485. ext v7.16b,v18.16b,v19.16b,#8
  1486. .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1487. .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
  1488. add v4.2d,v1.2d,v3.2d // "D + T1"
  1489. .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1490. add v25.2d,v25.2d,v23.2d
  1491. ld1 {v24.2d},[x3],#16
  1492. ext v25.16b,v25.16b,v25.16b,#8
  1493. ext v5.16b,v4.16b,v2.16b,#8
  1494. ext v6.16b,v0.16b,v4.16b,#8
  1495. add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
  1496. .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
  1497. ext v7.16b,v19.16b,v20.16b,#8
  1498. .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1499. .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
  1500. add v1.2d,v0.2d,v2.2d // "D + T1"
  1501. .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1502. ld1 {v25.2d},[x3],#16
  1503. add v24.2d,v24.2d,v16.2d
  1504. ld1 {v16.16b},[x1],#16 // load next input
  1505. ext v24.16b,v24.16b,v24.16b,#8
  1506. ext v5.16b,v1.16b,v4.16b,#8
  1507. ext v6.16b,v3.16b,v1.16b,#8
  1508. add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
  1509. .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1510. rev64 v16.16b,v16.16b
  1511. add v0.2d,v3.2d,v4.2d // "D + T1"
  1512. .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1513. ld1 {v24.2d},[x3],#16
  1514. add v25.2d,v25.2d,v17.2d
  1515. ld1 {v17.16b},[x1],#16 // load next input
  1516. ext v25.16b,v25.16b,v25.16b,#8
  1517. ext v5.16b,v0.16b,v1.16b,#8
  1518. ext v6.16b,v2.16b,v0.16b,#8
  1519. add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
  1520. .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1521. rev64 v17.16b,v17.16b
  1522. add v3.2d,v2.2d,v1.2d // "D + T1"
  1523. .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1524. ld1 {v25.2d},[x3],#16
  1525. add v24.2d,v24.2d,v18.2d
  1526. ld1 {v18.16b},[x1],#16 // load next input
  1527. ext v24.16b,v24.16b,v24.16b,#8
  1528. ext v5.16b,v3.16b,v0.16b,#8
  1529. ext v6.16b,v4.16b,v3.16b,#8
  1530. add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
  1531. .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1532. rev64 v18.16b,v18.16b
  1533. add v2.2d,v4.2d,v0.2d // "D + T1"
  1534. .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1535. ld1 {v24.2d},[x3],#16
  1536. add v25.2d,v25.2d,v19.2d
  1537. ld1 {v19.16b},[x1],#16 // load next input
  1538. ext v25.16b,v25.16b,v25.16b,#8
  1539. ext v5.16b,v2.16b,v3.16b,#8
  1540. ext v6.16b,v1.16b,v2.16b,#8
  1541. add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
  1542. .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
  1543. rev64 v19.16b,v19.16b
  1544. add v4.2d,v1.2d,v3.2d // "D + T1"
  1545. .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
  1546. ld1 {v25.2d},[x3],#16
  1547. add v24.2d,v24.2d,v20.2d
  1548. ld1 {v20.16b},[x1],#16 // load next input
  1549. ext v24.16b,v24.16b,v24.16b,#8
  1550. ext v5.16b,v4.16b,v2.16b,#8
  1551. ext v6.16b,v0.16b,v4.16b,#8
  1552. add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
  1553. .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
  1554. rev64 v20.16b,v20.16b
  1555. add v1.2d,v0.2d,v2.2d // "D + T1"
  1556. .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
  1557. ld1 {v24.2d},[x3],#16
  1558. add v25.2d,v25.2d,v21.2d
  1559. ld1 {v21.16b},[x1],#16 // load next input
  1560. ext v25.16b,v25.16b,v25.16b,#8
  1561. ext v5.16b,v1.16b,v4.16b,#8
  1562. ext v6.16b,v3.16b,v1.16b,#8
  1563. add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
  1564. .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
  1565. rev64 v21.16b,v21.16b
  1566. add v0.2d,v3.2d,v4.2d // "D + T1"
  1567. .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
  1568. ld1 {v25.2d},[x3],#16
  1569. add v24.2d,v24.2d,v22.2d
  1570. ld1 {v22.16b},[x1],#16 // load next input
  1571. ext v24.16b,v24.16b,v24.16b,#8
  1572. ext v5.16b,v0.16b,v1.16b,#8
  1573. ext v6.16b,v2.16b,v0.16b,#8
  1574. add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
  1575. .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
  1576. rev64 v22.16b,v22.16b
  1577. add v3.2d,v2.2d,v1.2d // "D + T1"
  1578. .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
  1579. sub x3,x3,#80*8 // rewind
  1580. add v25.2d,v25.2d,v23.2d
  1581. ld1 {v23.16b},[x1],#16 // load next input
  1582. ext v25.16b,v25.16b,v25.16b,#8
  1583. ext v5.16b,v3.16b,v0.16b,#8
  1584. ext v6.16b,v4.16b,v3.16b,#8
  1585. add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
  1586. .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
  1587. rev64 v23.16b,v23.16b
  1588. add v2.2d,v4.2d,v0.2d // "D + T1"
  1589. .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
  1590. add v0.2d,v0.2d,v26.2d // accumulate
  1591. add v1.2d,v1.2d,v27.2d
  1592. add v2.2d,v2.2d,v28.2d
  1593. add v3.2d,v3.2d,v29.2d
  1594. cbnz x2,.Loop_hw
  1595. st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
  1596. ldr x29,[sp],#16
  1597. ret
  1598. .size sha512_block_armv8,.-sha512_block_armv8
  1599. #endif