chacha-armv8.S 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977
  1. #include "arm_arch.h"
  2. .text
  3. .private_extern _OPENSSL_armcap_P
  4. .align 5
  5. Lsigma:
  6. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  7. Lone:
  8. .long 1,0,0,0
  9. LOPENSSL_armcap_P:
  10. #ifdef __ILP32__
  11. .long _OPENSSL_armcap_P-.
  12. #else
  13. .quad _OPENSSL_armcap_P-.
  14. #endif
  15. .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  16. .align 2
  17. .globl _ChaCha20_ctr32
  18. .align 5
  19. _ChaCha20_ctr32:
  20. cbz x2,Labort
  21. adr x5,LOPENSSL_armcap_P
  22. cmp x2,#192
  23. b.lo Lshort
  24. #ifdef __ILP32__
  25. ldrsw x6,[x5]
  26. #else
  27. ldr x6,[x5]
  28. #endif
  29. ldr w17,[x6,x5]
  30. tst w17,#ARMV7_NEON
  31. b.ne ChaCha20_neon
  32. Lshort:
  33. .long 0xd503233f // paciasp
  34. stp x29,x30,[sp,#-96]!
  35. add x29,sp,#0
  36. adr x5,Lsigma
  37. stp x19,x20,[sp,#16]
  38. stp x21,x22,[sp,#32]
  39. stp x23,x24,[sp,#48]
  40. stp x25,x26,[sp,#64]
  41. stp x27,x28,[sp,#80]
  42. sub sp,sp,#64
  43. ldp x22,x23,[x5] // load sigma
  44. ldp x24,x25,[x3] // load key
  45. ldp x26,x27,[x3,#16]
  46. ldp x28,x30,[x4] // load counter
  47. #ifdef __ARMEB__
  48. ror x24,x24,#32
  49. ror x25,x25,#32
  50. ror x26,x26,#32
  51. ror x27,x27,#32
  52. ror x28,x28,#32
  53. ror x30,x30,#32
  54. #endif
  55. Loop_outer:
  56. mov w5,w22 // unpack key block
  57. lsr x6,x22,#32
  58. mov w7,w23
  59. lsr x8,x23,#32
  60. mov w9,w24
  61. lsr x10,x24,#32
  62. mov w11,w25
  63. lsr x12,x25,#32
  64. mov w13,w26
  65. lsr x14,x26,#32
  66. mov w15,w27
  67. lsr x16,x27,#32
  68. mov w17,w28
  69. lsr x19,x28,#32
  70. mov w20,w30
  71. lsr x21,x30,#32
  72. mov x4,#10
  73. subs x2,x2,#64
  74. Loop:
  75. sub x4,x4,#1
  76. add w5,w5,w9
  77. add w6,w6,w10
  78. add w7,w7,w11
  79. add w8,w8,w12
  80. eor w17,w17,w5
  81. eor w19,w19,w6
  82. eor w20,w20,w7
  83. eor w21,w21,w8
  84. ror w17,w17,#16
  85. ror w19,w19,#16
  86. ror w20,w20,#16
  87. ror w21,w21,#16
  88. add w13,w13,w17
  89. add w14,w14,w19
  90. add w15,w15,w20
  91. add w16,w16,w21
  92. eor w9,w9,w13
  93. eor w10,w10,w14
  94. eor w11,w11,w15
  95. eor w12,w12,w16
  96. ror w9,w9,#20
  97. ror w10,w10,#20
  98. ror w11,w11,#20
  99. ror w12,w12,#20
  100. add w5,w5,w9
  101. add w6,w6,w10
  102. add w7,w7,w11
  103. add w8,w8,w12
  104. eor w17,w17,w5
  105. eor w19,w19,w6
  106. eor w20,w20,w7
  107. eor w21,w21,w8
  108. ror w17,w17,#24
  109. ror w19,w19,#24
  110. ror w20,w20,#24
  111. ror w21,w21,#24
  112. add w13,w13,w17
  113. add w14,w14,w19
  114. add w15,w15,w20
  115. add w16,w16,w21
  116. eor w9,w9,w13
  117. eor w10,w10,w14
  118. eor w11,w11,w15
  119. eor w12,w12,w16
  120. ror w9,w9,#25
  121. ror w10,w10,#25
  122. ror w11,w11,#25
  123. ror w12,w12,#25
  124. add w5,w5,w10
  125. add w6,w6,w11
  126. add w7,w7,w12
  127. add w8,w8,w9
  128. eor w21,w21,w5
  129. eor w17,w17,w6
  130. eor w19,w19,w7
  131. eor w20,w20,w8
  132. ror w21,w21,#16
  133. ror w17,w17,#16
  134. ror w19,w19,#16
  135. ror w20,w20,#16
  136. add w15,w15,w21
  137. add w16,w16,w17
  138. add w13,w13,w19
  139. add w14,w14,w20
  140. eor w10,w10,w15
  141. eor w11,w11,w16
  142. eor w12,w12,w13
  143. eor w9,w9,w14
  144. ror w10,w10,#20
  145. ror w11,w11,#20
  146. ror w12,w12,#20
  147. ror w9,w9,#20
  148. add w5,w5,w10
  149. add w6,w6,w11
  150. add w7,w7,w12
  151. add w8,w8,w9
  152. eor w21,w21,w5
  153. eor w17,w17,w6
  154. eor w19,w19,w7
  155. eor w20,w20,w8
  156. ror w21,w21,#24
  157. ror w17,w17,#24
  158. ror w19,w19,#24
  159. ror w20,w20,#24
  160. add w15,w15,w21
  161. add w16,w16,w17
  162. add w13,w13,w19
  163. add w14,w14,w20
  164. eor w10,w10,w15
  165. eor w11,w11,w16
  166. eor w12,w12,w13
  167. eor w9,w9,w14
  168. ror w10,w10,#25
  169. ror w11,w11,#25
  170. ror w12,w12,#25
  171. ror w9,w9,#25
  172. cbnz x4,Loop
  173. add w5,w5,w22 // accumulate key block
  174. add x6,x6,x22,lsr#32
  175. add w7,w7,w23
  176. add x8,x8,x23,lsr#32
  177. add w9,w9,w24
  178. add x10,x10,x24,lsr#32
  179. add w11,w11,w25
  180. add x12,x12,x25,lsr#32
  181. add w13,w13,w26
  182. add x14,x14,x26,lsr#32
  183. add w15,w15,w27
  184. add x16,x16,x27,lsr#32
  185. add w17,w17,w28
  186. add x19,x19,x28,lsr#32
  187. add w20,w20,w30
  188. add x21,x21,x30,lsr#32
  189. b.lo Ltail
  190. add x5,x5,x6,lsl#32 // pack
  191. add x7,x7,x8,lsl#32
  192. ldp x6,x8,[x1,#0] // load input
  193. add x9,x9,x10,lsl#32
  194. add x11,x11,x12,lsl#32
  195. ldp x10,x12,[x1,#16]
  196. add x13,x13,x14,lsl#32
  197. add x15,x15,x16,lsl#32
  198. ldp x14,x16,[x1,#32]
  199. add x17,x17,x19,lsl#32
  200. add x20,x20,x21,lsl#32
  201. ldp x19,x21,[x1,#48]
  202. add x1,x1,#64
  203. #ifdef __ARMEB__
  204. rev x5,x5
  205. rev x7,x7
  206. rev x9,x9
  207. rev x11,x11
  208. rev x13,x13
  209. rev x15,x15
  210. rev x17,x17
  211. rev x20,x20
  212. #endif
  213. eor x5,x5,x6
  214. eor x7,x7,x8
  215. eor x9,x9,x10
  216. eor x11,x11,x12
  217. eor x13,x13,x14
  218. eor x15,x15,x16
  219. eor x17,x17,x19
  220. eor x20,x20,x21
  221. stp x5,x7,[x0,#0] // store output
  222. add x28,x28,#1 // increment counter
  223. stp x9,x11,[x0,#16]
  224. stp x13,x15,[x0,#32]
  225. stp x17,x20,[x0,#48]
  226. add x0,x0,#64
  227. b.hi Loop_outer
  228. ldp x19,x20,[x29,#16]
  229. add sp,sp,#64
  230. ldp x21,x22,[x29,#32]
  231. ldp x23,x24,[x29,#48]
  232. ldp x25,x26,[x29,#64]
  233. ldp x27,x28,[x29,#80]
  234. ldp x29,x30,[sp],#96
  235. .long 0xd50323bf // autiasp
  236. Labort:
  237. ret
  238. .align 4
  239. Ltail:
  240. add x2,x2,#64
  241. Less_than_64:
  242. sub x0,x0,#1
  243. add x1,x1,x2
  244. add x0,x0,x2
  245. add x4,sp,x2
  246. neg x2,x2
  247. add x5,x5,x6,lsl#32 // pack
  248. add x7,x7,x8,lsl#32
  249. add x9,x9,x10,lsl#32
  250. add x11,x11,x12,lsl#32
  251. add x13,x13,x14,lsl#32
  252. add x15,x15,x16,lsl#32
  253. add x17,x17,x19,lsl#32
  254. add x20,x20,x21,lsl#32
  255. #ifdef __ARMEB__
  256. rev x5,x5
  257. rev x7,x7
  258. rev x9,x9
  259. rev x11,x11
  260. rev x13,x13
  261. rev x15,x15
  262. rev x17,x17
  263. rev x20,x20
  264. #endif
  265. stp x5,x7,[sp,#0]
  266. stp x9,x11,[sp,#16]
  267. stp x13,x15,[sp,#32]
  268. stp x17,x20,[sp,#48]
  269. Loop_tail:
  270. ldrb w10,[x1,x2]
  271. ldrb w11,[x4,x2]
  272. add x2,x2,#1
  273. eor w10,w10,w11
  274. strb w10,[x0,x2]
  275. cbnz x2,Loop_tail
  276. stp xzr,xzr,[sp,#0]
  277. stp xzr,xzr,[sp,#16]
  278. stp xzr,xzr,[sp,#32]
  279. stp xzr,xzr,[sp,#48]
  280. ldp x19,x20,[x29,#16]
  281. add sp,sp,#64
  282. ldp x21,x22,[x29,#32]
  283. ldp x23,x24,[x29,#48]
  284. ldp x25,x26,[x29,#64]
  285. ldp x27,x28,[x29,#80]
  286. ldp x29,x30,[sp],#96
  287. .long 0xd50323bf // autiasp
  288. ret
  289. .align 5
  290. ChaCha20_neon:
  291. .long 0xd503233f // paciasp
  292. stp x29,x30,[sp,#-96]!
  293. add x29,sp,#0
  294. adr x5,Lsigma
  295. stp x19,x20,[sp,#16]
  296. stp x21,x22,[sp,#32]
  297. stp x23,x24,[sp,#48]
  298. stp x25,x26,[sp,#64]
  299. stp x27,x28,[sp,#80]
  300. cmp x2,#512
  301. b.hs L512_or_more_neon
  302. sub sp,sp,#64
  303. ldp x22,x23,[x5] // load sigma
  304. ld1 {v24.4s},[x5],#16
  305. ldp x24,x25,[x3] // load key
  306. ldp x26,x27,[x3,#16]
  307. ld1 {v25.4s,v26.4s},[x3]
  308. ldp x28,x30,[x4] // load counter
  309. ld1 {v27.4s},[x4]
  310. ld1 {v31.4s},[x5]
  311. #ifdef __ARMEB__
  312. rev64 v24.4s,v24.4s
  313. ror x24,x24,#32
  314. ror x25,x25,#32
  315. ror x26,x26,#32
  316. ror x27,x27,#32
  317. ror x28,x28,#32
  318. ror x30,x30,#32
  319. #endif
  320. add v27.4s,v27.4s,v31.4s // += 1
  321. add v28.4s,v27.4s,v31.4s
  322. add v29.4s,v28.4s,v31.4s
  323. shl v31.4s,v31.4s,#2 // 1 -> 4
  324. Loop_outer_neon:
  325. mov w5,w22 // unpack key block
  326. lsr x6,x22,#32
  327. mov v0.16b,v24.16b
  328. mov w7,w23
  329. lsr x8,x23,#32
  330. mov v4.16b,v24.16b
  331. mov w9,w24
  332. lsr x10,x24,#32
  333. mov v16.16b,v24.16b
  334. mov w11,w25
  335. mov v1.16b,v25.16b
  336. lsr x12,x25,#32
  337. mov v5.16b,v25.16b
  338. mov w13,w26
  339. mov v17.16b,v25.16b
  340. lsr x14,x26,#32
  341. mov v3.16b,v27.16b
  342. mov w15,w27
  343. mov v7.16b,v28.16b
  344. lsr x16,x27,#32
  345. mov v19.16b,v29.16b
  346. mov w17,w28
  347. mov v2.16b,v26.16b
  348. lsr x19,x28,#32
  349. mov v6.16b,v26.16b
  350. mov w20,w30
  351. mov v18.16b,v26.16b
  352. lsr x21,x30,#32
  353. mov x4,#10
  354. subs x2,x2,#256
  355. Loop_neon:
  356. sub x4,x4,#1
  357. add v0.4s,v0.4s,v1.4s
  358. add w5,w5,w9
  359. add v4.4s,v4.4s,v5.4s
  360. add w6,w6,w10
  361. add v16.4s,v16.4s,v17.4s
  362. add w7,w7,w11
  363. eor v3.16b,v3.16b,v0.16b
  364. add w8,w8,w12
  365. eor v7.16b,v7.16b,v4.16b
  366. eor w17,w17,w5
  367. eor v19.16b,v19.16b,v16.16b
  368. eor w19,w19,w6
  369. rev32 v3.8h,v3.8h
  370. eor w20,w20,w7
  371. rev32 v7.8h,v7.8h
  372. eor w21,w21,w8
  373. rev32 v19.8h,v19.8h
  374. ror w17,w17,#16
  375. add v2.4s,v2.4s,v3.4s
  376. ror w19,w19,#16
  377. add v6.4s,v6.4s,v7.4s
  378. ror w20,w20,#16
  379. add v18.4s,v18.4s,v19.4s
  380. ror w21,w21,#16
  381. eor v20.16b,v1.16b,v2.16b
  382. add w13,w13,w17
  383. eor v21.16b,v5.16b,v6.16b
  384. add w14,w14,w19
  385. eor v22.16b,v17.16b,v18.16b
  386. add w15,w15,w20
  387. ushr v1.4s,v20.4s,#20
  388. add w16,w16,w21
  389. ushr v5.4s,v21.4s,#20
  390. eor w9,w9,w13
  391. ushr v17.4s,v22.4s,#20
  392. eor w10,w10,w14
  393. sli v1.4s,v20.4s,#12
  394. eor w11,w11,w15
  395. sli v5.4s,v21.4s,#12
  396. eor w12,w12,w16
  397. sli v17.4s,v22.4s,#12
  398. ror w9,w9,#20
  399. add v0.4s,v0.4s,v1.4s
  400. ror w10,w10,#20
  401. add v4.4s,v4.4s,v5.4s
  402. ror w11,w11,#20
  403. add v16.4s,v16.4s,v17.4s
  404. ror w12,w12,#20
  405. eor v20.16b,v3.16b,v0.16b
  406. add w5,w5,w9
  407. eor v21.16b,v7.16b,v4.16b
  408. add w6,w6,w10
  409. eor v22.16b,v19.16b,v16.16b
  410. add w7,w7,w11
  411. ushr v3.4s,v20.4s,#24
  412. add w8,w8,w12
  413. ushr v7.4s,v21.4s,#24
  414. eor w17,w17,w5
  415. ushr v19.4s,v22.4s,#24
  416. eor w19,w19,w6
  417. sli v3.4s,v20.4s,#8
  418. eor w20,w20,w7
  419. sli v7.4s,v21.4s,#8
  420. eor w21,w21,w8
  421. sli v19.4s,v22.4s,#8
  422. ror w17,w17,#24
  423. add v2.4s,v2.4s,v3.4s
  424. ror w19,w19,#24
  425. add v6.4s,v6.4s,v7.4s
  426. ror w20,w20,#24
  427. add v18.4s,v18.4s,v19.4s
  428. ror w21,w21,#24
  429. eor v20.16b,v1.16b,v2.16b
  430. add w13,w13,w17
  431. eor v21.16b,v5.16b,v6.16b
  432. add w14,w14,w19
  433. eor v22.16b,v17.16b,v18.16b
  434. add w15,w15,w20
  435. ushr v1.4s,v20.4s,#25
  436. add w16,w16,w21
  437. ushr v5.4s,v21.4s,#25
  438. eor w9,w9,w13
  439. ushr v17.4s,v22.4s,#25
  440. eor w10,w10,w14
  441. sli v1.4s,v20.4s,#7
  442. eor w11,w11,w15
  443. sli v5.4s,v21.4s,#7
  444. eor w12,w12,w16
  445. sli v17.4s,v22.4s,#7
  446. ror w9,w9,#25
  447. ext v2.16b,v2.16b,v2.16b,#8
  448. ror w10,w10,#25
  449. ext v6.16b,v6.16b,v6.16b,#8
  450. ror w11,w11,#25
  451. ext v18.16b,v18.16b,v18.16b,#8
  452. ror w12,w12,#25
  453. ext v3.16b,v3.16b,v3.16b,#12
  454. ext v7.16b,v7.16b,v7.16b,#12
  455. ext v19.16b,v19.16b,v19.16b,#12
  456. ext v1.16b,v1.16b,v1.16b,#4
  457. ext v5.16b,v5.16b,v5.16b,#4
  458. ext v17.16b,v17.16b,v17.16b,#4
  459. add v0.4s,v0.4s,v1.4s
  460. add w5,w5,w10
  461. add v4.4s,v4.4s,v5.4s
  462. add w6,w6,w11
  463. add v16.4s,v16.4s,v17.4s
  464. add w7,w7,w12
  465. eor v3.16b,v3.16b,v0.16b
  466. add w8,w8,w9
  467. eor v7.16b,v7.16b,v4.16b
  468. eor w21,w21,w5
  469. eor v19.16b,v19.16b,v16.16b
  470. eor w17,w17,w6
  471. rev32 v3.8h,v3.8h
  472. eor w19,w19,w7
  473. rev32 v7.8h,v7.8h
  474. eor w20,w20,w8
  475. rev32 v19.8h,v19.8h
  476. ror w21,w21,#16
  477. add v2.4s,v2.4s,v3.4s
  478. ror w17,w17,#16
  479. add v6.4s,v6.4s,v7.4s
  480. ror w19,w19,#16
  481. add v18.4s,v18.4s,v19.4s
  482. ror w20,w20,#16
  483. eor v20.16b,v1.16b,v2.16b
  484. add w15,w15,w21
  485. eor v21.16b,v5.16b,v6.16b
  486. add w16,w16,w17
  487. eor v22.16b,v17.16b,v18.16b
  488. add w13,w13,w19
  489. ushr v1.4s,v20.4s,#20
  490. add w14,w14,w20
  491. ushr v5.4s,v21.4s,#20
  492. eor w10,w10,w15
  493. ushr v17.4s,v22.4s,#20
  494. eor w11,w11,w16
  495. sli v1.4s,v20.4s,#12
  496. eor w12,w12,w13
  497. sli v5.4s,v21.4s,#12
  498. eor w9,w9,w14
  499. sli v17.4s,v22.4s,#12
  500. ror w10,w10,#20
  501. add v0.4s,v0.4s,v1.4s
  502. ror w11,w11,#20
  503. add v4.4s,v4.4s,v5.4s
  504. ror w12,w12,#20
  505. add v16.4s,v16.4s,v17.4s
  506. ror w9,w9,#20
  507. eor v20.16b,v3.16b,v0.16b
  508. add w5,w5,w10
  509. eor v21.16b,v7.16b,v4.16b
  510. add w6,w6,w11
  511. eor v22.16b,v19.16b,v16.16b
  512. add w7,w7,w12
  513. ushr v3.4s,v20.4s,#24
  514. add w8,w8,w9
  515. ushr v7.4s,v21.4s,#24
  516. eor w21,w21,w5
  517. ushr v19.4s,v22.4s,#24
  518. eor w17,w17,w6
  519. sli v3.4s,v20.4s,#8
  520. eor w19,w19,w7
  521. sli v7.4s,v21.4s,#8
  522. eor w20,w20,w8
  523. sli v19.4s,v22.4s,#8
  524. ror w21,w21,#24
  525. add v2.4s,v2.4s,v3.4s
  526. ror w17,w17,#24
  527. add v6.4s,v6.4s,v7.4s
  528. ror w19,w19,#24
  529. add v18.4s,v18.4s,v19.4s
  530. ror w20,w20,#24
  531. eor v20.16b,v1.16b,v2.16b
  532. add w15,w15,w21
  533. eor v21.16b,v5.16b,v6.16b
  534. add w16,w16,w17
  535. eor v22.16b,v17.16b,v18.16b
  536. add w13,w13,w19
  537. ushr v1.4s,v20.4s,#25
  538. add w14,w14,w20
  539. ushr v5.4s,v21.4s,#25
  540. eor w10,w10,w15
  541. ushr v17.4s,v22.4s,#25
  542. eor w11,w11,w16
  543. sli v1.4s,v20.4s,#7
  544. eor w12,w12,w13
  545. sli v5.4s,v21.4s,#7
  546. eor w9,w9,w14
  547. sli v17.4s,v22.4s,#7
  548. ror w10,w10,#25
  549. ext v2.16b,v2.16b,v2.16b,#8
  550. ror w11,w11,#25
  551. ext v6.16b,v6.16b,v6.16b,#8
  552. ror w12,w12,#25
  553. ext v18.16b,v18.16b,v18.16b,#8
  554. ror w9,w9,#25
  555. ext v3.16b,v3.16b,v3.16b,#4
  556. ext v7.16b,v7.16b,v7.16b,#4
  557. ext v19.16b,v19.16b,v19.16b,#4
  558. ext v1.16b,v1.16b,v1.16b,#12
  559. ext v5.16b,v5.16b,v5.16b,#12
  560. ext v17.16b,v17.16b,v17.16b,#12
  561. cbnz x4,Loop_neon
  562. add w5,w5,w22 // accumulate key block
  563. add v0.4s,v0.4s,v24.4s
  564. add x6,x6,x22,lsr#32
  565. add v4.4s,v4.4s,v24.4s
  566. add w7,w7,w23
  567. add v16.4s,v16.4s,v24.4s
  568. add x8,x8,x23,lsr#32
  569. add v2.4s,v2.4s,v26.4s
  570. add w9,w9,w24
  571. add v6.4s,v6.4s,v26.4s
  572. add x10,x10,x24,lsr#32
  573. add v18.4s,v18.4s,v26.4s
  574. add w11,w11,w25
  575. add v3.4s,v3.4s,v27.4s
  576. add x12,x12,x25,lsr#32
  577. add w13,w13,w26
  578. add v7.4s,v7.4s,v28.4s
  579. add x14,x14,x26,lsr#32
  580. add w15,w15,w27
  581. add v19.4s,v19.4s,v29.4s
  582. add x16,x16,x27,lsr#32
  583. add w17,w17,w28
  584. add v1.4s,v1.4s,v25.4s
  585. add x19,x19,x28,lsr#32
  586. add w20,w20,w30
  587. add v5.4s,v5.4s,v25.4s
  588. add x21,x21,x30,lsr#32
  589. add v17.4s,v17.4s,v25.4s
  590. b.lo Ltail_neon
  591. add x5,x5,x6,lsl#32 // pack
  592. add x7,x7,x8,lsl#32
  593. ldp x6,x8,[x1,#0] // load input
  594. add x9,x9,x10,lsl#32
  595. add x11,x11,x12,lsl#32
  596. ldp x10,x12,[x1,#16]
  597. add x13,x13,x14,lsl#32
  598. add x15,x15,x16,lsl#32
  599. ldp x14,x16,[x1,#32]
  600. add x17,x17,x19,lsl#32
  601. add x20,x20,x21,lsl#32
  602. ldp x19,x21,[x1,#48]
  603. add x1,x1,#64
  604. #ifdef __ARMEB__
  605. rev x5,x5
  606. rev x7,x7
  607. rev x9,x9
  608. rev x11,x11
  609. rev x13,x13
  610. rev x15,x15
  611. rev x17,x17
  612. rev x20,x20
  613. #endif
  614. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  615. eor x5,x5,x6
  616. eor x7,x7,x8
  617. eor x9,x9,x10
  618. eor x11,x11,x12
  619. eor x13,x13,x14
  620. eor v0.16b,v0.16b,v20.16b
  621. eor x15,x15,x16
  622. eor v1.16b,v1.16b,v21.16b
  623. eor x17,x17,x19
  624. eor v2.16b,v2.16b,v22.16b
  625. eor x20,x20,x21
  626. eor v3.16b,v3.16b,v23.16b
  627. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  628. stp x5,x7,[x0,#0] // store output
  629. add x28,x28,#4 // increment counter
  630. stp x9,x11,[x0,#16]
  631. add v27.4s,v27.4s,v31.4s // += 4
  632. stp x13,x15,[x0,#32]
  633. add v28.4s,v28.4s,v31.4s
  634. stp x17,x20,[x0,#48]
  635. add v29.4s,v29.4s,v31.4s
  636. add x0,x0,#64
  637. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  638. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  639. eor v4.16b,v4.16b,v20.16b
  640. eor v5.16b,v5.16b,v21.16b
  641. eor v6.16b,v6.16b,v22.16b
  642. eor v7.16b,v7.16b,v23.16b
  643. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  644. eor v16.16b,v16.16b,v0.16b
  645. eor v17.16b,v17.16b,v1.16b
  646. eor v18.16b,v18.16b,v2.16b
  647. eor v19.16b,v19.16b,v3.16b
  648. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  649. b.hi Loop_outer_neon
  650. ldp x19,x20,[x29,#16]
  651. add sp,sp,#64
  652. ldp x21,x22,[x29,#32]
  653. ldp x23,x24,[x29,#48]
  654. ldp x25,x26,[x29,#64]
  655. ldp x27,x28,[x29,#80]
  656. ldp x29,x30,[sp],#96
  657. .long 0xd50323bf // autiasp
  658. ret
  659. Ltail_neon:
  660. add x2,x2,#256
  661. cmp x2,#64
  662. b.lo Less_than_64
  663. add x5,x5,x6,lsl#32 // pack
  664. add x7,x7,x8,lsl#32
  665. ldp x6,x8,[x1,#0] // load input
  666. add x9,x9,x10,lsl#32
  667. add x11,x11,x12,lsl#32
  668. ldp x10,x12,[x1,#16]
  669. add x13,x13,x14,lsl#32
  670. add x15,x15,x16,lsl#32
  671. ldp x14,x16,[x1,#32]
  672. add x17,x17,x19,lsl#32
  673. add x20,x20,x21,lsl#32
  674. ldp x19,x21,[x1,#48]
  675. add x1,x1,#64
  676. #ifdef __ARMEB__
  677. rev x5,x5
  678. rev x7,x7
  679. rev x9,x9
  680. rev x11,x11
  681. rev x13,x13
  682. rev x15,x15
  683. rev x17,x17
  684. rev x20,x20
  685. #endif
  686. eor x5,x5,x6
  687. eor x7,x7,x8
  688. eor x9,x9,x10
  689. eor x11,x11,x12
  690. eor x13,x13,x14
  691. eor x15,x15,x16
  692. eor x17,x17,x19
  693. eor x20,x20,x21
  694. stp x5,x7,[x0,#0] // store output
  695. add x28,x28,#4 // increment counter
  696. stp x9,x11,[x0,#16]
  697. stp x13,x15,[x0,#32]
  698. stp x17,x20,[x0,#48]
  699. add x0,x0,#64
  700. b.eq Ldone_neon
  701. sub x2,x2,#64
  702. cmp x2,#64
  703. b.lo Less_than_128
  704. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  705. eor v0.16b,v0.16b,v20.16b
  706. eor v1.16b,v1.16b,v21.16b
  707. eor v2.16b,v2.16b,v22.16b
  708. eor v3.16b,v3.16b,v23.16b
  709. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  710. b.eq Ldone_neon
  711. sub x2,x2,#64
  712. cmp x2,#64
  713. b.lo Less_than_192
  714. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  715. eor v4.16b,v4.16b,v20.16b
  716. eor v5.16b,v5.16b,v21.16b
  717. eor v6.16b,v6.16b,v22.16b
  718. eor v7.16b,v7.16b,v23.16b
  719. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  720. b.eq Ldone_neon
  721. sub x2,x2,#64
  722. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
  723. b Last_neon
  724. Less_than_128:
  725. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
  726. b Last_neon
  727. Less_than_192:
  728. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
  729. b Last_neon
  730. .align 4
  731. Last_neon:
  732. sub x0,x0,#1
  733. add x1,x1,x2
  734. add x0,x0,x2
  735. add x4,sp,x2
  736. neg x2,x2
  737. Loop_tail_neon:
  738. ldrb w10,[x1,x2]
  739. ldrb w11,[x4,x2]
  740. add x2,x2,#1
  741. eor w10,w10,w11
  742. strb w10,[x0,x2]
  743. cbnz x2,Loop_tail_neon
  744. stp xzr,xzr,[sp,#0]
  745. stp xzr,xzr,[sp,#16]
  746. stp xzr,xzr,[sp,#32]
  747. stp xzr,xzr,[sp,#48]
  748. Ldone_neon:
  749. ldp x19,x20,[x29,#16]
  750. add sp,sp,#64
  751. ldp x21,x22,[x29,#32]
  752. ldp x23,x24,[x29,#48]
  753. ldp x25,x26,[x29,#64]
  754. ldp x27,x28,[x29,#80]
  755. ldp x29,x30,[sp],#96
  756. .long 0xd50323bf // autiasp
  757. ret
  758. .align 5
  759. ChaCha20_512_neon:
  760. .long 0xd503233f // paciasp
  761. stp x29,x30,[sp,#-96]!
  762. add x29,sp,#0
  763. adr x5,Lsigma
  764. stp x19,x20,[sp,#16]
  765. stp x21,x22,[sp,#32]
  766. stp x23,x24,[sp,#48]
  767. stp x25,x26,[sp,#64]
  768. stp x27,x28,[sp,#80]
  769. L512_or_more_neon:
  770. sub sp,sp,#128+64
  771. ldp x22,x23,[x5] // load sigma
  772. ld1 {v24.4s},[x5],#16
  773. ldp x24,x25,[x3] // load key
  774. ldp x26,x27,[x3,#16]
  775. ld1 {v25.4s,v26.4s},[x3]
  776. ldp x28,x30,[x4] // load counter
  777. ld1 {v27.4s},[x4]
  778. ld1 {v31.4s},[x5]
  779. #ifdef __ARMEB__
  780. rev64 v24.4s,v24.4s
  781. ror x24,x24,#32
  782. ror x25,x25,#32
  783. ror x26,x26,#32
  784. ror x27,x27,#32
  785. ror x28,x28,#32
  786. ror x30,x30,#32
  787. #endif
  788. add v27.4s,v27.4s,v31.4s // += 1
  789. stp q24,q25,[sp,#0] // off-load key block, invariant part
  790. add v27.4s,v27.4s,v31.4s // not typo
  791. str q26,[sp,#32]
  792. add v28.4s,v27.4s,v31.4s
  793. add v29.4s,v28.4s,v31.4s
  794. add v30.4s,v29.4s,v31.4s
  795. shl v31.4s,v31.4s,#2 // 1 -> 4
  796. stp d8,d9,[sp,#128+0] // meet ABI requirements
  797. stp d10,d11,[sp,#128+16]
  798. stp d12,d13,[sp,#128+32]
  799. stp d14,d15,[sp,#128+48]
  800. sub x2,x2,#512 // not typo
  801. Loop_outer_512_neon:
  802. mov v0.16b,v24.16b
  803. mov v4.16b,v24.16b
  804. mov v8.16b,v24.16b
  805. mov v12.16b,v24.16b
  806. mov v16.16b,v24.16b
  807. mov v20.16b,v24.16b
  808. mov v1.16b,v25.16b
  809. mov w5,w22 // unpack key block
  810. mov v5.16b,v25.16b
  811. lsr x6,x22,#32
  812. mov v9.16b,v25.16b
  813. mov w7,w23
  814. mov v13.16b,v25.16b
  815. lsr x8,x23,#32
  816. mov v17.16b,v25.16b
  817. mov w9,w24
  818. mov v21.16b,v25.16b
  819. lsr x10,x24,#32
  820. mov v3.16b,v27.16b
  821. mov w11,w25
  822. mov v7.16b,v28.16b
  823. lsr x12,x25,#32
  824. mov v11.16b,v29.16b
  825. mov w13,w26
  826. mov v15.16b,v30.16b
  827. lsr x14,x26,#32
  828. mov v2.16b,v26.16b
  829. mov w15,w27
  830. mov v6.16b,v26.16b
  831. lsr x16,x27,#32
  832. add v19.4s,v3.4s,v31.4s // +4
  833. mov w17,w28
  834. add v23.4s,v7.4s,v31.4s // +4
  835. lsr x19,x28,#32
  836. mov v10.16b,v26.16b
  837. mov w20,w30
  838. mov v14.16b,v26.16b
  839. lsr x21,x30,#32
  840. mov v18.16b,v26.16b
  841. stp q27,q28,[sp,#48] // off-load key block, variable part
  842. mov v22.16b,v26.16b
  843. str q29,[sp,#80]
  844. mov x4,#5
  845. subs x2,x2,#512
  846. Loop_upper_neon:
  847. sub x4,x4,#1
  848. add v0.4s,v0.4s,v1.4s
  849. add w5,w5,w9
  850. add v4.4s,v4.4s,v5.4s
  851. add w6,w6,w10
  852. add v8.4s,v8.4s,v9.4s
  853. add w7,w7,w11
  854. add v12.4s,v12.4s,v13.4s
  855. add w8,w8,w12
  856. add v16.4s,v16.4s,v17.4s
  857. eor w17,w17,w5
  858. add v20.4s,v20.4s,v21.4s
  859. eor w19,w19,w6
  860. eor v3.16b,v3.16b,v0.16b
  861. eor w20,w20,w7
  862. eor v7.16b,v7.16b,v4.16b
  863. eor w21,w21,w8
  864. eor v11.16b,v11.16b,v8.16b
  865. ror w17,w17,#16
  866. eor v15.16b,v15.16b,v12.16b
  867. ror w19,w19,#16
  868. eor v19.16b,v19.16b,v16.16b
  869. ror w20,w20,#16
  870. eor v23.16b,v23.16b,v20.16b
  871. ror w21,w21,#16
  872. rev32 v3.8h,v3.8h
  873. add w13,w13,w17
  874. rev32 v7.8h,v7.8h
  875. add w14,w14,w19
  876. rev32 v11.8h,v11.8h
  877. add w15,w15,w20
  878. rev32 v15.8h,v15.8h
  879. add w16,w16,w21
  880. rev32 v19.8h,v19.8h
  881. eor w9,w9,w13
  882. rev32 v23.8h,v23.8h
  883. eor w10,w10,w14
  884. add v2.4s,v2.4s,v3.4s
  885. eor w11,w11,w15
  886. add v6.4s,v6.4s,v7.4s
  887. eor w12,w12,w16
  888. add v10.4s,v10.4s,v11.4s
  889. ror w9,w9,#20
  890. add v14.4s,v14.4s,v15.4s
  891. ror w10,w10,#20
  892. add v18.4s,v18.4s,v19.4s
  893. ror w11,w11,#20
  894. add v22.4s,v22.4s,v23.4s
  895. ror w12,w12,#20
  896. eor v24.16b,v1.16b,v2.16b
  897. add w5,w5,w9
  898. eor v25.16b,v5.16b,v6.16b
  899. add w6,w6,w10
  900. eor v26.16b,v9.16b,v10.16b
  901. add w7,w7,w11
  902. eor v27.16b,v13.16b,v14.16b
  903. add w8,w8,w12
  904. eor v28.16b,v17.16b,v18.16b
  905. eor w17,w17,w5
  906. eor v29.16b,v21.16b,v22.16b
  907. eor w19,w19,w6
  908. ushr v1.4s,v24.4s,#20
  909. eor w20,w20,w7
  910. ushr v5.4s,v25.4s,#20
  911. eor w21,w21,w8
  912. ushr v9.4s,v26.4s,#20
  913. ror w17,w17,#24
  914. ushr v13.4s,v27.4s,#20
  915. ror w19,w19,#24
  916. ushr v17.4s,v28.4s,#20
  917. ror w20,w20,#24
  918. ushr v21.4s,v29.4s,#20
  919. ror w21,w21,#24
  920. sli v1.4s,v24.4s,#12
  921. add w13,w13,w17
  922. sli v5.4s,v25.4s,#12
  923. add w14,w14,w19
  924. sli v9.4s,v26.4s,#12
  925. add w15,w15,w20
  926. sli v13.4s,v27.4s,#12
  927. add w16,w16,w21
  928. sli v17.4s,v28.4s,#12
  929. eor w9,w9,w13
  930. sli v21.4s,v29.4s,#12
  931. eor w10,w10,w14
  932. add v0.4s,v0.4s,v1.4s
  933. eor w11,w11,w15
  934. add v4.4s,v4.4s,v5.4s
  935. eor w12,w12,w16
  936. add v8.4s,v8.4s,v9.4s
  937. ror w9,w9,#25
  938. add v12.4s,v12.4s,v13.4s
  939. ror w10,w10,#25
  940. add v16.4s,v16.4s,v17.4s
  941. ror w11,w11,#25
  942. add v20.4s,v20.4s,v21.4s
  943. ror w12,w12,#25
  944. eor v24.16b,v3.16b,v0.16b
  945. add w5,w5,w10
  946. eor v25.16b,v7.16b,v4.16b
  947. add w6,w6,w11
  948. eor v26.16b,v11.16b,v8.16b
  949. add w7,w7,w12
  950. eor v27.16b,v15.16b,v12.16b
  951. add w8,w8,w9
  952. eor v28.16b,v19.16b,v16.16b
  953. eor w21,w21,w5
  954. eor v29.16b,v23.16b,v20.16b
  955. eor w17,w17,w6
  956. ushr v3.4s,v24.4s,#24
  957. eor w19,w19,w7
  958. ushr v7.4s,v25.4s,#24
  959. eor w20,w20,w8
  960. ushr v11.4s,v26.4s,#24
  961. ror w21,w21,#16
  962. ushr v15.4s,v27.4s,#24
  963. ror w17,w17,#16
  964. ushr v19.4s,v28.4s,#24
  965. ror w19,w19,#16
  966. ushr v23.4s,v29.4s,#24
  967. ror w20,w20,#16
  968. sli v3.4s,v24.4s,#8
  969. add w15,w15,w21
  970. sli v7.4s,v25.4s,#8
  971. add w16,w16,w17
  972. sli v11.4s,v26.4s,#8
  973. add w13,w13,w19
  974. sli v15.4s,v27.4s,#8
  975. add w14,w14,w20
  976. sli v19.4s,v28.4s,#8
  977. eor w10,w10,w15
  978. sli v23.4s,v29.4s,#8
  979. eor w11,w11,w16
  980. add v2.4s,v2.4s,v3.4s
  981. eor w12,w12,w13
  982. add v6.4s,v6.4s,v7.4s
  983. eor w9,w9,w14
  984. add v10.4s,v10.4s,v11.4s
  985. ror w10,w10,#20
  986. add v14.4s,v14.4s,v15.4s
  987. ror w11,w11,#20
  988. add v18.4s,v18.4s,v19.4s
  989. ror w12,w12,#20
  990. add v22.4s,v22.4s,v23.4s
  991. ror w9,w9,#20
  992. eor v24.16b,v1.16b,v2.16b
  993. add w5,w5,w10
  994. eor v25.16b,v5.16b,v6.16b
  995. add w6,w6,w11
  996. eor v26.16b,v9.16b,v10.16b
  997. add w7,w7,w12
  998. eor v27.16b,v13.16b,v14.16b
  999. add w8,w8,w9
  1000. eor v28.16b,v17.16b,v18.16b
  1001. eor w21,w21,w5
  1002. eor v29.16b,v21.16b,v22.16b
  1003. eor w17,w17,w6
  1004. ushr v1.4s,v24.4s,#25
  1005. eor w19,w19,w7
  1006. ushr v5.4s,v25.4s,#25
  1007. eor w20,w20,w8
  1008. ushr v9.4s,v26.4s,#25
  1009. ror w21,w21,#24
  1010. ushr v13.4s,v27.4s,#25
  1011. ror w17,w17,#24
  1012. ushr v17.4s,v28.4s,#25
  1013. ror w19,w19,#24
  1014. ushr v21.4s,v29.4s,#25
  1015. ror w20,w20,#24
  1016. sli v1.4s,v24.4s,#7
  1017. add w15,w15,w21
  1018. sli v5.4s,v25.4s,#7
  1019. add w16,w16,w17
  1020. sli v9.4s,v26.4s,#7
  1021. add w13,w13,w19
  1022. sli v13.4s,v27.4s,#7
  1023. add w14,w14,w20
  1024. sli v17.4s,v28.4s,#7
  1025. eor w10,w10,w15
  1026. sli v21.4s,v29.4s,#7
  1027. eor w11,w11,w16
  1028. ext v2.16b,v2.16b,v2.16b,#8
  1029. eor w12,w12,w13
  1030. ext v6.16b,v6.16b,v6.16b,#8
  1031. eor w9,w9,w14
  1032. ext v10.16b,v10.16b,v10.16b,#8
  1033. ror w10,w10,#25
  1034. ext v14.16b,v14.16b,v14.16b,#8
  1035. ror w11,w11,#25
  1036. ext v18.16b,v18.16b,v18.16b,#8
  1037. ror w12,w12,#25
  1038. ext v22.16b,v22.16b,v22.16b,#8
  1039. ror w9,w9,#25
  1040. ext v3.16b,v3.16b,v3.16b,#12
  1041. ext v7.16b,v7.16b,v7.16b,#12
  1042. ext v11.16b,v11.16b,v11.16b,#12
  1043. ext v15.16b,v15.16b,v15.16b,#12
  1044. ext v19.16b,v19.16b,v19.16b,#12
  1045. ext v23.16b,v23.16b,v23.16b,#12
  1046. ext v1.16b,v1.16b,v1.16b,#4
  1047. ext v5.16b,v5.16b,v5.16b,#4
  1048. ext v9.16b,v9.16b,v9.16b,#4
  1049. ext v13.16b,v13.16b,v13.16b,#4
  1050. ext v17.16b,v17.16b,v17.16b,#4
  1051. ext v21.16b,v21.16b,v21.16b,#4
  1052. add v0.4s,v0.4s,v1.4s
  1053. add w5,w5,w9
  1054. add v4.4s,v4.4s,v5.4s
  1055. add w6,w6,w10
  1056. add v8.4s,v8.4s,v9.4s
  1057. add w7,w7,w11
  1058. add v12.4s,v12.4s,v13.4s
  1059. add w8,w8,w12
  1060. add v16.4s,v16.4s,v17.4s
  1061. eor w17,w17,w5
  1062. add v20.4s,v20.4s,v21.4s
  1063. eor w19,w19,w6
  1064. eor v3.16b,v3.16b,v0.16b
  1065. eor w20,w20,w7
  1066. eor v7.16b,v7.16b,v4.16b
  1067. eor w21,w21,w8
  1068. eor v11.16b,v11.16b,v8.16b
  1069. ror w17,w17,#16
  1070. eor v15.16b,v15.16b,v12.16b
  1071. ror w19,w19,#16
  1072. eor v19.16b,v19.16b,v16.16b
  1073. ror w20,w20,#16
  1074. eor v23.16b,v23.16b,v20.16b
  1075. ror w21,w21,#16
  1076. rev32 v3.8h,v3.8h
  1077. add w13,w13,w17
  1078. rev32 v7.8h,v7.8h
  1079. add w14,w14,w19
  1080. rev32 v11.8h,v11.8h
  1081. add w15,w15,w20
  1082. rev32 v15.8h,v15.8h
  1083. add w16,w16,w21
  1084. rev32 v19.8h,v19.8h
  1085. eor w9,w9,w13
  1086. rev32 v23.8h,v23.8h
  1087. eor w10,w10,w14
  1088. add v2.4s,v2.4s,v3.4s
  1089. eor w11,w11,w15
  1090. add v6.4s,v6.4s,v7.4s
  1091. eor w12,w12,w16
  1092. add v10.4s,v10.4s,v11.4s
  1093. ror w9,w9,#20
  1094. add v14.4s,v14.4s,v15.4s
  1095. ror w10,w10,#20
  1096. add v18.4s,v18.4s,v19.4s
  1097. ror w11,w11,#20
  1098. add v22.4s,v22.4s,v23.4s
  1099. ror w12,w12,#20
  1100. eor v24.16b,v1.16b,v2.16b
  1101. add w5,w5,w9
  1102. eor v25.16b,v5.16b,v6.16b
  1103. add w6,w6,w10
  1104. eor v26.16b,v9.16b,v10.16b
  1105. add w7,w7,w11
  1106. eor v27.16b,v13.16b,v14.16b
  1107. add w8,w8,w12
  1108. eor v28.16b,v17.16b,v18.16b
  1109. eor w17,w17,w5
  1110. eor v29.16b,v21.16b,v22.16b
  1111. eor w19,w19,w6
  1112. ushr v1.4s,v24.4s,#20
  1113. eor w20,w20,w7
  1114. ushr v5.4s,v25.4s,#20
  1115. eor w21,w21,w8
  1116. ushr v9.4s,v26.4s,#20
  1117. ror w17,w17,#24
  1118. ushr v13.4s,v27.4s,#20
  1119. ror w19,w19,#24
  1120. ushr v17.4s,v28.4s,#20
  1121. ror w20,w20,#24
  1122. ushr v21.4s,v29.4s,#20
  1123. ror w21,w21,#24
  1124. sli v1.4s,v24.4s,#12
  1125. add w13,w13,w17
  1126. sli v5.4s,v25.4s,#12
  1127. add w14,w14,w19
  1128. sli v9.4s,v26.4s,#12
  1129. add w15,w15,w20
  1130. sli v13.4s,v27.4s,#12
  1131. add w16,w16,w21
  1132. sli v17.4s,v28.4s,#12
  1133. eor w9,w9,w13
  1134. sli v21.4s,v29.4s,#12
  1135. eor w10,w10,w14
  1136. add v0.4s,v0.4s,v1.4s
  1137. eor w11,w11,w15
  1138. add v4.4s,v4.4s,v5.4s
  1139. eor w12,w12,w16
  1140. add v8.4s,v8.4s,v9.4s
  1141. ror w9,w9,#25
  1142. add v12.4s,v12.4s,v13.4s
  1143. ror w10,w10,#25
  1144. add v16.4s,v16.4s,v17.4s
  1145. ror w11,w11,#25
  1146. add v20.4s,v20.4s,v21.4s
  1147. ror w12,w12,#25
  1148. eor v24.16b,v3.16b,v0.16b
  1149. add w5,w5,w10
  1150. eor v25.16b,v7.16b,v4.16b
  1151. add w6,w6,w11
  1152. eor v26.16b,v11.16b,v8.16b
  1153. add w7,w7,w12
  1154. eor v27.16b,v15.16b,v12.16b
  1155. add w8,w8,w9
  1156. eor v28.16b,v19.16b,v16.16b
  1157. eor w21,w21,w5
  1158. eor v29.16b,v23.16b,v20.16b
  1159. eor w17,w17,w6
  1160. ushr v3.4s,v24.4s,#24
  1161. eor w19,w19,w7
  1162. ushr v7.4s,v25.4s,#24
  1163. eor w20,w20,w8
  1164. ushr v11.4s,v26.4s,#24
  1165. ror w21,w21,#16
  1166. ushr v15.4s,v27.4s,#24
  1167. ror w17,w17,#16
  1168. ushr v19.4s,v28.4s,#24
  1169. ror w19,w19,#16
  1170. ushr v23.4s,v29.4s,#24
  1171. ror w20,w20,#16
  1172. sli v3.4s,v24.4s,#8
  1173. add w15,w15,w21
  1174. sli v7.4s,v25.4s,#8
  1175. add w16,w16,w17
  1176. sli v11.4s,v26.4s,#8
  1177. add w13,w13,w19
  1178. sli v15.4s,v27.4s,#8
  1179. add w14,w14,w20
  1180. sli v19.4s,v28.4s,#8
  1181. eor w10,w10,w15
  1182. sli v23.4s,v29.4s,#8
  1183. eor w11,w11,w16
  1184. add v2.4s,v2.4s,v3.4s
  1185. eor w12,w12,w13
  1186. add v6.4s,v6.4s,v7.4s
  1187. eor w9,w9,w14
  1188. add v10.4s,v10.4s,v11.4s
  1189. ror w10,w10,#20
  1190. add v14.4s,v14.4s,v15.4s
  1191. ror w11,w11,#20
  1192. add v18.4s,v18.4s,v19.4s
  1193. ror w12,w12,#20
  1194. add v22.4s,v22.4s,v23.4s
  1195. ror w9,w9,#20
  1196. eor v24.16b,v1.16b,v2.16b
  1197. add w5,w5,w10
  1198. eor v25.16b,v5.16b,v6.16b
  1199. add w6,w6,w11
  1200. eor v26.16b,v9.16b,v10.16b
  1201. add w7,w7,w12
  1202. eor v27.16b,v13.16b,v14.16b
  1203. add w8,w8,w9
  1204. eor v28.16b,v17.16b,v18.16b
  1205. eor w21,w21,w5
  1206. eor v29.16b,v21.16b,v22.16b
  1207. eor w17,w17,w6
  1208. ushr v1.4s,v24.4s,#25
  1209. eor w19,w19,w7
  1210. ushr v5.4s,v25.4s,#25
  1211. eor w20,w20,w8
  1212. ushr v9.4s,v26.4s,#25
  1213. ror w21,w21,#24
  1214. ushr v13.4s,v27.4s,#25
  1215. ror w17,w17,#24
  1216. ushr v17.4s,v28.4s,#25
  1217. ror w19,w19,#24
  1218. ushr v21.4s,v29.4s,#25
  1219. ror w20,w20,#24
  1220. sli v1.4s,v24.4s,#7
  1221. add w15,w15,w21
  1222. sli v5.4s,v25.4s,#7
  1223. add w16,w16,w17
  1224. sli v9.4s,v26.4s,#7
  1225. add w13,w13,w19
  1226. sli v13.4s,v27.4s,#7
  1227. add w14,w14,w20
  1228. sli v17.4s,v28.4s,#7
  1229. eor w10,w10,w15
  1230. sli v21.4s,v29.4s,#7
  1231. eor w11,w11,w16
  1232. ext v2.16b,v2.16b,v2.16b,#8
  1233. eor w12,w12,w13
  1234. ext v6.16b,v6.16b,v6.16b,#8
  1235. eor w9,w9,w14
  1236. ext v10.16b,v10.16b,v10.16b,#8
  1237. ror w10,w10,#25
  1238. ext v14.16b,v14.16b,v14.16b,#8
  1239. ror w11,w11,#25
  1240. ext v18.16b,v18.16b,v18.16b,#8
  1241. ror w12,w12,#25
  1242. ext v22.16b,v22.16b,v22.16b,#8
  1243. ror w9,w9,#25
  1244. ext v3.16b,v3.16b,v3.16b,#4
  1245. ext v7.16b,v7.16b,v7.16b,#4
  1246. ext v11.16b,v11.16b,v11.16b,#4
  1247. ext v15.16b,v15.16b,v15.16b,#4
  1248. ext v19.16b,v19.16b,v19.16b,#4
  1249. ext v23.16b,v23.16b,v23.16b,#4
  1250. ext v1.16b,v1.16b,v1.16b,#12
  1251. ext v5.16b,v5.16b,v5.16b,#12
  1252. ext v9.16b,v9.16b,v9.16b,#12
  1253. ext v13.16b,v13.16b,v13.16b,#12
  1254. ext v17.16b,v17.16b,v17.16b,#12
  1255. ext v21.16b,v21.16b,v21.16b,#12
  1256. cbnz x4,Loop_upper_neon
  1257. add w5,w5,w22 // accumulate key block
  1258. add x6,x6,x22,lsr#32
  1259. add w7,w7,w23
  1260. add x8,x8,x23,lsr#32
  1261. add w9,w9,w24
  1262. add x10,x10,x24,lsr#32
  1263. add w11,w11,w25
  1264. add x12,x12,x25,lsr#32
  1265. add w13,w13,w26
  1266. add x14,x14,x26,lsr#32
  1267. add w15,w15,w27
  1268. add x16,x16,x27,lsr#32
  1269. add w17,w17,w28
  1270. add x19,x19,x28,lsr#32
  1271. add w20,w20,w30
  1272. add x21,x21,x30,lsr#32
  1273. add x5,x5,x6,lsl#32 // pack
  1274. add x7,x7,x8,lsl#32
  1275. ldp x6,x8,[x1,#0] // load input
  1276. add x9,x9,x10,lsl#32
  1277. add x11,x11,x12,lsl#32
  1278. ldp x10,x12,[x1,#16]
  1279. add x13,x13,x14,lsl#32
  1280. add x15,x15,x16,lsl#32
  1281. ldp x14,x16,[x1,#32]
  1282. add x17,x17,x19,lsl#32
  1283. add x20,x20,x21,lsl#32
  1284. ldp x19,x21,[x1,#48]
  1285. add x1,x1,#64
  1286. #ifdef __ARMEB__
  1287. rev x5,x5
  1288. rev x7,x7
  1289. rev x9,x9
  1290. rev x11,x11
  1291. rev x13,x13
  1292. rev x15,x15
  1293. rev x17,x17
  1294. rev x20,x20
  1295. #endif
  1296. eor x5,x5,x6
  1297. eor x7,x7,x8
  1298. eor x9,x9,x10
  1299. eor x11,x11,x12
  1300. eor x13,x13,x14
  1301. eor x15,x15,x16
  1302. eor x17,x17,x19
  1303. eor x20,x20,x21
  1304. stp x5,x7,[x0,#0] // store output
  1305. add x28,x28,#1 // increment counter
  1306. mov w5,w22 // unpack key block
  1307. lsr x6,x22,#32
  1308. stp x9,x11,[x0,#16]
  1309. mov w7,w23
  1310. lsr x8,x23,#32
  1311. stp x13,x15,[x0,#32]
  1312. mov w9,w24
  1313. lsr x10,x24,#32
  1314. stp x17,x20,[x0,#48]
  1315. add x0,x0,#64
  1316. mov w11,w25
  1317. lsr x12,x25,#32
  1318. mov w13,w26
  1319. lsr x14,x26,#32
  1320. mov w15,w27
  1321. lsr x16,x27,#32
  1322. mov w17,w28
  1323. lsr x19,x28,#32
  1324. mov w20,w30
  1325. lsr x21,x30,#32
  1326. mov x4,#5
  1327. Loop_lower_neon:
  1328. sub x4,x4,#1
  1329. add v0.4s,v0.4s,v1.4s
  1330. add w5,w5,w9
  1331. add v4.4s,v4.4s,v5.4s
  1332. add w6,w6,w10
  1333. add v8.4s,v8.4s,v9.4s
  1334. add w7,w7,w11
  1335. add v12.4s,v12.4s,v13.4s
  1336. add w8,w8,w12
  1337. add v16.4s,v16.4s,v17.4s
  1338. eor w17,w17,w5
  1339. add v20.4s,v20.4s,v21.4s
  1340. eor w19,w19,w6
  1341. eor v3.16b,v3.16b,v0.16b
  1342. eor w20,w20,w7
  1343. eor v7.16b,v7.16b,v4.16b
  1344. eor w21,w21,w8
  1345. eor v11.16b,v11.16b,v8.16b
  1346. ror w17,w17,#16
  1347. eor v15.16b,v15.16b,v12.16b
  1348. ror w19,w19,#16
  1349. eor v19.16b,v19.16b,v16.16b
  1350. ror w20,w20,#16
  1351. eor v23.16b,v23.16b,v20.16b
  1352. ror w21,w21,#16
  1353. rev32 v3.8h,v3.8h
  1354. add w13,w13,w17
  1355. rev32 v7.8h,v7.8h
  1356. add w14,w14,w19
  1357. rev32 v11.8h,v11.8h
  1358. add w15,w15,w20
  1359. rev32 v15.8h,v15.8h
  1360. add w16,w16,w21
  1361. rev32 v19.8h,v19.8h
  1362. eor w9,w9,w13
  1363. rev32 v23.8h,v23.8h
  1364. eor w10,w10,w14
  1365. add v2.4s,v2.4s,v3.4s
  1366. eor w11,w11,w15
  1367. add v6.4s,v6.4s,v7.4s
  1368. eor w12,w12,w16
  1369. add v10.4s,v10.4s,v11.4s
  1370. ror w9,w9,#20
  1371. add v14.4s,v14.4s,v15.4s
  1372. ror w10,w10,#20
  1373. add v18.4s,v18.4s,v19.4s
  1374. ror w11,w11,#20
  1375. add v22.4s,v22.4s,v23.4s
  1376. ror w12,w12,#20
  1377. eor v24.16b,v1.16b,v2.16b
  1378. add w5,w5,w9
  1379. eor v25.16b,v5.16b,v6.16b
  1380. add w6,w6,w10
  1381. eor v26.16b,v9.16b,v10.16b
  1382. add w7,w7,w11
  1383. eor v27.16b,v13.16b,v14.16b
  1384. add w8,w8,w12
  1385. eor v28.16b,v17.16b,v18.16b
  1386. eor w17,w17,w5
  1387. eor v29.16b,v21.16b,v22.16b
  1388. eor w19,w19,w6
  1389. ushr v1.4s,v24.4s,#20
  1390. eor w20,w20,w7
  1391. ushr v5.4s,v25.4s,#20
  1392. eor w21,w21,w8
  1393. ushr v9.4s,v26.4s,#20
  1394. ror w17,w17,#24
  1395. ushr v13.4s,v27.4s,#20
  1396. ror w19,w19,#24
  1397. ushr v17.4s,v28.4s,#20
  1398. ror w20,w20,#24
  1399. ushr v21.4s,v29.4s,#20
  1400. ror w21,w21,#24
  1401. sli v1.4s,v24.4s,#12
  1402. add w13,w13,w17
  1403. sli v5.4s,v25.4s,#12
  1404. add w14,w14,w19
  1405. sli v9.4s,v26.4s,#12
  1406. add w15,w15,w20
  1407. sli v13.4s,v27.4s,#12
  1408. add w16,w16,w21
  1409. sli v17.4s,v28.4s,#12
  1410. eor w9,w9,w13
  1411. sli v21.4s,v29.4s,#12
  1412. eor w10,w10,w14
  1413. add v0.4s,v0.4s,v1.4s
  1414. eor w11,w11,w15
  1415. add v4.4s,v4.4s,v5.4s
  1416. eor w12,w12,w16
  1417. add v8.4s,v8.4s,v9.4s
  1418. ror w9,w9,#25
  1419. add v12.4s,v12.4s,v13.4s
  1420. ror w10,w10,#25
  1421. add v16.4s,v16.4s,v17.4s
  1422. ror w11,w11,#25
  1423. add v20.4s,v20.4s,v21.4s
  1424. ror w12,w12,#25
  1425. eor v24.16b,v3.16b,v0.16b
  1426. add w5,w5,w10
  1427. eor v25.16b,v7.16b,v4.16b
  1428. add w6,w6,w11
  1429. eor v26.16b,v11.16b,v8.16b
  1430. add w7,w7,w12
  1431. eor v27.16b,v15.16b,v12.16b
  1432. add w8,w8,w9
  1433. eor v28.16b,v19.16b,v16.16b
  1434. eor w21,w21,w5
  1435. eor v29.16b,v23.16b,v20.16b
  1436. eor w17,w17,w6
  1437. ushr v3.4s,v24.4s,#24
  1438. eor w19,w19,w7
  1439. ushr v7.4s,v25.4s,#24
  1440. eor w20,w20,w8
  1441. ushr v11.4s,v26.4s,#24
  1442. ror w21,w21,#16
  1443. ushr v15.4s,v27.4s,#24
  1444. ror w17,w17,#16
  1445. ushr v19.4s,v28.4s,#24
  1446. ror w19,w19,#16
  1447. ushr v23.4s,v29.4s,#24
  1448. ror w20,w20,#16
  1449. sli v3.4s,v24.4s,#8
  1450. add w15,w15,w21
  1451. sli v7.4s,v25.4s,#8
  1452. add w16,w16,w17
  1453. sli v11.4s,v26.4s,#8
  1454. add w13,w13,w19
  1455. sli v15.4s,v27.4s,#8
  1456. add w14,w14,w20
  1457. sli v19.4s,v28.4s,#8
  1458. eor w10,w10,w15
  1459. sli v23.4s,v29.4s,#8
  1460. eor w11,w11,w16
  1461. add v2.4s,v2.4s,v3.4s
  1462. eor w12,w12,w13
  1463. add v6.4s,v6.4s,v7.4s
  1464. eor w9,w9,w14
  1465. add v10.4s,v10.4s,v11.4s
  1466. ror w10,w10,#20
  1467. add v14.4s,v14.4s,v15.4s
  1468. ror w11,w11,#20
  1469. add v18.4s,v18.4s,v19.4s
  1470. ror w12,w12,#20
  1471. add v22.4s,v22.4s,v23.4s
  1472. ror w9,w9,#20
  1473. eor v24.16b,v1.16b,v2.16b
  1474. add w5,w5,w10
  1475. eor v25.16b,v5.16b,v6.16b
  1476. add w6,w6,w11
  1477. eor v26.16b,v9.16b,v10.16b
  1478. add w7,w7,w12
  1479. eor v27.16b,v13.16b,v14.16b
  1480. add w8,w8,w9
  1481. eor v28.16b,v17.16b,v18.16b
  1482. eor w21,w21,w5
  1483. eor v29.16b,v21.16b,v22.16b
  1484. eor w17,w17,w6
  1485. ushr v1.4s,v24.4s,#25
  1486. eor w19,w19,w7
  1487. ushr v5.4s,v25.4s,#25
  1488. eor w20,w20,w8
  1489. ushr v9.4s,v26.4s,#25
  1490. ror w21,w21,#24
  1491. ushr v13.4s,v27.4s,#25
  1492. ror w17,w17,#24
  1493. ushr v17.4s,v28.4s,#25
  1494. ror w19,w19,#24
  1495. ushr v21.4s,v29.4s,#25
  1496. ror w20,w20,#24
  1497. sli v1.4s,v24.4s,#7
  1498. add w15,w15,w21
  1499. sli v5.4s,v25.4s,#7
  1500. add w16,w16,w17
  1501. sli v9.4s,v26.4s,#7
  1502. add w13,w13,w19
  1503. sli v13.4s,v27.4s,#7
  1504. add w14,w14,w20
  1505. sli v17.4s,v28.4s,#7
  1506. eor w10,w10,w15
  1507. sli v21.4s,v29.4s,#7
  1508. eor w11,w11,w16
  1509. ext v2.16b,v2.16b,v2.16b,#8
  1510. eor w12,w12,w13
  1511. ext v6.16b,v6.16b,v6.16b,#8
  1512. eor w9,w9,w14
  1513. ext v10.16b,v10.16b,v10.16b,#8
  1514. ror w10,w10,#25
  1515. ext v14.16b,v14.16b,v14.16b,#8
  1516. ror w11,w11,#25
  1517. ext v18.16b,v18.16b,v18.16b,#8
  1518. ror w12,w12,#25
  1519. ext v22.16b,v22.16b,v22.16b,#8
  1520. ror w9,w9,#25
  1521. ext v3.16b,v3.16b,v3.16b,#12
  1522. ext v7.16b,v7.16b,v7.16b,#12
  1523. ext v11.16b,v11.16b,v11.16b,#12
  1524. ext v15.16b,v15.16b,v15.16b,#12
  1525. ext v19.16b,v19.16b,v19.16b,#12
  1526. ext v23.16b,v23.16b,v23.16b,#12
  1527. ext v1.16b,v1.16b,v1.16b,#4
  1528. ext v5.16b,v5.16b,v5.16b,#4
  1529. ext v9.16b,v9.16b,v9.16b,#4
  1530. ext v13.16b,v13.16b,v13.16b,#4
  1531. ext v17.16b,v17.16b,v17.16b,#4
  1532. ext v21.16b,v21.16b,v21.16b,#4
  1533. add v0.4s,v0.4s,v1.4s
  1534. add w5,w5,w9
  1535. add v4.4s,v4.4s,v5.4s
  1536. add w6,w6,w10
  1537. add v8.4s,v8.4s,v9.4s
  1538. add w7,w7,w11
  1539. add v12.4s,v12.4s,v13.4s
  1540. add w8,w8,w12
  1541. add v16.4s,v16.4s,v17.4s
  1542. eor w17,w17,w5
  1543. add v20.4s,v20.4s,v21.4s
  1544. eor w19,w19,w6
  1545. eor v3.16b,v3.16b,v0.16b
  1546. eor w20,w20,w7
  1547. eor v7.16b,v7.16b,v4.16b
  1548. eor w21,w21,w8
  1549. eor v11.16b,v11.16b,v8.16b
  1550. ror w17,w17,#16
  1551. eor v15.16b,v15.16b,v12.16b
  1552. ror w19,w19,#16
  1553. eor v19.16b,v19.16b,v16.16b
  1554. ror w20,w20,#16
  1555. eor v23.16b,v23.16b,v20.16b
  1556. ror w21,w21,#16
  1557. rev32 v3.8h,v3.8h
  1558. add w13,w13,w17
  1559. rev32 v7.8h,v7.8h
  1560. add w14,w14,w19
  1561. rev32 v11.8h,v11.8h
  1562. add w15,w15,w20
  1563. rev32 v15.8h,v15.8h
  1564. add w16,w16,w21
  1565. rev32 v19.8h,v19.8h
  1566. eor w9,w9,w13
  1567. rev32 v23.8h,v23.8h
  1568. eor w10,w10,w14
  1569. add v2.4s,v2.4s,v3.4s
  1570. eor w11,w11,w15
  1571. add v6.4s,v6.4s,v7.4s
  1572. eor w12,w12,w16
  1573. add v10.4s,v10.4s,v11.4s
  1574. ror w9,w9,#20
  1575. add v14.4s,v14.4s,v15.4s
  1576. ror w10,w10,#20
  1577. add v18.4s,v18.4s,v19.4s
  1578. ror w11,w11,#20
  1579. add v22.4s,v22.4s,v23.4s
  1580. ror w12,w12,#20
  1581. eor v24.16b,v1.16b,v2.16b
  1582. add w5,w5,w9
  1583. eor v25.16b,v5.16b,v6.16b
  1584. add w6,w6,w10
  1585. eor v26.16b,v9.16b,v10.16b
  1586. add w7,w7,w11
  1587. eor v27.16b,v13.16b,v14.16b
  1588. add w8,w8,w12
  1589. eor v28.16b,v17.16b,v18.16b
  1590. eor w17,w17,w5
  1591. eor v29.16b,v21.16b,v22.16b
  1592. eor w19,w19,w6
  1593. ushr v1.4s,v24.4s,#20
  1594. eor w20,w20,w7
  1595. ushr v5.4s,v25.4s,#20
  1596. eor w21,w21,w8
  1597. ushr v9.4s,v26.4s,#20
  1598. ror w17,w17,#24
  1599. ushr v13.4s,v27.4s,#20
  1600. ror w19,w19,#24
  1601. ushr v17.4s,v28.4s,#20
  1602. ror w20,w20,#24
  1603. ushr v21.4s,v29.4s,#20
  1604. ror w21,w21,#24
  1605. sli v1.4s,v24.4s,#12
  1606. add w13,w13,w17
  1607. sli v5.4s,v25.4s,#12
  1608. add w14,w14,w19
  1609. sli v9.4s,v26.4s,#12
  1610. add w15,w15,w20
  1611. sli v13.4s,v27.4s,#12
  1612. add w16,w16,w21
  1613. sli v17.4s,v28.4s,#12
  1614. eor w9,w9,w13
  1615. sli v21.4s,v29.4s,#12
  1616. eor w10,w10,w14
  1617. add v0.4s,v0.4s,v1.4s
  1618. eor w11,w11,w15
  1619. add v4.4s,v4.4s,v5.4s
  1620. eor w12,w12,w16
  1621. add v8.4s,v8.4s,v9.4s
  1622. ror w9,w9,#25
  1623. add v12.4s,v12.4s,v13.4s
  1624. ror w10,w10,#25
  1625. add v16.4s,v16.4s,v17.4s
  1626. ror w11,w11,#25
  1627. add v20.4s,v20.4s,v21.4s
  1628. ror w12,w12,#25
  1629. eor v24.16b,v3.16b,v0.16b
  1630. add w5,w5,w10
  1631. eor v25.16b,v7.16b,v4.16b
  1632. add w6,w6,w11
  1633. eor v26.16b,v11.16b,v8.16b
  1634. add w7,w7,w12
  1635. eor v27.16b,v15.16b,v12.16b
  1636. add w8,w8,w9
  1637. eor v28.16b,v19.16b,v16.16b
  1638. eor w21,w21,w5
  1639. eor v29.16b,v23.16b,v20.16b
  1640. eor w17,w17,w6
  1641. ushr v3.4s,v24.4s,#24
  1642. eor w19,w19,w7
  1643. ushr v7.4s,v25.4s,#24
  1644. eor w20,w20,w8
  1645. ushr v11.4s,v26.4s,#24
  1646. ror w21,w21,#16
  1647. ushr v15.4s,v27.4s,#24
  1648. ror w17,w17,#16
  1649. ushr v19.4s,v28.4s,#24
  1650. ror w19,w19,#16
  1651. ushr v23.4s,v29.4s,#24
  1652. ror w20,w20,#16
  1653. sli v3.4s,v24.4s,#8
  1654. add w15,w15,w21
  1655. sli v7.4s,v25.4s,#8
  1656. add w16,w16,w17
  1657. sli v11.4s,v26.4s,#8
  1658. add w13,w13,w19
  1659. sli v15.4s,v27.4s,#8
  1660. add w14,w14,w20
  1661. sli v19.4s,v28.4s,#8
  1662. eor w10,w10,w15
  1663. sli v23.4s,v29.4s,#8
  1664. eor w11,w11,w16
  1665. add v2.4s,v2.4s,v3.4s
  1666. eor w12,w12,w13
  1667. add v6.4s,v6.4s,v7.4s
  1668. eor w9,w9,w14
  1669. add v10.4s,v10.4s,v11.4s
  1670. ror w10,w10,#20
  1671. add v14.4s,v14.4s,v15.4s
  1672. ror w11,w11,#20
  1673. add v18.4s,v18.4s,v19.4s
  1674. ror w12,w12,#20
  1675. add v22.4s,v22.4s,v23.4s
  1676. ror w9,w9,#20
  1677. eor v24.16b,v1.16b,v2.16b
  1678. add w5,w5,w10
  1679. eor v25.16b,v5.16b,v6.16b
  1680. add w6,w6,w11
  1681. eor v26.16b,v9.16b,v10.16b
  1682. add w7,w7,w12
  1683. eor v27.16b,v13.16b,v14.16b
  1684. add w8,w8,w9
  1685. eor v28.16b,v17.16b,v18.16b
  1686. eor w21,w21,w5
  1687. eor v29.16b,v21.16b,v22.16b
  1688. eor w17,w17,w6
  1689. ushr v1.4s,v24.4s,#25
  1690. eor w19,w19,w7
  1691. ushr v5.4s,v25.4s,#25
  1692. eor w20,w20,w8
  1693. ushr v9.4s,v26.4s,#25
  1694. ror w21,w21,#24
  1695. ushr v13.4s,v27.4s,#25
  1696. ror w17,w17,#24
  1697. ushr v17.4s,v28.4s,#25
  1698. ror w19,w19,#24
  1699. ushr v21.4s,v29.4s,#25
  1700. ror w20,w20,#24
  1701. sli v1.4s,v24.4s,#7
  1702. add w15,w15,w21
  1703. sli v5.4s,v25.4s,#7
  1704. add w16,w16,w17
  1705. sli v9.4s,v26.4s,#7
  1706. add w13,w13,w19
  1707. sli v13.4s,v27.4s,#7
  1708. add w14,w14,w20
  1709. sli v17.4s,v28.4s,#7
  1710. eor w10,w10,w15
  1711. sli v21.4s,v29.4s,#7
  1712. eor w11,w11,w16
  1713. ext v2.16b,v2.16b,v2.16b,#8
  1714. eor w12,w12,w13
  1715. ext v6.16b,v6.16b,v6.16b,#8
  1716. eor w9,w9,w14
  1717. ext v10.16b,v10.16b,v10.16b,#8
  1718. ror w10,w10,#25
  1719. ext v14.16b,v14.16b,v14.16b,#8
  1720. ror w11,w11,#25
  1721. ext v18.16b,v18.16b,v18.16b,#8
  1722. ror w12,w12,#25
  1723. ext v22.16b,v22.16b,v22.16b,#8
  1724. ror w9,w9,#25
  1725. ext v3.16b,v3.16b,v3.16b,#4
  1726. ext v7.16b,v7.16b,v7.16b,#4
  1727. ext v11.16b,v11.16b,v11.16b,#4
  1728. ext v15.16b,v15.16b,v15.16b,#4
  1729. ext v19.16b,v19.16b,v19.16b,#4
  1730. ext v23.16b,v23.16b,v23.16b,#4
  1731. ext v1.16b,v1.16b,v1.16b,#12
  1732. ext v5.16b,v5.16b,v5.16b,#12
  1733. ext v9.16b,v9.16b,v9.16b,#12
  1734. ext v13.16b,v13.16b,v13.16b,#12
  1735. ext v17.16b,v17.16b,v17.16b,#12
  1736. ext v21.16b,v21.16b,v21.16b,#12
  1737. cbnz x4,Loop_lower_neon
  1738. add w5,w5,w22 // accumulate key block
  1739. ldp q24,q25,[sp,#0]
  1740. add x6,x6,x22,lsr#32
  1741. ldp q26,q27,[sp,#32]
  1742. add w7,w7,w23
  1743. ldp q28,q29,[sp,#64]
  1744. add x8,x8,x23,lsr#32
  1745. add v0.4s,v0.4s,v24.4s
  1746. add w9,w9,w24
  1747. add v4.4s,v4.4s,v24.4s
  1748. add x10,x10,x24,lsr#32
  1749. add v8.4s,v8.4s,v24.4s
  1750. add w11,w11,w25
  1751. add v12.4s,v12.4s,v24.4s
  1752. add x12,x12,x25,lsr#32
  1753. add v16.4s,v16.4s,v24.4s
  1754. add w13,w13,w26
  1755. add v20.4s,v20.4s,v24.4s
  1756. add x14,x14,x26,lsr#32
  1757. add v2.4s,v2.4s,v26.4s
  1758. add w15,w15,w27
  1759. add v6.4s,v6.4s,v26.4s
  1760. add x16,x16,x27,lsr#32
  1761. add v10.4s,v10.4s,v26.4s
  1762. add w17,w17,w28
  1763. add v14.4s,v14.4s,v26.4s
  1764. add x19,x19,x28,lsr#32
  1765. add v18.4s,v18.4s,v26.4s
  1766. add w20,w20,w30
  1767. add v22.4s,v22.4s,v26.4s
  1768. add x21,x21,x30,lsr#32
  1769. add v19.4s,v19.4s,v31.4s // +4
  1770. add x5,x5,x6,lsl#32 // pack
  1771. add v23.4s,v23.4s,v31.4s // +4
  1772. add x7,x7,x8,lsl#32
  1773. add v3.4s,v3.4s,v27.4s
  1774. ldp x6,x8,[x1,#0] // load input
  1775. add v7.4s,v7.4s,v28.4s
  1776. add x9,x9,x10,lsl#32
  1777. add v11.4s,v11.4s,v29.4s
  1778. add x11,x11,x12,lsl#32
  1779. add v15.4s,v15.4s,v30.4s
  1780. ldp x10,x12,[x1,#16]
  1781. add v19.4s,v19.4s,v27.4s
  1782. add x13,x13,x14,lsl#32
  1783. add v23.4s,v23.4s,v28.4s
  1784. add x15,x15,x16,lsl#32
  1785. add v1.4s,v1.4s,v25.4s
  1786. ldp x14,x16,[x1,#32]
  1787. add v5.4s,v5.4s,v25.4s
  1788. add x17,x17,x19,lsl#32
  1789. add v9.4s,v9.4s,v25.4s
  1790. add x20,x20,x21,lsl#32
  1791. add v13.4s,v13.4s,v25.4s
  1792. ldp x19,x21,[x1,#48]
  1793. add v17.4s,v17.4s,v25.4s
  1794. add x1,x1,#64
  1795. add v21.4s,v21.4s,v25.4s
  1796. #ifdef __ARMEB__
  1797. rev x5,x5
  1798. rev x7,x7
  1799. rev x9,x9
  1800. rev x11,x11
  1801. rev x13,x13
  1802. rev x15,x15
  1803. rev x17,x17
  1804. rev x20,x20
  1805. #endif
  1806. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1807. eor x5,x5,x6
  1808. eor x7,x7,x8
  1809. eor x9,x9,x10
  1810. eor x11,x11,x12
  1811. eor x13,x13,x14
  1812. eor v0.16b,v0.16b,v24.16b
  1813. eor x15,x15,x16
  1814. eor v1.16b,v1.16b,v25.16b
  1815. eor x17,x17,x19
  1816. eor v2.16b,v2.16b,v26.16b
  1817. eor x20,x20,x21
  1818. eor v3.16b,v3.16b,v27.16b
  1819. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1820. stp x5,x7,[x0,#0] // store output
  1821. add x28,x28,#7 // increment counter
  1822. stp x9,x11,[x0,#16]
  1823. stp x13,x15,[x0,#32]
  1824. stp x17,x20,[x0,#48]
  1825. add x0,x0,#64
  1826. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  1827. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  1828. eor v4.16b,v4.16b,v24.16b
  1829. eor v5.16b,v5.16b,v25.16b
  1830. eor v6.16b,v6.16b,v26.16b
  1831. eor v7.16b,v7.16b,v27.16b
  1832. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  1833. ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
  1834. eor v8.16b,v8.16b,v0.16b
  1835. ldp q24,q25,[sp,#0]
  1836. eor v9.16b,v9.16b,v1.16b
  1837. ldp q26,q27,[sp,#32]
  1838. eor v10.16b,v10.16b,v2.16b
  1839. eor v11.16b,v11.16b,v3.16b
  1840. st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
  1841. ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
  1842. eor v12.16b,v12.16b,v4.16b
  1843. eor v13.16b,v13.16b,v5.16b
  1844. eor v14.16b,v14.16b,v6.16b
  1845. eor v15.16b,v15.16b,v7.16b
  1846. st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
  1847. ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
  1848. eor v16.16b,v16.16b,v8.16b
  1849. eor v17.16b,v17.16b,v9.16b
  1850. eor v18.16b,v18.16b,v10.16b
  1851. eor v19.16b,v19.16b,v11.16b
  1852. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  1853. shl v0.4s,v31.4s,#1 // 4 -> 8
  1854. eor v20.16b,v20.16b,v12.16b
  1855. eor v21.16b,v21.16b,v13.16b
  1856. eor v22.16b,v22.16b,v14.16b
  1857. eor v23.16b,v23.16b,v15.16b
  1858. st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
  1859. add v27.4s,v27.4s,v0.4s // += 8
  1860. add v28.4s,v28.4s,v0.4s
  1861. add v29.4s,v29.4s,v0.4s
  1862. add v30.4s,v30.4s,v0.4s
  1863. b.hs Loop_outer_512_neon
  1864. adds x2,x2,#512
  1865. ushr v0.4s,v31.4s,#2 // 4 -> 1
  1866. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  1867. ldp d10,d11,[sp,#128+16]
  1868. ldp d12,d13,[sp,#128+32]
  1869. ldp d14,d15,[sp,#128+48]
  1870. stp q24,q31,[sp,#0] // wipe off-load area
  1871. stp q24,q31,[sp,#32]
  1872. stp q24,q31,[sp,#64]
  1873. b.eq Ldone_512_neon
  1874. cmp x2,#192
  1875. sub v27.4s,v27.4s,v0.4s // -= 1
  1876. sub v28.4s,v28.4s,v0.4s
  1877. sub v29.4s,v29.4s,v0.4s
  1878. add sp,sp,#128
  1879. b.hs Loop_outer_neon
  1880. eor v25.16b,v25.16b,v25.16b
  1881. eor v26.16b,v26.16b,v26.16b
  1882. eor v27.16b,v27.16b,v27.16b
  1883. eor v28.16b,v28.16b,v28.16b
  1884. eor v29.16b,v29.16b,v29.16b
  1885. eor v30.16b,v30.16b,v30.16b
  1886. b Loop_outer
  1887. Ldone_512_neon:
  1888. ldp x19,x20,[x29,#16]
  1889. add sp,sp,#128+64
  1890. ldp x21,x22,[x29,#32]
  1891. ldp x23,x24,[x29,#48]
  1892. ldp x25,x26,[x29,#64]
  1893. ldp x27,x28,[x29,#80]
  1894. ldp x29,x30,[sp],#96
  1895. .long 0xd50323bf // autiasp
  1896. ret