chacha-armv8.S 39 KB


  1. #include "arm_arch.h"
  2. .text
  3. .hidden OPENSSL_armcap_P
  4. .align 5
  5. .Lsigma:
  6. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  7. .Lone:
  8. .long 1,0,0,0
  9. .LOPENSSL_armcap_P:
  10. #ifdef __ILP32__
  11. .long OPENSSL_armcap_P-.
  12. #else
  13. .quad OPENSSL_armcap_P-.
  14. #endif
  15. .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  16. .align 2
  17. .globl ChaCha20_ctr32
  18. .type ChaCha20_ctr32,%function
  19. .align 5
  20. ChaCha20_ctr32:
  21. cbz x2,.Labort
  22. adr x5,.LOPENSSL_armcap_P
  23. cmp x2,#192
  24. b.lo .Lshort
  25. #ifdef __ILP32__
  26. ldrsw x6,[x5]
  27. #else
  28. ldr x6,[x5]
  29. #endif
  30. ldr w17,[x6,x5]
  31. tst w17,#ARMV7_NEON
  32. b.ne ChaCha20_neon
  33. .Lshort:
  34. .inst 0xd503233f // paciasp
  35. stp x29,x30,[sp,#-96]!
  36. add x29,sp,#0
  37. adr x5,.Lsigma
  38. stp x19,x20,[sp,#16]
  39. stp x21,x22,[sp,#32]
  40. stp x23,x24,[sp,#48]
  41. stp x25,x26,[sp,#64]
  42. stp x27,x28,[sp,#80]
  43. sub sp,sp,#64
  44. ldp x22,x23,[x5] // load sigma
  45. ldp x24,x25,[x3] // load key
  46. ldp x26,x27,[x3,#16]
  47. ldp x28,x30,[x4] // load counter
  48. #ifdef __ARMEB__
  49. ror x24,x24,#32
  50. ror x25,x25,#32
  51. ror x26,x26,#32
  52. ror x27,x27,#32
  53. ror x28,x28,#32
  54. ror x30,x30,#32
  55. #endif
  56. .Loop_outer:
  57. mov w5,w22 // unpack key block
  58. lsr x6,x22,#32
  59. mov w7,w23
  60. lsr x8,x23,#32
  61. mov w9,w24
  62. lsr x10,x24,#32
  63. mov w11,w25
  64. lsr x12,x25,#32
  65. mov w13,w26
  66. lsr x14,x26,#32
  67. mov w15,w27
  68. lsr x16,x27,#32
  69. mov w17,w28
  70. lsr x19,x28,#32
  71. mov w20,w30
  72. lsr x21,x30,#32
  73. mov x4,#10
  74. subs x2,x2,#64
  75. .Loop:
  76. sub x4,x4,#1
  77. add w5,w5,w9
  78. add w6,w6,w10
  79. add w7,w7,w11
  80. add w8,w8,w12
  81. eor w17,w17,w5
  82. eor w19,w19,w6
  83. eor w20,w20,w7
  84. eor w21,w21,w8
  85. ror w17,w17,#16
  86. ror w19,w19,#16
  87. ror w20,w20,#16
  88. ror w21,w21,#16
  89. add w13,w13,w17
  90. add w14,w14,w19
  91. add w15,w15,w20
  92. add w16,w16,w21
  93. eor w9,w9,w13
  94. eor w10,w10,w14
  95. eor w11,w11,w15
  96. eor w12,w12,w16
  97. ror w9,w9,#20
  98. ror w10,w10,#20
  99. ror w11,w11,#20
  100. ror w12,w12,#20
  101. add w5,w5,w9
  102. add w6,w6,w10
  103. add w7,w7,w11
  104. add w8,w8,w12
  105. eor w17,w17,w5
  106. eor w19,w19,w6
  107. eor w20,w20,w7
  108. eor w21,w21,w8
  109. ror w17,w17,#24
  110. ror w19,w19,#24
  111. ror w20,w20,#24
  112. ror w21,w21,#24
  113. add w13,w13,w17
  114. add w14,w14,w19
  115. add w15,w15,w20
  116. add w16,w16,w21
  117. eor w9,w9,w13
  118. eor w10,w10,w14
  119. eor w11,w11,w15
  120. eor w12,w12,w16
  121. ror w9,w9,#25
  122. ror w10,w10,#25
  123. ror w11,w11,#25
  124. ror w12,w12,#25
  125. add w5,w5,w10
  126. add w6,w6,w11
  127. add w7,w7,w12
  128. add w8,w8,w9
  129. eor w21,w21,w5
  130. eor w17,w17,w6
  131. eor w19,w19,w7
  132. eor w20,w20,w8
  133. ror w21,w21,#16
  134. ror w17,w17,#16
  135. ror w19,w19,#16
  136. ror w20,w20,#16
  137. add w15,w15,w21
  138. add w16,w16,w17
  139. add w13,w13,w19
  140. add w14,w14,w20
  141. eor w10,w10,w15
  142. eor w11,w11,w16
  143. eor w12,w12,w13
  144. eor w9,w9,w14
  145. ror w10,w10,#20
  146. ror w11,w11,#20
  147. ror w12,w12,#20
  148. ror w9,w9,#20
  149. add w5,w5,w10
  150. add w6,w6,w11
  151. add w7,w7,w12
  152. add w8,w8,w9
  153. eor w21,w21,w5
  154. eor w17,w17,w6
  155. eor w19,w19,w7
  156. eor w20,w20,w8
  157. ror w21,w21,#24
  158. ror w17,w17,#24
  159. ror w19,w19,#24
  160. ror w20,w20,#24
  161. add w15,w15,w21
  162. add w16,w16,w17
  163. add w13,w13,w19
  164. add w14,w14,w20
  165. eor w10,w10,w15
  166. eor w11,w11,w16
  167. eor w12,w12,w13
  168. eor w9,w9,w14
  169. ror w10,w10,#25
  170. ror w11,w11,#25
  171. ror w12,w12,#25
  172. ror w9,w9,#25
  173. cbnz x4,.Loop
  174. add w5,w5,w22 // accumulate key block
  175. add x6,x6,x22,lsr#32
  176. add w7,w7,w23
  177. add x8,x8,x23,lsr#32
  178. add w9,w9,w24
  179. add x10,x10,x24,lsr#32
  180. add w11,w11,w25
  181. add x12,x12,x25,lsr#32
  182. add w13,w13,w26
  183. add x14,x14,x26,lsr#32
  184. add w15,w15,w27
  185. add x16,x16,x27,lsr#32
  186. add w17,w17,w28
  187. add x19,x19,x28,lsr#32
  188. add w20,w20,w30
  189. add x21,x21,x30,lsr#32
  190. b.lo .Ltail
  191. add x5,x5,x6,lsl#32 // pack
  192. add x7,x7,x8,lsl#32
  193. ldp x6,x8,[x1,#0] // load input
  194. add x9,x9,x10,lsl#32
  195. add x11,x11,x12,lsl#32
  196. ldp x10,x12,[x1,#16]
  197. add x13,x13,x14,lsl#32
  198. add x15,x15,x16,lsl#32
  199. ldp x14,x16,[x1,#32]
  200. add x17,x17,x19,lsl#32
  201. add x20,x20,x21,lsl#32
  202. ldp x19,x21,[x1,#48]
  203. add x1,x1,#64
  204. #ifdef __ARMEB__
  205. rev x5,x5
  206. rev x7,x7
  207. rev x9,x9
  208. rev x11,x11
  209. rev x13,x13
  210. rev x15,x15
  211. rev x17,x17
  212. rev x20,x20
  213. #endif
  214. eor x5,x5,x6
  215. eor x7,x7,x8
  216. eor x9,x9,x10
  217. eor x11,x11,x12
  218. eor x13,x13,x14
  219. eor x15,x15,x16
  220. eor x17,x17,x19
  221. eor x20,x20,x21
  222. stp x5,x7,[x0,#0] // store output
  223. add x28,x28,#1 // increment counter
  224. stp x9,x11,[x0,#16]
  225. stp x13,x15,[x0,#32]
  226. stp x17,x20,[x0,#48]
  227. add x0,x0,#64
  228. b.hi .Loop_outer
  229. ldp x19,x20,[x29,#16]
  230. add sp,sp,#64
  231. ldp x21,x22,[x29,#32]
  232. ldp x23,x24,[x29,#48]
  233. ldp x25,x26,[x29,#64]
  234. ldp x27,x28,[x29,#80]
  235. ldp x29,x30,[sp],#96
  236. .inst 0xd50323bf // autiasp
  237. .Labort:
  238. ret
  239. .align 4
  240. .Ltail:
  241. add x2,x2,#64
  242. .Less_than_64:
  243. sub x0,x0,#1
  244. add x1,x1,x2
  245. add x0,x0,x2
  246. add x4,sp,x2
  247. neg x2,x2
  248. add x5,x5,x6,lsl#32 // pack
  249. add x7,x7,x8,lsl#32
  250. add x9,x9,x10,lsl#32
  251. add x11,x11,x12,lsl#32
  252. add x13,x13,x14,lsl#32
  253. add x15,x15,x16,lsl#32
  254. add x17,x17,x19,lsl#32
  255. add x20,x20,x21,lsl#32
  256. #ifdef __ARMEB__
  257. rev x5,x5
  258. rev x7,x7
  259. rev x9,x9
  260. rev x11,x11
  261. rev x13,x13
  262. rev x15,x15
  263. rev x17,x17
  264. rev x20,x20
  265. #endif
  266. stp x5,x7,[sp,#0]
  267. stp x9,x11,[sp,#16]
  268. stp x13,x15,[sp,#32]
  269. stp x17,x20,[sp,#48]
  270. .Loop_tail:
  271. ldrb w10,[x1,x2]
  272. ldrb w11,[x4,x2]
  273. add x2,x2,#1
  274. eor w10,w10,w11
  275. strb w10,[x0,x2]
  276. cbnz x2,.Loop_tail
  277. stp xzr,xzr,[sp,#0]
  278. stp xzr,xzr,[sp,#16]
  279. stp xzr,xzr,[sp,#32]
  280. stp xzr,xzr,[sp,#48]
  281. ldp x19,x20,[x29,#16]
  282. add sp,sp,#64
  283. ldp x21,x22,[x29,#32]
  284. ldp x23,x24,[x29,#48]
  285. ldp x25,x26,[x29,#64]
  286. ldp x27,x28,[x29,#80]
  287. ldp x29,x30,[sp],#96
  288. .inst 0xd50323bf // autiasp
  289. ret
  290. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  291. .type ChaCha20_neon,%function
  292. .align 5
  293. ChaCha20_neon:
  294. .inst 0xd503233f // paciasp
  295. stp x29,x30,[sp,#-96]!
  296. add x29,sp,#0
  297. adr x5,.Lsigma
  298. stp x19,x20,[sp,#16]
  299. stp x21,x22,[sp,#32]
  300. stp x23,x24,[sp,#48]
  301. stp x25,x26,[sp,#64]
  302. stp x27,x28,[sp,#80]
  303. cmp x2,#512
  304. b.hs .L512_or_more_neon
  305. sub sp,sp,#64
  306. ldp x22,x23,[x5] // load sigma
  307. ld1 {v24.4s},[x5],#16
  308. ldp x24,x25,[x3] // load key
  309. ldp x26,x27,[x3,#16]
  310. ld1 {v25.4s,v26.4s},[x3]
  311. ldp x28,x30,[x4] // load counter
  312. ld1 {v27.4s},[x4]
  313. ld1 {v31.4s},[x5]
  314. #ifdef __ARMEB__
  315. rev64 v24.4s,v24.4s
  316. ror x24,x24,#32
  317. ror x25,x25,#32
  318. ror x26,x26,#32
  319. ror x27,x27,#32
  320. ror x28,x28,#32
  321. ror x30,x30,#32
  322. #endif
  323. add v27.4s,v27.4s,v31.4s // += 1
  324. add v28.4s,v27.4s,v31.4s
  325. add v29.4s,v28.4s,v31.4s
  326. shl v31.4s,v31.4s,#2 // 1 -> 4
  327. .Loop_outer_neon:
  328. mov w5,w22 // unpack key block
  329. lsr x6,x22,#32
  330. mov v0.16b,v24.16b
  331. mov w7,w23
  332. lsr x8,x23,#32
  333. mov v4.16b,v24.16b
  334. mov w9,w24
  335. lsr x10,x24,#32
  336. mov v16.16b,v24.16b
  337. mov w11,w25
  338. mov v1.16b,v25.16b
  339. lsr x12,x25,#32
  340. mov v5.16b,v25.16b
  341. mov w13,w26
  342. mov v17.16b,v25.16b
  343. lsr x14,x26,#32
  344. mov v3.16b,v27.16b
  345. mov w15,w27
  346. mov v7.16b,v28.16b
  347. lsr x16,x27,#32
  348. mov v19.16b,v29.16b
  349. mov w17,w28
  350. mov v2.16b,v26.16b
  351. lsr x19,x28,#32
  352. mov v6.16b,v26.16b
  353. mov w20,w30
  354. mov v18.16b,v26.16b
  355. lsr x21,x30,#32
  356. mov x4,#10
  357. subs x2,x2,#256
  358. .Loop_neon:
  359. sub x4,x4,#1
  360. add v0.4s,v0.4s,v1.4s
  361. add w5,w5,w9
  362. add v4.4s,v4.4s,v5.4s
  363. add w6,w6,w10
  364. add v16.4s,v16.4s,v17.4s
  365. add w7,w7,w11
  366. eor v3.16b,v3.16b,v0.16b
  367. add w8,w8,w12
  368. eor v7.16b,v7.16b,v4.16b
  369. eor w17,w17,w5
  370. eor v19.16b,v19.16b,v16.16b
  371. eor w19,w19,w6
  372. rev32 v3.8h,v3.8h
  373. eor w20,w20,w7
  374. rev32 v7.8h,v7.8h
  375. eor w21,w21,w8
  376. rev32 v19.8h,v19.8h
  377. ror w17,w17,#16
  378. add v2.4s,v2.4s,v3.4s
  379. ror w19,w19,#16
  380. add v6.4s,v6.4s,v7.4s
  381. ror w20,w20,#16
  382. add v18.4s,v18.4s,v19.4s
  383. ror w21,w21,#16
  384. eor v20.16b,v1.16b,v2.16b
  385. add w13,w13,w17
  386. eor v21.16b,v5.16b,v6.16b
  387. add w14,w14,w19
  388. eor v22.16b,v17.16b,v18.16b
  389. add w15,w15,w20
  390. ushr v1.4s,v20.4s,#20
  391. add w16,w16,w21
  392. ushr v5.4s,v21.4s,#20
  393. eor w9,w9,w13
  394. ushr v17.4s,v22.4s,#20
  395. eor w10,w10,w14
  396. sli v1.4s,v20.4s,#12
  397. eor w11,w11,w15
  398. sli v5.4s,v21.4s,#12
  399. eor w12,w12,w16
  400. sli v17.4s,v22.4s,#12
  401. ror w9,w9,#20
  402. add v0.4s,v0.4s,v1.4s
  403. ror w10,w10,#20
  404. add v4.4s,v4.4s,v5.4s
  405. ror w11,w11,#20
  406. add v16.4s,v16.4s,v17.4s
  407. ror w12,w12,#20
  408. eor v20.16b,v3.16b,v0.16b
  409. add w5,w5,w9
  410. eor v21.16b,v7.16b,v4.16b
  411. add w6,w6,w10
  412. eor v22.16b,v19.16b,v16.16b
  413. add w7,w7,w11
  414. ushr v3.4s,v20.4s,#24
  415. add w8,w8,w12
  416. ushr v7.4s,v21.4s,#24
  417. eor w17,w17,w5
  418. ushr v19.4s,v22.4s,#24
  419. eor w19,w19,w6
  420. sli v3.4s,v20.4s,#8
  421. eor w20,w20,w7
  422. sli v7.4s,v21.4s,#8
  423. eor w21,w21,w8
  424. sli v19.4s,v22.4s,#8
  425. ror w17,w17,#24
  426. add v2.4s,v2.4s,v3.4s
  427. ror w19,w19,#24
  428. add v6.4s,v6.4s,v7.4s
  429. ror w20,w20,#24
  430. add v18.4s,v18.4s,v19.4s
  431. ror w21,w21,#24
  432. eor v20.16b,v1.16b,v2.16b
  433. add w13,w13,w17
  434. eor v21.16b,v5.16b,v6.16b
  435. add w14,w14,w19
  436. eor v22.16b,v17.16b,v18.16b
  437. add w15,w15,w20
  438. ushr v1.4s,v20.4s,#25
  439. add w16,w16,w21
  440. ushr v5.4s,v21.4s,#25
  441. eor w9,w9,w13
  442. ushr v17.4s,v22.4s,#25
  443. eor w10,w10,w14
  444. sli v1.4s,v20.4s,#7
  445. eor w11,w11,w15
  446. sli v5.4s,v21.4s,#7
  447. eor w12,w12,w16
  448. sli v17.4s,v22.4s,#7
  449. ror w9,w9,#25
  450. ext v2.16b,v2.16b,v2.16b,#8
  451. ror w10,w10,#25
  452. ext v6.16b,v6.16b,v6.16b,#8
  453. ror w11,w11,#25
  454. ext v18.16b,v18.16b,v18.16b,#8
  455. ror w12,w12,#25
  456. ext v3.16b,v3.16b,v3.16b,#12
  457. ext v7.16b,v7.16b,v7.16b,#12
  458. ext v19.16b,v19.16b,v19.16b,#12
  459. ext v1.16b,v1.16b,v1.16b,#4
  460. ext v5.16b,v5.16b,v5.16b,#4
  461. ext v17.16b,v17.16b,v17.16b,#4
  462. add v0.4s,v0.4s,v1.4s
  463. add w5,w5,w10
  464. add v4.4s,v4.4s,v5.4s
  465. add w6,w6,w11
  466. add v16.4s,v16.4s,v17.4s
  467. add w7,w7,w12
  468. eor v3.16b,v3.16b,v0.16b
  469. add w8,w8,w9
  470. eor v7.16b,v7.16b,v4.16b
  471. eor w21,w21,w5
  472. eor v19.16b,v19.16b,v16.16b
  473. eor w17,w17,w6
  474. rev32 v3.8h,v3.8h
  475. eor w19,w19,w7
  476. rev32 v7.8h,v7.8h
  477. eor w20,w20,w8
  478. rev32 v19.8h,v19.8h
  479. ror w21,w21,#16
  480. add v2.4s,v2.4s,v3.4s
  481. ror w17,w17,#16
  482. add v6.4s,v6.4s,v7.4s
  483. ror w19,w19,#16
  484. add v18.4s,v18.4s,v19.4s
  485. ror w20,w20,#16
  486. eor v20.16b,v1.16b,v2.16b
  487. add w15,w15,w21
  488. eor v21.16b,v5.16b,v6.16b
  489. add w16,w16,w17
  490. eor v22.16b,v17.16b,v18.16b
  491. add w13,w13,w19
  492. ushr v1.4s,v20.4s,#20
  493. add w14,w14,w20
  494. ushr v5.4s,v21.4s,#20
  495. eor w10,w10,w15
  496. ushr v17.4s,v22.4s,#20
  497. eor w11,w11,w16
  498. sli v1.4s,v20.4s,#12
  499. eor w12,w12,w13
  500. sli v5.4s,v21.4s,#12
  501. eor w9,w9,w14
  502. sli v17.4s,v22.4s,#12
  503. ror w10,w10,#20
  504. add v0.4s,v0.4s,v1.4s
  505. ror w11,w11,#20
  506. add v4.4s,v4.4s,v5.4s
  507. ror w12,w12,#20
  508. add v16.4s,v16.4s,v17.4s
  509. ror w9,w9,#20
  510. eor v20.16b,v3.16b,v0.16b
  511. add w5,w5,w10
  512. eor v21.16b,v7.16b,v4.16b
  513. add w6,w6,w11
  514. eor v22.16b,v19.16b,v16.16b
  515. add w7,w7,w12
  516. ushr v3.4s,v20.4s,#24
  517. add w8,w8,w9
  518. ushr v7.4s,v21.4s,#24
  519. eor w21,w21,w5
  520. ushr v19.4s,v22.4s,#24
  521. eor w17,w17,w6
  522. sli v3.4s,v20.4s,#8
  523. eor w19,w19,w7
  524. sli v7.4s,v21.4s,#8
  525. eor w20,w20,w8
  526. sli v19.4s,v22.4s,#8
  527. ror w21,w21,#24
  528. add v2.4s,v2.4s,v3.4s
  529. ror w17,w17,#24
  530. add v6.4s,v6.4s,v7.4s
  531. ror w19,w19,#24
  532. add v18.4s,v18.4s,v19.4s
  533. ror w20,w20,#24
  534. eor v20.16b,v1.16b,v2.16b
  535. add w15,w15,w21
  536. eor v21.16b,v5.16b,v6.16b
  537. add w16,w16,w17
  538. eor v22.16b,v17.16b,v18.16b
  539. add w13,w13,w19
  540. ushr v1.4s,v20.4s,#25
  541. add w14,w14,w20
  542. ushr v5.4s,v21.4s,#25
  543. eor w10,w10,w15
  544. ushr v17.4s,v22.4s,#25
  545. eor w11,w11,w16
  546. sli v1.4s,v20.4s,#7
  547. eor w12,w12,w13
  548. sli v5.4s,v21.4s,#7
  549. eor w9,w9,w14
  550. sli v17.4s,v22.4s,#7
  551. ror w10,w10,#25
  552. ext v2.16b,v2.16b,v2.16b,#8
  553. ror w11,w11,#25
  554. ext v6.16b,v6.16b,v6.16b,#8
  555. ror w12,w12,#25
  556. ext v18.16b,v18.16b,v18.16b,#8
  557. ror w9,w9,#25
  558. ext v3.16b,v3.16b,v3.16b,#4
  559. ext v7.16b,v7.16b,v7.16b,#4
  560. ext v19.16b,v19.16b,v19.16b,#4
  561. ext v1.16b,v1.16b,v1.16b,#12
  562. ext v5.16b,v5.16b,v5.16b,#12
  563. ext v17.16b,v17.16b,v17.16b,#12
  564. cbnz x4,.Loop_neon
  565. add w5,w5,w22 // accumulate key block
  566. add v0.4s,v0.4s,v24.4s
  567. add x6,x6,x22,lsr#32
  568. add v4.4s,v4.4s,v24.4s
  569. add w7,w7,w23
  570. add v16.4s,v16.4s,v24.4s
  571. add x8,x8,x23,lsr#32
  572. add v2.4s,v2.4s,v26.4s
  573. add w9,w9,w24
  574. add v6.4s,v6.4s,v26.4s
  575. add x10,x10,x24,lsr#32
  576. add v18.4s,v18.4s,v26.4s
  577. add w11,w11,w25
  578. add v3.4s,v3.4s,v27.4s
  579. add x12,x12,x25,lsr#32
  580. add w13,w13,w26
  581. add v7.4s,v7.4s,v28.4s
  582. add x14,x14,x26,lsr#32
  583. add w15,w15,w27
  584. add v19.4s,v19.4s,v29.4s
  585. add x16,x16,x27,lsr#32
  586. add w17,w17,w28
  587. add v1.4s,v1.4s,v25.4s
  588. add x19,x19,x28,lsr#32
  589. add w20,w20,w30
  590. add v5.4s,v5.4s,v25.4s
  591. add x21,x21,x30,lsr#32
  592. add v17.4s,v17.4s,v25.4s
  593. b.lo .Ltail_neon
  594. add x5,x5,x6,lsl#32 // pack
  595. add x7,x7,x8,lsl#32
  596. ldp x6,x8,[x1,#0] // load input
  597. add x9,x9,x10,lsl#32
  598. add x11,x11,x12,lsl#32
  599. ldp x10,x12,[x1,#16]
  600. add x13,x13,x14,lsl#32
  601. add x15,x15,x16,lsl#32
  602. ldp x14,x16,[x1,#32]
  603. add x17,x17,x19,lsl#32
  604. add x20,x20,x21,lsl#32
  605. ldp x19,x21,[x1,#48]
  606. add x1,x1,#64
  607. #ifdef __ARMEB__
  608. rev x5,x5
  609. rev x7,x7
  610. rev x9,x9
  611. rev x11,x11
  612. rev x13,x13
  613. rev x15,x15
  614. rev x17,x17
  615. rev x20,x20
  616. #endif
  617. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  618. eor x5,x5,x6
  619. eor x7,x7,x8
  620. eor x9,x9,x10
  621. eor x11,x11,x12
  622. eor x13,x13,x14
  623. eor v0.16b,v0.16b,v20.16b
  624. eor x15,x15,x16
  625. eor v1.16b,v1.16b,v21.16b
  626. eor x17,x17,x19
  627. eor v2.16b,v2.16b,v22.16b
  628. eor x20,x20,x21
  629. eor v3.16b,v3.16b,v23.16b
  630. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  631. stp x5,x7,[x0,#0] // store output
  632. add x28,x28,#4 // increment counter
  633. stp x9,x11,[x0,#16]
  634. add v27.4s,v27.4s,v31.4s // += 4
  635. stp x13,x15,[x0,#32]
  636. add v28.4s,v28.4s,v31.4s
  637. stp x17,x20,[x0,#48]
  638. add v29.4s,v29.4s,v31.4s
  639. add x0,x0,#64
  640. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  641. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  642. eor v4.16b,v4.16b,v20.16b
  643. eor v5.16b,v5.16b,v21.16b
  644. eor v6.16b,v6.16b,v22.16b
  645. eor v7.16b,v7.16b,v23.16b
  646. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  647. eor v16.16b,v16.16b,v0.16b
  648. eor v17.16b,v17.16b,v1.16b
  649. eor v18.16b,v18.16b,v2.16b
  650. eor v19.16b,v19.16b,v3.16b
  651. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  652. b.hi .Loop_outer_neon
  653. ldp x19,x20,[x29,#16]
  654. add sp,sp,#64
  655. ldp x21,x22,[x29,#32]
  656. ldp x23,x24,[x29,#48]
  657. ldp x25,x26,[x29,#64]
  658. ldp x27,x28,[x29,#80]
  659. ldp x29,x30,[sp],#96
  660. .inst 0xd50323bf // autiasp
  661. ret
  662. .Ltail_neon:
  663. add x2,x2,#256
  664. cmp x2,#64
  665. b.lo .Less_than_64
  666. add x5,x5,x6,lsl#32 // pack
  667. add x7,x7,x8,lsl#32
  668. ldp x6,x8,[x1,#0] // load input
  669. add x9,x9,x10,lsl#32
  670. add x11,x11,x12,lsl#32
  671. ldp x10,x12,[x1,#16]
  672. add x13,x13,x14,lsl#32
  673. add x15,x15,x16,lsl#32
  674. ldp x14,x16,[x1,#32]
  675. add x17,x17,x19,lsl#32
  676. add x20,x20,x21,lsl#32
  677. ldp x19,x21,[x1,#48]
  678. add x1,x1,#64
  679. #ifdef __ARMEB__
  680. rev x5,x5
  681. rev x7,x7
  682. rev x9,x9
  683. rev x11,x11
  684. rev x13,x13
  685. rev x15,x15
  686. rev x17,x17
  687. rev x20,x20
  688. #endif
  689. eor x5,x5,x6
  690. eor x7,x7,x8
  691. eor x9,x9,x10
  692. eor x11,x11,x12
  693. eor x13,x13,x14
  694. eor x15,x15,x16
  695. eor x17,x17,x19
  696. eor x20,x20,x21
  697. stp x5,x7,[x0,#0] // store output
  698. add x28,x28,#4 // increment counter
  699. stp x9,x11,[x0,#16]
  700. stp x13,x15,[x0,#32]
  701. stp x17,x20,[x0,#48]
  702. add x0,x0,#64
  703. b.eq .Ldone_neon
  704. sub x2,x2,#64
  705. cmp x2,#64
  706. b.lo .Less_than_128
  707. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  708. eor v0.16b,v0.16b,v20.16b
  709. eor v1.16b,v1.16b,v21.16b
  710. eor v2.16b,v2.16b,v22.16b
  711. eor v3.16b,v3.16b,v23.16b
  712. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  713. b.eq .Ldone_neon
  714. sub x2,x2,#64
  715. cmp x2,#64
  716. b.lo .Less_than_192
  717. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  718. eor v4.16b,v4.16b,v20.16b
  719. eor v5.16b,v5.16b,v21.16b
  720. eor v6.16b,v6.16b,v22.16b
  721. eor v7.16b,v7.16b,v23.16b
  722. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  723. b.eq .Ldone_neon
  724. sub x2,x2,#64
  725. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
  726. b .Last_neon
  727. .Less_than_128:
  728. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
  729. b .Last_neon
  730. .Less_than_192:
  731. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
  732. b .Last_neon
  733. .align 4
  734. .Last_neon:
  735. sub x0,x0,#1
  736. add x1,x1,x2
  737. add x0,x0,x2
  738. add x4,sp,x2
  739. neg x2,x2
  740. .Loop_tail_neon:
  741. ldrb w10,[x1,x2]
  742. ldrb w11,[x4,x2]
  743. add x2,x2,#1
  744. eor w10,w10,w11
  745. strb w10,[x0,x2]
  746. cbnz x2,.Loop_tail_neon
  747. stp xzr,xzr,[sp,#0]
  748. stp xzr,xzr,[sp,#16]
  749. stp xzr,xzr,[sp,#32]
  750. stp xzr,xzr,[sp,#48]
  751. .Ldone_neon:
  752. ldp x19,x20,[x29,#16]
  753. add sp,sp,#64
  754. ldp x21,x22,[x29,#32]
  755. ldp x23,x24,[x29,#48]
  756. ldp x25,x26,[x29,#64]
  757. ldp x27,x28,[x29,#80]
  758. ldp x29,x30,[sp],#96
  759. .inst 0xd50323bf // autiasp
  760. ret
  761. .size ChaCha20_neon,.-ChaCha20_neon
  762. .type ChaCha20_512_neon,%function
  763. .align 5
  764. ChaCha20_512_neon:
  765. .inst 0xd503233f // paciasp
  766. stp x29,x30,[sp,#-96]!
  767. add x29,sp,#0
  768. adr x5,.Lsigma
  769. stp x19,x20,[sp,#16]
  770. stp x21,x22,[sp,#32]
  771. stp x23,x24,[sp,#48]
  772. stp x25,x26,[sp,#64]
  773. stp x27,x28,[sp,#80]
  774. .L512_or_more_neon:
  775. sub sp,sp,#128+64
  776. ldp x22,x23,[x5] // load sigma
  777. ld1 {v24.4s},[x5],#16
  778. ldp x24,x25,[x3] // load key
  779. ldp x26,x27,[x3,#16]
  780. ld1 {v25.4s,v26.4s},[x3]
  781. ldp x28,x30,[x4] // load counter
  782. ld1 {v27.4s},[x4]
  783. ld1 {v31.4s},[x5]
  784. #ifdef __ARMEB__
  785. rev64 v24.4s,v24.4s
  786. ror x24,x24,#32
  787. ror x25,x25,#32
  788. ror x26,x26,#32
  789. ror x27,x27,#32
  790. ror x28,x28,#32
  791. ror x30,x30,#32
  792. #endif
  793. add v27.4s,v27.4s,v31.4s // += 1
  794. stp q24,q25,[sp,#0] // off-load key block, invariant part
  795. add v27.4s,v27.4s,v31.4s // not typo
  796. str q26,[sp,#32]
  797. add v28.4s,v27.4s,v31.4s
  798. add v29.4s,v28.4s,v31.4s
  799. add v30.4s,v29.4s,v31.4s
  800. shl v31.4s,v31.4s,#2 // 1 -> 4
  801. stp d8,d9,[sp,#128+0] // meet ABI requirements
  802. stp d10,d11,[sp,#128+16]
  803. stp d12,d13,[sp,#128+32]
  804. stp d14,d15,[sp,#128+48]
  805. sub x2,x2,#512 // not typo
  806. .Loop_outer_512_neon:
  807. mov v0.16b,v24.16b
  808. mov v4.16b,v24.16b
  809. mov v8.16b,v24.16b
  810. mov v12.16b,v24.16b
  811. mov v16.16b,v24.16b
  812. mov v20.16b,v24.16b
  813. mov v1.16b,v25.16b
  814. mov w5,w22 // unpack key block
  815. mov v5.16b,v25.16b
  816. lsr x6,x22,#32
  817. mov v9.16b,v25.16b
  818. mov w7,w23
  819. mov v13.16b,v25.16b
  820. lsr x8,x23,#32
  821. mov v17.16b,v25.16b
  822. mov w9,w24
  823. mov v21.16b,v25.16b
  824. lsr x10,x24,#32
  825. mov v3.16b,v27.16b
  826. mov w11,w25
  827. mov v7.16b,v28.16b
  828. lsr x12,x25,#32
  829. mov v11.16b,v29.16b
  830. mov w13,w26
  831. mov v15.16b,v30.16b
  832. lsr x14,x26,#32
  833. mov v2.16b,v26.16b
  834. mov w15,w27
  835. mov v6.16b,v26.16b
  836. lsr x16,x27,#32
  837. add v19.4s,v3.4s,v31.4s // +4
  838. mov w17,w28
  839. add v23.4s,v7.4s,v31.4s // +4
  840. lsr x19,x28,#32
  841. mov v10.16b,v26.16b
  842. mov w20,w30
  843. mov v14.16b,v26.16b
  844. lsr x21,x30,#32
  845. mov v18.16b,v26.16b
  846. stp q27,q28,[sp,#48] // off-load key block, variable part
  847. mov v22.16b,v26.16b
  848. str q29,[sp,#80]
  849. mov x4,#5
  850. subs x2,x2,#512
  851. .Loop_upper_neon:
  852. sub x4,x4,#1
  853. add v0.4s,v0.4s,v1.4s
  854. add w5,w5,w9
  855. add v4.4s,v4.4s,v5.4s
  856. add w6,w6,w10
  857. add v8.4s,v8.4s,v9.4s
  858. add w7,w7,w11
  859. add v12.4s,v12.4s,v13.4s
  860. add w8,w8,w12
  861. add v16.4s,v16.4s,v17.4s
  862. eor w17,w17,w5
  863. add v20.4s,v20.4s,v21.4s
  864. eor w19,w19,w6
  865. eor v3.16b,v3.16b,v0.16b
  866. eor w20,w20,w7
  867. eor v7.16b,v7.16b,v4.16b
  868. eor w21,w21,w8
  869. eor v11.16b,v11.16b,v8.16b
  870. ror w17,w17,#16
  871. eor v15.16b,v15.16b,v12.16b
  872. ror w19,w19,#16
  873. eor v19.16b,v19.16b,v16.16b
  874. ror w20,w20,#16
  875. eor v23.16b,v23.16b,v20.16b
  876. ror w21,w21,#16
  877. rev32 v3.8h,v3.8h
  878. add w13,w13,w17
  879. rev32 v7.8h,v7.8h
  880. add w14,w14,w19
  881. rev32 v11.8h,v11.8h
  882. add w15,w15,w20
  883. rev32 v15.8h,v15.8h
  884. add w16,w16,w21
  885. rev32 v19.8h,v19.8h
  886. eor w9,w9,w13
  887. rev32 v23.8h,v23.8h
  888. eor w10,w10,w14
  889. add v2.4s,v2.4s,v3.4s
  890. eor w11,w11,w15
  891. add v6.4s,v6.4s,v7.4s
  892. eor w12,w12,w16
  893. add v10.4s,v10.4s,v11.4s
  894. ror w9,w9,#20
  895. add v14.4s,v14.4s,v15.4s
  896. ror w10,w10,#20
  897. add v18.4s,v18.4s,v19.4s
  898. ror w11,w11,#20
  899. add v22.4s,v22.4s,v23.4s
  900. ror w12,w12,#20
  901. eor v24.16b,v1.16b,v2.16b
  902. add w5,w5,w9
  903. eor v25.16b,v5.16b,v6.16b
  904. add w6,w6,w10
  905. eor v26.16b,v9.16b,v10.16b
  906. add w7,w7,w11
  907. eor v27.16b,v13.16b,v14.16b
  908. add w8,w8,w12
  909. eor v28.16b,v17.16b,v18.16b
  910. eor w17,w17,w5
  911. eor v29.16b,v21.16b,v22.16b
  912. eor w19,w19,w6
  913. ushr v1.4s,v24.4s,#20
  914. eor w20,w20,w7
  915. ushr v5.4s,v25.4s,#20
  916. eor w21,w21,w8
  917. ushr v9.4s,v26.4s,#20
  918. ror w17,w17,#24
  919. ushr v13.4s,v27.4s,#20
  920. ror w19,w19,#24
  921. ushr v17.4s,v28.4s,#20
  922. ror w20,w20,#24
  923. ushr v21.4s,v29.4s,#20
  924. ror w21,w21,#24
  925. sli v1.4s,v24.4s,#12
  926. add w13,w13,w17
  927. sli v5.4s,v25.4s,#12
  928. add w14,w14,w19
  929. sli v9.4s,v26.4s,#12
  930. add w15,w15,w20
  931. sli v13.4s,v27.4s,#12
  932. add w16,w16,w21
  933. sli v17.4s,v28.4s,#12
  934. eor w9,w9,w13
  935. sli v21.4s,v29.4s,#12
  936. eor w10,w10,w14
  937. add v0.4s,v0.4s,v1.4s
  938. eor w11,w11,w15
  939. add v4.4s,v4.4s,v5.4s
  940. eor w12,w12,w16
  941. add v8.4s,v8.4s,v9.4s
  942. ror w9,w9,#25
  943. add v12.4s,v12.4s,v13.4s
  944. ror w10,w10,#25
  945. add v16.4s,v16.4s,v17.4s
  946. ror w11,w11,#25
  947. add v20.4s,v20.4s,v21.4s
  948. ror w12,w12,#25
  949. eor v24.16b,v3.16b,v0.16b
  950. add w5,w5,w10
  951. eor v25.16b,v7.16b,v4.16b
  952. add w6,w6,w11
  953. eor v26.16b,v11.16b,v8.16b
  954. add w7,w7,w12
  955. eor v27.16b,v15.16b,v12.16b
  956. add w8,w8,w9
  957. eor v28.16b,v19.16b,v16.16b
  958. eor w21,w21,w5
  959. eor v29.16b,v23.16b,v20.16b
  960. eor w17,w17,w6
  961. ushr v3.4s,v24.4s,#24
  962. eor w19,w19,w7
  963. ushr v7.4s,v25.4s,#24
  964. eor w20,w20,w8
  965. ushr v11.4s,v26.4s,#24
  966. ror w21,w21,#16
  967. ushr v15.4s,v27.4s,#24
  968. ror w17,w17,#16
  969. ushr v19.4s,v28.4s,#24
  970. ror w19,w19,#16
  971. ushr v23.4s,v29.4s,#24
  972. ror w20,w20,#16
  973. sli v3.4s,v24.4s,#8
  974. add w15,w15,w21
  975. sli v7.4s,v25.4s,#8
  976. add w16,w16,w17
  977. sli v11.4s,v26.4s,#8
  978. add w13,w13,w19
  979. sli v15.4s,v27.4s,#8
  980. add w14,w14,w20
  981. sli v19.4s,v28.4s,#8
  982. eor w10,w10,w15
  983. sli v23.4s,v29.4s,#8
  984. eor w11,w11,w16
  985. add v2.4s,v2.4s,v3.4s
  986. eor w12,w12,w13
  987. add v6.4s,v6.4s,v7.4s
  988. eor w9,w9,w14
  989. add v10.4s,v10.4s,v11.4s
  990. ror w10,w10,#20
  991. add v14.4s,v14.4s,v15.4s
  992. ror w11,w11,#20
  993. add v18.4s,v18.4s,v19.4s
  994. ror w12,w12,#20
  995. add v22.4s,v22.4s,v23.4s
  996. ror w9,w9,#20
  997. eor v24.16b,v1.16b,v2.16b
  998. add w5,w5,w10
  999. eor v25.16b,v5.16b,v6.16b
  1000. add w6,w6,w11
  1001. eor v26.16b,v9.16b,v10.16b
  1002. add w7,w7,w12
  1003. eor v27.16b,v13.16b,v14.16b
  1004. add w8,w8,w9
  1005. eor v28.16b,v17.16b,v18.16b
  1006. eor w21,w21,w5
  1007. eor v29.16b,v21.16b,v22.16b
  1008. eor w17,w17,w6
  1009. ushr v1.4s,v24.4s,#25
  1010. eor w19,w19,w7
  1011. ushr v5.4s,v25.4s,#25
  1012. eor w20,w20,w8
  1013. ushr v9.4s,v26.4s,#25
  1014. ror w21,w21,#24
  1015. ushr v13.4s,v27.4s,#25
  1016. ror w17,w17,#24
  1017. ushr v17.4s,v28.4s,#25
  1018. ror w19,w19,#24
  1019. ushr v21.4s,v29.4s,#25
  1020. ror w20,w20,#24
  1021. sli v1.4s,v24.4s,#7
  1022. add w15,w15,w21
  1023. sli v5.4s,v25.4s,#7
  1024. add w16,w16,w17
  1025. sli v9.4s,v26.4s,#7
  1026. add w13,w13,w19
  1027. sli v13.4s,v27.4s,#7
  1028. add w14,w14,w20
  1029. sli v17.4s,v28.4s,#7
  1030. eor w10,w10,w15
  1031. sli v21.4s,v29.4s,#7
  1032. eor w11,w11,w16
  1033. ext v2.16b,v2.16b,v2.16b,#8
  1034. eor w12,w12,w13
  1035. ext v6.16b,v6.16b,v6.16b,#8
  1036. eor w9,w9,w14
  1037. ext v10.16b,v10.16b,v10.16b,#8
  1038. ror w10,w10,#25
  1039. ext v14.16b,v14.16b,v14.16b,#8
  1040. ror w11,w11,#25
  1041. ext v18.16b,v18.16b,v18.16b,#8
  1042. ror w12,w12,#25
  1043. ext v22.16b,v22.16b,v22.16b,#8
  1044. ror w9,w9,#25
  1045. ext v3.16b,v3.16b,v3.16b,#12
  1046. ext v7.16b,v7.16b,v7.16b,#12
  1047. ext v11.16b,v11.16b,v11.16b,#12
  1048. ext v15.16b,v15.16b,v15.16b,#12
  1049. ext v19.16b,v19.16b,v19.16b,#12
  1050. ext v23.16b,v23.16b,v23.16b,#12
  1051. ext v1.16b,v1.16b,v1.16b,#4
  1052. ext v5.16b,v5.16b,v5.16b,#4
  1053. ext v9.16b,v9.16b,v9.16b,#4
  1054. ext v13.16b,v13.16b,v13.16b,#4
  1055. ext v17.16b,v17.16b,v17.16b,#4
  1056. ext v21.16b,v21.16b,v21.16b,#4
  1057. add v0.4s,v0.4s,v1.4s
  1058. add w5,w5,w9
  1059. add v4.4s,v4.4s,v5.4s
  1060. add w6,w6,w10
  1061. add v8.4s,v8.4s,v9.4s
  1062. add w7,w7,w11
  1063. add v12.4s,v12.4s,v13.4s
  1064. add w8,w8,w12
  1065. add v16.4s,v16.4s,v17.4s
  1066. eor w17,w17,w5
  1067. add v20.4s,v20.4s,v21.4s
  1068. eor w19,w19,w6
  1069. eor v3.16b,v3.16b,v0.16b
  1070. eor w20,w20,w7
  1071. eor v7.16b,v7.16b,v4.16b
  1072. eor w21,w21,w8
  1073. eor v11.16b,v11.16b,v8.16b
  1074. ror w17,w17,#16
  1075. eor v15.16b,v15.16b,v12.16b
  1076. ror w19,w19,#16
  1077. eor v19.16b,v19.16b,v16.16b
  1078. ror w20,w20,#16
  1079. eor v23.16b,v23.16b,v20.16b
  1080. ror w21,w21,#16
  1081. rev32 v3.8h,v3.8h
  1082. add w13,w13,w17
  1083. rev32 v7.8h,v7.8h
  1084. add w14,w14,w19
  1085. rev32 v11.8h,v11.8h
  1086. add w15,w15,w20
  1087. rev32 v15.8h,v15.8h
  1088. add w16,w16,w21
  1089. rev32 v19.8h,v19.8h
  1090. eor w9,w9,w13
  1091. rev32 v23.8h,v23.8h
  1092. eor w10,w10,w14
  1093. add v2.4s,v2.4s,v3.4s
  1094. eor w11,w11,w15
  1095. add v6.4s,v6.4s,v7.4s
  1096. eor w12,w12,w16
  1097. add v10.4s,v10.4s,v11.4s
  1098. ror w9,w9,#20
  1099. add v14.4s,v14.4s,v15.4s
  1100. ror w10,w10,#20
  1101. add v18.4s,v18.4s,v19.4s
  1102. ror w11,w11,#20
  1103. add v22.4s,v22.4s,v23.4s
  1104. ror w12,w12,#20
  1105. eor v24.16b,v1.16b,v2.16b
  1106. add w5,w5,w9
  1107. eor v25.16b,v5.16b,v6.16b
  1108. add w6,w6,w10
  1109. eor v26.16b,v9.16b,v10.16b
  1110. add w7,w7,w11
  1111. eor v27.16b,v13.16b,v14.16b
  1112. add w8,w8,w12
  1113. eor v28.16b,v17.16b,v18.16b
  1114. eor w17,w17,w5
  1115. eor v29.16b,v21.16b,v22.16b
  1116. eor w19,w19,w6
  1117. ushr v1.4s,v24.4s,#20
  1118. eor w20,w20,w7
  1119. ushr v5.4s,v25.4s,#20
  1120. eor w21,w21,w8
  1121. ushr v9.4s,v26.4s,#20
  1122. ror w17,w17,#24
  1123. ushr v13.4s,v27.4s,#20
  1124. ror w19,w19,#24
  1125. ushr v17.4s,v28.4s,#20
  1126. ror w20,w20,#24
  1127. ushr v21.4s,v29.4s,#20
  1128. ror w21,w21,#24
  1129. sli v1.4s,v24.4s,#12
  1130. add w13,w13,w17
  1131. sli v5.4s,v25.4s,#12
  1132. add w14,w14,w19
  1133. sli v9.4s,v26.4s,#12
  1134. add w15,w15,w20
  1135. sli v13.4s,v27.4s,#12
  1136. add w16,w16,w21
  1137. sli v17.4s,v28.4s,#12
  1138. eor w9,w9,w13
  1139. sli v21.4s,v29.4s,#12
  1140. eor w10,w10,w14
  1141. add v0.4s,v0.4s,v1.4s
  1142. eor w11,w11,w15
  1143. add v4.4s,v4.4s,v5.4s
  1144. eor w12,w12,w16
  1145. add v8.4s,v8.4s,v9.4s
  1146. ror w9,w9,#25
  1147. add v12.4s,v12.4s,v13.4s
  1148. ror w10,w10,#25
  1149. add v16.4s,v16.4s,v17.4s
  1150. ror w11,w11,#25
  1151. add v20.4s,v20.4s,v21.4s
  1152. ror w12,w12,#25
  1153. eor v24.16b,v3.16b,v0.16b
  1154. add w5,w5,w10
  1155. eor v25.16b,v7.16b,v4.16b
  1156. add w6,w6,w11
  1157. eor v26.16b,v11.16b,v8.16b
  1158. add w7,w7,w12
  1159. eor v27.16b,v15.16b,v12.16b
  1160. add w8,w8,w9
  1161. eor v28.16b,v19.16b,v16.16b
  1162. eor w21,w21,w5
  1163. eor v29.16b,v23.16b,v20.16b
  1164. eor w17,w17,w6
  1165. ushr v3.4s,v24.4s,#24
  1166. eor w19,w19,w7
  1167. ushr v7.4s,v25.4s,#24
  1168. eor w20,w20,w8
  1169. ushr v11.4s,v26.4s,#24
  1170. ror w21,w21,#16
  1171. ushr v15.4s,v27.4s,#24
  1172. ror w17,w17,#16
  1173. ushr v19.4s,v28.4s,#24
  1174. ror w19,w19,#16
  1175. ushr v23.4s,v29.4s,#24
  1176. ror w20,w20,#16
  1177. sli v3.4s,v24.4s,#8
  1178. add w15,w15,w21
  1179. sli v7.4s,v25.4s,#8
  1180. add w16,w16,w17
  1181. sli v11.4s,v26.4s,#8
  1182. add w13,w13,w19
  1183. sli v15.4s,v27.4s,#8
  1184. add w14,w14,w20
  1185. sli v19.4s,v28.4s,#8
  1186. eor w10,w10,w15
  1187. sli v23.4s,v29.4s,#8
  1188. eor w11,w11,w16
  1189. add v2.4s,v2.4s,v3.4s
  1190. eor w12,w12,w13
  1191. add v6.4s,v6.4s,v7.4s
  1192. eor w9,w9,w14
  1193. add v10.4s,v10.4s,v11.4s
  1194. ror w10,w10,#20
  1195. add v14.4s,v14.4s,v15.4s
  1196. ror w11,w11,#20
  1197. add v18.4s,v18.4s,v19.4s
  1198. ror w12,w12,#20
  1199. add v22.4s,v22.4s,v23.4s
  1200. ror w9,w9,#20
  1201. eor v24.16b,v1.16b,v2.16b
  1202. add w5,w5,w10
  1203. eor v25.16b,v5.16b,v6.16b
  1204. add w6,w6,w11
  1205. eor v26.16b,v9.16b,v10.16b
  1206. add w7,w7,w12
  1207. eor v27.16b,v13.16b,v14.16b
  1208. add w8,w8,w9
  1209. eor v28.16b,v17.16b,v18.16b
  1210. eor w21,w21,w5
  1211. eor v29.16b,v21.16b,v22.16b
  1212. eor w17,w17,w6
  1213. ushr v1.4s,v24.4s,#25
  1214. eor w19,w19,w7
  1215. ushr v5.4s,v25.4s,#25
  1216. eor w20,w20,w8
  1217. ushr v9.4s,v26.4s,#25
  1218. ror w21,w21,#24
  1219. ushr v13.4s,v27.4s,#25
  1220. ror w17,w17,#24
  1221. ushr v17.4s,v28.4s,#25
  1222. ror w19,w19,#24
  1223. ushr v21.4s,v29.4s,#25
  1224. ror w20,w20,#24
  1225. sli v1.4s,v24.4s,#7
  1226. add w15,w15,w21
  1227. sli v5.4s,v25.4s,#7
  1228. add w16,w16,w17
  1229. sli v9.4s,v26.4s,#7
  1230. add w13,w13,w19
  1231. sli v13.4s,v27.4s,#7
  1232. add w14,w14,w20
  1233. sli v17.4s,v28.4s,#7
  1234. eor w10,w10,w15
  1235. sli v21.4s,v29.4s,#7
  1236. eor w11,w11,w16
  1237. ext v2.16b,v2.16b,v2.16b,#8
  1238. eor w12,w12,w13
  1239. ext v6.16b,v6.16b,v6.16b,#8
  1240. eor w9,w9,w14
  1241. ext v10.16b,v10.16b,v10.16b,#8
  1242. ror w10,w10,#25
  1243. ext v14.16b,v14.16b,v14.16b,#8
  1244. ror w11,w11,#25
  1245. ext v18.16b,v18.16b,v18.16b,#8
  1246. ror w12,w12,#25
  1247. ext v22.16b,v22.16b,v22.16b,#8
  1248. ror w9,w9,#25
  1249. ext v3.16b,v3.16b,v3.16b,#4
  1250. ext v7.16b,v7.16b,v7.16b,#4
  1251. ext v11.16b,v11.16b,v11.16b,#4
  1252. ext v15.16b,v15.16b,v15.16b,#4
  1253. ext v19.16b,v19.16b,v19.16b,#4
  1254. ext v23.16b,v23.16b,v23.16b,#4
  1255. ext v1.16b,v1.16b,v1.16b,#12
  1256. ext v5.16b,v5.16b,v5.16b,#12
  1257. ext v9.16b,v9.16b,v9.16b,#12
  1258. ext v13.16b,v13.16b,v13.16b,#12
  1259. ext v17.16b,v17.16b,v17.16b,#12
  1260. ext v21.16b,v21.16b,v21.16b,#12
  1261. cbnz x4,.Loop_upper_neon
  1262. add w5,w5,w22 // accumulate key block
  1263. add x6,x6,x22,lsr#32
  1264. add w7,w7,w23
  1265. add x8,x8,x23,lsr#32
  1266. add w9,w9,w24
  1267. add x10,x10,x24,lsr#32
  1268. add w11,w11,w25
  1269. add x12,x12,x25,lsr#32
  1270. add w13,w13,w26
  1271. add x14,x14,x26,lsr#32
  1272. add w15,w15,w27
  1273. add x16,x16,x27,lsr#32
  1274. add w17,w17,w28
  1275. add x19,x19,x28,lsr#32
  1276. add w20,w20,w30
  1277. add x21,x21,x30,lsr#32
  1278. add x5,x5,x6,lsl#32 // pack
  1279. add x7,x7,x8,lsl#32
  1280. ldp x6,x8,[x1,#0] // load input
  1281. add x9,x9,x10,lsl#32
  1282. add x11,x11,x12,lsl#32
  1283. ldp x10,x12,[x1,#16]
  1284. add x13,x13,x14,lsl#32
  1285. add x15,x15,x16,lsl#32
  1286. ldp x14,x16,[x1,#32]
  1287. add x17,x17,x19,lsl#32
  1288. add x20,x20,x21,lsl#32
  1289. ldp x19,x21,[x1,#48]
  1290. add x1,x1,#64
  1291. #ifdef __ARMEB__
  1292. rev x5,x5
  1293. rev x7,x7
  1294. rev x9,x9
  1295. rev x11,x11
  1296. rev x13,x13
  1297. rev x15,x15
  1298. rev x17,x17
  1299. rev x20,x20
  1300. #endif
  1301. eor x5,x5,x6
  1302. eor x7,x7,x8
  1303. eor x9,x9,x10
  1304. eor x11,x11,x12
  1305. eor x13,x13,x14
  1306. eor x15,x15,x16
  1307. eor x17,x17,x19
  1308. eor x20,x20,x21
  1309. stp x5,x7,[x0,#0] // store output
  1310. add x28,x28,#1 // increment counter
  1311. mov w5,w22 // unpack key block
  1312. lsr x6,x22,#32
  1313. stp x9,x11,[x0,#16]
  1314. mov w7,w23
  1315. lsr x8,x23,#32
  1316. stp x13,x15,[x0,#32]
  1317. mov w9,w24
  1318. lsr x10,x24,#32
  1319. stp x17,x20,[x0,#48]
  1320. add x0,x0,#64
  1321. mov w11,w25
  1322. lsr x12,x25,#32
  1323. mov w13,w26
  1324. lsr x14,x26,#32
  1325. mov w15,w27
  1326. lsr x16,x27,#32
  1327. mov w17,w28
  1328. lsr x19,x28,#32
  1329. mov w20,w30
  1330. lsr x21,x30,#32
  1331. mov x4,#5
  1332. .Loop_lower_neon:
  1333. sub x4,x4,#1
  1334. add v0.4s,v0.4s,v1.4s
  1335. add w5,w5,w9
  1336. add v4.4s,v4.4s,v5.4s
  1337. add w6,w6,w10
  1338. add v8.4s,v8.4s,v9.4s
  1339. add w7,w7,w11
  1340. add v12.4s,v12.4s,v13.4s
  1341. add w8,w8,w12
  1342. add v16.4s,v16.4s,v17.4s
  1343. eor w17,w17,w5
  1344. add v20.4s,v20.4s,v21.4s
  1345. eor w19,w19,w6
  1346. eor v3.16b,v3.16b,v0.16b
  1347. eor w20,w20,w7
  1348. eor v7.16b,v7.16b,v4.16b
  1349. eor w21,w21,w8
  1350. eor v11.16b,v11.16b,v8.16b
  1351. ror w17,w17,#16
  1352. eor v15.16b,v15.16b,v12.16b
  1353. ror w19,w19,#16
  1354. eor v19.16b,v19.16b,v16.16b
  1355. ror w20,w20,#16
  1356. eor v23.16b,v23.16b,v20.16b
  1357. ror w21,w21,#16
  1358. rev32 v3.8h,v3.8h
  1359. add w13,w13,w17
  1360. rev32 v7.8h,v7.8h
  1361. add w14,w14,w19
  1362. rev32 v11.8h,v11.8h
  1363. add w15,w15,w20
  1364. rev32 v15.8h,v15.8h
  1365. add w16,w16,w21
  1366. rev32 v19.8h,v19.8h
  1367. eor w9,w9,w13
  1368. rev32 v23.8h,v23.8h
  1369. eor w10,w10,w14
  1370. add v2.4s,v2.4s,v3.4s
  1371. eor w11,w11,w15
  1372. add v6.4s,v6.4s,v7.4s
  1373. eor w12,w12,w16
  1374. add v10.4s,v10.4s,v11.4s
  1375. ror w9,w9,#20
  1376. add v14.4s,v14.4s,v15.4s
  1377. ror w10,w10,#20
  1378. add v18.4s,v18.4s,v19.4s
  1379. ror w11,w11,#20
  1380. add v22.4s,v22.4s,v23.4s
  1381. ror w12,w12,#20
  1382. eor v24.16b,v1.16b,v2.16b
  1383. add w5,w5,w9
  1384. eor v25.16b,v5.16b,v6.16b
  1385. add w6,w6,w10
  1386. eor v26.16b,v9.16b,v10.16b
  1387. add w7,w7,w11
  1388. eor v27.16b,v13.16b,v14.16b
  1389. add w8,w8,w12
  1390. eor v28.16b,v17.16b,v18.16b
  1391. eor w17,w17,w5
  1392. eor v29.16b,v21.16b,v22.16b
  1393. eor w19,w19,w6
  1394. ushr v1.4s,v24.4s,#20
  1395. eor w20,w20,w7
  1396. ushr v5.4s,v25.4s,#20
  1397. eor w21,w21,w8
  1398. ushr v9.4s,v26.4s,#20
  1399. ror w17,w17,#24
  1400. ushr v13.4s,v27.4s,#20
  1401. ror w19,w19,#24
  1402. ushr v17.4s,v28.4s,#20
  1403. ror w20,w20,#24
  1404. ushr v21.4s,v29.4s,#20
  1405. ror w21,w21,#24
  1406. sli v1.4s,v24.4s,#12
  1407. add w13,w13,w17
  1408. sli v5.4s,v25.4s,#12
  1409. add w14,w14,w19
  1410. sli v9.4s,v26.4s,#12
  1411. add w15,w15,w20
  1412. sli v13.4s,v27.4s,#12
  1413. add w16,w16,w21
  1414. sli v17.4s,v28.4s,#12
  1415. eor w9,w9,w13
  1416. sli v21.4s,v29.4s,#12
  1417. eor w10,w10,w14
  1418. add v0.4s,v0.4s,v1.4s
  1419. eor w11,w11,w15
  1420. add v4.4s,v4.4s,v5.4s
  1421. eor w12,w12,w16
  1422. add v8.4s,v8.4s,v9.4s
  1423. ror w9,w9,#25
  1424. add v12.4s,v12.4s,v13.4s
  1425. ror w10,w10,#25
  1426. add v16.4s,v16.4s,v17.4s
  1427. ror w11,w11,#25
  1428. add v20.4s,v20.4s,v21.4s
  1429. ror w12,w12,#25
  1430. eor v24.16b,v3.16b,v0.16b
  1431. add w5,w5,w10
  1432. eor v25.16b,v7.16b,v4.16b
  1433. add w6,w6,w11
  1434. eor v26.16b,v11.16b,v8.16b
  1435. add w7,w7,w12
  1436. eor v27.16b,v15.16b,v12.16b
  1437. add w8,w8,w9
  1438. eor v28.16b,v19.16b,v16.16b
  1439. eor w21,w21,w5
  1440. eor v29.16b,v23.16b,v20.16b
  1441. eor w17,w17,w6
  1442. ushr v3.4s,v24.4s,#24
  1443. eor w19,w19,w7
  1444. ushr v7.4s,v25.4s,#24
  1445. eor w20,w20,w8
  1446. ushr v11.4s,v26.4s,#24
  1447. ror w21,w21,#16
  1448. ushr v15.4s,v27.4s,#24
  1449. ror w17,w17,#16
  1450. ushr v19.4s,v28.4s,#24
  1451. ror w19,w19,#16
  1452. ushr v23.4s,v29.4s,#24
  1453. ror w20,w20,#16
  1454. sli v3.4s,v24.4s,#8
  1455. add w15,w15,w21
  1456. sli v7.4s,v25.4s,#8
  1457. add w16,w16,w17
  1458. sli v11.4s,v26.4s,#8
  1459. add w13,w13,w19
  1460. sli v15.4s,v27.4s,#8
  1461. add w14,w14,w20
  1462. sli v19.4s,v28.4s,#8
  1463. eor w10,w10,w15
  1464. sli v23.4s,v29.4s,#8
  1465. eor w11,w11,w16
  1466. add v2.4s,v2.4s,v3.4s
  1467. eor w12,w12,w13
  1468. add v6.4s,v6.4s,v7.4s
  1469. eor w9,w9,w14
  1470. add v10.4s,v10.4s,v11.4s
  1471. ror w10,w10,#20
  1472. add v14.4s,v14.4s,v15.4s
  1473. ror w11,w11,#20
  1474. add v18.4s,v18.4s,v19.4s
  1475. ror w12,w12,#20
  1476. add v22.4s,v22.4s,v23.4s
  1477. ror w9,w9,#20
  1478. eor v24.16b,v1.16b,v2.16b
  1479. add w5,w5,w10
  1480. eor v25.16b,v5.16b,v6.16b
  1481. add w6,w6,w11
  1482. eor v26.16b,v9.16b,v10.16b
  1483. add w7,w7,w12
  1484. eor v27.16b,v13.16b,v14.16b
  1485. add w8,w8,w9
  1486. eor v28.16b,v17.16b,v18.16b
  1487. eor w21,w21,w5
  1488. eor v29.16b,v21.16b,v22.16b
  1489. eor w17,w17,w6
  1490. ushr v1.4s,v24.4s,#25
  1491. eor w19,w19,w7
  1492. ushr v5.4s,v25.4s,#25
  1493. eor w20,w20,w8
  1494. ushr v9.4s,v26.4s,#25
  1495. ror w21,w21,#24
  1496. ushr v13.4s,v27.4s,#25
  1497. ror w17,w17,#24
  1498. ushr v17.4s,v28.4s,#25
  1499. ror w19,w19,#24
  1500. ushr v21.4s,v29.4s,#25
  1501. ror w20,w20,#24
  1502. sli v1.4s,v24.4s,#7
  1503. add w15,w15,w21
  1504. sli v5.4s,v25.4s,#7
  1505. add w16,w16,w17
  1506. sli v9.4s,v26.4s,#7
  1507. add w13,w13,w19
  1508. sli v13.4s,v27.4s,#7
  1509. add w14,w14,w20
  1510. sli v17.4s,v28.4s,#7
  1511. eor w10,w10,w15
  1512. sli v21.4s,v29.4s,#7
  1513. eor w11,w11,w16
  1514. ext v2.16b,v2.16b,v2.16b,#8
  1515. eor w12,w12,w13
  1516. ext v6.16b,v6.16b,v6.16b,#8
  1517. eor w9,w9,w14
  1518. ext v10.16b,v10.16b,v10.16b,#8
  1519. ror w10,w10,#25
  1520. ext v14.16b,v14.16b,v14.16b,#8
  1521. ror w11,w11,#25
  1522. ext v18.16b,v18.16b,v18.16b,#8
  1523. ror w12,w12,#25
  1524. ext v22.16b,v22.16b,v22.16b,#8
  1525. ror w9,w9,#25
  1526. ext v3.16b,v3.16b,v3.16b,#12
  1527. ext v7.16b,v7.16b,v7.16b,#12
  1528. ext v11.16b,v11.16b,v11.16b,#12
  1529. ext v15.16b,v15.16b,v15.16b,#12
  1530. ext v19.16b,v19.16b,v19.16b,#12
  1531. ext v23.16b,v23.16b,v23.16b,#12
  1532. ext v1.16b,v1.16b,v1.16b,#4
  1533. ext v5.16b,v5.16b,v5.16b,#4
  1534. ext v9.16b,v9.16b,v9.16b,#4
  1535. ext v13.16b,v13.16b,v13.16b,#4
  1536. ext v17.16b,v17.16b,v17.16b,#4
  1537. ext v21.16b,v21.16b,v21.16b,#4
  1538. add v0.4s,v0.4s,v1.4s
  1539. add w5,w5,w9
  1540. add v4.4s,v4.4s,v5.4s
  1541. add w6,w6,w10
  1542. add v8.4s,v8.4s,v9.4s
  1543. add w7,w7,w11
  1544. add v12.4s,v12.4s,v13.4s
  1545. add w8,w8,w12
  1546. add v16.4s,v16.4s,v17.4s
  1547. eor w17,w17,w5
  1548. add v20.4s,v20.4s,v21.4s
  1549. eor w19,w19,w6
  1550. eor v3.16b,v3.16b,v0.16b
  1551. eor w20,w20,w7
  1552. eor v7.16b,v7.16b,v4.16b
  1553. eor w21,w21,w8
  1554. eor v11.16b,v11.16b,v8.16b
  1555. ror w17,w17,#16
  1556. eor v15.16b,v15.16b,v12.16b
  1557. ror w19,w19,#16
  1558. eor v19.16b,v19.16b,v16.16b
  1559. ror w20,w20,#16
  1560. eor v23.16b,v23.16b,v20.16b
  1561. ror w21,w21,#16
  1562. rev32 v3.8h,v3.8h
  1563. add w13,w13,w17
  1564. rev32 v7.8h,v7.8h
  1565. add w14,w14,w19
  1566. rev32 v11.8h,v11.8h
  1567. add w15,w15,w20
  1568. rev32 v15.8h,v15.8h
  1569. add w16,w16,w21
  1570. rev32 v19.8h,v19.8h
  1571. eor w9,w9,w13
  1572. rev32 v23.8h,v23.8h
  1573. eor w10,w10,w14
  1574. add v2.4s,v2.4s,v3.4s
  1575. eor w11,w11,w15
  1576. add v6.4s,v6.4s,v7.4s
  1577. eor w12,w12,w16
  1578. add v10.4s,v10.4s,v11.4s
  1579. ror w9,w9,#20
  1580. add v14.4s,v14.4s,v15.4s
  1581. ror w10,w10,#20
  1582. add v18.4s,v18.4s,v19.4s
  1583. ror w11,w11,#20
  1584. add v22.4s,v22.4s,v23.4s
  1585. ror w12,w12,#20
  1586. eor v24.16b,v1.16b,v2.16b
  1587. add w5,w5,w9
  1588. eor v25.16b,v5.16b,v6.16b
  1589. add w6,w6,w10
  1590. eor v26.16b,v9.16b,v10.16b
  1591. add w7,w7,w11
  1592. eor v27.16b,v13.16b,v14.16b
  1593. add w8,w8,w12
  1594. eor v28.16b,v17.16b,v18.16b
  1595. eor w17,w17,w5
  1596. eor v29.16b,v21.16b,v22.16b
  1597. eor w19,w19,w6
  1598. ushr v1.4s,v24.4s,#20
  1599. eor w20,w20,w7
  1600. ushr v5.4s,v25.4s,#20
  1601. eor w21,w21,w8
  1602. ushr v9.4s,v26.4s,#20
  1603. ror w17,w17,#24
  1604. ushr v13.4s,v27.4s,#20
  1605. ror w19,w19,#24
  1606. ushr v17.4s,v28.4s,#20
  1607. ror w20,w20,#24
  1608. ushr v21.4s,v29.4s,#20
  1609. ror w21,w21,#24
  1610. sli v1.4s,v24.4s,#12
  1611. add w13,w13,w17
  1612. sli v5.4s,v25.4s,#12
  1613. add w14,w14,w19
  1614. sli v9.4s,v26.4s,#12
  1615. add w15,w15,w20
  1616. sli v13.4s,v27.4s,#12
  1617. add w16,w16,w21
  1618. sli v17.4s,v28.4s,#12
  1619. eor w9,w9,w13
  1620. sli v21.4s,v29.4s,#12
  1621. eor w10,w10,w14
  1622. add v0.4s,v0.4s,v1.4s
  1623. eor w11,w11,w15
  1624. add v4.4s,v4.4s,v5.4s
  1625. eor w12,w12,w16
  1626. add v8.4s,v8.4s,v9.4s
  1627. ror w9,w9,#25
  1628. add v12.4s,v12.4s,v13.4s
  1629. ror w10,w10,#25
  1630. add v16.4s,v16.4s,v17.4s
  1631. ror w11,w11,#25
  1632. add v20.4s,v20.4s,v21.4s
  1633. ror w12,w12,#25
  1634. eor v24.16b,v3.16b,v0.16b
  1635. add w5,w5,w10
  1636. eor v25.16b,v7.16b,v4.16b
  1637. add w6,w6,w11
  1638. eor v26.16b,v11.16b,v8.16b
  1639. add w7,w7,w12
  1640. eor v27.16b,v15.16b,v12.16b
  1641. add w8,w8,w9
  1642. eor v28.16b,v19.16b,v16.16b
  1643. eor w21,w21,w5
  1644. eor v29.16b,v23.16b,v20.16b
  1645. eor w17,w17,w6
  1646. ushr v3.4s,v24.4s,#24
  1647. eor w19,w19,w7
  1648. ushr v7.4s,v25.4s,#24
  1649. eor w20,w20,w8
  1650. ushr v11.4s,v26.4s,#24
  1651. ror w21,w21,#16
  1652. ushr v15.4s,v27.4s,#24
  1653. ror w17,w17,#16
  1654. ushr v19.4s,v28.4s,#24
  1655. ror w19,w19,#16
  1656. ushr v23.4s,v29.4s,#24
  1657. ror w20,w20,#16
  1658. sli v3.4s,v24.4s,#8
  1659. add w15,w15,w21
  1660. sli v7.4s,v25.4s,#8
  1661. add w16,w16,w17
  1662. sli v11.4s,v26.4s,#8
  1663. add w13,w13,w19
  1664. sli v15.4s,v27.4s,#8
  1665. add w14,w14,w20
  1666. sli v19.4s,v28.4s,#8
  1667. eor w10,w10,w15
  1668. sli v23.4s,v29.4s,#8
  1669. eor w11,w11,w16
  1670. add v2.4s,v2.4s,v3.4s
  1671. eor w12,w12,w13
  1672. add v6.4s,v6.4s,v7.4s
  1673. eor w9,w9,w14
  1674. add v10.4s,v10.4s,v11.4s
  1675. ror w10,w10,#20
  1676. add v14.4s,v14.4s,v15.4s
  1677. ror w11,w11,#20
  1678. add v18.4s,v18.4s,v19.4s
  1679. ror w12,w12,#20
  1680. add v22.4s,v22.4s,v23.4s
  1681. ror w9,w9,#20
  1682. eor v24.16b,v1.16b,v2.16b
  1683. add w5,w5,w10
  1684. eor v25.16b,v5.16b,v6.16b
  1685. add w6,w6,w11
  1686. eor v26.16b,v9.16b,v10.16b
  1687. add w7,w7,w12
  1688. eor v27.16b,v13.16b,v14.16b
  1689. add w8,w8,w9
  1690. eor v28.16b,v17.16b,v18.16b
  1691. eor w21,w21,w5
  1692. eor v29.16b,v21.16b,v22.16b
  1693. eor w17,w17,w6
  1694. ushr v1.4s,v24.4s,#25
  1695. eor w19,w19,w7
  1696. ushr v5.4s,v25.4s,#25
  1697. eor w20,w20,w8
  1698. ushr v9.4s,v26.4s,#25
  1699. ror w21,w21,#24
  1700. ushr v13.4s,v27.4s,#25
  1701. ror w17,w17,#24
  1702. ushr v17.4s,v28.4s,#25
  1703. ror w19,w19,#24
  1704. ushr v21.4s,v29.4s,#25
  1705. ror w20,w20,#24
  1706. sli v1.4s,v24.4s,#7
  1707. add w15,w15,w21
  1708. sli v5.4s,v25.4s,#7
  1709. add w16,w16,w17
  1710. sli v9.4s,v26.4s,#7
  1711. add w13,w13,w19
  1712. sli v13.4s,v27.4s,#7
  1713. add w14,w14,w20
  1714. sli v17.4s,v28.4s,#7
  1715. eor w10,w10,w15
  1716. sli v21.4s,v29.4s,#7
  1717. eor w11,w11,w16
  1718. ext v2.16b,v2.16b,v2.16b,#8
  1719. eor w12,w12,w13
  1720. ext v6.16b,v6.16b,v6.16b,#8
  1721. eor w9,w9,w14
  1722. ext v10.16b,v10.16b,v10.16b,#8
  1723. ror w10,w10,#25
  1724. ext v14.16b,v14.16b,v14.16b,#8
  1725. ror w11,w11,#25
  1726. ext v18.16b,v18.16b,v18.16b,#8
  1727. ror w12,w12,#25
  1728. ext v22.16b,v22.16b,v22.16b,#8
  1729. ror w9,w9,#25
  1730. ext v3.16b,v3.16b,v3.16b,#4
  1731. ext v7.16b,v7.16b,v7.16b,#4
  1732. ext v11.16b,v11.16b,v11.16b,#4
  1733. ext v15.16b,v15.16b,v15.16b,#4
  1734. ext v19.16b,v19.16b,v19.16b,#4
  1735. ext v23.16b,v23.16b,v23.16b,#4
  1736. ext v1.16b,v1.16b,v1.16b,#12
  1737. ext v5.16b,v5.16b,v5.16b,#12
  1738. ext v9.16b,v9.16b,v9.16b,#12
  1739. ext v13.16b,v13.16b,v13.16b,#12
  1740. ext v17.16b,v17.16b,v17.16b,#12
  1741. ext v21.16b,v21.16b,v21.16b,#12
  1742. cbnz x4,.Loop_lower_neon
  1743. add w5,w5,w22 // accumulate key block
  1744. ldp q24,q25,[sp,#0]
  1745. add x6,x6,x22,lsr#32
  1746. ldp q26,q27,[sp,#32]
  1747. add w7,w7,w23
  1748. ldp q28,q29,[sp,#64]
  1749. add x8,x8,x23,lsr#32
  1750. add v0.4s,v0.4s,v24.4s
  1751. add w9,w9,w24
  1752. add v4.4s,v4.4s,v24.4s
  1753. add x10,x10,x24,lsr#32
  1754. add v8.4s,v8.4s,v24.4s
  1755. add w11,w11,w25
  1756. add v12.4s,v12.4s,v24.4s
  1757. add x12,x12,x25,lsr#32
  1758. add v16.4s,v16.4s,v24.4s
  1759. add w13,w13,w26
  1760. add v20.4s,v20.4s,v24.4s
  1761. add x14,x14,x26,lsr#32
  1762. add v2.4s,v2.4s,v26.4s
  1763. add w15,w15,w27
  1764. add v6.4s,v6.4s,v26.4s
  1765. add x16,x16,x27,lsr#32
  1766. add v10.4s,v10.4s,v26.4s
  1767. add w17,w17,w28
  1768. add v14.4s,v14.4s,v26.4s
  1769. add x19,x19,x28,lsr#32
  1770. add v18.4s,v18.4s,v26.4s
  1771. add w20,w20,w30
  1772. add v22.4s,v22.4s,v26.4s
  1773. add x21,x21,x30,lsr#32
  1774. add v19.4s,v19.4s,v31.4s // +4
  1775. add x5,x5,x6,lsl#32 // pack
  1776. add v23.4s,v23.4s,v31.4s // +4
  1777. add x7,x7,x8,lsl#32
  1778. add v3.4s,v3.4s,v27.4s
  1779. ldp x6,x8,[x1,#0] // load input
  1780. add v7.4s,v7.4s,v28.4s
  1781. add x9,x9,x10,lsl#32
  1782. add v11.4s,v11.4s,v29.4s
  1783. add x11,x11,x12,lsl#32
  1784. add v15.4s,v15.4s,v30.4s
  1785. ldp x10,x12,[x1,#16]
  1786. add v19.4s,v19.4s,v27.4s
  1787. add x13,x13,x14,lsl#32
  1788. add v23.4s,v23.4s,v28.4s
  1789. add x15,x15,x16,lsl#32
  1790. add v1.4s,v1.4s,v25.4s
  1791. ldp x14,x16,[x1,#32]
  1792. add v5.4s,v5.4s,v25.4s
  1793. add x17,x17,x19,lsl#32
  1794. add v9.4s,v9.4s,v25.4s
  1795. add x20,x20,x21,lsl#32
  1796. add v13.4s,v13.4s,v25.4s
  1797. ldp x19,x21,[x1,#48]
  1798. add v17.4s,v17.4s,v25.4s
  1799. add x1,x1,#64
  1800. add v21.4s,v21.4s,v25.4s
  1801. #ifdef __ARMEB__
  1802. rev x5,x5
  1803. rev x7,x7
  1804. rev x9,x9
  1805. rev x11,x11
  1806. rev x13,x13
  1807. rev x15,x15
  1808. rev x17,x17
  1809. rev x20,x20
  1810. #endif
  1811. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1812. eor x5,x5,x6
  1813. eor x7,x7,x8
  1814. eor x9,x9,x10
  1815. eor x11,x11,x12
  1816. eor x13,x13,x14
  1817. eor v0.16b,v0.16b,v24.16b
  1818. eor x15,x15,x16
  1819. eor v1.16b,v1.16b,v25.16b
  1820. eor x17,x17,x19
  1821. eor v2.16b,v2.16b,v26.16b
  1822. eor x20,x20,x21
  1823. eor v3.16b,v3.16b,v27.16b
  1824. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1825. stp x5,x7,[x0,#0] // store output
  1826. add x28,x28,#7 // increment counter
  1827. stp x9,x11,[x0,#16]
  1828. stp x13,x15,[x0,#32]
  1829. stp x17,x20,[x0,#48]
  1830. add x0,x0,#64
  1831. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  1832. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  1833. eor v4.16b,v4.16b,v24.16b
  1834. eor v5.16b,v5.16b,v25.16b
  1835. eor v6.16b,v6.16b,v26.16b
  1836. eor v7.16b,v7.16b,v27.16b
  1837. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  1838. ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
  1839. eor v8.16b,v8.16b,v0.16b
  1840. ldp q24,q25,[sp,#0]
  1841. eor v9.16b,v9.16b,v1.16b
  1842. ldp q26,q27,[sp,#32]
  1843. eor v10.16b,v10.16b,v2.16b
  1844. eor v11.16b,v11.16b,v3.16b
  1845. st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
  1846. ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
  1847. eor v12.16b,v12.16b,v4.16b
  1848. eor v13.16b,v13.16b,v5.16b
  1849. eor v14.16b,v14.16b,v6.16b
  1850. eor v15.16b,v15.16b,v7.16b
  1851. st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
  1852. ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
  1853. eor v16.16b,v16.16b,v8.16b
  1854. eor v17.16b,v17.16b,v9.16b
  1855. eor v18.16b,v18.16b,v10.16b
  1856. eor v19.16b,v19.16b,v11.16b
  1857. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  1858. shl v0.4s,v31.4s,#1 // 4 -> 8
  1859. eor v20.16b,v20.16b,v12.16b
  1860. eor v21.16b,v21.16b,v13.16b
  1861. eor v22.16b,v22.16b,v14.16b
  1862. eor v23.16b,v23.16b,v15.16b
  1863. st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
  1864. add v27.4s,v27.4s,v0.4s // += 8
  1865. add v28.4s,v28.4s,v0.4s
  1866. add v29.4s,v29.4s,v0.4s
  1867. add v30.4s,v30.4s,v0.4s
  1868. b.hs .Loop_outer_512_neon
  1869. adds x2,x2,#512
  1870. ushr v0.4s,v31.4s,#2 // 4 -> 1
  1871. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  1872. ldp d10,d11,[sp,#128+16]
  1873. ldp d12,d13,[sp,#128+32]
  1874. ldp d14,d15,[sp,#128+48]
  1875. stp q24,q31,[sp,#0] // wipe off-load area
  1876. stp q24,q31,[sp,#32]
  1877. stp q24,q31,[sp,#64]
  1878. b.eq .Ldone_512_neon
  1879. cmp x2,#192
  1880. sub v27.4s,v27.4s,v0.4s // -= 1
  1881. sub v28.4s,v28.4s,v0.4s
  1882. sub v29.4s,v29.4s,v0.4s
  1883. add sp,sp,#128
  1884. b.hs .Loop_outer_neon
  1885. eor v25.16b,v25.16b,v25.16b
  1886. eor v26.16b,v26.16b,v26.16b
  1887. eor v27.16b,v27.16b,v27.16b
  1888. eor v28.16b,v28.16b,v28.16b
  1889. eor v29.16b,v29.16b,v29.16b
  1890. eor v30.16b,v30.16b,v30.16b
  1891. b .Loop_outer
  1892. .Ldone_512_neon:
  1893. ldp x19,x20,[x29,#16]
  1894. add sp,sp,#128+64
  1895. ldp x21,x22,[x29,#32]
  1896. ldp x23,x24,[x29,#48]
  1897. ldp x25,x26,[x29,#64]
  1898. ldp x27,x28,[x29,#80]
  1899. ldp x29,x30,[sp],#96
  1900. .inst 0xd50323bf // autiasp
  1901. ret
  1902. .size ChaCha20_512_neon,.-ChaCha20_512_neon