keccak1600-armv8.S 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095
  1. .text
  2. .align 8 // strategic alignment and padding that allows to use
  3. // address value as loop termination condition...
  4. .quad 0,0,0,0,0,0,0,0
  5. .type iotas,%object
  6. iotas:
  7. .quad 0x0000000000000001
  8. .quad 0x0000000000008082
  9. .quad 0x800000000000808a
  10. .quad 0x8000000080008000
  11. .quad 0x000000000000808b
  12. .quad 0x0000000080000001
  13. .quad 0x8000000080008081
  14. .quad 0x8000000000008009
  15. .quad 0x000000000000008a
  16. .quad 0x0000000000000088
  17. .quad 0x0000000080008009
  18. .quad 0x000000008000000a
  19. .quad 0x000000008000808b
  20. .quad 0x800000000000008b
  21. .quad 0x8000000000008089
  22. .quad 0x8000000000008003
  23. .quad 0x8000000000008002
  24. .quad 0x8000000000000080
  25. .quad 0x000000000000800a
  26. .quad 0x800000008000000a
  27. .quad 0x8000000080008081
  28. .quad 0x8000000000008080
  29. .quad 0x0000000080000001
  30. .quad 0x8000000080008008
  31. .size iotas,.-iotas
  32. .type KeccakF1600_int,%function
  33. .align 5
  34. KeccakF1600_int:
  35. adr x28,iotas
  36. .inst 0xd503233f // paciasp
  37. stp x28,x30,[sp,#16] // 32 bytes on top are mine
  38. b .Loop
  39. .align 4
  40. .Loop:
  41. ////////////////////////////////////////// Theta
  42. eor x26,x0,x5
  43. stp x4,x9,[sp,#0] // offload pair...
  44. eor x27,x1,x6
  45. eor x28,x2,x7
  46. eor x30,x3,x8
  47. eor x4,x4,x9
  48. eor x26,x26,x10
  49. eor x27,x27,x11
  50. eor x28,x28,x12
  51. eor x30,x30,x13
  52. eor x4,x4,x14
  53. eor x26,x26,x15
  54. eor x27,x27,x16
  55. eor x28,x28,x17
  56. eor x30,x30,x25
  57. eor x4,x4,x19
  58. eor x26,x26,x20
  59. eor x28,x28,x22
  60. eor x27,x27,x21
  61. eor x30,x30,x23
  62. eor x4,x4,x24
  63. eor x9,x26,x28,ror#63
  64. eor x1,x1,x9
  65. eor x6,x6,x9
  66. eor x11,x11,x9
  67. eor x16,x16,x9
  68. eor x21,x21,x9
  69. eor x9,x27,x30,ror#63
  70. eor x28,x28,x4,ror#63
  71. eor x30,x30,x26,ror#63
  72. eor x4,x4,x27,ror#63
  73. eor x27, x2,x9 // mov x27,x2
  74. eor x7,x7,x9
  75. eor x12,x12,x9
  76. eor x17,x17,x9
  77. eor x22,x22,x9
  78. eor x0,x0,x4
  79. eor x5,x5,x4
  80. eor x10,x10,x4
  81. eor x15,x15,x4
  82. eor x20,x20,x4
  83. ldp x4,x9,[sp,#0] // re-load offloaded data
  84. eor x26, x3,x28 // mov x26,x3
  85. eor x8,x8,x28
  86. eor x13,x13,x28
  87. eor x25,x25,x28
  88. eor x23,x23,x28
  89. eor x28, x4,x30 // mov x28,x4
  90. eor x9,x9,x30
  91. eor x14,x14,x30
  92. eor x19,x19,x30
  93. eor x24,x24,x30
  94. ////////////////////////////////////////// Rho+Pi
  95. mov x30,x1
  96. ror x1,x6,#64-44
  97. //mov x27,x2
  98. ror x2,x12,#64-43
  99. //mov x26,x3
  100. ror x3,x25,#64-21
  101. //mov x28,x4
  102. ror x4,x24,#64-14
  103. ror x6,x9,#64-20
  104. ror x12,x13,#64-25
  105. ror x25,x17,#64-15
  106. ror x24,x21,#64-2
  107. ror x9,x22,#64-61
  108. ror x13,x19,#64-8
  109. ror x17,x11,#64-10
  110. ror x21,x8,#64-55
  111. ror x22,x14,#64-39
  112. ror x19,x23,#64-56
  113. ror x11,x7,#64-6
  114. ror x8,x16,#64-45
  115. ror x14,x20,#64-18
  116. ror x23,x15,#64-41
  117. ror x7,x10,#64-3
  118. ror x16,x5,#64-36
  119. ror x5,x26,#64-28
  120. ror x10,x30,#64-1
  121. ror x15,x28,#64-27
  122. ror x20,x27,#64-62
  123. ////////////////////////////////////////// Chi+Iota
  124. bic x26,x2,x1
  125. bic x27,x3,x2
  126. bic x28,x0,x4
  127. bic x30,x1,x0
  128. eor x0,x0,x26
  129. bic x26,x4,x3
  130. eor x1,x1,x27
  131. ldr x27,[sp,#16]
  132. eor x3,x3,x28
  133. eor x4,x4,x30
  134. eor x2,x2,x26
  135. ldr x30,[x27],#8 // Iota[i++]
  136. bic x26,x7,x6
  137. tst x27,#255 // are we done?
  138. str x27,[sp,#16]
  139. bic x27,x8,x7
  140. bic x28,x5,x9
  141. eor x0,x0,x30 // A[0][0] ^= Iota
  142. bic x30,x6,x5
  143. eor x5,x5,x26
  144. bic x26,x9,x8
  145. eor x6,x6,x27
  146. eor x8,x8,x28
  147. eor x9,x9,x30
  148. eor x7,x7,x26
  149. bic x26,x12,x11
  150. bic x27,x13,x12
  151. bic x28,x10,x14
  152. bic x30,x11,x10
  153. eor x10,x10,x26
  154. bic x26,x14,x13
  155. eor x11,x11,x27
  156. eor x13,x13,x28
  157. eor x14,x14,x30
  158. eor x12,x12,x26
  159. bic x26,x17,x16
  160. bic x27,x25,x17
  161. bic x28,x15,x19
  162. bic x30,x16,x15
  163. eor x15,x15,x26
  164. bic x26,x19,x25
  165. eor x16,x16,x27
  166. eor x25,x25,x28
  167. eor x19,x19,x30
  168. eor x17,x17,x26
  169. bic x26,x22,x21
  170. bic x27,x23,x22
  171. bic x28,x20,x24
  172. bic x30,x21,x20
  173. eor x20,x20,x26
  174. bic x26,x24,x23
  175. eor x21,x21,x27
  176. eor x23,x23,x28
  177. eor x24,x24,x30
  178. eor x22,x22,x26
  179. bne .Loop
  180. ldr x30,[sp,#24]
  181. .inst 0xd50323bf // autiasp
  182. ret
  183. .size KeccakF1600_int,.-KeccakF1600_int
  184. .type KeccakF1600,%function
  185. .align 5
  186. KeccakF1600:
  187. .inst 0xd503233f // paciasp
  188. stp x29,x30,[sp,#-128]!
  189. add x29,sp,#0
  190. stp x19,x20,[sp,#16]
  191. stp x21,x22,[sp,#32]
  192. stp x23,x24,[sp,#48]
  193. stp x25,x26,[sp,#64]
  194. stp x27,x28,[sp,#80]
  195. sub sp,sp,#48
  196. str x0,[sp,#32] // offload argument
  197. mov x26,x0
  198. ldp x0,x1,[x0,#16*0]
  199. ldp x2,x3,[x26,#16*1]
  200. ldp x4,x5,[x26,#16*2]
  201. ldp x6,x7,[x26,#16*3]
  202. ldp x8,x9,[x26,#16*4]
  203. ldp x10,x11,[x26,#16*5]
  204. ldp x12,x13,[x26,#16*6]
  205. ldp x14,x15,[x26,#16*7]
  206. ldp x16,x17,[x26,#16*8]
  207. ldp x25,x19,[x26,#16*9]
  208. ldp x20,x21,[x26,#16*10]
  209. ldp x22,x23,[x26,#16*11]
  210. ldr x24,[x26,#16*12]
  211. bl KeccakF1600_int
  212. ldr x26,[sp,#32]
  213. stp x0,x1,[x26,#16*0]
  214. stp x2,x3,[x26,#16*1]
  215. stp x4,x5,[x26,#16*2]
  216. stp x6,x7,[x26,#16*3]
  217. stp x8,x9,[x26,#16*4]
  218. stp x10,x11,[x26,#16*5]
  219. stp x12,x13,[x26,#16*6]
  220. stp x14,x15,[x26,#16*7]
  221. stp x16,x17,[x26,#16*8]
  222. stp x25,x19,[x26,#16*9]
  223. stp x20,x21,[x26,#16*10]
  224. stp x22,x23,[x26,#16*11]
  225. str x24,[x26,#16*12]
  226. ldp x19,x20,[x29,#16]
  227. add sp,sp,#48
  228. ldp x21,x22,[x29,#32]
  229. ldp x23,x24,[x29,#48]
  230. ldp x25,x26,[x29,#64]
  231. ldp x27,x28,[x29,#80]
  232. ldp x29,x30,[sp],#128
  233. .inst 0xd50323bf // autiasp
  234. ret
  235. .size KeccakF1600,.-KeccakF1600
  236. .globl SHA3_absorb
  237. .type SHA3_absorb,%function
  238. .align 5
  239. SHA3_absorb:
  240. .inst 0xd503233f // paciasp
  241. stp x29,x30,[sp,#-128]!
  242. add x29,sp,#0
  243. stp x19,x20,[sp,#16]
  244. stp x21,x22,[sp,#32]
  245. stp x23,x24,[sp,#48]
  246. stp x25,x26,[sp,#64]
  247. stp x27,x28,[sp,#80]
  248. sub sp,sp,#64
  249. stp x0,x1,[sp,#32] // offload arguments
  250. stp x2,x3,[sp,#48]
  251. mov x26,x0 // uint64_t A[5][5]
  252. mov x27,x1 // const void *inp
  253. mov x28,x2 // size_t len
  254. mov x30,x3 // size_t bsz
  255. ldp x0,x1,[x26,#16*0]
  256. ldp x2,x3,[x26,#16*1]
  257. ldp x4,x5,[x26,#16*2]
  258. ldp x6,x7,[x26,#16*3]
  259. ldp x8,x9,[x26,#16*4]
  260. ldp x10,x11,[x26,#16*5]
  261. ldp x12,x13,[x26,#16*6]
  262. ldp x14,x15,[x26,#16*7]
  263. ldp x16,x17,[x26,#16*8]
  264. ldp x25,x19,[x26,#16*9]
  265. ldp x20,x21,[x26,#16*10]
  266. ldp x22,x23,[x26,#16*11]
  267. ldr x24,[x26,#16*12]
  268. b .Loop_absorb
  269. .align 4
  270. .Loop_absorb:
  271. subs x26,x28,x30 // len - bsz
  272. blo .Labsorbed
  273. str x26,[sp,#48] // save len - bsz
  274. ldr x26,[x27],#8 // *inp++
  275. #ifdef __AARCH64EB__
  276. rev x26,x26
  277. #endif
  278. eor x0,x0,x26
  279. cmp x30,#8*(0+2)
  280. blo .Lprocess_block
  281. ldr x26,[x27],#8 // *inp++
  282. #ifdef __AARCH64EB__
  283. rev x26,x26
  284. #endif
  285. eor x1,x1,x26
  286. beq .Lprocess_block
  287. ldr x26,[x27],#8 // *inp++
  288. #ifdef __AARCH64EB__
  289. rev x26,x26
  290. #endif
  291. eor x2,x2,x26
  292. cmp x30,#8*(2+2)
  293. blo .Lprocess_block
  294. ldr x26,[x27],#8 // *inp++
  295. #ifdef __AARCH64EB__
  296. rev x26,x26
  297. #endif
  298. eor x3,x3,x26
  299. beq .Lprocess_block
  300. ldr x26,[x27],#8 // *inp++
  301. #ifdef __AARCH64EB__
  302. rev x26,x26
  303. #endif
  304. eor x4,x4,x26
  305. cmp x30,#8*(4+2)
  306. blo .Lprocess_block
  307. ldr x26,[x27],#8 // *inp++
  308. #ifdef __AARCH64EB__
  309. rev x26,x26
  310. #endif
  311. eor x5,x5,x26
  312. beq .Lprocess_block
  313. ldr x26,[x27],#8 // *inp++
  314. #ifdef __AARCH64EB__
  315. rev x26,x26
  316. #endif
  317. eor x6,x6,x26
  318. cmp x30,#8*(6+2)
  319. blo .Lprocess_block
  320. ldr x26,[x27],#8 // *inp++
  321. #ifdef __AARCH64EB__
  322. rev x26,x26
  323. #endif
  324. eor x7,x7,x26
  325. beq .Lprocess_block
  326. ldr x26,[x27],#8 // *inp++
  327. #ifdef __AARCH64EB__
  328. rev x26,x26
  329. #endif
  330. eor x8,x8,x26
  331. cmp x30,#8*(8+2)
  332. blo .Lprocess_block
  333. ldr x26,[x27],#8 // *inp++
  334. #ifdef __AARCH64EB__
  335. rev x26,x26
  336. #endif
  337. eor x9,x9,x26
  338. beq .Lprocess_block
  339. ldr x26,[x27],#8 // *inp++
  340. #ifdef __AARCH64EB__
  341. rev x26,x26
  342. #endif
  343. eor x10,x10,x26
  344. cmp x30,#8*(10+2)
  345. blo .Lprocess_block
  346. ldr x26,[x27],#8 // *inp++
  347. #ifdef __AARCH64EB__
  348. rev x26,x26
  349. #endif
  350. eor x11,x11,x26
  351. beq .Lprocess_block
  352. ldr x26,[x27],#8 // *inp++
  353. #ifdef __AARCH64EB__
  354. rev x26,x26
  355. #endif
  356. eor x12,x12,x26
  357. cmp x30,#8*(12+2)
  358. blo .Lprocess_block
  359. ldr x26,[x27],#8 // *inp++
  360. #ifdef __AARCH64EB__
  361. rev x26,x26
  362. #endif
  363. eor x13,x13,x26
  364. beq .Lprocess_block
  365. ldr x26,[x27],#8 // *inp++
  366. #ifdef __AARCH64EB__
  367. rev x26,x26
  368. #endif
  369. eor x14,x14,x26
  370. cmp x30,#8*(14+2)
  371. blo .Lprocess_block
  372. ldr x26,[x27],#8 // *inp++
  373. #ifdef __AARCH64EB__
  374. rev x26,x26
  375. #endif
  376. eor x15,x15,x26
  377. beq .Lprocess_block
  378. ldr x26,[x27],#8 // *inp++
  379. #ifdef __AARCH64EB__
  380. rev x26,x26
  381. #endif
  382. eor x16,x16,x26
  383. cmp x30,#8*(16+2)
  384. blo .Lprocess_block
  385. ldr x26,[x27],#8 // *inp++
  386. #ifdef __AARCH64EB__
  387. rev x26,x26
  388. #endif
  389. eor x17,x17,x26
  390. beq .Lprocess_block
  391. ldr x26,[x27],#8 // *inp++
  392. #ifdef __AARCH64EB__
  393. rev x26,x26
  394. #endif
  395. eor x25,x25,x26
  396. cmp x30,#8*(18+2)
  397. blo .Lprocess_block
  398. ldr x26,[x27],#8 // *inp++
  399. #ifdef __AARCH64EB__
  400. rev x26,x26
  401. #endif
  402. eor x19,x19,x26
  403. beq .Lprocess_block
  404. ldr x26,[x27],#8 // *inp++
  405. #ifdef __AARCH64EB__
  406. rev x26,x26
  407. #endif
  408. eor x20,x20,x26
  409. cmp x30,#8*(20+2)
  410. blo .Lprocess_block
  411. ldr x26,[x27],#8 // *inp++
  412. #ifdef __AARCH64EB__
  413. rev x26,x26
  414. #endif
  415. eor x21,x21,x26
  416. beq .Lprocess_block
  417. ldr x26,[x27],#8 // *inp++
  418. #ifdef __AARCH64EB__
  419. rev x26,x26
  420. #endif
  421. eor x22,x22,x26
  422. cmp x30,#8*(22+2)
  423. blo .Lprocess_block
  424. ldr x26,[x27],#8 // *inp++
  425. #ifdef __AARCH64EB__
  426. rev x26,x26
  427. #endif
  428. eor x23,x23,x26
  429. beq .Lprocess_block
  430. ldr x26,[x27],#8 // *inp++
  431. #ifdef __AARCH64EB__
  432. rev x26,x26
  433. #endif
  434. eor x24,x24,x26
  435. .Lprocess_block:
  436. str x27,[sp,#40] // save inp
  437. bl KeccakF1600_int
  438. ldr x27,[sp,#40] // restore arguments
  439. ldp x28,x30,[sp,#48]
  440. b .Loop_absorb
  441. .align 4
  442. .Labsorbed:
  443. ldr x27,[sp,#32]
  444. stp x0,x1,[x27,#16*0]
  445. stp x2,x3,[x27,#16*1]
  446. stp x4,x5,[x27,#16*2]
  447. stp x6,x7,[x27,#16*3]
  448. stp x8,x9,[x27,#16*4]
  449. stp x10,x11,[x27,#16*5]
  450. stp x12,x13,[x27,#16*6]
  451. stp x14,x15,[x27,#16*7]
  452. stp x16,x17,[x27,#16*8]
  453. stp x25,x19,[x27,#16*9]
  454. stp x20,x21,[x27,#16*10]
  455. stp x22,x23,[x27,#16*11]
  456. str x24,[x27,#16*12]
  457. mov x0,x28 // return value
  458. ldp x19,x20,[x29,#16]
  459. add sp,sp,#64
  460. ldp x21,x22,[x29,#32]
  461. ldp x23,x24,[x29,#48]
  462. ldp x25,x26,[x29,#64]
  463. ldp x27,x28,[x29,#80]
  464. ldp x29,x30,[sp],#128
  465. .inst 0xd50323bf // autiasp
  466. ret
  467. .size SHA3_absorb,.-SHA3_absorb
  468. .globl SHA3_squeeze
  469. .type SHA3_squeeze,%function
  470. .align 5
  471. SHA3_squeeze:
  472. .inst 0xd503233f // paciasp
  473. stp x29,x30,[sp,#-48]!
  474. add x29,sp,#0
  475. stp x19,x20,[sp,#16]
  476. stp x21,x22,[sp,#32]
  477. mov x19,x0 // put aside arguments
  478. mov x20,x1
  479. mov x21,x2
  480. mov x22,x3
  481. .Loop_squeeze:
  482. ldr x4,[x0],#8
  483. cmp x21,#8
  484. blo .Lsqueeze_tail
  485. #ifdef __AARCH64EB__
  486. rev x4,x4
  487. #endif
  488. str x4,[x20],#8
  489. subs x21,x21,#8
  490. beq .Lsqueeze_done
  491. subs x3,x3,#8
  492. bhi .Loop_squeeze
  493. mov x0,x19
  494. bl KeccakF1600
  495. mov x0,x19
  496. mov x3,x22
  497. b .Loop_squeeze
  498. .align 4
  499. .Lsqueeze_tail:
  500. strb w4,[x20],#1
  501. lsr x4,x4,#8
  502. subs x21,x21,#1
  503. beq .Lsqueeze_done
  504. strb w4,[x20],#1
  505. lsr x4,x4,#8
  506. subs x21,x21,#1
  507. beq .Lsqueeze_done
  508. strb w4,[x20],#1
  509. lsr x4,x4,#8
  510. subs x21,x21,#1
  511. beq .Lsqueeze_done
  512. strb w4,[x20],#1
  513. lsr x4,x4,#8
  514. subs x21,x21,#1
  515. beq .Lsqueeze_done
  516. strb w4,[x20],#1
  517. lsr x4,x4,#8
  518. subs x21,x21,#1
  519. beq .Lsqueeze_done
  520. strb w4,[x20],#1
  521. lsr x4,x4,#8
  522. subs x21,x21,#1
  523. beq .Lsqueeze_done
  524. strb w4,[x20],#1
  525. .Lsqueeze_done:
  526. ldp x19,x20,[sp,#16]
  527. ldp x21,x22,[sp,#32]
  528. ldp x29,x30,[sp],#48
  529. .inst 0xd50323bf // autiasp
  530. ret
  531. .size SHA3_squeeze,.-SHA3_squeeze
  532. .type KeccakF1600_ce,%function
  533. .align 5
  534. KeccakF1600_ce:
  535. mov x9,#12
  536. adr x10,iotas
  537. b .Loop_ce
  538. .align 4
  539. .Loop_ce:
  540. ////////////////////////////////////////////////// Theta
  541. .inst 0xce052819 //eor3 v25.16b,v0.16b,v5.16b,v10.16b
  542. .inst 0xce062c3a //eor3 v26.16b,v1.16b,v6.16b,v11.16b
  543. .inst 0xce07305b //eor3 v27.16b,v2.16b,v7.16b,v12.16b
  544. .inst 0xce08347c //eor3 v28.16b,v3.16b,v8.16b,v13.16b
  545. .inst 0xce09389d //eor3 v29.16b,v4.16b,v9.16b,v14.16b
  546. .inst 0xce0f5339 //eor3 v25.16b,v25.16b, v15.16b,v20.16b
  547. .inst 0xce10575a //eor3 v26.16b,v26.16b, v16.16b,v21.16b
  548. .inst 0xce115b7b //eor3 v27.16b,v27.16b, v17.16b,v22.16b
  549. .inst 0xce125f9c //eor3 v28.16b,v28.16b, v18.16b,v23.16b
  550. .inst 0xce1363bd //eor3 v29.16b,v29.16b, v19.16b,v24.16b
  551. .inst 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1]
  552. .inst 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2]
  553. .inst 0xce7d8f7b //rax1 v27.16b,v27.16b,v29.16b // D[3]
  554. .inst 0xce798f9c //rax1 v28.16b,v28.16b,v25.16b // D[4]
  555. .inst 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0]
  556. ////////////////////////////////////////////////// Theta+Rho+Pi
  557. .inst 0xce9e50d9 //xar v25.16b, v6.16b,v30.16b,#64-44 // C[0]=A[0][1]
  558. .inst 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20
  559. .inst 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61
  560. .inst 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39
  561. .inst 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18
  562. .inst 0xce9f0854 //xar v20.16b,v2.16b,v31.16b,#64-62
  563. .inst 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43
  564. .inst 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25
  565. .inst 0xce9ce26d //xar v13.16b,v19.16b,v28.16b,#64-8
  566. .inst 0xce9b22f3 //xar v19.16b,v23.16b,v27.16b,#64-56
  567. .inst 0xce9d5df7 //xar v23.16b,v15.16b,v29.16b,#64-41
  568. .inst 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27
  569. eor v0.16b,v0.16b,v29.16b
  570. ldr x11,[x10],#8
  571. .inst 0xce9bae5a //xar v26.16b, v18.16b,v27.16b,#64-21 // C[1]=A[0][3]
  572. .inst 0xce9fc632 //xar v18.16b,v17.16b,v31.16b,#64-15
  573. .inst 0xce9ed971 //xar v17.16b,v11.16b,v30.16b,#64-10
  574. .inst 0xce9fe8eb //xar v11.16b,v7.16b,v31.16b,#64-6
  575. .inst 0xce9df547 //xar v7.16b,v10.16b,v29.16b,#64-3
  576. .inst 0xce9efc2a //xar v10.16b,v1.16b,v30.16b,#64-1 // *
  577. .inst 0xce9ccb04 //xar v4.16b,v24.16b,v28.16b,#64-14
  578. .inst 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2
  579. .inst 0xce9b2515 //xar v21.16b,v8.16b,v27.16b,#64-55
  580. .inst 0xce9e4e08 //xar v8.16b,v16.16b,v30.16b,#64-45
  581. .inst 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36
  582. .inst 0xce9b907b //xar v27.16b, v3.16b,v27.16b,#64-28 // C[2]=A[1][0]
  583. ////////////////////////////////////////////////// Chi+Iota
  584. dup v31.2d,x11 // borrow C[6]
  585. .inst 0xce22641c //bcax v28.16b, v0.16b,v2.16b,v25.16b // *
  586. .inst 0xce3a0b21 //bcax v1.16b,v25.16b, v26.16b, v2.16b // *
  587. .inst 0xce246842 //bcax v2.16b,v2.16b,v4.16b,v26.16b
  588. .inst 0xce201343 //bcax v3.16b,v26.16b, v0.16b,v4.16b
  589. .inst 0xce390084 //bcax v4.16b,v4.16b,v25.16b, v0.16b
  590. .inst 0xce271b65 //bcax v5.16b,v27.16b, v7.16b,v6.16b // *
  591. .inst 0xce281cd9 //bcax v25.16b, v6.16b,v8.16b,v7.16b // *
  592. .inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
  593. .inst 0xce3b2508 //bcax v8.16b,v8.16b,v27.16b, v9.16b
  594. .inst 0xce266d29 //bcax v9.16b,v9.16b,v6.16b,v27.16b
  595. eor v0.16b,v28.16b,v31.16b // Iota
  596. .inst 0xce2c2d5a //bcax v26.16b, v10.16b,v12.16b,v11.16b // *
  597. .inst 0xce2d317b //bcax v27.16b, v11.16b,v13.16b,v12.16b // *
  598. .inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
  599. .inst 0xce2a39ad //bcax v13.16b,v13.16b,v10.16b,v14.16b
  600. .inst 0xce2b29ce //bcax v14.16b,v14.16b,v11.16b,v10.16b
  601. .inst 0xce3141fc //bcax v28.16b, v15.16b,v17.16b,v16.16b // *
  602. .inst 0xce32461d //bcax v29.16b, v16.16b,v18.16b,v17.16b // *
  603. .inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
  604. .inst 0xce2f4e52 //bcax v18.16b,v18.16b,v15.16b,v19.16b
  605. .inst 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b
  606. .inst 0xce36569e //bcax v30.16b, v20.16b,v22.16b,v21.16b // *
  607. .inst 0xce375abf //bcax v31.16b, v21.16b,v23.16b,v22.16b // *
  608. .inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
  609. .inst 0xce3462f7 //bcax v23.16b,v23.16b,v20.16b,v24.16b
  610. .inst 0xce355318 //bcax v24.16b,v24.16b,v21.16b,v20.16b
  611. ////////////////////////////////////////////////// Theta
  612. .inst 0xce056806 //eor3 v6.16b,v0.16b,v5.16b,v26.16b
  613. .inst 0xce196c2a //eor3 v10.16b,v1.16b,v25.16b,v27.16b
  614. .inst 0xce07304b //eor3 v11.16b,v2.16b,v7.16b,v12.16b
  615. .inst 0xce08346f //eor3 v15.16b,v3.16b,v8.16b,v13.16b
  616. .inst 0xce093890 //eor3 v16.16b,v4.16b,v9.16b,v14.16b
  617. .inst 0xce1c78c6 //eor3 v6.16b,v6.16b, v28.16b,v30.16b
  618. .inst 0xce1d7d4a //eor3 v10.16b,v10.16b, v29.16b,v31.16b
  619. .inst 0xce11596b //eor3 v11.16b,v11.16b, v17.16b,v22.16b
  620. .inst 0xce125def //eor3 v15.16b,v15.16b, v18.16b,v23.16b
  621. .inst 0xce136210 //eor3 v16.16b,v16.16b, v19.16b,v24.16b
  622. .inst 0xce6b8cd4 //rax1 v20.16b,v6.16b,v11.16b // D[1]
  623. .inst 0xce6f8d55 //rax1 v21.16b,v10.16b,v15.16b // D[2]
  624. .inst 0xce708d6b //rax1 v11.16b,v11.16b,v16.16b // D[3]
  625. .inst 0xce668def //rax1 v15.16b,v15.16b,v6.16b // D[4]
  626. .inst 0xce6a8e10 //rax1 v16.16b,v16.16b,v10.16b // D[0]
  627. ////////////////////////////////////////////////// Theta+Rho+Pi
  628. .inst 0xce945326 //xar v6.16b, v25.16b,v20.16b,#64-44 // C[0]=A[0][1]
  629. .inst 0xce8fb139 //xar v25.16b,v9.16b,v15.16b,#64-20
  630. .inst 0xce950ec9 //xar v9.16b,v22.16b,v21.16b,#64-61
  631. .inst 0xce8f65d6 //xar v22.16b,v14.16b,v15.16b,#64-39
  632. .inst 0xce90bbce //xar v14.16b,v30.16b,v16.16b,#64-18
  633. .inst 0xce95085e //xar v30.16b,v2.16b,v21.16b,#64-62
  634. .inst 0xce955582 //xar v2.16b,v12.16b,v21.16b,#64-43
  635. .inst 0xce8b9dac //xar v12.16b,v13.16b,v11.16b,#64-25
  636. .inst 0xce8fe26d //xar v13.16b,v19.16b,v15.16b,#64-8
  637. .inst 0xce8b22f3 //xar v19.16b,v23.16b,v11.16b,#64-56
  638. .inst 0xce905f97 //xar v23.16b,v28.16b,v16.16b,#64-41
  639. .inst 0xce8f949c //xar v28.16b,v4.16b,v15.16b,#64-27
  640. eor v0.16b,v0.16b,v16.16b
  641. ldr x11,[x10],#8
  642. .inst 0xce8bae4a //xar v10.16b, v18.16b,v11.16b,#64-21 // C[1]=A[0][3]
  643. .inst 0xce95c632 //xar v18.16b,v17.16b,v21.16b,#64-15
  644. .inst 0xce94db71 //xar v17.16b,v27.16b,v20.16b,#64-10
  645. .inst 0xce95e8fb //xar v27.16b,v7.16b,v21.16b,#64-6
  646. .inst 0xce90f747 //xar v7.16b,v26.16b,v16.16b,#64-3
  647. .inst 0xce94fc3a //xar v26.16b,v1.16b,v20.16b,#64-1 // *
  648. .inst 0xce8fcb04 //xar v4.16b,v24.16b,v15.16b,#64-14
  649. .inst 0xce94fbf8 //xar v24.16b,v31.16b,v20.16b,#64-2
  650. .inst 0xce8b251f //xar v31.16b,v8.16b,v11.16b,#64-55
  651. .inst 0xce944fa8 //xar v8.16b,v29.16b,v20.16b,#64-45
  652. .inst 0xce9070bd //xar v29.16b,v5.16b,v16.16b,#64-36
  653. .inst 0xce8b906b //xar v11.16b, v3.16b,v11.16b,#64-28 // C[2]=A[1][0]
  654. ////////////////////////////////////////////////// Chi+Iota
  655. dup v21.2d,x11 // borrow C[6]
  656. .inst 0xce22180f //bcax v15.16b, v0.16b,v2.16b,v6.16b // *
  657. .inst 0xce2a08c1 //bcax v1.16b,v6.16b, v10.16b, v2.16b // *
  658. .inst 0xce242842 //bcax v2.16b,v2.16b,v4.16b,v10.16b
  659. .inst 0xce201143 //bcax v3.16b,v10.16b, v0.16b,v4.16b
  660. .inst 0xce260084 //bcax v4.16b,v4.16b,v6.16b, v0.16b
  661. .inst 0xce276565 //bcax v5.16b,v11.16b, v7.16b,v25.16b // *
  662. .inst 0xce281f26 //bcax v6.16b, v25.16b,v8.16b,v7.16b // *
  663. .inst 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
  664. .inst 0xce2b2508 //bcax v8.16b,v8.16b,v11.16b, v9.16b
  665. .inst 0xce392d29 //bcax v9.16b,v9.16b,v25.16b,v11.16b
  666. eor v0.16b,v15.16b,v21.16b // Iota
  667. .inst 0xce2c6f4a //bcax v10.16b, v26.16b,v12.16b,v27.16b // *
  668. .inst 0xce2d336b //bcax v11.16b, v27.16b,v13.16b,v12.16b // *
  669. .inst 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
  670. .inst 0xce3a39ad //bcax v13.16b,v13.16b,v26.16b,v14.16b
  671. .inst 0xce3b69ce //bcax v14.16b,v14.16b,v27.16b,v26.16b
  672. .inst 0xce31778f //bcax v15.16b, v28.16b,v17.16b,v29.16b // *
  673. .inst 0xce3247b0 //bcax v16.16b, v29.16b,v18.16b,v17.16b // *
  674. .inst 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
  675. .inst 0xce3c4e52 //bcax v18.16b,v18.16b,v28.16b,v19.16b
  676. .inst 0xce3d7273 //bcax v19.16b,v19.16b,v29.16b,v28.16b
  677. .inst 0xce367fd4 //bcax v20.16b, v30.16b,v22.16b,v31.16b // *
  678. .inst 0xce375bf5 //bcax v21.16b, v31.16b,v23.16b,v22.16b // *
  679. .inst 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
  680. .inst 0xce3e62f7 //bcax v23.16b,v23.16b,v30.16b,v24.16b
  681. .inst 0xce3f7b18 //bcax v24.16b,v24.16b,v31.16b,v30.16b
  682. subs x9,x9,#1
  683. bne .Loop_ce
  684. ret
  685. .size KeccakF1600_ce,.-KeccakF1600_ce
  686. .type KeccakF1600_cext,%function
  687. .align 5
  688. KeccakF1600_cext:
  689. .inst 0xd503233f // paciasp
  690. stp x29,x30,[sp,#-80]!
  691. add x29,sp,#0
  692. stp d8,d9,[sp,#16] // per ABI requirement
  693. stp d10,d11,[sp,#32]
  694. stp d12,d13,[sp,#48]
  695. stp d14,d15,[sp,#64]
  696. ldp d0,d1,[x0,#8*0]
  697. ldp d2,d3,[x0,#8*2]
  698. ldp d4,d5,[x0,#8*4]
  699. ldp d6,d7,[x0,#8*6]
  700. ldp d8,d9,[x0,#8*8]
  701. ldp d10,d11,[x0,#8*10]
  702. ldp d12,d13,[x0,#8*12]
  703. ldp d14,d15,[x0,#8*14]
  704. ldp d16,d17,[x0,#8*16]
  705. ldp d18,d19,[x0,#8*18]
  706. ldp d20,d21,[x0,#8*20]
  707. ldp d22,d23,[x0,#8*22]
  708. ldr d24,[x0,#8*24]
  709. bl KeccakF1600_ce
  710. ldr x30,[sp,#8]
  711. stp d0,d1,[x0,#8*0]
  712. stp d2,d3,[x0,#8*2]
  713. stp d4,d5,[x0,#8*4]
  714. stp d6,d7,[x0,#8*6]
  715. stp d8,d9,[x0,#8*8]
  716. stp d10,d11,[x0,#8*10]
  717. stp d12,d13,[x0,#8*12]
  718. stp d14,d15,[x0,#8*14]
  719. stp d16,d17,[x0,#8*16]
  720. stp d18,d19,[x0,#8*18]
  721. stp d20,d21,[x0,#8*20]
  722. stp d22,d23,[x0,#8*22]
  723. str d24,[x0,#8*24]
  724. ldp d8,d9,[sp,#16]
  725. ldp d10,d11,[sp,#32]
  726. ldp d12,d13,[sp,#48]
  727. ldp d14,d15,[sp,#64]
  728. ldr x29,[sp],#80
  729. .inst 0xd50323bf // autiasp
  730. ret
  731. .size KeccakF1600_cext,.-KeccakF1600_cext
  732. .globl SHA3_absorb_cext
  733. .type SHA3_absorb_cext,%function
  734. .align 5
  735. SHA3_absorb_cext:
  736. .inst 0xd503233f // paciasp
  737. stp x29,x30,[sp,#-80]!
  738. add x29,sp,#0
  739. stp d8,d9,[sp,#16] // per ABI requirement
  740. stp d10,d11,[sp,#32]
  741. stp d12,d13,[sp,#48]
  742. stp d14,d15,[sp,#64]
  743. ldp d0,d1,[x0,#8*0]
  744. ldp d2,d3,[x0,#8*2]
  745. ldp d4,d5,[x0,#8*4]
  746. ldp d6,d7,[x0,#8*6]
  747. ldp d8,d9,[x0,#8*8]
  748. ldp d10,d11,[x0,#8*10]
  749. ldp d12,d13,[x0,#8*12]
  750. ldp d14,d15,[x0,#8*14]
  751. ldp d16,d17,[x0,#8*16]
  752. ldp d18,d19,[x0,#8*18]
  753. ldp d20,d21,[x0,#8*20]
  754. ldp d22,d23,[x0,#8*22]
  755. ldr d24,[x0,#8*24]
  756. b .Loop_absorb_ce
  757. .align 4
  758. .Loop_absorb_ce:
  759. subs x2,x2,x3 // len - bsz
  760. blo .Labsorbed_ce
  761. ldr d31,[x1],#8 // *inp++
  762. #ifdef __AARCH64EB__
  763. rev64 v31.16b,v31.16b
  764. #endif
  765. eor v0.16b,v0.16b,v31.16b
  766. cmp x3,#8*(0+2)
  767. blo .Lprocess_block_ce
  768. ldr d31,[x1],#8 // *inp++
  769. #ifdef __AARCH64EB__
  770. rev64 v31.16b,v31.16b
  771. #endif
  772. eor v1.16b,v1.16b,v31.16b
  773. beq .Lprocess_block_ce
  774. ldr d31,[x1],#8 // *inp++
  775. #ifdef __AARCH64EB__
  776. rev64 v31.16b,v31.16b
  777. #endif
  778. eor v2.16b,v2.16b,v31.16b
  779. cmp x3,#8*(2+2)
  780. blo .Lprocess_block_ce
  781. ldr d31,[x1],#8 // *inp++
  782. #ifdef __AARCH64EB__
  783. rev64 v31.16b,v31.16b
  784. #endif
  785. eor v3.16b,v3.16b,v31.16b
  786. beq .Lprocess_block_ce
  787. ldr d31,[x1],#8 // *inp++
  788. #ifdef __AARCH64EB__
  789. rev64 v31.16b,v31.16b
  790. #endif
  791. eor v4.16b,v4.16b,v31.16b
  792. cmp x3,#8*(4+2)
  793. blo .Lprocess_block_ce
  794. ldr d31,[x1],#8 // *inp++
  795. #ifdef __AARCH64EB__
  796. rev64 v31.16b,v31.16b
  797. #endif
  798. eor v5.16b,v5.16b,v31.16b
  799. beq .Lprocess_block_ce
  800. ldr d31,[x1],#8 // *inp++
  801. #ifdef __AARCH64EB__
  802. rev64 v31.16b,v31.16b
  803. #endif
  804. eor v6.16b,v6.16b,v31.16b
  805. cmp x3,#8*(6+2)
  806. blo .Lprocess_block_ce
  807. ldr d31,[x1],#8 // *inp++
  808. #ifdef __AARCH64EB__
  809. rev64 v31.16b,v31.16b
  810. #endif
  811. eor v7.16b,v7.16b,v31.16b
  812. beq .Lprocess_block_ce
  813. ldr d31,[x1],#8 // *inp++
  814. #ifdef __AARCH64EB__
  815. rev64 v31.16b,v31.16b
  816. #endif
  817. eor v8.16b,v8.16b,v31.16b
  818. cmp x3,#8*(8+2)
  819. blo .Lprocess_block_ce
  820. ldr d31,[x1],#8 // *inp++
  821. #ifdef __AARCH64EB__
  822. rev64 v31.16b,v31.16b
  823. #endif
  824. eor v9.16b,v9.16b,v31.16b
  825. beq .Lprocess_block_ce
  826. ldr d31,[x1],#8 // *inp++
  827. #ifdef __AARCH64EB__
  828. rev64 v31.16b,v31.16b
  829. #endif
  830. eor v10.16b,v10.16b,v31.16b
  831. cmp x3,#8*(10+2)
  832. blo .Lprocess_block_ce
  833. ldr d31,[x1],#8 // *inp++
  834. #ifdef __AARCH64EB__
  835. rev64 v31.16b,v31.16b
  836. #endif
  837. eor v11.16b,v11.16b,v31.16b
  838. beq .Lprocess_block_ce
  839. ldr d31,[x1],#8 // *inp++
  840. #ifdef __AARCH64EB__
  841. rev64 v31.16b,v31.16b
  842. #endif
  843. eor v12.16b,v12.16b,v31.16b
  844. cmp x3,#8*(12+2)
  845. blo .Lprocess_block_ce
  846. ldr d31,[x1],#8 // *inp++
  847. #ifdef __AARCH64EB__
  848. rev64 v31.16b,v31.16b
  849. #endif
  850. eor v13.16b,v13.16b,v31.16b
  851. beq .Lprocess_block_ce
  852. ldr d31,[x1],#8 // *inp++
  853. #ifdef __AARCH64EB__
  854. rev64 v31.16b,v31.16b
  855. #endif
  856. eor v14.16b,v14.16b,v31.16b
  857. cmp x3,#8*(14+2)
  858. blo .Lprocess_block_ce
  859. ldr d31,[x1],#8 // *inp++
  860. #ifdef __AARCH64EB__
  861. rev64 v31.16b,v31.16b
  862. #endif
  863. eor v15.16b,v15.16b,v31.16b
  864. beq .Lprocess_block_ce
  865. ldr d31,[x1],#8 // *inp++
  866. #ifdef __AARCH64EB__
  867. rev64 v31.16b,v31.16b
  868. #endif
  869. eor v16.16b,v16.16b,v31.16b
  870. cmp x3,#8*(16+2)
  871. blo .Lprocess_block_ce
  872. ldr d31,[x1],#8 // *inp++
  873. #ifdef __AARCH64EB__
  874. rev64 v31.16b,v31.16b
  875. #endif
  876. eor v17.16b,v17.16b,v31.16b
  877. beq .Lprocess_block_ce
  878. ldr d31,[x1],#8 // *inp++
  879. #ifdef __AARCH64EB__
  880. rev64 v31.16b,v31.16b
  881. #endif
  882. eor v18.16b,v18.16b,v31.16b
  883. cmp x3,#8*(18+2)
  884. blo .Lprocess_block_ce
  885. ldr d31,[x1],#8 // *inp++
  886. #ifdef __AARCH64EB__
  887. rev64 v31.16b,v31.16b
  888. #endif
  889. eor v19.16b,v19.16b,v31.16b
  890. beq .Lprocess_block_ce
  891. ldr d31,[x1],#8 // *inp++
  892. #ifdef __AARCH64EB__
  893. rev64 v31.16b,v31.16b
  894. #endif
  895. eor v20.16b,v20.16b,v31.16b
  896. cmp x3,#8*(20+2)
  897. blo .Lprocess_block_ce
  898. ldr d31,[x1],#8 // *inp++
  899. #ifdef __AARCH64EB__
  900. rev64 v31.16b,v31.16b
  901. #endif
  902. eor v21.16b,v21.16b,v31.16b
  903. beq .Lprocess_block_ce
  904. ldr d31,[x1],#8 // *inp++
  905. #ifdef __AARCH64EB__
  906. rev64 v31.16b,v31.16b
  907. #endif
  908. eor v22.16b,v22.16b,v31.16b
  909. cmp x3,#8*(22+2)
  910. blo .Lprocess_block_ce
  911. ldr d31,[x1],#8 // *inp++
  912. #ifdef __AARCH64EB__
  913. rev64 v31.16b,v31.16b
  914. #endif
  915. eor v23.16b,v23.16b,v31.16b
  916. beq .Lprocess_block_ce
  917. ldr d31,[x1],#8 // *inp++
  918. #ifdef __AARCH64EB__
  919. rev64 v31.16b,v31.16b
  920. #endif
  921. eor v24.16b,v24.16b,v31.16b
  922. .Lprocess_block_ce:
  923. bl KeccakF1600_ce
  924. b .Loop_absorb_ce
  925. .align 4
  926. .Labsorbed_ce:
  927. stp d0,d1,[x0,#8*0]
  928. stp d2,d3,[x0,#8*2]
  929. stp d4,d5,[x0,#8*4]
  930. stp d6,d7,[x0,#8*6]
  931. stp d8,d9,[x0,#8*8]
  932. stp d10,d11,[x0,#8*10]
  933. stp d12,d13,[x0,#8*12]
  934. stp d14,d15,[x0,#8*14]
  935. stp d16,d17,[x0,#8*16]
  936. stp d18,d19,[x0,#8*18]
  937. stp d20,d21,[x0,#8*20]
  938. stp d22,d23,[x0,#8*22]
  939. str d24,[x0,#8*24]
  940. add x0,x2,x3 // return value
  941. ldp d8,d9,[sp,#16]
  942. ldp d10,d11,[sp,#32]
  943. ldp d12,d13,[sp,#48]
  944. ldp d14,d15,[sp,#64]
  945. ldp x29,x30,[sp],#80
  946. .inst 0xd50323bf // autiasp
  947. ret
  948. .size SHA3_absorb_cext,.-SHA3_absorb_cext
  949. .globl SHA3_squeeze_cext
  950. .type SHA3_squeeze_cext,%function
  951. .align 5
  952. SHA3_squeeze_cext:
  953. .inst 0xd503233f // paciasp
  954. stp x29,x30,[sp,#-16]!
  955. add x29,sp,#0
  956. mov x9,x0
  957. mov x10,x3
  958. .Loop_squeeze_ce:
  959. ldr x4,[x9],#8
  960. cmp x2,#8
  961. blo .Lsqueeze_tail_ce
  962. #ifdef __AARCH64EB__
  963. rev x4,x4
  964. #endif
  965. str x4,[x1],#8
  966. beq .Lsqueeze_done_ce
  967. sub x2,x2,#8
  968. subs x10,x10,#8
  969. bhi .Loop_squeeze_ce
  970. bl KeccakF1600_cext
  971. ldr x30,[sp,#8]
  972. mov x9,x0
  973. mov x10,x3
  974. b .Loop_squeeze_ce
  975. .align 4
  976. .Lsqueeze_tail_ce:
  977. strb w4,[x1],#1
  978. lsr x4,x4,#8
  979. subs x2,x2,#1
  980. beq .Lsqueeze_done_ce
  981. strb w4,[x1],#1
  982. lsr x4,x4,#8
  983. subs x2,x2,#1
  984. beq .Lsqueeze_done_ce
  985. strb w4,[x1],#1
  986. lsr x4,x4,#8
  987. subs x2,x2,#1
  988. beq .Lsqueeze_done_ce
  989. strb w4,[x1],#1
  990. lsr x4,x4,#8
  991. subs x2,x2,#1
  992. beq .Lsqueeze_done_ce
  993. strb w4,[x1],#1
  994. lsr x4,x4,#8
  995. subs x2,x2,#1
  996. beq .Lsqueeze_done_ce
  997. strb w4,[x1],#1
  998. lsr x4,x4,#8
  999. subs x2,x2,#1
  1000. beq .Lsqueeze_done_ce
  1001. strb w4,[x1],#1
  1002. .Lsqueeze_done_ce:
  1003. ldr x29,[sp],#16
  1004. .inst 0xd50323bf // autiasp
  1005. ret
  1006. .size SHA3_squeeze_cext,.-SHA3_squeeze_cext
  1007. .byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1008. .align 2