keccak1600-armv8.S 24 KB


  1. .text
  2. .align 8 // strategic alignment and padding that allows to use
  3. // address value as loop termination condition...
  4. .quad 0,0,0,0,0,0,0,0
  5. iotas:
  6. .quad 0x0000000000000001
  7. .quad 0x0000000000008082
  8. .quad 0x800000000000808a
  9. .quad 0x8000000080008000
  10. .quad 0x000000000000808b
  11. .quad 0x0000000080000001
  12. .quad 0x8000000080008081
  13. .quad 0x8000000000008009
  14. .quad 0x000000000000008a
  15. .quad 0x0000000000000088
  16. .quad 0x0000000080008009
  17. .quad 0x000000008000000a
  18. .quad 0x000000008000808b
  19. .quad 0x800000000000008b
  20. .quad 0x8000000000008089
  21. .quad 0x8000000000008003
  22. .quad 0x8000000000008002
  23. .quad 0x8000000000000080
  24. .quad 0x000000000000800a
  25. .quad 0x800000008000000a
  26. .quad 0x8000000080008081
  27. .quad 0x8000000000008080
  28. .quad 0x0000000080000001
  29. .quad 0x8000000080008008
  30. .align 5
  31. KeccakF1600_int:
  32. adr x28,iotas
  33. .long 0xd503233f // paciasp
  34. stp x28,x30,[sp,#16] // 32 bytes on top are mine
  35. b Loop
  36. .align 4
  37. Loop:
  38. ////////////////////////////////////////// Theta
  39. eor x26,x0,x5
  40. stp x4,x9,[sp,#0] // offload pair...
  41. eor x27,x1,x6
  42. eor x28,x2,x7
  43. eor x30,x3,x8
  44. eor x4,x4,x9
  45. eor x26,x26,x10
  46. eor x27,x27,x11
  47. eor x28,x28,x12
  48. eor x30,x30,x13
  49. eor x4,x4,x14
  50. eor x26,x26,x15
  51. eor x27,x27,x16
  52. eor x28,x28,x17
  53. eor x30,x30,x25
  54. eor x4,x4,x19
  55. eor x26,x26,x20
  56. eor x28,x28,x22
  57. eor x27,x27,x21
  58. eor x30,x30,x23
  59. eor x4,x4,x24
  60. eor x9,x26,x28,ror#63
  61. eor x1,x1,x9
  62. eor x6,x6,x9
  63. eor x11,x11,x9
  64. eor x16,x16,x9
  65. eor x21,x21,x9
  66. eor x9,x27,x30,ror#63
  67. eor x28,x28,x4,ror#63
  68. eor x30,x30,x26,ror#63
  69. eor x4,x4,x27,ror#63
  70. eor x27, x2,x9 // mov x27,x2
  71. eor x7,x7,x9
  72. eor x12,x12,x9
  73. eor x17,x17,x9
  74. eor x22,x22,x9
  75. eor x0,x0,x4
  76. eor x5,x5,x4
  77. eor x10,x10,x4
  78. eor x15,x15,x4
  79. eor x20,x20,x4
  80. ldp x4,x9,[sp,#0] // re-load offloaded data
  81. eor x26, x3,x28 // mov x26,x3
  82. eor x8,x8,x28
  83. eor x13,x13,x28
  84. eor x25,x25,x28
  85. eor x23,x23,x28
  86. eor x28, x4,x30 // mov x28,x4
  87. eor x9,x9,x30
  88. eor x14,x14,x30
  89. eor x19,x19,x30
  90. eor x24,x24,x30
  91. ////////////////////////////////////////// Rho+Pi
  92. mov x30,x1
  93. ror x1,x6,#64-44
  94. //mov x27,x2
  95. ror x2,x12,#64-43
  96. //mov x26,x3
  97. ror x3,x25,#64-21
  98. //mov x28,x4
  99. ror x4,x24,#64-14
  100. ror x6,x9,#64-20
  101. ror x12,x13,#64-25
  102. ror x25,x17,#64-15
  103. ror x24,x21,#64-2
  104. ror x9,x22,#64-61
  105. ror x13,x19,#64-8
  106. ror x17,x11,#64-10
  107. ror x21,x8,#64-55
  108. ror x22,x14,#64-39
  109. ror x19,x23,#64-56
  110. ror x11,x7,#64-6
  111. ror x8,x16,#64-45
  112. ror x14,x20,#64-18
  113. ror x23,x15,#64-41
  114. ror x7,x10,#64-3
  115. ror x16,x5,#64-36
  116. ror x5,x26,#64-28
  117. ror x10,x30,#64-1
  118. ror x15,x28,#64-27
  119. ror x20,x27,#64-62
  120. ////////////////////////////////////////// Chi+Iota
  121. bic x26,x2,x1
  122. bic x27,x3,x2
  123. bic x28,x0,x4
  124. bic x30,x1,x0
  125. eor x0,x0,x26
  126. bic x26,x4,x3
  127. eor x1,x1,x27
  128. ldr x27,[sp,#16]
  129. eor x3,x3,x28
  130. eor x4,x4,x30
  131. eor x2,x2,x26
  132. ldr x30,[x27],#8 // Iota[i++]
  133. bic x26,x7,x6
  134. tst x27,#255 // are we done?
  135. str x27,[sp,#16]
  136. bic x27,x8,x7
  137. bic x28,x5,x9
  138. eor x0,x0,x30 // A[0][0] ^= Iota
  139. bic x30,x6,x5
  140. eor x5,x5,x26
  141. bic x26,x9,x8
  142. eor x6,x6,x27
  143. eor x8,x8,x28
  144. eor x9,x9,x30
  145. eor x7,x7,x26
  146. bic x26,x12,x11
  147. bic x27,x13,x12
  148. bic x28,x10,x14
  149. bic x30,x11,x10
  150. eor x10,x10,x26
  151. bic x26,x14,x13
  152. eor x11,x11,x27
  153. eor x13,x13,x28
  154. eor x14,x14,x30
  155. eor x12,x12,x26
  156. bic x26,x17,x16
  157. bic x27,x25,x17
  158. bic x28,x15,x19
  159. bic x30,x16,x15
  160. eor x15,x15,x26
  161. bic x26,x19,x25
  162. eor x16,x16,x27
  163. eor x25,x25,x28
  164. eor x19,x19,x30
  165. eor x17,x17,x26
  166. bic x26,x22,x21
  167. bic x27,x23,x22
  168. bic x28,x20,x24
  169. bic x30,x21,x20
  170. eor x20,x20,x26
  171. bic x26,x24,x23
  172. eor x21,x21,x27
  173. eor x23,x23,x28
  174. eor x24,x24,x30
  175. eor x22,x22,x26
  176. bne Loop
  177. ldr x30,[sp,#24]
  178. .long 0xd50323bf // autiasp
  179. ret
  180. .align 5
  181. KeccakF1600:
  182. .long 0xd503233f // paciasp
  183. stp x29,x30,[sp,#-128]!
  184. add x29,sp,#0
  185. stp x19,x20,[sp,#16]
  186. stp x21,x22,[sp,#32]
  187. stp x23,x24,[sp,#48]
  188. stp x25,x26,[sp,#64]
  189. stp x27,x28,[sp,#80]
  190. sub sp,sp,#48
  191. str x0,[sp,#32] // offload argument
  192. mov x26,x0
  193. ldp x0,x1,[x0,#16*0]
  194. ldp x2,x3,[x26,#16*1]
  195. ldp x4,x5,[x26,#16*2]
  196. ldp x6,x7,[x26,#16*3]
  197. ldp x8,x9,[x26,#16*4]
  198. ldp x10,x11,[x26,#16*5]
  199. ldp x12,x13,[x26,#16*6]
  200. ldp x14,x15,[x26,#16*7]
  201. ldp x16,x17,[x26,#16*8]
  202. ldp x25,x19,[x26,#16*9]
  203. ldp x20,x21,[x26,#16*10]
  204. ldp x22,x23,[x26,#16*11]
  205. ldr x24,[x26,#16*12]
  206. bl KeccakF1600_int
  207. ldr x26,[sp,#32]
  208. stp x0,x1,[x26,#16*0]
  209. stp x2,x3,[x26,#16*1]
  210. stp x4,x5,[x26,#16*2]
  211. stp x6,x7,[x26,#16*3]
  212. stp x8,x9,[x26,#16*4]
  213. stp x10,x11,[x26,#16*5]
  214. stp x12,x13,[x26,#16*6]
  215. stp x14,x15,[x26,#16*7]
  216. stp x16,x17,[x26,#16*8]
  217. stp x25,x19,[x26,#16*9]
  218. stp x20,x21,[x26,#16*10]
  219. stp x22,x23,[x26,#16*11]
  220. str x24,[x26,#16*12]
  221. ldp x19,x20,[x29,#16]
  222. add sp,sp,#48
  223. ldp x21,x22,[x29,#32]
  224. ldp x23,x24,[x29,#48]
  225. ldp x25,x26,[x29,#64]
  226. ldp x27,x28,[x29,#80]
  227. ldp x29,x30,[sp],#128
  228. .long 0xd50323bf // autiasp
  229. ret
  230. .globl _SHA3_absorb
  231. .align 5
  232. _SHA3_absorb:
  233. .long 0xd503233f // paciasp
  234. stp x29,x30,[sp,#-128]!
  235. add x29,sp,#0
  236. stp x19,x20,[sp,#16]
  237. stp x21,x22,[sp,#32]
  238. stp x23,x24,[sp,#48]
  239. stp x25,x26,[sp,#64]
  240. stp x27,x28,[sp,#80]
  241. sub sp,sp,#64
  242. stp x0,x1,[sp,#32] // offload arguments
  243. stp x2,x3,[sp,#48]
  244. mov x26,x0 // uint64_t A[5][5]
  245. mov x27,x1 // const void *inp
  246. mov x28,x2 // size_t len
  247. mov x30,x3 // size_t bsz
  248. ldp x0,x1,[x26,#16*0]
  249. ldp x2,x3,[x26,#16*1]
  250. ldp x4,x5,[x26,#16*2]
  251. ldp x6,x7,[x26,#16*3]
  252. ldp x8,x9,[x26,#16*4]
  253. ldp x10,x11,[x26,#16*5]
  254. ldp x12,x13,[x26,#16*6]
  255. ldp x14,x15,[x26,#16*7]
  256. ldp x16,x17,[x26,#16*8]
  257. ldp x25,x19,[x26,#16*9]
  258. ldp x20,x21,[x26,#16*10]
  259. ldp x22,x23,[x26,#16*11]
  260. ldr x24,[x26,#16*12]
  261. b Loop_absorb
  262. .align 4
  263. Loop_absorb:
  264. subs x26,x28,x30 // len - bsz
  265. blo Labsorbed
  266. str x26,[sp,#48] // save len - bsz
  267. ldr x26,[x27],#8 // *inp++
  268. #ifdef __AARCH64EB__
  269. rev x26,x26
  270. #endif
  271. eor x0,x0,x26
  272. cmp x30,#8*(0+2)
  273. blo Lprocess_block
  274. ldr x26,[x27],#8 // *inp++
  275. #ifdef __AARCH64EB__
  276. rev x26,x26
  277. #endif
  278. eor x1,x1,x26
  279. beq Lprocess_block
  280. ldr x26,[x27],#8 // *inp++
  281. #ifdef __AARCH64EB__
  282. rev x26,x26
  283. #endif
  284. eor x2,x2,x26
  285. cmp x30,#8*(2+2)
  286. blo Lprocess_block
  287. ldr x26,[x27],#8 // *inp++
  288. #ifdef __AARCH64EB__
  289. rev x26,x26
  290. #endif
  291. eor x3,x3,x26
  292. beq Lprocess_block
  293. ldr x26,[x27],#8 // *inp++
  294. #ifdef __AARCH64EB__
  295. rev x26,x26
  296. #endif
  297. eor x4,x4,x26
  298. cmp x30,#8*(4+2)
  299. blo Lprocess_block
  300. ldr x26,[x27],#8 // *inp++
  301. #ifdef __AARCH64EB__
  302. rev x26,x26
  303. #endif
  304. eor x5,x5,x26
  305. beq Lprocess_block
  306. ldr x26,[x27],#8 // *inp++
  307. #ifdef __AARCH64EB__
  308. rev x26,x26
  309. #endif
  310. eor x6,x6,x26
  311. cmp x30,#8*(6+2)
  312. blo Lprocess_block
  313. ldr x26,[x27],#8 // *inp++
  314. #ifdef __AARCH64EB__
  315. rev x26,x26
  316. #endif
  317. eor x7,x7,x26
  318. beq Lprocess_block
  319. ldr x26,[x27],#8 // *inp++
  320. #ifdef __AARCH64EB__
  321. rev x26,x26
  322. #endif
  323. eor x8,x8,x26
  324. cmp x30,#8*(8+2)
  325. blo Lprocess_block
  326. ldr x26,[x27],#8 // *inp++
  327. #ifdef __AARCH64EB__
  328. rev x26,x26
  329. #endif
  330. eor x9,x9,x26
  331. beq Lprocess_block
  332. ldr x26,[x27],#8 // *inp++
  333. #ifdef __AARCH64EB__
  334. rev x26,x26
  335. #endif
  336. eor x10,x10,x26
  337. cmp x30,#8*(10+2)
  338. blo Lprocess_block
  339. ldr x26,[x27],#8 // *inp++
  340. #ifdef __AARCH64EB__
  341. rev x26,x26
  342. #endif
  343. eor x11,x11,x26
  344. beq Lprocess_block
  345. ldr x26,[x27],#8 // *inp++
  346. #ifdef __AARCH64EB__
  347. rev x26,x26
  348. #endif
  349. eor x12,x12,x26
  350. cmp x30,#8*(12+2)
  351. blo Lprocess_block
  352. ldr x26,[x27],#8 // *inp++
  353. #ifdef __AARCH64EB__
  354. rev x26,x26
  355. #endif
  356. eor x13,x13,x26
  357. beq Lprocess_block
  358. ldr x26,[x27],#8 // *inp++
  359. #ifdef __AARCH64EB__
  360. rev x26,x26
  361. #endif
  362. eor x14,x14,x26
  363. cmp x30,#8*(14+2)
  364. blo Lprocess_block
  365. ldr x26,[x27],#8 // *inp++
  366. #ifdef __AARCH64EB__
  367. rev x26,x26
  368. #endif
  369. eor x15,x15,x26
  370. beq Lprocess_block
  371. ldr x26,[x27],#8 // *inp++
  372. #ifdef __AARCH64EB__
  373. rev x26,x26
  374. #endif
  375. eor x16,x16,x26
  376. cmp x30,#8*(16+2)
  377. blo Lprocess_block
  378. ldr x26,[x27],#8 // *inp++
  379. #ifdef __AARCH64EB__
  380. rev x26,x26
  381. #endif
  382. eor x17,x17,x26
  383. beq Lprocess_block
  384. ldr x26,[x27],#8 // *inp++
  385. #ifdef __AARCH64EB__
  386. rev x26,x26
  387. #endif
  388. eor x25,x25,x26
  389. cmp x30,#8*(18+2)
  390. blo Lprocess_block
  391. ldr x26,[x27],#8 // *inp++
  392. #ifdef __AARCH64EB__
  393. rev x26,x26
  394. #endif
  395. eor x19,x19,x26
  396. beq Lprocess_block
  397. ldr x26,[x27],#8 // *inp++
  398. #ifdef __AARCH64EB__
  399. rev x26,x26
  400. #endif
  401. eor x20,x20,x26
  402. cmp x30,#8*(20+2)
  403. blo Lprocess_block
  404. ldr x26,[x27],#8 // *inp++
  405. #ifdef __AARCH64EB__
  406. rev x26,x26
  407. #endif
  408. eor x21,x21,x26
  409. beq Lprocess_block
  410. ldr x26,[x27],#8 // *inp++
  411. #ifdef __AARCH64EB__
  412. rev x26,x26
  413. #endif
  414. eor x22,x22,x26
  415. cmp x30,#8*(22+2)
  416. blo Lprocess_block
  417. ldr x26,[x27],#8 // *inp++
  418. #ifdef __AARCH64EB__
  419. rev x26,x26
  420. #endif
  421. eor x23,x23,x26
  422. beq Lprocess_block
  423. ldr x26,[x27],#8 // *inp++
  424. #ifdef __AARCH64EB__
  425. rev x26,x26
  426. #endif
  427. eor x24,x24,x26
  428. Lprocess_block:
  429. str x27,[sp,#40] // save inp
  430. bl KeccakF1600_int
  431. ldr x27,[sp,#40] // restore arguments
  432. ldp x28,x30,[sp,#48]
  433. b Loop_absorb
  434. .align 4
  435. Labsorbed:
  436. ldr x27,[sp,#32]
  437. stp x0,x1,[x27,#16*0]
  438. stp x2,x3,[x27,#16*1]
  439. stp x4,x5,[x27,#16*2]
  440. stp x6,x7,[x27,#16*3]
  441. stp x8,x9,[x27,#16*4]
  442. stp x10,x11,[x27,#16*5]
  443. stp x12,x13,[x27,#16*6]
  444. stp x14,x15,[x27,#16*7]
  445. stp x16,x17,[x27,#16*8]
  446. stp x25,x19,[x27,#16*9]
  447. stp x20,x21,[x27,#16*10]
  448. stp x22,x23,[x27,#16*11]
  449. str x24,[x27,#16*12]
  450. mov x0,x28 // return value
  451. ldp x19,x20,[x29,#16]
  452. add sp,sp,#64
  453. ldp x21,x22,[x29,#32]
  454. ldp x23,x24,[x29,#48]
  455. ldp x25,x26,[x29,#64]
  456. ldp x27,x28,[x29,#80]
  457. ldp x29,x30,[sp],#128
  458. .long 0xd50323bf // autiasp
  459. ret
  460. .globl _SHA3_squeeze
  461. .align 5
  462. _SHA3_squeeze:
  463. .long 0xd503233f // paciasp
  464. stp x29,x30,[sp,#-48]!
  465. add x29,sp,#0
  466. stp x19,x20,[sp,#16]
  467. stp x21,x22,[sp,#32]
  468. mov x19,x0 // put aside arguments
  469. mov x20,x1
  470. mov x21,x2
  471. mov x22,x3
  472. Loop_squeeze:
  473. ldr x4,[x0],#8
  474. cmp x21,#8
  475. blo Lsqueeze_tail
  476. #ifdef __AARCH64EB__
  477. rev x4,x4
  478. #endif
  479. str x4,[x20],#8
  480. subs x21,x21,#8
  481. beq Lsqueeze_done
  482. subs x3,x3,#8
  483. bhi Loop_squeeze
  484. mov x0,x19
  485. bl KeccakF1600
  486. mov x0,x19
  487. mov x3,x22
  488. b Loop_squeeze
  489. .align 4
  490. Lsqueeze_tail:
  491. strb w4,[x20],#1
  492. lsr x4,x4,#8
  493. subs x21,x21,#1
  494. beq Lsqueeze_done
  495. strb w4,[x20],#1
  496. lsr x4,x4,#8
  497. subs x21,x21,#1
  498. beq Lsqueeze_done
  499. strb w4,[x20],#1
  500. lsr x4,x4,#8
  501. subs x21,x21,#1
  502. beq Lsqueeze_done
  503. strb w4,[x20],#1
  504. lsr x4,x4,#8
  505. subs x21,x21,#1
  506. beq Lsqueeze_done
  507. strb w4,[x20],#1
  508. lsr x4,x4,#8
  509. subs x21,x21,#1
  510. beq Lsqueeze_done
  511. strb w4,[x20],#1
  512. lsr x4,x4,#8
  513. subs x21,x21,#1
  514. beq Lsqueeze_done
  515. strb w4,[x20],#1
  516. Lsqueeze_done:
  517. ldp x19,x20,[sp,#16]
  518. ldp x21,x22,[sp,#32]
  519. ldp x29,x30,[sp],#48
  520. .long 0xd50323bf // autiasp
  521. ret
  522. .align 5
  523. KeccakF1600_ce:
  524. mov x9,#12
  525. adr x10,iotas
  526. b Loop_ce
  527. .align 4
  528. Loop_ce:
  529. ////////////////////////////////////////////////// Theta
  530. .long 0xce052819 //eor3 v25.16b,v0.16b,v5.16b,v10.16b
  531. .long 0xce062c3a //eor3 v26.16b,v1.16b,v6.16b,v11.16b
  532. .long 0xce07305b //eor3 v27.16b,v2.16b,v7.16b,v12.16b
  533. .long 0xce08347c //eor3 v28.16b,v3.16b,v8.16b,v13.16b
  534. .long 0xce09389d //eor3 v29.16b,v4.16b,v9.16b,v14.16b
  535. .long 0xce0f5339 //eor3 v25.16b,v25.16b, v15.16b,v20.16b
  536. .long 0xce10575a //eor3 v26.16b,v26.16b, v16.16b,v21.16b
  537. .long 0xce115b7b //eor3 v27.16b,v27.16b, v17.16b,v22.16b
  538. .long 0xce125f9c //eor3 v28.16b,v28.16b, v18.16b,v23.16b
  539. .long 0xce1363bd //eor3 v29.16b,v29.16b, v19.16b,v24.16b
  540. .long 0xce7b8f3e //rax1 v30.16b,v25.16b,v27.16b // D[1]
  541. .long 0xce7c8f5f //rax1 v31.16b,v26.16b,v28.16b // D[2]
  542. .long 0xce7d8f7b //rax1 v27.16b,v27.16b,v29.16b // D[3]
  543. .long 0xce798f9c //rax1 v28.16b,v28.16b,v25.16b // D[4]
  544. .long 0xce7a8fbd //rax1 v29.16b,v29.16b,v26.16b // D[0]
  545. ////////////////////////////////////////////////// Theta+Rho+Pi
  546. .long 0xce9e50d9 //xar v25.16b, v6.16b,v30.16b,#64-44 // C[0]=A[0][1]
  547. .long 0xce9cb126 //xar v6.16b,v9.16b,v28.16b,#64-20
  548. .long 0xce9f0ec9 //xar v9.16b,v22.16b,v31.16b,#64-61
  549. .long 0xce9c65d6 //xar v22.16b,v14.16b,v28.16b,#64-39
  550. .long 0xce9dba8e //xar v14.16b,v20.16b,v29.16b,#64-18
  551. .long 0xce9f0854 //xar v20.16b,v2.16b,v31.16b,#64-62
  552. .long 0xce9f5582 //xar v2.16b,v12.16b,v31.16b,#64-43
  553. .long 0xce9b9dac //xar v12.16b,v13.16b,v27.16b,#64-25
  554. .long 0xce9ce26d //xar v13.16b,v19.16b,v28.16b,#64-8
  555. .long 0xce9b22f3 //xar v19.16b,v23.16b,v27.16b,#64-56
  556. .long 0xce9d5df7 //xar v23.16b,v15.16b,v29.16b,#64-41
  557. .long 0xce9c948f //xar v15.16b,v4.16b,v28.16b,#64-27
  558. eor v0.16b,v0.16b,v29.16b
  559. ldr x11,[x10],#8
  560. .long 0xce9bae5a //xar v26.16b, v18.16b,v27.16b,#64-21 // C[1]=A[0][3]
  561. .long 0xce9fc632 //xar v18.16b,v17.16b,v31.16b,#64-15
  562. .long 0xce9ed971 //xar v17.16b,v11.16b,v30.16b,#64-10
  563. .long 0xce9fe8eb //xar v11.16b,v7.16b,v31.16b,#64-6
  564. .long 0xce9df547 //xar v7.16b,v10.16b,v29.16b,#64-3
  565. .long 0xce9efc2a //xar v10.16b,v1.16b,v30.16b,#64-1 // *
  566. .long 0xce9ccb04 //xar v4.16b,v24.16b,v28.16b,#64-14
  567. .long 0xce9efab8 //xar v24.16b,v21.16b,v30.16b,#64-2
  568. .long 0xce9b2515 //xar v21.16b,v8.16b,v27.16b,#64-55
  569. .long 0xce9e4e08 //xar v8.16b,v16.16b,v30.16b,#64-45
  570. .long 0xce9d70b0 //xar v16.16b,v5.16b,v29.16b,#64-36
  571. .long 0xce9b907b //xar v27.16b, v3.16b,v27.16b,#64-28 // C[2]=A[1][0]
  572. ////////////////////////////////////////////////// Chi+Iota
  573. dup v31.2d,x11 // borrow C[6]
  574. .long 0xce22641c //bcax v28.16b, v0.16b,v2.16b,v25.16b // *
  575. .long 0xce3a0b21 //bcax v1.16b,v25.16b, v26.16b, v2.16b // *
  576. .long 0xce246842 //bcax v2.16b,v2.16b,v4.16b,v26.16b
  577. .long 0xce201343 //bcax v3.16b,v26.16b, v0.16b,v4.16b
  578. .long 0xce390084 //bcax v4.16b,v4.16b,v25.16b, v0.16b
  579. .long 0xce271b65 //bcax v5.16b,v27.16b, v7.16b,v6.16b // *
  580. .long 0xce281cd9 //bcax v25.16b, v6.16b,v8.16b,v7.16b // *
  581. .long 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
  582. .long 0xce3b2508 //bcax v8.16b,v8.16b,v27.16b, v9.16b
  583. .long 0xce266d29 //bcax v9.16b,v9.16b,v6.16b,v27.16b
  584. eor v0.16b,v28.16b,v31.16b // Iota
  585. .long 0xce2c2d5a //bcax v26.16b, v10.16b,v12.16b,v11.16b // *
  586. .long 0xce2d317b //bcax v27.16b, v11.16b,v13.16b,v12.16b // *
  587. .long 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
  588. .long 0xce2a39ad //bcax v13.16b,v13.16b,v10.16b,v14.16b
  589. .long 0xce2b29ce //bcax v14.16b,v14.16b,v11.16b,v10.16b
  590. .long 0xce3141fc //bcax v28.16b, v15.16b,v17.16b,v16.16b // *
  591. .long 0xce32461d //bcax v29.16b, v16.16b,v18.16b,v17.16b // *
  592. .long 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
  593. .long 0xce2f4e52 //bcax v18.16b,v18.16b,v15.16b,v19.16b
  594. .long 0xce303e73 //bcax v19.16b,v19.16b,v16.16b,v15.16b
  595. .long 0xce36569e //bcax v30.16b, v20.16b,v22.16b,v21.16b // *
  596. .long 0xce375abf //bcax v31.16b, v21.16b,v23.16b,v22.16b // *
  597. .long 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
  598. .long 0xce3462f7 //bcax v23.16b,v23.16b,v20.16b,v24.16b
  599. .long 0xce355318 //bcax v24.16b,v24.16b,v21.16b,v20.16b
  600. ////////////////////////////////////////////////// Theta
  601. .long 0xce056806 //eor3 v6.16b,v0.16b,v5.16b,v26.16b
  602. .long 0xce196c2a //eor3 v10.16b,v1.16b,v25.16b,v27.16b
  603. .long 0xce07304b //eor3 v11.16b,v2.16b,v7.16b,v12.16b
  604. .long 0xce08346f //eor3 v15.16b,v3.16b,v8.16b,v13.16b
  605. .long 0xce093890 //eor3 v16.16b,v4.16b,v9.16b,v14.16b
  606. .long 0xce1c78c6 //eor3 v6.16b,v6.16b, v28.16b,v30.16b
  607. .long 0xce1d7d4a //eor3 v10.16b,v10.16b, v29.16b,v31.16b
  608. .long 0xce11596b //eor3 v11.16b,v11.16b, v17.16b,v22.16b
  609. .long 0xce125def //eor3 v15.16b,v15.16b, v18.16b,v23.16b
  610. .long 0xce136210 //eor3 v16.16b,v16.16b, v19.16b,v24.16b
  611. .long 0xce6b8cd4 //rax1 v20.16b,v6.16b,v11.16b // D[1]
  612. .long 0xce6f8d55 //rax1 v21.16b,v10.16b,v15.16b // D[2]
  613. .long 0xce708d6b //rax1 v11.16b,v11.16b,v16.16b // D[3]
  614. .long 0xce668def //rax1 v15.16b,v15.16b,v6.16b // D[4]
  615. .long 0xce6a8e10 //rax1 v16.16b,v16.16b,v10.16b // D[0]
  616. ////////////////////////////////////////////////// Theta+Rho+Pi
  617. .long 0xce945326 //xar v6.16b, v25.16b,v20.16b,#64-44 // C[0]=A[0][1]
  618. .long 0xce8fb139 //xar v25.16b,v9.16b,v15.16b,#64-20
  619. .long 0xce950ec9 //xar v9.16b,v22.16b,v21.16b,#64-61
  620. .long 0xce8f65d6 //xar v22.16b,v14.16b,v15.16b,#64-39
  621. .long 0xce90bbce //xar v14.16b,v30.16b,v16.16b,#64-18
  622. .long 0xce95085e //xar v30.16b,v2.16b,v21.16b,#64-62
  623. .long 0xce955582 //xar v2.16b,v12.16b,v21.16b,#64-43
  624. .long 0xce8b9dac //xar v12.16b,v13.16b,v11.16b,#64-25
  625. .long 0xce8fe26d //xar v13.16b,v19.16b,v15.16b,#64-8
  626. .long 0xce8b22f3 //xar v19.16b,v23.16b,v11.16b,#64-56
  627. .long 0xce905f97 //xar v23.16b,v28.16b,v16.16b,#64-41
  628. .long 0xce8f949c //xar v28.16b,v4.16b,v15.16b,#64-27
  629. eor v0.16b,v0.16b,v16.16b
  630. ldr x11,[x10],#8
  631. .long 0xce8bae4a //xar v10.16b, v18.16b,v11.16b,#64-21 // C[1]=A[0][3]
  632. .long 0xce95c632 //xar v18.16b,v17.16b,v21.16b,#64-15
  633. .long 0xce94db71 //xar v17.16b,v27.16b,v20.16b,#64-10
  634. .long 0xce95e8fb //xar v27.16b,v7.16b,v21.16b,#64-6
  635. .long 0xce90f747 //xar v7.16b,v26.16b,v16.16b,#64-3
  636. .long 0xce94fc3a //xar v26.16b,v1.16b,v20.16b,#64-1 // *
  637. .long 0xce8fcb04 //xar v4.16b,v24.16b,v15.16b,#64-14
  638. .long 0xce94fbf8 //xar v24.16b,v31.16b,v20.16b,#64-2
  639. .long 0xce8b251f //xar v31.16b,v8.16b,v11.16b,#64-55
  640. .long 0xce944fa8 //xar v8.16b,v29.16b,v20.16b,#64-45
  641. .long 0xce9070bd //xar v29.16b,v5.16b,v16.16b,#64-36
  642. .long 0xce8b906b //xar v11.16b, v3.16b,v11.16b,#64-28 // C[2]=A[1][0]
  643. ////////////////////////////////////////////////// Chi+Iota
  644. dup v21.2d,x11 // borrow C[6]
  645. .long 0xce22180f //bcax v15.16b, v0.16b,v2.16b,v6.16b // *
  646. .long 0xce2a08c1 //bcax v1.16b,v6.16b, v10.16b, v2.16b // *
  647. .long 0xce242842 //bcax v2.16b,v2.16b,v4.16b,v10.16b
  648. .long 0xce201143 //bcax v3.16b,v10.16b, v0.16b,v4.16b
  649. .long 0xce260084 //bcax v4.16b,v4.16b,v6.16b, v0.16b
  650. .long 0xce276565 //bcax v5.16b,v11.16b, v7.16b,v25.16b // *
  651. .long 0xce281f26 //bcax v6.16b, v25.16b,v8.16b,v7.16b // *
  652. .long 0xce2920e7 //bcax v7.16b,v7.16b,v9.16b,v8.16b
  653. .long 0xce2b2508 //bcax v8.16b,v8.16b,v11.16b, v9.16b
  654. .long 0xce392d29 //bcax v9.16b,v9.16b,v25.16b,v11.16b
  655. eor v0.16b,v15.16b,v21.16b // Iota
  656. .long 0xce2c6f4a //bcax v10.16b, v26.16b,v12.16b,v27.16b // *
  657. .long 0xce2d336b //bcax v11.16b, v27.16b,v13.16b,v12.16b // *
  658. .long 0xce2e358c //bcax v12.16b,v12.16b,v14.16b,v13.16b
  659. .long 0xce3a39ad //bcax v13.16b,v13.16b,v26.16b,v14.16b
  660. .long 0xce3b69ce //bcax v14.16b,v14.16b,v27.16b,v26.16b
  661. .long 0xce31778f //bcax v15.16b, v28.16b,v17.16b,v29.16b // *
  662. .long 0xce3247b0 //bcax v16.16b, v29.16b,v18.16b,v17.16b // *
  663. .long 0xce334a31 //bcax v17.16b,v17.16b,v19.16b,v18.16b
  664. .long 0xce3c4e52 //bcax v18.16b,v18.16b,v28.16b,v19.16b
  665. .long 0xce3d7273 //bcax v19.16b,v19.16b,v29.16b,v28.16b
  666. .long 0xce367fd4 //bcax v20.16b, v30.16b,v22.16b,v31.16b // *
  667. .long 0xce375bf5 //bcax v21.16b, v31.16b,v23.16b,v22.16b // *
  668. .long 0xce385ed6 //bcax v22.16b,v22.16b,v24.16b,v23.16b
  669. .long 0xce3e62f7 //bcax v23.16b,v23.16b,v30.16b,v24.16b
  670. .long 0xce3f7b18 //bcax v24.16b,v24.16b,v31.16b,v30.16b
  671. subs x9,x9,#1
  672. bne Loop_ce
  673. ret
  674. .align 5
  675. KeccakF1600_cext:
  676. .long 0xd503233f // paciasp
  677. stp x29,x30,[sp,#-80]!
  678. add x29,sp,#0
  679. stp d8,d9,[sp,#16] // per ABI requirement
  680. stp d10,d11,[sp,#32]
  681. stp d12,d13,[sp,#48]
  682. stp d14,d15,[sp,#64]
  683. ldp d0,d1,[x0,#8*0]
  684. ldp d2,d3,[x0,#8*2]
  685. ldp d4,d5,[x0,#8*4]
  686. ldp d6,d7,[x0,#8*6]
  687. ldp d8,d9,[x0,#8*8]
  688. ldp d10,d11,[x0,#8*10]
  689. ldp d12,d13,[x0,#8*12]
  690. ldp d14,d15,[x0,#8*14]
  691. ldp d16,d17,[x0,#8*16]
  692. ldp d18,d19,[x0,#8*18]
  693. ldp d20,d21,[x0,#8*20]
  694. ldp d22,d23,[x0,#8*22]
  695. ldr d24,[x0,#8*24]
  696. bl KeccakF1600_ce
  697. ldr x30,[sp,#8]
  698. stp d0,d1,[x0,#8*0]
  699. stp d2,d3,[x0,#8*2]
  700. stp d4,d5,[x0,#8*4]
  701. stp d6,d7,[x0,#8*6]
  702. stp d8,d9,[x0,#8*8]
  703. stp d10,d11,[x0,#8*10]
  704. stp d12,d13,[x0,#8*12]
  705. stp d14,d15,[x0,#8*14]
  706. stp d16,d17,[x0,#8*16]
  707. stp d18,d19,[x0,#8*18]
  708. stp d20,d21,[x0,#8*20]
  709. stp d22,d23,[x0,#8*22]
  710. str d24,[x0,#8*24]
  711. ldp d8,d9,[sp,#16]
  712. ldp d10,d11,[sp,#32]
  713. ldp d12,d13,[sp,#48]
  714. ldp d14,d15,[sp,#64]
  715. ldr x29,[sp],#80
  716. .long 0xd50323bf // autiasp
  717. ret
  718. .globl _SHA3_absorb_cext
  719. .align 5
  720. _SHA3_absorb_cext:
  721. .long 0xd503233f // paciasp
  722. stp x29,x30,[sp,#-80]!
  723. add x29,sp,#0
  724. stp d8,d9,[sp,#16] // per ABI requirement
  725. stp d10,d11,[sp,#32]
  726. stp d12,d13,[sp,#48]
  727. stp d14,d15,[sp,#64]
  728. ldp d0,d1,[x0,#8*0]
  729. ldp d2,d3,[x0,#8*2]
  730. ldp d4,d5,[x0,#8*4]
  731. ldp d6,d7,[x0,#8*6]
  732. ldp d8,d9,[x0,#8*8]
  733. ldp d10,d11,[x0,#8*10]
  734. ldp d12,d13,[x0,#8*12]
  735. ldp d14,d15,[x0,#8*14]
  736. ldp d16,d17,[x0,#8*16]
  737. ldp d18,d19,[x0,#8*18]
  738. ldp d20,d21,[x0,#8*20]
  739. ldp d22,d23,[x0,#8*22]
  740. ldr d24,[x0,#8*24]
  741. b Loop_absorb_ce
  742. .align 4
  743. Loop_absorb_ce:
  744. subs x2,x2,x3 // len - bsz
  745. blo Labsorbed_ce
  746. ldr d31,[x1],#8 // *inp++
  747. #ifdef __AARCH64EB__
  748. rev64 v31.16b,v31.16b
  749. #endif
  750. eor v0.16b,v0.16b,v31.16b
  751. cmp x3,#8*(0+2)
  752. blo Lprocess_block_ce
  753. ldr d31,[x1],#8 // *inp++
  754. #ifdef __AARCH64EB__
  755. rev64 v31.16b,v31.16b
  756. #endif
  757. eor v1.16b,v1.16b,v31.16b
  758. beq Lprocess_block_ce
  759. ldr d31,[x1],#8 // *inp++
  760. #ifdef __AARCH64EB__
  761. rev64 v31.16b,v31.16b
  762. #endif
  763. eor v2.16b,v2.16b,v31.16b
  764. cmp x3,#8*(2+2)
  765. blo Lprocess_block_ce
  766. ldr d31,[x1],#8 // *inp++
  767. #ifdef __AARCH64EB__
  768. rev64 v31.16b,v31.16b
  769. #endif
  770. eor v3.16b,v3.16b,v31.16b
  771. beq Lprocess_block_ce
  772. ldr d31,[x1],#8 // *inp++
  773. #ifdef __AARCH64EB__
  774. rev64 v31.16b,v31.16b
  775. #endif
  776. eor v4.16b,v4.16b,v31.16b
  777. cmp x3,#8*(4+2)
  778. blo Lprocess_block_ce
  779. ldr d31,[x1],#8 // *inp++
  780. #ifdef __AARCH64EB__
  781. rev64 v31.16b,v31.16b
  782. #endif
  783. eor v5.16b,v5.16b,v31.16b
  784. beq Lprocess_block_ce
  785. ldr d31,[x1],#8 // *inp++
  786. #ifdef __AARCH64EB__
  787. rev64 v31.16b,v31.16b
  788. #endif
  789. eor v6.16b,v6.16b,v31.16b
  790. cmp x3,#8*(6+2)
  791. blo Lprocess_block_ce
  792. ldr d31,[x1],#8 // *inp++
  793. #ifdef __AARCH64EB__
  794. rev64 v31.16b,v31.16b
  795. #endif
  796. eor v7.16b,v7.16b,v31.16b
  797. beq Lprocess_block_ce
  798. ldr d31,[x1],#8 // *inp++
  799. #ifdef __AARCH64EB__
  800. rev64 v31.16b,v31.16b
  801. #endif
  802. eor v8.16b,v8.16b,v31.16b
  803. cmp x3,#8*(8+2)
  804. blo Lprocess_block_ce
  805. ldr d31,[x1],#8 // *inp++
  806. #ifdef __AARCH64EB__
  807. rev64 v31.16b,v31.16b
  808. #endif
  809. eor v9.16b,v9.16b,v31.16b
  810. beq Lprocess_block_ce
  811. ldr d31,[x1],#8 // *inp++
  812. #ifdef __AARCH64EB__
  813. rev64 v31.16b,v31.16b
  814. #endif
  815. eor v10.16b,v10.16b,v31.16b
  816. cmp x3,#8*(10+2)
  817. blo Lprocess_block_ce
  818. ldr d31,[x1],#8 // *inp++
  819. #ifdef __AARCH64EB__
  820. rev64 v31.16b,v31.16b
  821. #endif
  822. eor v11.16b,v11.16b,v31.16b
  823. beq Lprocess_block_ce
  824. ldr d31,[x1],#8 // *inp++
  825. #ifdef __AARCH64EB__
  826. rev64 v31.16b,v31.16b
  827. #endif
  828. eor v12.16b,v12.16b,v31.16b
  829. cmp x3,#8*(12+2)
  830. blo Lprocess_block_ce
  831. ldr d31,[x1],#8 // *inp++
  832. #ifdef __AARCH64EB__
  833. rev64 v31.16b,v31.16b
  834. #endif
  835. eor v13.16b,v13.16b,v31.16b
  836. beq Lprocess_block_ce
  837. ldr d31,[x1],#8 // *inp++
  838. #ifdef __AARCH64EB__
  839. rev64 v31.16b,v31.16b
  840. #endif
  841. eor v14.16b,v14.16b,v31.16b
  842. cmp x3,#8*(14+2)
  843. blo Lprocess_block_ce
  844. ldr d31,[x1],#8 // *inp++
  845. #ifdef __AARCH64EB__
  846. rev64 v31.16b,v31.16b
  847. #endif
  848. eor v15.16b,v15.16b,v31.16b
  849. beq Lprocess_block_ce
  850. ldr d31,[x1],#8 // *inp++
  851. #ifdef __AARCH64EB__
  852. rev64 v31.16b,v31.16b
  853. #endif
  854. eor v16.16b,v16.16b,v31.16b
  855. cmp x3,#8*(16+2)
  856. blo Lprocess_block_ce
  857. ldr d31,[x1],#8 // *inp++
  858. #ifdef __AARCH64EB__
  859. rev64 v31.16b,v31.16b
  860. #endif
  861. eor v17.16b,v17.16b,v31.16b
  862. beq Lprocess_block_ce
  863. ldr d31,[x1],#8 // *inp++
  864. #ifdef __AARCH64EB__
  865. rev64 v31.16b,v31.16b
  866. #endif
  867. eor v18.16b,v18.16b,v31.16b
  868. cmp x3,#8*(18+2)
  869. blo Lprocess_block_ce
  870. ldr d31,[x1],#8 // *inp++
  871. #ifdef __AARCH64EB__
  872. rev64 v31.16b,v31.16b
  873. #endif
  874. eor v19.16b,v19.16b,v31.16b
  875. beq Lprocess_block_ce
  876. ldr d31,[x1],#8 // *inp++
  877. #ifdef __AARCH64EB__
  878. rev64 v31.16b,v31.16b
  879. #endif
  880. eor v20.16b,v20.16b,v31.16b
  881. cmp x3,#8*(20+2)
  882. blo Lprocess_block_ce
  883. ldr d31,[x1],#8 // *inp++
  884. #ifdef __AARCH64EB__
  885. rev64 v31.16b,v31.16b
  886. #endif
  887. eor v21.16b,v21.16b,v31.16b
  888. beq Lprocess_block_ce
  889. ldr d31,[x1],#8 // *inp++
  890. #ifdef __AARCH64EB__
  891. rev64 v31.16b,v31.16b
  892. #endif
  893. eor v22.16b,v22.16b,v31.16b
  894. cmp x3,#8*(22+2)
  895. blo Lprocess_block_ce
  896. ldr d31,[x1],#8 // *inp++
  897. #ifdef __AARCH64EB__
  898. rev64 v31.16b,v31.16b
  899. #endif
  900. eor v23.16b,v23.16b,v31.16b
  901. beq Lprocess_block_ce
  902. ldr d31,[x1],#8 // *inp++
  903. #ifdef __AARCH64EB__
  904. rev64 v31.16b,v31.16b
  905. #endif
  906. eor v24.16b,v24.16b,v31.16b
  907. Lprocess_block_ce:
  908. bl KeccakF1600_ce
  909. b Loop_absorb_ce
  910. .align 4
  911. Labsorbed_ce:
  912. stp d0,d1,[x0,#8*0]
  913. stp d2,d3,[x0,#8*2]
  914. stp d4,d5,[x0,#8*4]
  915. stp d6,d7,[x0,#8*6]
  916. stp d8,d9,[x0,#8*8]
  917. stp d10,d11,[x0,#8*10]
  918. stp d12,d13,[x0,#8*12]
  919. stp d14,d15,[x0,#8*14]
  920. stp d16,d17,[x0,#8*16]
  921. stp d18,d19,[x0,#8*18]
  922. stp d20,d21,[x0,#8*20]
  923. stp d22,d23,[x0,#8*22]
  924. str d24,[x0,#8*24]
  925. add x0,x2,x3 // return value
  926. ldp d8,d9,[sp,#16]
  927. ldp d10,d11,[sp,#32]
  928. ldp d12,d13,[sp,#48]
  929. ldp d14,d15,[sp,#64]
  930. ldp x29,x30,[sp],#80
  931. .long 0xd50323bf // autiasp
  932. ret
  933. .globl _SHA3_squeeze_cext
  934. .align 5
  935. _SHA3_squeeze_cext:
  936. .long 0xd503233f // paciasp
  937. stp x29,x30,[sp,#-16]!
  938. add x29,sp,#0
  939. mov x9,x0
  940. mov x10,x3
  941. Loop_squeeze_ce:
  942. ldr x4,[x9],#8
  943. cmp x2,#8
  944. blo Lsqueeze_tail_ce
  945. #ifdef __AARCH64EB__
  946. rev x4,x4
  947. #endif
  948. str x4,[x1],#8
  949. beq Lsqueeze_done_ce
  950. sub x2,x2,#8
  951. subs x10,x10,#8
  952. bhi Loop_squeeze_ce
  953. bl KeccakF1600_cext
  954. ldr x30,[sp,#8]
  955. mov x9,x0
  956. mov x10,x3
  957. b Loop_squeeze_ce
  958. .align 4
  959. Lsqueeze_tail_ce:
  960. strb w4,[x1],#1
  961. lsr x4,x4,#8
  962. subs x2,x2,#1
  963. beq Lsqueeze_done_ce
  964. strb w4,[x1],#1
  965. lsr x4,x4,#8
  966. subs x2,x2,#1
  967. beq Lsqueeze_done_ce
  968. strb w4,[x1],#1
  969. lsr x4,x4,#8
  970. subs x2,x2,#1
  971. beq Lsqueeze_done_ce
  972. strb w4,[x1],#1
  973. lsr x4,x4,#8
  974. subs x2,x2,#1
  975. beq Lsqueeze_done_ce
  976. strb w4,[x1],#1
  977. lsr x4,x4,#8
  978. subs x2,x2,#1
  979. beq Lsqueeze_done_ce
  980. strb w4,[x1],#1
  981. lsr x4,x4,#8
  982. subs x2,x2,#1
  983. beq Lsqueeze_done_ce
  984. strb w4,[x1],#1
  985. Lsqueeze_done_ce:
  986. ldr x29,[sp],#16
  987. .long 0xd50323bf // autiasp
  988. ret
  989. .byte 75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  990. .align 2