aesv8-armx.S 15 KB


  1. #include "arm_arch.h"
  2. #if __ARM_MAX_ARCH__>=7
  3. .text
  4. .arch armv8-a+crypto
  5. .align 5
  6. .Lrcon:
  7. .long 0x01,0x01,0x01,0x01
  8. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
  9. .long 0x1b,0x1b,0x1b,0x1b
  10. .globl aes_v8_set_encrypt_key
  11. .type aes_v8_set_encrypt_key,%function
  12. .align 5
  13. aes_v8_set_encrypt_key:
  14. .Lenc_key:
  15. stp x29,x30,[sp,#-16]!
  16. add x29,sp,#0
  17. mov x3,#-1
  18. cmp x0,#0
  19. b.eq .Lenc_key_abort
  20. cmp x2,#0
  21. b.eq .Lenc_key_abort
  22. mov x3,#-2
  23. cmp w1,#128
  24. b.lt .Lenc_key_abort
  25. cmp w1,#256
  26. b.gt .Lenc_key_abort
  27. tst w1,#0x3f
  28. b.ne .Lenc_key_abort
  29. adr x3,.Lrcon
  30. cmp w1,#192
  31. eor v0.16b,v0.16b,v0.16b
  32. ld1 {v3.16b},[x0],#16
  33. mov w1,#8 // reuse w1
  34. ld1 {v1.4s,v2.4s},[x3],#32
  35. b.lt .Loop128
  36. b.eq .L192
  37. b .L256
  38. .align 4
  39. .Loop128:
  40. tbl v6.16b,{v3.16b},v2.16b
  41. ext v5.16b,v0.16b,v3.16b,#12
  42. st1 {v3.4s},[x2],#16
  43. aese v6.16b,v0.16b
  44. subs w1,w1,#1
  45. eor v3.16b,v3.16b,v5.16b
  46. ext v5.16b,v0.16b,v5.16b,#12
  47. eor v3.16b,v3.16b,v5.16b
  48. ext v5.16b,v0.16b,v5.16b,#12
  49. eor v6.16b,v6.16b,v1.16b
  50. eor v3.16b,v3.16b,v5.16b
  51. shl v1.16b,v1.16b,#1
  52. eor v3.16b,v3.16b,v6.16b
  53. b.ne .Loop128
  54. ld1 {v1.4s},[x3]
  55. tbl v6.16b,{v3.16b},v2.16b
  56. ext v5.16b,v0.16b,v3.16b,#12
  57. st1 {v3.4s},[x2],#16
  58. aese v6.16b,v0.16b
  59. eor v3.16b,v3.16b,v5.16b
  60. ext v5.16b,v0.16b,v5.16b,#12
  61. eor v3.16b,v3.16b,v5.16b
  62. ext v5.16b,v0.16b,v5.16b,#12
  63. eor v6.16b,v6.16b,v1.16b
  64. eor v3.16b,v3.16b,v5.16b
  65. shl v1.16b,v1.16b,#1
  66. eor v3.16b,v3.16b,v6.16b
  67. tbl v6.16b,{v3.16b},v2.16b
  68. ext v5.16b,v0.16b,v3.16b,#12
  69. st1 {v3.4s},[x2],#16
  70. aese v6.16b,v0.16b
  71. eor v3.16b,v3.16b,v5.16b
  72. ext v5.16b,v0.16b,v5.16b,#12
  73. eor v3.16b,v3.16b,v5.16b
  74. ext v5.16b,v0.16b,v5.16b,#12
  75. eor v6.16b,v6.16b,v1.16b
  76. eor v3.16b,v3.16b,v5.16b
  77. eor v3.16b,v3.16b,v6.16b
  78. st1 {v3.4s},[x2]
  79. add x2,x2,#0x50
  80. mov w12,#10
  81. b .Ldone
  82. .align 4
  83. .L192:
  84. ld1 {v4.8b},[x0],#8
  85. movi v6.16b,#8 // borrow v6.16b
  86. st1 {v3.4s},[x2],#16
  87. sub v2.16b,v2.16b,v6.16b // adjust the mask
  88. .Loop192:
  89. tbl v6.16b,{v4.16b},v2.16b
  90. ext v5.16b,v0.16b,v3.16b,#12
  91. #ifdef __ARMEB__
  92. st1 {v4.4s},[x2],#16
  93. sub x2,x2,#8
  94. #else
  95. st1 {v4.8b},[x2],#8
  96. #endif
  97. aese v6.16b,v0.16b
  98. subs w1,w1,#1
  99. eor v3.16b,v3.16b,v5.16b
  100. ext v5.16b,v0.16b,v5.16b,#12
  101. eor v3.16b,v3.16b,v5.16b
  102. ext v5.16b,v0.16b,v5.16b,#12
  103. eor v3.16b,v3.16b,v5.16b
  104. dup v5.4s,v3.s[3]
  105. eor v5.16b,v5.16b,v4.16b
  106. eor v6.16b,v6.16b,v1.16b
  107. ext v4.16b,v0.16b,v4.16b,#12
  108. shl v1.16b,v1.16b,#1
  109. eor v4.16b,v4.16b,v5.16b
  110. eor v3.16b,v3.16b,v6.16b
  111. eor v4.16b,v4.16b,v6.16b
  112. st1 {v3.4s},[x2],#16
  113. b.ne .Loop192
  114. mov w12,#12
  115. add x2,x2,#0x20
  116. b .Ldone
  117. .align 4
  118. .L256:
  119. ld1 {v4.16b},[x0]
  120. mov w1,#7
  121. mov w12,#14
  122. st1 {v3.4s},[x2],#16
  123. .Loop256:
  124. tbl v6.16b,{v4.16b},v2.16b
  125. ext v5.16b,v0.16b,v3.16b,#12
  126. st1 {v4.4s},[x2],#16
  127. aese v6.16b,v0.16b
  128. subs w1,w1,#1
  129. eor v3.16b,v3.16b,v5.16b
  130. ext v5.16b,v0.16b,v5.16b,#12
  131. eor v3.16b,v3.16b,v5.16b
  132. ext v5.16b,v0.16b,v5.16b,#12
  133. eor v6.16b,v6.16b,v1.16b
  134. eor v3.16b,v3.16b,v5.16b
  135. shl v1.16b,v1.16b,#1
  136. eor v3.16b,v3.16b,v6.16b
  137. st1 {v3.4s},[x2],#16
  138. b.eq .Ldone
  139. dup v6.4s,v3.s[3] // just splat
  140. ext v5.16b,v0.16b,v4.16b,#12
  141. aese v6.16b,v0.16b
  142. eor v4.16b,v4.16b,v5.16b
  143. ext v5.16b,v0.16b,v5.16b,#12
  144. eor v4.16b,v4.16b,v5.16b
  145. ext v5.16b,v0.16b,v5.16b,#12
  146. eor v4.16b,v4.16b,v5.16b
  147. eor v4.16b,v4.16b,v6.16b
  148. b .Loop256
  149. .Ldone:
  150. str w12,[x2]
  151. mov x3,#0
  152. .Lenc_key_abort:
  153. mov x0,x3 // return value
  154. ldr x29,[sp],#16
  155. ret
  156. .size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
  157. .globl aes_v8_set_decrypt_key
  158. .type aes_v8_set_decrypt_key,%function
  159. .align 5
  160. aes_v8_set_decrypt_key:
  161. .inst 0xd503233f // paciasp
  162. stp x29,x30,[sp,#-16]!
  163. add x29,sp,#0
  164. bl .Lenc_key
  165. cmp x0,#0
  166. b.ne .Ldec_key_abort
  167. sub x2,x2,#240 // restore original x2
  168. mov x4,#-16
  169. add x0,x2,x12,lsl#4 // end of key schedule
  170. ld1 {v0.4s},[x2]
  171. ld1 {v1.4s},[x0]
  172. st1 {v0.4s},[x0],x4
  173. st1 {v1.4s},[x2],#16
  174. .Loop_imc:
  175. ld1 {v0.4s},[x2]
  176. ld1 {v1.4s},[x0]
  177. aesimc v0.16b,v0.16b
  178. aesimc v1.16b,v1.16b
  179. st1 {v0.4s},[x0],x4
  180. st1 {v1.4s},[x2],#16
  181. cmp x0,x2
  182. b.hi .Loop_imc
  183. ld1 {v0.4s},[x2]
  184. aesimc v0.16b,v0.16b
  185. st1 {v0.4s},[x0]
  186. eor x0,x0,x0 // return value
  187. .Ldec_key_abort:
  188. ldp x29,x30,[sp],#16
  189. .inst 0xd50323bf // autiasp
  190. ret
  191. .size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
  192. .globl aes_v8_encrypt
  193. .type aes_v8_encrypt,%function
  194. .align 5
  195. aes_v8_encrypt:
  196. ldr w3,[x2,#240]
  197. ld1 {v0.4s},[x2],#16
  198. ld1 {v2.16b},[x0]
  199. sub w3,w3,#2
  200. ld1 {v1.4s},[x2],#16
  201. .Loop_enc:
  202. aese v2.16b,v0.16b
  203. aesmc v2.16b,v2.16b
  204. ld1 {v0.4s},[x2],#16
  205. subs w3,w3,#2
  206. aese v2.16b,v1.16b
  207. aesmc v2.16b,v2.16b
  208. ld1 {v1.4s},[x2],#16
  209. b.gt .Loop_enc
  210. aese v2.16b,v0.16b
  211. aesmc v2.16b,v2.16b
  212. ld1 {v0.4s},[x2]
  213. aese v2.16b,v1.16b
  214. eor v2.16b,v2.16b,v0.16b
  215. st1 {v2.16b},[x1]
  216. ret
  217. .size aes_v8_encrypt,.-aes_v8_encrypt
  218. .globl aes_v8_decrypt
  219. .type aes_v8_decrypt,%function
  220. .align 5
  221. aes_v8_decrypt:
  222. ldr w3,[x2,#240]
  223. ld1 {v0.4s},[x2],#16
  224. ld1 {v2.16b},[x0]
  225. sub w3,w3,#2
  226. ld1 {v1.4s},[x2],#16
  227. .Loop_dec:
  228. aesd v2.16b,v0.16b
  229. aesimc v2.16b,v2.16b
  230. ld1 {v0.4s},[x2],#16
  231. subs w3,w3,#2
  232. aesd v2.16b,v1.16b
  233. aesimc v2.16b,v2.16b
  234. ld1 {v1.4s},[x2],#16
  235. b.gt .Loop_dec
  236. aesd v2.16b,v0.16b
  237. aesimc v2.16b,v2.16b
  238. ld1 {v0.4s},[x2]
  239. aesd v2.16b,v1.16b
  240. eor v2.16b,v2.16b,v0.16b
  241. st1 {v2.16b},[x1]
  242. ret
  243. .size aes_v8_decrypt,.-aes_v8_decrypt
  244. .globl aes_v8_cbc_encrypt
  245. .type aes_v8_cbc_encrypt,%function
  246. .align 5
  247. aes_v8_cbc_encrypt:
  248. stp x29,x30,[sp,#-16]!
  249. add x29,sp,#0
  250. subs x2,x2,#16
  251. mov x8,#16
  252. b.lo .Lcbc_abort
  253. csel x8,xzr,x8,eq
  254. cmp w5,#0 // en- or decrypting?
  255. ldr w5,[x3,#240]
  256. and x2,x2,#-16
  257. ld1 {v6.16b},[x4]
  258. ld1 {v0.16b},[x0],x8
  259. ld1 {v16.4s,v17.4s},[x3] // load key schedule...
  260. sub w5,w5,#6
  261. add x7,x3,x5,lsl#4 // pointer to last 7 round keys
  262. sub w5,w5,#2
  263. ld1 {v18.4s,v19.4s},[x7],#32
  264. ld1 {v20.4s,v21.4s},[x7],#32
  265. ld1 {v22.4s,v23.4s},[x7],#32
  266. ld1 {v7.4s},[x7]
  267. add x7,x3,#32
  268. mov w6,w5
  269. b.eq .Lcbc_dec
  270. cmp w5,#2
  271. eor v0.16b,v0.16b,v6.16b
  272. eor v5.16b,v16.16b,v7.16b
  273. b.eq .Lcbc_enc128
  274. ld1 {v2.4s,v3.4s},[x7]
  275. add x7,x3,#16
  276. add x6,x3,#16*4
  277. add x12,x3,#16*5
  278. aese v0.16b,v16.16b
  279. aesmc v0.16b,v0.16b
  280. add x14,x3,#16*6
  281. add x3,x3,#16*7
  282. b .Lenter_cbc_enc
  283. .align 4
  284. .Loop_cbc_enc:
  285. aese v0.16b,v16.16b
  286. aesmc v0.16b,v0.16b
  287. st1 {v6.16b},[x1],#16
  288. .Lenter_cbc_enc:
  289. aese v0.16b,v17.16b
  290. aesmc v0.16b,v0.16b
  291. aese v0.16b,v2.16b
  292. aesmc v0.16b,v0.16b
  293. ld1 {v16.4s},[x6]
  294. cmp w5,#4
  295. aese v0.16b,v3.16b
  296. aesmc v0.16b,v0.16b
  297. ld1 {v17.4s},[x12]
  298. b.eq .Lcbc_enc192
  299. aese v0.16b,v16.16b
  300. aesmc v0.16b,v0.16b
  301. ld1 {v16.4s},[x14]
  302. aese v0.16b,v17.16b
  303. aesmc v0.16b,v0.16b
  304. ld1 {v17.4s},[x3]
  305. nop
  306. .Lcbc_enc192:
  307. aese v0.16b,v16.16b
  308. aesmc v0.16b,v0.16b
  309. subs x2,x2,#16
  310. aese v0.16b,v17.16b
  311. aesmc v0.16b,v0.16b
  312. csel x8,xzr,x8,eq
  313. aese v0.16b,v18.16b
  314. aesmc v0.16b,v0.16b
  315. aese v0.16b,v19.16b
  316. aesmc v0.16b,v0.16b
  317. ld1 {v16.16b},[x0],x8
  318. aese v0.16b,v20.16b
  319. aesmc v0.16b,v0.16b
  320. eor v16.16b,v16.16b,v5.16b
  321. aese v0.16b,v21.16b
  322. aesmc v0.16b,v0.16b
  323. ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
  324. aese v0.16b,v22.16b
  325. aesmc v0.16b,v0.16b
  326. aese v0.16b,v23.16b
  327. eor v6.16b,v0.16b,v7.16b
  328. b.hs .Loop_cbc_enc
  329. st1 {v6.16b},[x1],#16
  330. b .Lcbc_done
  331. .align 5
  332. .Lcbc_enc128:
  333. ld1 {v2.4s,v3.4s},[x7]
  334. aese v0.16b,v16.16b
  335. aesmc v0.16b,v0.16b
  336. b .Lenter_cbc_enc128
  337. .Loop_cbc_enc128:
  338. aese v0.16b,v16.16b
  339. aesmc v0.16b,v0.16b
  340. st1 {v6.16b},[x1],#16
  341. .Lenter_cbc_enc128:
  342. aese v0.16b,v17.16b
  343. aesmc v0.16b,v0.16b
  344. subs x2,x2,#16
  345. aese v0.16b,v2.16b
  346. aesmc v0.16b,v0.16b
  347. csel x8,xzr,x8,eq
  348. aese v0.16b,v3.16b
  349. aesmc v0.16b,v0.16b
  350. aese v0.16b,v18.16b
  351. aesmc v0.16b,v0.16b
  352. aese v0.16b,v19.16b
  353. aesmc v0.16b,v0.16b
  354. ld1 {v16.16b},[x0],x8
  355. aese v0.16b,v20.16b
  356. aesmc v0.16b,v0.16b
  357. aese v0.16b,v21.16b
  358. aesmc v0.16b,v0.16b
  359. aese v0.16b,v22.16b
  360. aesmc v0.16b,v0.16b
  361. eor v16.16b,v16.16b,v5.16b
  362. aese v0.16b,v23.16b
  363. eor v6.16b,v0.16b,v7.16b
  364. b.hs .Loop_cbc_enc128
  365. st1 {v6.16b},[x1],#16
  366. b .Lcbc_done
  367. .align 5
  368. .Lcbc_dec:
  369. ld1 {v18.16b},[x0],#16
  370. subs x2,x2,#32 // bias
  371. add w6,w5,#2
  372. orr v3.16b,v0.16b,v0.16b
  373. orr v1.16b,v0.16b,v0.16b
  374. orr v19.16b,v18.16b,v18.16b
  375. b.lo .Lcbc_dec_tail
  376. orr v1.16b,v18.16b,v18.16b
  377. ld1 {v18.16b},[x0],#16
  378. orr v2.16b,v0.16b,v0.16b
  379. orr v3.16b,v1.16b,v1.16b
  380. orr v19.16b,v18.16b,v18.16b
  381. .Loop3x_cbc_dec:
  382. aesd v0.16b,v16.16b
  383. aesimc v0.16b,v0.16b
  384. aesd v1.16b,v16.16b
  385. aesimc v1.16b,v1.16b
  386. aesd v18.16b,v16.16b
  387. aesimc v18.16b,v18.16b
  388. ld1 {v16.4s},[x7],#16
  389. subs w6,w6,#2
  390. aesd v0.16b,v17.16b
  391. aesimc v0.16b,v0.16b
  392. aesd v1.16b,v17.16b
  393. aesimc v1.16b,v1.16b
  394. aesd v18.16b,v17.16b
  395. aesimc v18.16b,v18.16b
  396. ld1 {v17.4s},[x7],#16
  397. b.gt .Loop3x_cbc_dec
  398. aesd v0.16b,v16.16b
  399. aesimc v0.16b,v0.16b
  400. aesd v1.16b,v16.16b
  401. aesimc v1.16b,v1.16b
  402. aesd v18.16b,v16.16b
  403. aesimc v18.16b,v18.16b
  404. eor v4.16b,v6.16b,v7.16b
  405. subs x2,x2,#0x30
  406. eor v5.16b,v2.16b,v7.16b
  407. csel x6,x2,x6,lo // x6, w6, is zero at this point
  408. aesd v0.16b,v17.16b
  409. aesimc v0.16b,v0.16b
  410. aesd v1.16b,v17.16b
  411. aesimc v1.16b,v1.16b
  412. aesd v18.16b,v17.16b
  413. aesimc v18.16b,v18.16b
  414. eor v17.16b,v3.16b,v7.16b
  415. add x0,x0,x6 // x0 is adjusted in such way that
  416. // at exit from the loop v1.16b-v18.16b
  417. // are loaded with last "words"
  418. orr v6.16b,v19.16b,v19.16b
  419. mov x7,x3
  420. aesd v0.16b,v20.16b
  421. aesimc v0.16b,v0.16b
  422. aesd v1.16b,v20.16b
  423. aesimc v1.16b,v1.16b
  424. aesd v18.16b,v20.16b
  425. aesimc v18.16b,v18.16b
  426. ld1 {v2.16b},[x0],#16
  427. aesd v0.16b,v21.16b
  428. aesimc v0.16b,v0.16b
  429. aesd v1.16b,v21.16b
  430. aesimc v1.16b,v1.16b
  431. aesd v18.16b,v21.16b
  432. aesimc v18.16b,v18.16b
  433. ld1 {v3.16b},[x0],#16
  434. aesd v0.16b,v22.16b
  435. aesimc v0.16b,v0.16b
  436. aesd v1.16b,v22.16b
  437. aesimc v1.16b,v1.16b
  438. aesd v18.16b,v22.16b
  439. aesimc v18.16b,v18.16b
  440. ld1 {v19.16b},[x0],#16
  441. aesd v0.16b,v23.16b
  442. aesd v1.16b,v23.16b
  443. aesd v18.16b,v23.16b
  444. ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
  445. add w6,w5,#2
  446. eor v4.16b,v4.16b,v0.16b
  447. eor v5.16b,v5.16b,v1.16b
  448. eor v18.16b,v18.16b,v17.16b
  449. ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
  450. st1 {v4.16b},[x1],#16
  451. orr v0.16b,v2.16b,v2.16b
  452. st1 {v5.16b},[x1],#16
  453. orr v1.16b,v3.16b,v3.16b
  454. st1 {v18.16b},[x1],#16
  455. orr v18.16b,v19.16b,v19.16b
  456. b.hs .Loop3x_cbc_dec
  457. cmn x2,#0x30
  458. b.eq .Lcbc_done
  459. nop
  460. .Lcbc_dec_tail:
  461. aesd v1.16b,v16.16b
  462. aesimc v1.16b,v1.16b
  463. aesd v18.16b,v16.16b
  464. aesimc v18.16b,v18.16b
  465. ld1 {v16.4s},[x7],#16
  466. subs w6,w6,#2
  467. aesd v1.16b,v17.16b
  468. aesimc v1.16b,v1.16b
  469. aesd v18.16b,v17.16b
  470. aesimc v18.16b,v18.16b
  471. ld1 {v17.4s},[x7],#16
  472. b.gt .Lcbc_dec_tail
  473. aesd v1.16b,v16.16b
  474. aesimc v1.16b,v1.16b
  475. aesd v18.16b,v16.16b
  476. aesimc v18.16b,v18.16b
  477. aesd v1.16b,v17.16b
  478. aesimc v1.16b,v1.16b
  479. aesd v18.16b,v17.16b
  480. aesimc v18.16b,v18.16b
  481. aesd v1.16b,v20.16b
  482. aesimc v1.16b,v1.16b
  483. aesd v18.16b,v20.16b
  484. aesimc v18.16b,v18.16b
  485. cmn x2,#0x20
  486. aesd v1.16b,v21.16b
  487. aesimc v1.16b,v1.16b
  488. aesd v18.16b,v21.16b
  489. aesimc v18.16b,v18.16b
  490. eor v5.16b,v6.16b,v7.16b
  491. aesd v1.16b,v22.16b
  492. aesimc v1.16b,v1.16b
  493. aesd v18.16b,v22.16b
  494. aesimc v18.16b,v18.16b
  495. eor v17.16b,v3.16b,v7.16b
  496. aesd v1.16b,v23.16b
  497. aesd v18.16b,v23.16b
  498. b.eq .Lcbc_dec_one
  499. eor v5.16b,v5.16b,v1.16b
  500. eor v17.16b,v17.16b,v18.16b
  501. orr v6.16b,v19.16b,v19.16b
  502. st1 {v5.16b},[x1],#16
  503. st1 {v17.16b},[x1],#16
  504. b .Lcbc_done
  505. .Lcbc_dec_one:
  506. eor v5.16b,v5.16b,v18.16b
  507. orr v6.16b,v19.16b,v19.16b
  508. st1 {v5.16b},[x1],#16
  509. .Lcbc_done:
  510. st1 {v6.16b},[x4]
  511. .Lcbc_abort:
  512. ldr x29,[sp],#16
  513. ret
  514. .size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
  515. .globl aes_v8_ctr32_encrypt_blocks
  516. .type aes_v8_ctr32_encrypt_blocks,%function
  517. .align 5
  518. aes_v8_ctr32_encrypt_blocks:
  519. stp x29,x30,[sp,#-16]!
  520. add x29,sp,#0
  521. ldr w5,[x3,#240]
  522. ldr w8, [x4, #12]
  523. #ifdef __ARMEB__
  524. ld1 {v0.16b},[x4]
  525. #else
  526. ld1 {v0.4s},[x4]
  527. #endif
  528. ld1 {v16.4s,v17.4s},[x3] // load key schedule...
  529. sub w5,w5,#4
  530. mov x12,#16
  531. cmp x2,#2
  532. add x7,x3,x5,lsl#4 // pointer to last 5 round keys
  533. sub w5,w5,#2
  534. ld1 {v20.4s,v21.4s},[x7],#32
  535. ld1 {v22.4s,v23.4s},[x7],#32
  536. ld1 {v7.4s},[x7]
  537. add x7,x3,#32
  538. mov w6,w5
  539. csel x12,xzr,x12,lo
  540. #ifndef __ARMEB__
  541. rev w8, w8
  542. #endif
  543. orr v1.16b,v0.16b,v0.16b
  544. add w10, w8, #1
  545. orr v18.16b,v0.16b,v0.16b
  546. add w8, w8, #2
  547. orr v6.16b,v0.16b,v0.16b
  548. rev w10, w10
  549. mov v1.s[3],w10
  550. b.ls .Lctr32_tail
  551. rev w12, w8
  552. sub x2,x2,#3 // bias
  553. mov v18.s[3],w12
  554. b .Loop3x_ctr32
  555. .align 4
  556. .Loop3x_ctr32:
  557. aese v0.16b,v16.16b
  558. aesmc v0.16b,v0.16b
  559. aese v1.16b,v16.16b
  560. aesmc v1.16b,v1.16b
  561. aese v18.16b,v16.16b
  562. aesmc v18.16b,v18.16b
  563. ld1 {v16.4s},[x7],#16
  564. subs w6,w6,#2
  565. aese v0.16b,v17.16b
  566. aesmc v0.16b,v0.16b
  567. aese v1.16b,v17.16b
  568. aesmc v1.16b,v1.16b
  569. aese v18.16b,v17.16b
  570. aesmc v18.16b,v18.16b
  571. ld1 {v17.4s},[x7],#16
  572. b.gt .Loop3x_ctr32
  573. aese v0.16b,v16.16b
  574. aesmc v4.16b,v0.16b
  575. aese v1.16b,v16.16b
  576. aesmc v5.16b,v1.16b
  577. ld1 {v2.16b},[x0],#16
  578. orr v0.16b,v6.16b,v6.16b
  579. aese v18.16b,v16.16b
  580. aesmc v18.16b,v18.16b
  581. ld1 {v3.16b},[x0],#16
  582. orr v1.16b,v6.16b,v6.16b
  583. aese v4.16b,v17.16b
  584. aesmc v4.16b,v4.16b
  585. aese v5.16b,v17.16b
  586. aesmc v5.16b,v5.16b
  587. ld1 {v19.16b},[x0],#16
  588. mov x7,x3
  589. aese v18.16b,v17.16b
  590. aesmc v17.16b,v18.16b
  591. orr v18.16b,v6.16b,v6.16b
  592. add w9,w8,#1
  593. aese v4.16b,v20.16b
  594. aesmc v4.16b,v4.16b
  595. aese v5.16b,v20.16b
  596. aesmc v5.16b,v5.16b
  597. eor v2.16b,v2.16b,v7.16b
  598. add w10,w8,#2
  599. aese v17.16b,v20.16b
  600. aesmc v17.16b,v17.16b
  601. eor v3.16b,v3.16b,v7.16b
  602. add w8,w8,#3
  603. aese v4.16b,v21.16b
  604. aesmc v4.16b,v4.16b
  605. aese v5.16b,v21.16b
  606. aesmc v5.16b,v5.16b
  607. eor v19.16b,v19.16b,v7.16b
  608. rev w9,w9
  609. aese v17.16b,v21.16b
  610. aesmc v17.16b,v17.16b
  611. mov v0.s[3], w9
  612. rev w10,w10
  613. aese v4.16b,v22.16b
  614. aesmc v4.16b,v4.16b
  615. aese v5.16b,v22.16b
  616. aesmc v5.16b,v5.16b
  617. mov v1.s[3], w10
  618. rev w12,w8
  619. aese v17.16b,v22.16b
  620. aesmc v17.16b,v17.16b
  621. mov v18.s[3], w12
  622. subs x2,x2,#3
  623. aese v4.16b,v23.16b
  624. aese v5.16b,v23.16b
  625. aese v17.16b,v23.16b
  626. eor v2.16b,v2.16b,v4.16b
  627. ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
  628. st1 {v2.16b},[x1],#16
  629. eor v3.16b,v3.16b,v5.16b
  630. mov w6,w5
  631. st1 {v3.16b},[x1],#16
  632. eor v19.16b,v19.16b,v17.16b
  633. ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
  634. st1 {v19.16b},[x1],#16
  635. b.hs .Loop3x_ctr32
  636. adds x2,x2,#3
  637. b.eq .Lctr32_done
  638. cmp x2,#1
  639. mov x12,#16
  640. csel x12,xzr,x12,eq
  641. .Lctr32_tail:
  642. aese v0.16b,v16.16b
  643. aesmc v0.16b,v0.16b
  644. aese v1.16b,v16.16b
  645. aesmc v1.16b,v1.16b
  646. ld1 {v16.4s},[x7],#16
  647. subs w6,w6,#2
  648. aese v0.16b,v17.16b
  649. aesmc v0.16b,v0.16b
  650. aese v1.16b,v17.16b
  651. aesmc v1.16b,v1.16b
  652. ld1 {v17.4s},[x7],#16
  653. b.gt .Lctr32_tail
  654. aese v0.16b,v16.16b
  655. aesmc v0.16b,v0.16b
  656. aese v1.16b,v16.16b
  657. aesmc v1.16b,v1.16b
  658. aese v0.16b,v17.16b
  659. aesmc v0.16b,v0.16b
  660. aese v1.16b,v17.16b
  661. aesmc v1.16b,v1.16b
  662. ld1 {v2.16b},[x0],x12
  663. aese v0.16b,v20.16b
  664. aesmc v0.16b,v0.16b
  665. aese v1.16b,v20.16b
  666. aesmc v1.16b,v1.16b
  667. ld1 {v3.16b},[x0]
  668. aese v0.16b,v21.16b
  669. aesmc v0.16b,v0.16b
  670. aese v1.16b,v21.16b
  671. aesmc v1.16b,v1.16b
  672. eor v2.16b,v2.16b,v7.16b
  673. aese v0.16b,v22.16b
  674. aesmc v0.16b,v0.16b
  675. aese v1.16b,v22.16b
  676. aesmc v1.16b,v1.16b
  677. eor v3.16b,v3.16b,v7.16b
  678. aese v0.16b,v23.16b
  679. aese v1.16b,v23.16b
  680. cmp x2,#1
  681. eor v2.16b,v2.16b,v0.16b
  682. eor v3.16b,v3.16b,v1.16b
  683. st1 {v2.16b},[x1],#16
  684. b.eq .Lctr32_done
  685. st1 {v3.16b},[x1]
  686. .Lctr32_done:
  687. ldr x29,[sp],#16
  688. ret
  689. .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
  690. #endif