aesv8-armx.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757
  1. #include "arm_arch.h"
  2. #if __ARM_MAX_ARCH__>=7
  3. .text
  4. .align 5
  5. Lrcon:
  6. .long 0x01,0x01,0x01,0x01
  7. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
  8. .long 0x1b,0x1b,0x1b,0x1b
  9. .globl _aes_v8_set_encrypt_key
  10. .align 5
  11. _aes_v8_set_encrypt_key:
  12. Lenc_key:
  13. stp x29,x30,[sp,#-16]!
  14. add x29,sp,#0
  15. mov x3,#-1
  16. cmp x0,#0
  17. b.eq Lenc_key_abort
  18. cmp x2,#0
  19. b.eq Lenc_key_abort
  20. mov x3,#-2
  21. cmp w1,#128
  22. b.lt Lenc_key_abort
  23. cmp w1,#256
  24. b.gt Lenc_key_abort
  25. tst w1,#0x3f
  26. b.ne Lenc_key_abort
  27. adr x3,Lrcon
  28. cmp w1,#192
  29. eor v0.16b,v0.16b,v0.16b
  30. ld1 {v3.16b},[x0],#16
  31. mov w1,#8 // reuse w1
  32. ld1 {v1.4s,v2.4s},[x3],#32
  33. b.lt Loop128
  34. b.eq L192
  35. b L256
  36. .align 4
  37. Loop128:
  38. tbl v6.16b,{v3.16b},v2.16b
  39. ext v5.16b,v0.16b,v3.16b,#12
  40. st1 {v3.4s},[x2],#16
  41. aese v6.16b,v0.16b
  42. subs w1,w1,#1
  43. eor v3.16b,v3.16b,v5.16b
  44. ext v5.16b,v0.16b,v5.16b,#12
  45. eor v3.16b,v3.16b,v5.16b
  46. ext v5.16b,v0.16b,v5.16b,#12
  47. eor v6.16b,v6.16b,v1.16b
  48. eor v3.16b,v3.16b,v5.16b
  49. shl v1.16b,v1.16b,#1
  50. eor v3.16b,v3.16b,v6.16b
  51. b.ne Loop128
  52. ld1 {v1.4s},[x3]
  53. tbl v6.16b,{v3.16b},v2.16b
  54. ext v5.16b,v0.16b,v3.16b,#12
  55. st1 {v3.4s},[x2],#16
  56. aese v6.16b,v0.16b
  57. eor v3.16b,v3.16b,v5.16b
  58. ext v5.16b,v0.16b,v5.16b,#12
  59. eor v3.16b,v3.16b,v5.16b
  60. ext v5.16b,v0.16b,v5.16b,#12
  61. eor v6.16b,v6.16b,v1.16b
  62. eor v3.16b,v3.16b,v5.16b
  63. shl v1.16b,v1.16b,#1
  64. eor v3.16b,v3.16b,v6.16b
  65. tbl v6.16b,{v3.16b},v2.16b
  66. ext v5.16b,v0.16b,v3.16b,#12
  67. st1 {v3.4s},[x2],#16
  68. aese v6.16b,v0.16b
  69. eor v3.16b,v3.16b,v5.16b
  70. ext v5.16b,v0.16b,v5.16b,#12
  71. eor v3.16b,v3.16b,v5.16b
  72. ext v5.16b,v0.16b,v5.16b,#12
  73. eor v6.16b,v6.16b,v1.16b
  74. eor v3.16b,v3.16b,v5.16b
  75. eor v3.16b,v3.16b,v6.16b
  76. st1 {v3.4s},[x2]
  77. add x2,x2,#0x50
  78. mov w12,#10
  79. b Ldone
  80. .align 4
  81. L192:
  82. ld1 {v4.8b},[x0],#8
  83. movi v6.16b,#8 // borrow v6.16b
  84. st1 {v3.4s},[x2],#16
  85. sub v2.16b,v2.16b,v6.16b // adjust the mask
  86. Loop192:
  87. tbl v6.16b,{v4.16b},v2.16b
  88. ext v5.16b,v0.16b,v3.16b,#12
  89. #ifdef __ARMEB__
  90. st1 {v4.4s},[x2],#16
  91. sub x2,x2,#8
  92. #else
  93. st1 {v4.8b},[x2],#8
  94. #endif
  95. aese v6.16b,v0.16b
  96. subs w1,w1,#1
  97. eor v3.16b,v3.16b,v5.16b
  98. ext v5.16b,v0.16b,v5.16b,#12
  99. eor v3.16b,v3.16b,v5.16b
  100. ext v5.16b,v0.16b,v5.16b,#12
  101. eor v3.16b,v3.16b,v5.16b
  102. dup v5.4s,v3.s[3]
  103. eor v5.16b,v5.16b,v4.16b
  104. eor v6.16b,v6.16b,v1.16b
  105. ext v4.16b,v0.16b,v4.16b,#12
  106. shl v1.16b,v1.16b,#1
  107. eor v4.16b,v4.16b,v5.16b
  108. eor v3.16b,v3.16b,v6.16b
  109. eor v4.16b,v4.16b,v6.16b
  110. st1 {v3.4s},[x2],#16
  111. b.ne Loop192
  112. mov w12,#12
  113. add x2,x2,#0x20
  114. b Ldone
  115. .align 4
  116. L256:
  117. ld1 {v4.16b},[x0]
  118. mov w1,#7
  119. mov w12,#14
  120. st1 {v3.4s},[x2],#16
  121. Loop256:
  122. tbl v6.16b,{v4.16b},v2.16b
  123. ext v5.16b,v0.16b,v3.16b,#12
  124. st1 {v4.4s},[x2],#16
  125. aese v6.16b,v0.16b
  126. subs w1,w1,#1
  127. eor v3.16b,v3.16b,v5.16b
  128. ext v5.16b,v0.16b,v5.16b,#12
  129. eor v3.16b,v3.16b,v5.16b
  130. ext v5.16b,v0.16b,v5.16b,#12
  131. eor v6.16b,v6.16b,v1.16b
  132. eor v3.16b,v3.16b,v5.16b
  133. shl v1.16b,v1.16b,#1
  134. eor v3.16b,v3.16b,v6.16b
  135. st1 {v3.4s},[x2],#16
  136. b.eq Ldone
  137. dup v6.4s,v3.s[3] // just splat
  138. ext v5.16b,v0.16b,v4.16b,#12
  139. aese v6.16b,v0.16b
  140. eor v4.16b,v4.16b,v5.16b
  141. ext v5.16b,v0.16b,v5.16b,#12
  142. eor v4.16b,v4.16b,v5.16b
  143. ext v5.16b,v0.16b,v5.16b,#12
  144. eor v4.16b,v4.16b,v5.16b
  145. eor v4.16b,v4.16b,v6.16b
  146. b Loop256
  147. Ldone:
  148. str w12,[x2]
  149. mov x3,#0
  150. Lenc_key_abort:
  151. mov x0,x3 // return value
  152. ldr x29,[sp],#16
  153. ret
  154. .globl _aes_v8_set_decrypt_key
  155. .align 5
  156. _aes_v8_set_decrypt_key:
  157. .long 0xd503233f // paciasp
  158. stp x29,x30,[sp,#-16]!
  159. add x29,sp,#0
  160. bl Lenc_key
  161. cmp x0,#0
  162. b.ne Ldec_key_abort
  163. sub x2,x2,#240 // restore original x2
  164. mov x4,#-16
  165. add x0,x2,x12,lsl#4 // end of key schedule
  166. ld1 {v0.4s},[x2]
  167. ld1 {v1.4s},[x0]
  168. st1 {v0.4s},[x0],x4
  169. st1 {v1.4s},[x2],#16
  170. Loop_imc:
  171. ld1 {v0.4s},[x2]
  172. ld1 {v1.4s},[x0]
  173. aesimc v0.16b,v0.16b
  174. aesimc v1.16b,v1.16b
  175. st1 {v0.4s},[x0],x4
  176. st1 {v1.4s},[x2],#16
  177. cmp x0,x2
  178. b.hi Loop_imc
  179. ld1 {v0.4s},[x2]
  180. aesimc v0.16b,v0.16b
  181. st1 {v0.4s},[x0]
  182. eor x0,x0,x0 // return value
  183. Ldec_key_abort:
  184. ldp x29,x30,[sp],#16
  185. .long 0xd50323bf // autiasp
  186. ret
  187. .globl _aes_v8_encrypt
  188. .align 5
  189. _aes_v8_encrypt:
  190. ldr w3,[x2,#240]
  191. ld1 {v0.4s},[x2],#16
  192. ld1 {v2.16b},[x0]
  193. sub w3,w3,#2
  194. ld1 {v1.4s},[x2],#16
  195. Loop_enc:
  196. aese v2.16b,v0.16b
  197. aesmc v2.16b,v2.16b
  198. ld1 {v0.4s},[x2],#16
  199. subs w3,w3,#2
  200. aese v2.16b,v1.16b
  201. aesmc v2.16b,v2.16b
  202. ld1 {v1.4s},[x2],#16
  203. b.gt Loop_enc
  204. aese v2.16b,v0.16b
  205. aesmc v2.16b,v2.16b
  206. ld1 {v0.4s},[x2]
  207. aese v2.16b,v1.16b
  208. eor v2.16b,v2.16b,v0.16b
  209. st1 {v2.16b},[x1]
  210. ret
  211. .globl _aes_v8_decrypt
  212. .align 5
  213. _aes_v8_decrypt:
  214. ldr w3,[x2,#240]
  215. ld1 {v0.4s},[x2],#16
  216. ld1 {v2.16b},[x0]
  217. sub w3,w3,#2
  218. ld1 {v1.4s},[x2],#16
  219. Loop_dec:
  220. aesd v2.16b,v0.16b
  221. aesimc v2.16b,v2.16b
  222. ld1 {v0.4s},[x2],#16
  223. subs w3,w3,#2
  224. aesd v2.16b,v1.16b
  225. aesimc v2.16b,v2.16b
  226. ld1 {v1.4s},[x2],#16
  227. b.gt Loop_dec
  228. aesd v2.16b,v0.16b
  229. aesimc v2.16b,v2.16b
  230. ld1 {v0.4s},[x2]
  231. aesd v2.16b,v1.16b
  232. eor v2.16b,v2.16b,v0.16b
  233. st1 {v2.16b},[x1]
  234. ret
  235. .globl _aes_v8_cbc_encrypt
  236. .align 5
  237. _aes_v8_cbc_encrypt:
  238. stp x29,x30,[sp,#-16]!
  239. add x29,sp,#0
  240. subs x2,x2,#16
  241. mov x8,#16
  242. b.lo Lcbc_abort
  243. csel x8,xzr,x8,eq
  244. cmp w5,#0 // en- or decrypting?
  245. ldr w5,[x3,#240]
  246. and x2,x2,#-16
  247. ld1 {v6.16b},[x4]
  248. ld1 {v0.16b},[x0],x8
  249. ld1 {v16.4s,v17.4s},[x3] // load key schedule...
  250. sub w5,w5,#6
  251. add x7,x3,x5,lsl#4 // pointer to last 7 round keys
  252. sub w5,w5,#2
  253. ld1 {v18.4s,v19.4s},[x7],#32
  254. ld1 {v20.4s,v21.4s},[x7],#32
  255. ld1 {v22.4s,v23.4s},[x7],#32
  256. ld1 {v7.4s},[x7]
  257. add x7,x3,#32
  258. mov w6,w5
  259. b.eq Lcbc_dec
  260. cmp w5,#2
  261. eor v0.16b,v0.16b,v6.16b
  262. eor v5.16b,v16.16b,v7.16b
  263. b.eq Lcbc_enc128
  264. ld1 {v2.4s,v3.4s},[x7]
  265. add x7,x3,#16
  266. add x6,x3,#16*4
  267. add x12,x3,#16*5
  268. aese v0.16b,v16.16b
  269. aesmc v0.16b,v0.16b
  270. add x14,x3,#16*6
  271. add x3,x3,#16*7
  272. b Lenter_cbc_enc
  273. .align 4
  274. Loop_cbc_enc:
  275. aese v0.16b,v16.16b
  276. aesmc v0.16b,v0.16b
  277. st1 {v6.16b},[x1],#16
  278. Lenter_cbc_enc:
  279. aese v0.16b,v17.16b
  280. aesmc v0.16b,v0.16b
  281. aese v0.16b,v2.16b
  282. aesmc v0.16b,v0.16b
  283. ld1 {v16.4s},[x6]
  284. cmp w5,#4
  285. aese v0.16b,v3.16b
  286. aesmc v0.16b,v0.16b
  287. ld1 {v17.4s},[x12]
  288. b.eq Lcbc_enc192
  289. aese v0.16b,v16.16b
  290. aesmc v0.16b,v0.16b
  291. ld1 {v16.4s},[x14]
  292. aese v0.16b,v17.16b
  293. aesmc v0.16b,v0.16b
  294. ld1 {v17.4s},[x3]
  295. nop
  296. Lcbc_enc192:
  297. aese v0.16b,v16.16b
  298. aesmc v0.16b,v0.16b
  299. subs x2,x2,#16
  300. aese v0.16b,v17.16b
  301. aesmc v0.16b,v0.16b
  302. csel x8,xzr,x8,eq
  303. aese v0.16b,v18.16b
  304. aesmc v0.16b,v0.16b
  305. aese v0.16b,v19.16b
  306. aesmc v0.16b,v0.16b
  307. ld1 {v16.16b},[x0],x8
  308. aese v0.16b,v20.16b
  309. aesmc v0.16b,v0.16b
  310. eor v16.16b,v16.16b,v5.16b
  311. aese v0.16b,v21.16b
  312. aesmc v0.16b,v0.16b
  313. ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
  314. aese v0.16b,v22.16b
  315. aesmc v0.16b,v0.16b
  316. aese v0.16b,v23.16b
  317. eor v6.16b,v0.16b,v7.16b
  318. b.hs Loop_cbc_enc
  319. st1 {v6.16b},[x1],#16
  320. b Lcbc_done
  321. .align 5
  322. Lcbc_enc128:
  323. ld1 {v2.4s,v3.4s},[x7]
  324. aese v0.16b,v16.16b
  325. aesmc v0.16b,v0.16b
  326. b Lenter_cbc_enc128
  327. Loop_cbc_enc128:
  328. aese v0.16b,v16.16b
  329. aesmc v0.16b,v0.16b
  330. st1 {v6.16b},[x1],#16
  331. Lenter_cbc_enc128:
  332. aese v0.16b,v17.16b
  333. aesmc v0.16b,v0.16b
  334. subs x2,x2,#16
  335. aese v0.16b,v2.16b
  336. aesmc v0.16b,v0.16b
  337. csel x8,xzr,x8,eq
  338. aese v0.16b,v3.16b
  339. aesmc v0.16b,v0.16b
  340. aese v0.16b,v18.16b
  341. aesmc v0.16b,v0.16b
  342. aese v0.16b,v19.16b
  343. aesmc v0.16b,v0.16b
  344. ld1 {v16.16b},[x0],x8
  345. aese v0.16b,v20.16b
  346. aesmc v0.16b,v0.16b
  347. aese v0.16b,v21.16b
  348. aesmc v0.16b,v0.16b
  349. aese v0.16b,v22.16b
  350. aesmc v0.16b,v0.16b
  351. eor v16.16b,v16.16b,v5.16b
  352. aese v0.16b,v23.16b
  353. eor v6.16b,v0.16b,v7.16b
  354. b.hs Loop_cbc_enc128
  355. st1 {v6.16b},[x1],#16
  356. b Lcbc_done
  357. .align 5
  358. Lcbc_dec:
  359. ld1 {v18.16b},[x0],#16
  360. subs x2,x2,#32 // bias
  361. add w6,w5,#2
  362. orr v3.16b,v0.16b,v0.16b
  363. orr v1.16b,v0.16b,v0.16b
  364. orr v19.16b,v18.16b,v18.16b
  365. b.lo Lcbc_dec_tail
  366. orr v1.16b,v18.16b,v18.16b
  367. ld1 {v18.16b},[x0],#16
  368. orr v2.16b,v0.16b,v0.16b
  369. orr v3.16b,v1.16b,v1.16b
  370. orr v19.16b,v18.16b,v18.16b
  371. Loop3x_cbc_dec:
  372. aesd v0.16b,v16.16b
  373. aesimc v0.16b,v0.16b
  374. aesd v1.16b,v16.16b
  375. aesimc v1.16b,v1.16b
  376. aesd v18.16b,v16.16b
  377. aesimc v18.16b,v18.16b
  378. ld1 {v16.4s},[x7],#16
  379. subs w6,w6,#2
  380. aesd v0.16b,v17.16b
  381. aesimc v0.16b,v0.16b
  382. aesd v1.16b,v17.16b
  383. aesimc v1.16b,v1.16b
  384. aesd v18.16b,v17.16b
  385. aesimc v18.16b,v18.16b
  386. ld1 {v17.4s},[x7],#16
  387. b.gt Loop3x_cbc_dec
  388. aesd v0.16b,v16.16b
  389. aesimc v0.16b,v0.16b
  390. aesd v1.16b,v16.16b
  391. aesimc v1.16b,v1.16b
  392. aesd v18.16b,v16.16b
  393. aesimc v18.16b,v18.16b
  394. eor v4.16b,v6.16b,v7.16b
  395. subs x2,x2,#0x30
  396. eor v5.16b,v2.16b,v7.16b
  397. csel x6,x2,x6,lo // x6, w6, is zero at this point
  398. aesd v0.16b,v17.16b
  399. aesimc v0.16b,v0.16b
  400. aesd v1.16b,v17.16b
  401. aesimc v1.16b,v1.16b
  402. aesd v18.16b,v17.16b
  403. aesimc v18.16b,v18.16b
  404. eor v17.16b,v3.16b,v7.16b
  405. add x0,x0,x6 // x0 is adjusted in such way that
  406. // at exit from the loop v1.16b-v18.16b
  407. // are loaded with last "words"
  408. orr v6.16b,v19.16b,v19.16b
  409. mov x7,x3
  410. aesd v0.16b,v20.16b
  411. aesimc v0.16b,v0.16b
  412. aesd v1.16b,v20.16b
  413. aesimc v1.16b,v1.16b
  414. aesd v18.16b,v20.16b
  415. aesimc v18.16b,v18.16b
  416. ld1 {v2.16b},[x0],#16
  417. aesd v0.16b,v21.16b
  418. aesimc v0.16b,v0.16b
  419. aesd v1.16b,v21.16b
  420. aesimc v1.16b,v1.16b
  421. aesd v18.16b,v21.16b
  422. aesimc v18.16b,v18.16b
  423. ld1 {v3.16b},[x0],#16
  424. aesd v0.16b,v22.16b
  425. aesimc v0.16b,v0.16b
  426. aesd v1.16b,v22.16b
  427. aesimc v1.16b,v1.16b
  428. aesd v18.16b,v22.16b
  429. aesimc v18.16b,v18.16b
  430. ld1 {v19.16b},[x0],#16
  431. aesd v0.16b,v23.16b
  432. aesd v1.16b,v23.16b
  433. aesd v18.16b,v23.16b
  434. ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
  435. add w6,w5,#2
  436. eor v4.16b,v4.16b,v0.16b
  437. eor v5.16b,v5.16b,v1.16b
  438. eor v18.16b,v18.16b,v17.16b
  439. ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
  440. st1 {v4.16b},[x1],#16
  441. orr v0.16b,v2.16b,v2.16b
  442. st1 {v5.16b},[x1],#16
  443. orr v1.16b,v3.16b,v3.16b
  444. st1 {v18.16b},[x1],#16
  445. orr v18.16b,v19.16b,v19.16b
  446. b.hs Loop3x_cbc_dec
  447. cmn x2,#0x30
  448. b.eq Lcbc_done
  449. nop
  450. Lcbc_dec_tail:
  451. aesd v1.16b,v16.16b
  452. aesimc v1.16b,v1.16b
  453. aesd v18.16b,v16.16b
  454. aesimc v18.16b,v18.16b
  455. ld1 {v16.4s},[x7],#16
  456. subs w6,w6,#2
  457. aesd v1.16b,v17.16b
  458. aesimc v1.16b,v1.16b
  459. aesd v18.16b,v17.16b
  460. aesimc v18.16b,v18.16b
  461. ld1 {v17.4s},[x7],#16
  462. b.gt Lcbc_dec_tail
  463. aesd v1.16b,v16.16b
  464. aesimc v1.16b,v1.16b
  465. aesd v18.16b,v16.16b
  466. aesimc v18.16b,v18.16b
  467. aesd v1.16b,v17.16b
  468. aesimc v1.16b,v1.16b
  469. aesd v18.16b,v17.16b
  470. aesimc v18.16b,v18.16b
  471. aesd v1.16b,v20.16b
  472. aesimc v1.16b,v1.16b
  473. aesd v18.16b,v20.16b
  474. aesimc v18.16b,v18.16b
  475. cmn x2,#0x20
  476. aesd v1.16b,v21.16b
  477. aesimc v1.16b,v1.16b
  478. aesd v18.16b,v21.16b
  479. aesimc v18.16b,v18.16b
  480. eor v5.16b,v6.16b,v7.16b
  481. aesd v1.16b,v22.16b
  482. aesimc v1.16b,v1.16b
  483. aesd v18.16b,v22.16b
  484. aesimc v18.16b,v18.16b
  485. eor v17.16b,v3.16b,v7.16b
  486. aesd v1.16b,v23.16b
  487. aesd v18.16b,v23.16b
  488. b.eq Lcbc_dec_one
  489. eor v5.16b,v5.16b,v1.16b
  490. eor v17.16b,v17.16b,v18.16b
  491. orr v6.16b,v19.16b,v19.16b
  492. st1 {v5.16b},[x1],#16
  493. st1 {v17.16b},[x1],#16
  494. b Lcbc_done
  495. Lcbc_dec_one:
  496. eor v5.16b,v5.16b,v18.16b
  497. orr v6.16b,v19.16b,v19.16b
  498. st1 {v5.16b},[x1],#16
  499. Lcbc_done:
  500. st1 {v6.16b},[x4]
  501. Lcbc_abort:
  502. ldr x29,[sp],#16
  503. ret
  504. .globl _aes_v8_ctr32_encrypt_blocks
  505. .align 5
  506. _aes_v8_ctr32_encrypt_blocks:
  507. stp x29,x30,[sp,#-16]!
  508. add x29,sp,#0
  509. ldr w5,[x3,#240]
  510. ldr w8, [x4, #12]
  511. #ifdef __ARMEB__
  512. ld1 {v0.16b},[x4]
  513. #else
  514. ld1 {v0.4s},[x4]
  515. #endif
  516. ld1 {v16.4s,v17.4s},[x3] // load key schedule...
  517. sub w5,w5,#4
  518. mov x12,#16
  519. cmp x2,#2
  520. add x7,x3,x5,lsl#4 // pointer to last 5 round keys
  521. sub w5,w5,#2
  522. ld1 {v20.4s,v21.4s},[x7],#32
  523. ld1 {v22.4s,v23.4s},[x7],#32
  524. ld1 {v7.4s},[x7]
  525. add x7,x3,#32
  526. mov w6,w5
  527. csel x12,xzr,x12,lo
  528. #ifndef __ARMEB__
  529. rev w8, w8
  530. #endif
  531. orr v1.16b,v0.16b,v0.16b
  532. add w10, w8, #1
  533. orr v18.16b,v0.16b,v0.16b
  534. add w8, w8, #2
  535. orr v6.16b,v0.16b,v0.16b
  536. rev w10, w10
  537. mov v1.s[3],w10
  538. b.ls Lctr32_tail
  539. rev w12, w8
  540. sub x2,x2,#3 // bias
  541. mov v18.s[3],w12
  542. b Loop3x_ctr32
  543. .align 4
  544. Loop3x_ctr32:
  545. aese v0.16b,v16.16b
  546. aesmc v0.16b,v0.16b
  547. aese v1.16b,v16.16b
  548. aesmc v1.16b,v1.16b
  549. aese v18.16b,v16.16b
  550. aesmc v18.16b,v18.16b
  551. ld1 {v16.4s},[x7],#16
  552. subs w6,w6,#2
  553. aese v0.16b,v17.16b
  554. aesmc v0.16b,v0.16b
  555. aese v1.16b,v17.16b
  556. aesmc v1.16b,v1.16b
  557. aese v18.16b,v17.16b
  558. aesmc v18.16b,v18.16b
  559. ld1 {v17.4s},[x7],#16
  560. b.gt Loop3x_ctr32
  561. aese v0.16b,v16.16b
  562. aesmc v4.16b,v0.16b
  563. aese v1.16b,v16.16b
  564. aesmc v5.16b,v1.16b
  565. ld1 {v2.16b},[x0],#16
  566. orr v0.16b,v6.16b,v6.16b
  567. aese v18.16b,v16.16b
  568. aesmc v18.16b,v18.16b
  569. ld1 {v3.16b},[x0],#16
  570. orr v1.16b,v6.16b,v6.16b
  571. aese v4.16b,v17.16b
  572. aesmc v4.16b,v4.16b
  573. aese v5.16b,v17.16b
  574. aesmc v5.16b,v5.16b
  575. ld1 {v19.16b},[x0],#16
  576. mov x7,x3
  577. aese v18.16b,v17.16b
  578. aesmc v17.16b,v18.16b
  579. orr v18.16b,v6.16b,v6.16b
  580. add w9,w8,#1
  581. aese v4.16b,v20.16b
  582. aesmc v4.16b,v4.16b
  583. aese v5.16b,v20.16b
  584. aesmc v5.16b,v5.16b
  585. eor v2.16b,v2.16b,v7.16b
  586. add w10,w8,#2
  587. aese v17.16b,v20.16b
  588. aesmc v17.16b,v17.16b
  589. eor v3.16b,v3.16b,v7.16b
  590. add w8,w8,#3
  591. aese v4.16b,v21.16b
  592. aesmc v4.16b,v4.16b
  593. aese v5.16b,v21.16b
  594. aesmc v5.16b,v5.16b
  595. eor v19.16b,v19.16b,v7.16b
  596. rev w9,w9
  597. aese v17.16b,v21.16b
  598. aesmc v17.16b,v17.16b
  599. mov v0.s[3], w9
  600. rev w10,w10
  601. aese v4.16b,v22.16b
  602. aesmc v4.16b,v4.16b
  603. aese v5.16b,v22.16b
  604. aesmc v5.16b,v5.16b
  605. mov v1.s[3], w10
  606. rev w12,w8
  607. aese v17.16b,v22.16b
  608. aesmc v17.16b,v17.16b
  609. mov v18.s[3], w12
  610. subs x2,x2,#3
  611. aese v4.16b,v23.16b
  612. aese v5.16b,v23.16b
  613. aese v17.16b,v23.16b
  614. eor v2.16b,v2.16b,v4.16b
  615. ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
  616. st1 {v2.16b},[x1],#16
  617. eor v3.16b,v3.16b,v5.16b
  618. mov w6,w5
  619. st1 {v3.16b},[x1],#16
  620. eor v19.16b,v19.16b,v17.16b
  621. ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
  622. st1 {v19.16b},[x1],#16
  623. b.hs Loop3x_ctr32
  624. adds x2,x2,#3
  625. b.eq Lctr32_done
  626. cmp x2,#1
  627. mov x12,#16
  628. csel x12,xzr,x12,eq
  629. Lctr32_tail:
  630. aese v0.16b,v16.16b
  631. aesmc v0.16b,v0.16b
  632. aese v1.16b,v16.16b
  633. aesmc v1.16b,v1.16b
  634. ld1 {v16.4s},[x7],#16
  635. subs w6,w6,#2
  636. aese v0.16b,v17.16b
  637. aesmc v0.16b,v0.16b
  638. aese v1.16b,v17.16b
  639. aesmc v1.16b,v1.16b
  640. ld1 {v17.4s},[x7],#16
  641. b.gt Lctr32_tail
  642. aese v0.16b,v16.16b
  643. aesmc v0.16b,v0.16b
  644. aese v1.16b,v16.16b
  645. aesmc v1.16b,v1.16b
  646. aese v0.16b,v17.16b
  647. aesmc v0.16b,v0.16b
  648. aese v1.16b,v17.16b
  649. aesmc v1.16b,v1.16b
  650. ld1 {v2.16b},[x0],x12
  651. aese v0.16b,v20.16b
  652. aesmc v0.16b,v0.16b
  653. aese v1.16b,v20.16b
  654. aesmc v1.16b,v1.16b
  655. ld1 {v3.16b},[x0]
  656. aese v0.16b,v21.16b
  657. aesmc v0.16b,v0.16b
  658. aese v1.16b,v21.16b
  659. aesmc v1.16b,v1.16b
  660. eor v2.16b,v2.16b,v7.16b
  661. aese v0.16b,v22.16b
  662. aesmc v0.16b,v0.16b
  663. aese v1.16b,v22.16b
  664. aesmc v1.16b,v1.16b
  665. eor v3.16b,v3.16b,v7.16b
  666. aese v0.16b,v23.16b
  667. aese v1.16b,v23.16b
  668. cmp x2,#1
  669. eor v2.16b,v2.16b,v0.16b
  670. eor v3.16b,v3.16b,v1.16b
  671. st1 {v2.16b},[x1],#16
  672. b.eq Lctr32_done
  673. st1 {v3.16b},[x1]
  674. Lctr32_done:
  675. ldr x29,[sp],#16
  676. ret
  677. #endif