aesni-mb-x86_64.s 30 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507
  1. .text
  2. .globl aesni_multi_cbc_encrypt
  3. .type aesni_multi_cbc_encrypt,@function
  4. .align 32
  5. aesni_multi_cbc_encrypt:
  6. .cfi_startproc
  7. cmpl $2,%edx
  8. jb .Lenc_non_avx
  9. movl OPENSSL_ia32cap_P+4(%rip),%ecx
  10. testl $268435456,%ecx
  11. jnz _avx_cbc_enc_shortcut
  12. jmp .Lenc_non_avx
  13. .align 16
  14. .Lenc_non_avx:
  15. movq %rsp,%rax
  16. .cfi_def_cfa_register %rax
  17. pushq %rbx
  18. .cfi_offset %rbx,-16
  19. pushq %rbp
  20. .cfi_offset %rbp,-24
  21. pushq %r12
  22. .cfi_offset %r12,-32
  23. pushq %r13
  24. .cfi_offset %r13,-40
  25. pushq %r14
  26. .cfi_offset %r14,-48
  27. pushq %r15
  28. .cfi_offset %r15,-56
  29. subq $48,%rsp
  30. andq $-64,%rsp
  31. movq %rax,16(%rsp)
  32. .cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
  33. .Lenc4x_body:
  34. movdqu (%rsi),%xmm12
  35. leaq 120(%rsi),%rsi
  36. leaq 80(%rdi),%rdi
  37. .Lenc4x_loop_grande:
  38. movl %edx,24(%rsp)
  39. xorl %edx,%edx
  40. movl -64(%rdi),%ecx
  41. movq -80(%rdi),%r8
  42. cmpl %edx,%ecx
  43. movq -72(%rdi),%r12
  44. cmovgl %ecx,%edx
  45. testl %ecx,%ecx
  46. movdqu -56(%rdi),%xmm2
  47. movl %ecx,32(%rsp)
  48. cmovleq %rsp,%r8
  49. movl -24(%rdi),%ecx
  50. movq -40(%rdi),%r9
  51. cmpl %edx,%ecx
  52. movq -32(%rdi),%r13
  53. cmovgl %ecx,%edx
  54. testl %ecx,%ecx
  55. movdqu -16(%rdi),%xmm3
  56. movl %ecx,36(%rsp)
  57. cmovleq %rsp,%r9
  58. movl 16(%rdi),%ecx
  59. movq 0(%rdi),%r10
  60. cmpl %edx,%ecx
  61. movq 8(%rdi),%r14
  62. cmovgl %ecx,%edx
  63. testl %ecx,%ecx
  64. movdqu 24(%rdi),%xmm4
  65. movl %ecx,40(%rsp)
  66. cmovleq %rsp,%r10
  67. movl 56(%rdi),%ecx
  68. movq 40(%rdi),%r11
  69. cmpl %edx,%ecx
  70. movq 48(%rdi),%r15
  71. cmovgl %ecx,%edx
  72. testl %ecx,%ecx
  73. movdqu 64(%rdi),%xmm5
  74. movl %ecx,44(%rsp)
  75. cmovleq %rsp,%r11
  76. testl %edx,%edx
  77. jz .Lenc4x_done
  78. movups 16-120(%rsi),%xmm1
  79. pxor %xmm12,%xmm2
  80. movups 32-120(%rsi),%xmm0
  81. pxor %xmm12,%xmm3
  82. movl 240-120(%rsi),%eax
  83. pxor %xmm12,%xmm4
  84. movdqu (%r8),%xmm6
  85. pxor %xmm12,%xmm5
  86. movdqu (%r9),%xmm7
  87. pxor %xmm6,%xmm2
  88. movdqu (%r10),%xmm8
  89. pxor %xmm7,%xmm3
  90. movdqu (%r11),%xmm9
  91. pxor %xmm8,%xmm4
  92. pxor %xmm9,%xmm5
  93. movdqa 32(%rsp),%xmm10
  94. xorq %rbx,%rbx
  95. jmp .Loop_enc4x
  96. .align 32
  97. .Loop_enc4x:
  98. addq $16,%rbx
  99. leaq 16(%rsp),%rbp
  100. movl $1,%ecx
  101. subq %rbx,%rbp
  102. .byte 102,15,56,220,209
  103. prefetcht0 31(%r8,%rbx,1)
  104. prefetcht0 31(%r9,%rbx,1)
  105. .byte 102,15,56,220,217
  106. prefetcht0 31(%r10,%rbx,1)
  107. prefetcht0 31(%r10,%rbx,1)
  108. .byte 102,15,56,220,225
  109. .byte 102,15,56,220,233
  110. movups 48-120(%rsi),%xmm1
  111. cmpl 32(%rsp),%ecx
  112. .byte 102,15,56,220,208
  113. .byte 102,15,56,220,216
  114. .byte 102,15,56,220,224
  115. cmovgeq %rbp,%r8
  116. cmovgq %rbp,%r12
  117. .byte 102,15,56,220,232
  118. movups -56(%rsi),%xmm0
  119. cmpl 36(%rsp),%ecx
  120. .byte 102,15,56,220,209
  121. .byte 102,15,56,220,217
  122. .byte 102,15,56,220,225
  123. cmovgeq %rbp,%r9
  124. cmovgq %rbp,%r13
  125. .byte 102,15,56,220,233
  126. movups -40(%rsi),%xmm1
  127. cmpl 40(%rsp),%ecx
  128. .byte 102,15,56,220,208
  129. .byte 102,15,56,220,216
  130. .byte 102,15,56,220,224
  131. cmovgeq %rbp,%r10
  132. cmovgq %rbp,%r14
  133. .byte 102,15,56,220,232
  134. movups -24(%rsi),%xmm0
  135. cmpl 44(%rsp),%ecx
  136. .byte 102,15,56,220,209
  137. .byte 102,15,56,220,217
  138. .byte 102,15,56,220,225
  139. cmovgeq %rbp,%r11
  140. cmovgq %rbp,%r15
  141. .byte 102,15,56,220,233
  142. movups -8(%rsi),%xmm1
  143. movdqa %xmm10,%xmm11
  144. .byte 102,15,56,220,208
  145. prefetcht0 15(%r12,%rbx,1)
  146. prefetcht0 15(%r13,%rbx,1)
  147. .byte 102,15,56,220,216
  148. prefetcht0 15(%r14,%rbx,1)
  149. prefetcht0 15(%r15,%rbx,1)
  150. .byte 102,15,56,220,224
  151. .byte 102,15,56,220,232
  152. movups 128-120(%rsi),%xmm0
  153. pxor %xmm12,%xmm12
  154. .byte 102,15,56,220,209
  155. pcmpgtd %xmm12,%xmm11
  156. movdqu -120(%rsi),%xmm12
  157. .byte 102,15,56,220,217
  158. paddd %xmm11,%xmm10
  159. movdqa %xmm10,32(%rsp)
  160. .byte 102,15,56,220,225
  161. .byte 102,15,56,220,233
  162. movups 144-120(%rsi),%xmm1
  163. cmpl $11,%eax
  164. .byte 102,15,56,220,208
  165. .byte 102,15,56,220,216
  166. .byte 102,15,56,220,224
  167. .byte 102,15,56,220,232
  168. movups 160-120(%rsi),%xmm0
  169. jb .Lenc4x_tail
  170. .byte 102,15,56,220,209
  171. .byte 102,15,56,220,217
  172. .byte 102,15,56,220,225
  173. .byte 102,15,56,220,233
  174. movups 176-120(%rsi),%xmm1
  175. .byte 102,15,56,220,208
  176. .byte 102,15,56,220,216
  177. .byte 102,15,56,220,224
  178. .byte 102,15,56,220,232
  179. movups 192-120(%rsi),%xmm0
  180. je .Lenc4x_tail
  181. .byte 102,15,56,220,209
  182. .byte 102,15,56,220,217
  183. .byte 102,15,56,220,225
  184. .byte 102,15,56,220,233
  185. movups 208-120(%rsi),%xmm1
  186. .byte 102,15,56,220,208
  187. .byte 102,15,56,220,216
  188. .byte 102,15,56,220,224
  189. .byte 102,15,56,220,232
  190. movups 224-120(%rsi),%xmm0
  191. jmp .Lenc4x_tail
  192. .align 32
  193. .Lenc4x_tail:
  194. .byte 102,15,56,220,209
  195. .byte 102,15,56,220,217
  196. .byte 102,15,56,220,225
  197. .byte 102,15,56,220,233
  198. movdqu (%r8,%rbx,1),%xmm6
  199. movdqu 16-120(%rsi),%xmm1
  200. .byte 102,15,56,221,208
  201. movdqu (%r9,%rbx,1),%xmm7
  202. pxor %xmm12,%xmm6
  203. .byte 102,15,56,221,216
  204. movdqu (%r10,%rbx,1),%xmm8
  205. pxor %xmm12,%xmm7
  206. .byte 102,15,56,221,224
  207. movdqu (%r11,%rbx,1),%xmm9
  208. pxor %xmm12,%xmm8
  209. .byte 102,15,56,221,232
  210. movdqu 32-120(%rsi),%xmm0
  211. pxor %xmm12,%xmm9
  212. movups %xmm2,-16(%r12,%rbx,1)
  213. pxor %xmm6,%xmm2
  214. movups %xmm3,-16(%r13,%rbx,1)
  215. pxor %xmm7,%xmm3
  216. movups %xmm4,-16(%r14,%rbx,1)
  217. pxor %xmm8,%xmm4
  218. movups %xmm5,-16(%r15,%rbx,1)
  219. pxor %xmm9,%xmm5
  220. decl %edx
  221. jnz .Loop_enc4x
  222. movq 16(%rsp),%rax
  223. .cfi_def_cfa %rax,8
  224. movl 24(%rsp),%edx
  225. leaq 160(%rdi),%rdi
  226. decl %edx
  227. jnz .Lenc4x_loop_grande
  228. .Lenc4x_done:
  229. movq -48(%rax),%r15
  230. .cfi_restore %r15
  231. movq -40(%rax),%r14
  232. .cfi_restore %r14
  233. movq -32(%rax),%r13
  234. .cfi_restore %r13
  235. movq -24(%rax),%r12
  236. .cfi_restore %r12
  237. movq -16(%rax),%rbp
  238. .cfi_restore %rbp
  239. movq -8(%rax),%rbx
  240. .cfi_restore %rbx
  241. leaq (%rax),%rsp
  242. .cfi_def_cfa_register %rsp
  243. .Lenc4x_epilogue:
  244. .byte 0xf3,0xc3
  245. .cfi_endproc
  246. .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
  247. .globl aesni_multi_cbc_decrypt
  248. .type aesni_multi_cbc_decrypt,@function
  249. .align 32
  250. aesni_multi_cbc_decrypt:
  251. .cfi_startproc
  252. cmpl $2,%edx
  253. jb .Ldec_non_avx
  254. movl OPENSSL_ia32cap_P+4(%rip),%ecx
  255. testl $268435456,%ecx
  256. jnz _avx_cbc_dec_shortcut
  257. jmp .Ldec_non_avx
  258. .align 16
  259. .Ldec_non_avx:
  260. movq %rsp,%rax
  261. .cfi_def_cfa_register %rax
  262. pushq %rbx
  263. .cfi_offset %rbx,-16
  264. pushq %rbp
  265. .cfi_offset %rbp,-24
  266. pushq %r12
  267. .cfi_offset %r12,-32
  268. pushq %r13
  269. .cfi_offset %r13,-40
  270. pushq %r14
  271. .cfi_offset %r14,-48
  272. pushq %r15
  273. .cfi_offset %r15,-56
  274. subq $48,%rsp
  275. andq $-64,%rsp
  276. movq %rax,16(%rsp)
  277. .cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
  278. .Ldec4x_body:
  279. movdqu (%rsi),%xmm12
  280. leaq 120(%rsi),%rsi
  281. leaq 80(%rdi),%rdi
  282. .Ldec4x_loop_grande:
  283. movl %edx,24(%rsp)
  284. xorl %edx,%edx
  285. movl -64(%rdi),%ecx
  286. movq -80(%rdi),%r8
  287. cmpl %edx,%ecx
  288. movq -72(%rdi),%r12
  289. cmovgl %ecx,%edx
  290. testl %ecx,%ecx
  291. movdqu -56(%rdi),%xmm6
  292. movl %ecx,32(%rsp)
  293. cmovleq %rsp,%r8
  294. movl -24(%rdi),%ecx
  295. movq -40(%rdi),%r9
  296. cmpl %edx,%ecx
  297. movq -32(%rdi),%r13
  298. cmovgl %ecx,%edx
  299. testl %ecx,%ecx
  300. movdqu -16(%rdi),%xmm7
  301. movl %ecx,36(%rsp)
  302. cmovleq %rsp,%r9
  303. movl 16(%rdi),%ecx
  304. movq 0(%rdi),%r10
  305. cmpl %edx,%ecx
  306. movq 8(%rdi),%r14
  307. cmovgl %ecx,%edx
  308. testl %ecx,%ecx
  309. movdqu 24(%rdi),%xmm8
  310. movl %ecx,40(%rsp)
  311. cmovleq %rsp,%r10
  312. movl 56(%rdi),%ecx
  313. movq 40(%rdi),%r11
  314. cmpl %edx,%ecx
  315. movq 48(%rdi),%r15
  316. cmovgl %ecx,%edx
  317. testl %ecx,%ecx
  318. movdqu 64(%rdi),%xmm9
  319. movl %ecx,44(%rsp)
  320. cmovleq %rsp,%r11
  321. testl %edx,%edx
  322. jz .Ldec4x_done
  323. movups 16-120(%rsi),%xmm1
  324. movups 32-120(%rsi),%xmm0
  325. movl 240-120(%rsi),%eax
  326. movdqu (%r8),%xmm2
  327. movdqu (%r9),%xmm3
  328. pxor %xmm12,%xmm2
  329. movdqu (%r10),%xmm4
  330. pxor %xmm12,%xmm3
  331. movdqu (%r11),%xmm5
  332. pxor %xmm12,%xmm4
  333. pxor %xmm12,%xmm5
  334. movdqa 32(%rsp),%xmm10
  335. xorq %rbx,%rbx
  336. jmp .Loop_dec4x
  337. .align 32
  338. .Loop_dec4x:
  339. addq $16,%rbx
  340. leaq 16(%rsp),%rbp
  341. movl $1,%ecx
  342. subq %rbx,%rbp
  343. .byte 102,15,56,222,209
  344. prefetcht0 31(%r8,%rbx,1)
  345. prefetcht0 31(%r9,%rbx,1)
  346. .byte 102,15,56,222,217
  347. prefetcht0 31(%r10,%rbx,1)
  348. prefetcht0 31(%r11,%rbx,1)
  349. .byte 102,15,56,222,225
  350. .byte 102,15,56,222,233
  351. movups 48-120(%rsi),%xmm1
  352. cmpl 32(%rsp),%ecx
  353. .byte 102,15,56,222,208
  354. .byte 102,15,56,222,216
  355. .byte 102,15,56,222,224
  356. cmovgeq %rbp,%r8
  357. cmovgq %rbp,%r12
  358. .byte 102,15,56,222,232
  359. movups -56(%rsi),%xmm0
  360. cmpl 36(%rsp),%ecx
  361. .byte 102,15,56,222,209
  362. .byte 102,15,56,222,217
  363. .byte 102,15,56,222,225
  364. cmovgeq %rbp,%r9
  365. cmovgq %rbp,%r13
  366. .byte 102,15,56,222,233
  367. movups -40(%rsi),%xmm1
  368. cmpl 40(%rsp),%ecx
  369. .byte 102,15,56,222,208
  370. .byte 102,15,56,222,216
  371. .byte 102,15,56,222,224
  372. cmovgeq %rbp,%r10
  373. cmovgq %rbp,%r14
  374. .byte 102,15,56,222,232
  375. movups -24(%rsi),%xmm0
  376. cmpl 44(%rsp),%ecx
  377. .byte 102,15,56,222,209
  378. .byte 102,15,56,222,217
  379. .byte 102,15,56,222,225
  380. cmovgeq %rbp,%r11
  381. cmovgq %rbp,%r15
  382. .byte 102,15,56,222,233
  383. movups -8(%rsi),%xmm1
  384. movdqa %xmm10,%xmm11
  385. .byte 102,15,56,222,208
  386. prefetcht0 15(%r12,%rbx,1)
  387. prefetcht0 15(%r13,%rbx,1)
  388. .byte 102,15,56,222,216
  389. prefetcht0 15(%r14,%rbx,1)
  390. prefetcht0 15(%r15,%rbx,1)
  391. .byte 102,15,56,222,224
  392. .byte 102,15,56,222,232
  393. movups 128-120(%rsi),%xmm0
  394. pxor %xmm12,%xmm12
  395. .byte 102,15,56,222,209
  396. pcmpgtd %xmm12,%xmm11
  397. movdqu -120(%rsi),%xmm12
  398. .byte 102,15,56,222,217
  399. paddd %xmm11,%xmm10
  400. movdqa %xmm10,32(%rsp)
  401. .byte 102,15,56,222,225
  402. .byte 102,15,56,222,233
  403. movups 144-120(%rsi),%xmm1
  404. cmpl $11,%eax
  405. .byte 102,15,56,222,208
  406. .byte 102,15,56,222,216
  407. .byte 102,15,56,222,224
  408. .byte 102,15,56,222,232
  409. movups 160-120(%rsi),%xmm0
  410. jb .Ldec4x_tail
  411. .byte 102,15,56,222,209
  412. .byte 102,15,56,222,217
  413. .byte 102,15,56,222,225
  414. .byte 102,15,56,222,233
  415. movups 176-120(%rsi),%xmm1
  416. .byte 102,15,56,222,208
  417. .byte 102,15,56,222,216
  418. .byte 102,15,56,222,224
  419. .byte 102,15,56,222,232
  420. movups 192-120(%rsi),%xmm0
  421. je .Ldec4x_tail
  422. .byte 102,15,56,222,209
  423. .byte 102,15,56,222,217
  424. .byte 102,15,56,222,225
  425. .byte 102,15,56,222,233
  426. movups 208-120(%rsi),%xmm1
  427. .byte 102,15,56,222,208
  428. .byte 102,15,56,222,216
  429. .byte 102,15,56,222,224
  430. .byte 102,15,56,222,232
  431. movups 224-120(%rsi),%xmm0
  432. jmp .Ldec4x_tail
  433. .align 32
  434. .Ldec4x_tail:
  435. .byte 102,15,56,222,209
  436. .byte 102,15,56,222,217
  437. .byte 102,15,56,222,225
  438. pxor %xmm0,%xmm6
  439. pxor %xmm0,%xmm7
  440. .byte 102,15,56,222,233
  441. movdqu 16-120(%rsi),%xmm1
  442. pxor %xmm0,%xmm8
  443. pxor %xmm0,%xmm9
  444. movdqu 32-120(%rsi),%xmm0
  445. .byte 102,15,56,223,214
  446. .byte 102,15,56,223,223
  447. movdqu -16(%r8,%rbx,1),%xmm6
  448. movdqu -16(%r9,%rbx,1),%xmm7
  449. .byte 102,65,15,56,223,224
  450. .byte 102,65,15,56,223,233
  451. movdqu -16(%r10,%rbx,1),%xmm8
  452. movdqu -16(%r11,%rbx,1),%xmm9
  453. movups %xmm2,-16(%r12,%rbx,1)
  454. movdqu (%r8,%rbx,1),%xmm2
  455. movups %xmm3,-16(%r13,%rbx,1)
  456. movdqu (%r9,%rbx,1),%xmm3
  457. pxor %xmm12,%xmm2
  458. movups %xmm4,-16(%r14,%rbx,1)
  459. movdqu (%r10,%rbx,1),%xmm4
  460. pxor %xmm12,%xmm3
  461. movups %xmm5,-16(%r15,%rbx,1)
  462. movdqu (%r11,%rbx,1),%xmm5
  463. pxor %xmm12,%xmm4
  464. pxor %xmm12,%xmm5
  465. decl %edx
  466. jnz .Loop_dec4x
  467. movq 16(%rsp),%rax
  468. .cfi_def_cfa %rax,8
  469. movl 24(%rsp),%edx
  470. leaq 160(%rdi),%rdi
  471. decl %edx
  472. jnz .Ldec4x_loop_grande
  473. .Ldec4x_done:
  474. movq -48(%rax),%r15
  475. .cfi_restore %r15
  476. movq -40(%rax),%r14
  477. .cfi_restore %r14
  478. movq -32(%rax),%r13
  479. .cfi_restore %r13
  480. movq -24(%rax),%r12
  481. .cfi_restore %r12
  482. movq -16(%rax),%rbp
  483. .cfi_restore %rbp
  484. movq -8(%rax),%rbx
  485. .cfi_restore %rbx
  486. leaq (%rax),%rsp
  487. .cfi_def_cfa_register %rsp
  488. .Ldec4x_epilogue:
  489. .byte 0xf3,0xc3
  490. .cfi_endproc
  491. .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
  492. .type aesni_multi_cbc_encrypt_avx,@function
  493. .align 32
  494. aesni_multi_cbc_encrypt_avx:
  495. .cfi_startproc
  496. _avx_cbc_enc_shortcut:
  497. movq %rsp,%rax
  498. .cfi_def_cfa_register %rax
  499. pushq %rbx
  500. .cfi_offset %rbx,-16
  501. pushq %rbp
  502. .cfi_offset %rbp,-24
  503. pushq %r12
  504. .cfi_offset %r12,-32
  505. pushq %r13
  506. .cfi_offset %r13,-40
  507. pushq %r14
  508. .cfi_offset %r14,-48
  509. pushq %r15
  510. .cfi_offset %r15,-56
  511. subq $192,%rsp
  512. andq $-128,%rsp
  513. movq %rax,16(%rsp)
  514. .cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
  515. .Lenc8x_body:
  516. vzeroupper
  517. vmovdqu (%rsi),%xmm15
  518. leaq 120(%rsi),%rsi
  519. leaq 160(%rdi),%rdi
  520. shrl $1,%edx
  521. .Lenc8x_loop_grande:
  522. xorl %edx,%edx
  523. movl -144(%rdi),%ecx
  524. movq -160(%rdi),%r8
  525. cmpl %edx,%ecx
  526. movq -152(%rdi),%rbx
  527. cmovgl %ecx,%edx
  528. testl %ecx,%ecx
  529. vmovdqu -136(%rdi),%xmm2
  530. movl %ecx,32(%rsp)
  531. cmovleq %rsp,%r8
  532. subq %r8,%rbx
  533. movq %rbx,64(%rsp)
  534. movl -104(%rdi),%ecx
  535. movq -120(%rdi),%r9
  536. cmpl %edx,%ecx
  537. movq -112(%rdi),%rbp
  538. cmovgl %ecx,%edx
  539. testl %ecx,%ecx
  540. vmovdqu -96(%rdi),%xmm3
  541. movl %ecx,36(%rsp)
  542. cmovleq %rsp,%r9
  543. subq %r9,%rbp
  544. movq %rbp,72(%rsp)
  545. movl -64(%rdi),%ecx
  546. movq -80(%rdi),%r10
  547. cmpl %edx,%ecx
  548. movq -72(%rdi),%rbp
  549. cmovgl %ecx,%edx
  550. testl %ecx,%ecx
  551. vmovdqu -56(%rdi),%xmm4
  552. movl %ecx,40(%rsp)
  553. cmovleq %rsp,%r10
  554. subq %r10,%rbp
  555. movq %rbp,80(%rsp)
  556. movl -24(%rdi),%ecx
  557. movq -40(%rdi),%r11
  558. cmpl %edx,%ecx
  559. movq -32(%rdi),%rbp
  560. cmovgl %ecx,%edx
  561. testl %ecx,%ecx
  562. vmovdqu -16(%rdi),%xmm5
  563. movl %ecx,44(%rsp)
  564. cmovleq %rsp,%r11
  565. subq %r11,%rbp
  566. movq %rbp,88(%rsp)
  567. movl 16(%rdi),%ecx
  568. movq 0(%rdi),%r12
  569. cmpl %edx,%ecx
  570. movq 8(%rdi),%rbp
  571. cmovgl %ecx,%edx
  572. testl %ecx,%ecx
  573. vmovdqu 24(%rdi),%xmm6
  574. movl %ecx,48(%rsp)
  575. cmovleq %rsp,%r12
  576. subq %r12,%rbp
  577. movq %rbp,96(%rsp)
  578. movl 56(%rdi),%ecx
  579. movq 40(%rdi),%r13
  580. cmpl %edx,%ecx
  581. movq 48(%rdi),%rbp
  582. cmovgl %ecx,%edx
  583. testl %ecx,%ecx
  584. vmovdqu 64(%rdi),%xmm7
  585. movl %ecx,52(%rsp)
  586. cmovleq %rsp,%r13
  587. subq %r13,%rbp
  588. movq %rbp,104(%rsp)
  589. movl 96(%rdi),%ecx
  590. movq 80(%rdi),%r14
  591. cmpl %edx,%ecx
  592. movq 88(%rdi),%rbp
  593. cmovgl %ecx,%edx
  594. testl %ecx,%ecx
  595. vmovdqu 104(%rdi),%xmm8
  596. movl %ecx,56(%rsp)
  597. cmovleq %rsp,%r14
  598. subq %r14,%rbp
  599. movq %rbp,112(%rsp)
  600. movl 136(%rdi),%ecx
  601. movq 120(%rdi),%r15
  602. cmpl %edx,%ecx
  603. movq 128(%rdi),%rbp
  604. cmovgl %ecx,%edx
  605. testl %ecx,%ecx
  606. vmovdqu 144(%rdi),%xmm9
  607. movl %ecx,60(%rsp)
  608. cmovleq %rsp,%r15
  609. subq %r15,%rbp
  610. movq %rbp,120(%rsp)
  611. testl %edx,%edx
  612. jz .Lenc8x_done
  613. vmovups 16-120(%rsi),%xmm1
  614. vmovups 32-120(%rsi),%xmm0
  615. movl 240-120(%rsi),%eax
  616. vpxor (%r8),%xmm15,%xmm10
  617. leaq 128(%rsp),%rbp
  618. vpxor (%r9),%xmm15,%xmm11
  619. vpxor (%r10),%xmm15,%xmm12
  620. vpxor (%r11),%xmm15,%xmm13
  621. vpxor %xmm10,%xmm2,%xmm2
  622. vpxor (%r12),%xmm15,%xmm10
  623. vpxor %xmm11,%xmm3,%xmm3
  624. vpxor (%r13),%xmm15,%xmm11
  625. vpxor %xmm12,%xmm4,%xmm4
  626. vpxor (%r14),%xmm15,%xmm12
  627. vpxor %xmm13,%xmm5,%xmm5
  628. vpxor (%r15),%xmm15,%xmm13
  629. vpxor %xmm10,%xmm6,%xmm6
  630. movl $1,%ecx
  631. vpxor %xmm11,%xmm7,%xmm7
  632. vpxor %xmm12,%xmm8,%xmm8
  633. vpxor %xmm13,%xmm9,%xmm9
  634. jmp .Loop_enc8x
  635. .align 32
  636. .Loop_enc8x:
  637. vaesenc %xmm1,%xmm2,%xmm2
  638. cmpl 32+0(%rsp),%ecx
  639. vaesenc %xmm1,%xmm3,%xmm3
  640. prefetcht0 31(%r8)
  641. vaesenc %xmm1,%xmm4,%xmm4
  642. vaesenc %xmm1,%xmm5,%xmm5
  643. leaq (%r8,%rbx,1),%rbx
  644. cmovgeq %rsp,%r8
  645. vaesenc %xmm1,%xmm6,%xmm6
  646. cmovgq %rsp,%rbx
  647. vaesenc %xmm1,%xmm7,%xmm7
  648. subq %r8,%rbx
  649. vaesenc %xmm1,%xmm8,%xmm8
  650. vpxor 16(%r8),%xmm15,%xmm10
  651. movq %rbx,64+0(%rsp)
  652. vaesenc %xmm1,%xmm9,%xmm9
  653. vmovups -72(%rsi),%xmm1
  654. leaq 16(%r8,%rbx,1),%r8
  655. vmovdqu %xmm10,0(%rbp)
  656. vaesenc %xmm0,%xmm2,%xmm2
  657. cmpl 32+4(%rsp),%ecx
  658. movq 64+8(%rsp),%rbx
  659. vaesenc %xmm0,%xmm3,%xmm3
  660. prefetcht0 31(%r9)
  661. vaesenc %xmm0,%xmm4,%xmm4
  662. vaesenc %xmm0,%xmm5,%xmm5
  663. leaq (%r9,%rbx,1),%rbx
  664. cmovgeq %rsp,%r9
  665. vaesenc %xmm0,%xmm6,%xmm6
  666. cmovgq %rsp,%rbx
  667. vaesenc %xmm0,%xmm7,%xmm7
  668. subq %r9,%rbx
  669. vaesenc %xmm0,%xmm8,%xmm8
  670. vpxor 16(%r9),%xmm15,%xmm11
  671. movq %rbx,64+8(%rsp)
  672. vaesenc %xmm0,%xmm9,%xmm9
  673. vmovups -56(%rsi),%xmm0
  674. leaq 16(%r9,%rbx,1),%r9
  675. vmovdqu %xmm11,16(%rbp)
  676. vaesenc %xmm1,%xmm2,%xmm2
  677. cmpl 32+8(%rsp),%ecx
  678. movq 64+16(%rsp),%rbx
  679. vaesenc %xmm1,%xmm3,%xmm3
  680. prefetcht0 31(%r10)
  681. vaesenc %xmm1,%xmm4,%xmm4
  682. prefetcht0 15(%r8)
  683. vaesenc %xmm1,%xmm5,%xmm5
  684. leaq (%r10,%rbx,1),%rbx
  685. cmovgeq %rsp,%r10
  686. vaesenc %xmm1,%xmm6,%xmm6
  687. cmovgq %rsp,%rbx
  688. vaesenc %xmm1,%xmm7,%xmm7
  689. subq %r10,%rbx
  690. vaesenc %xmm1,%xmm8,%xmm8
  691. vpxor 16(%r10),%xmm15,%xmm12
  692. movq %rbx,64+16(%rsp)
  693. vaesenc %xmm1,%xmm9,%xmm9
  694. vmovups -40(%rsi),%xmm1
  695. leaq 16(%r10,%rbx,1),%r10
  696. vmovdqu %xmm12,32(%rbp)
  697. vaesenc %xmm0,%xmm2,%xmm2
  698. cmpl 32+12(%rsp),%ecx
  699. movq 64+24(%rsp),%rbx
  700. vaesenc %xmm0,%xmm3,%xmm3
  701. prefetcht0 31(%r11)
  702. vaesenc %xmm0,%xmm4,%xmm4
  703. prefetcht0 15(%r9)
  704. vaesenc %xmm0,%xmm5,%xmm5
  705. leaq (%r11,%rbx,1),%rbx
  706. cmovgeq %rsp,%r11
  707. vaesenc %xmm0,%xmm6,%xmm6
  708. cmovgq %rsp,%rbx
  709. vaesenc %xmm0,%xmm7,%xmm7
  710. subq %r11,%rbx
  711. vaesenc %xmm0,%xmm8,%xmm8
  712. vpxor 16(%r11),%xmm15,%xmm13
  713. movq %rbx,64+24(%rsp)
  714. vaesenc %xmm0,%xmm9,%xmm9
  715. vmovups -24(%rsi),%xmm0
  716. leaq 16(%r11,%rbx,1),%r11
  717. vmovdqu %xmm13,48(%rbp)
  718. vaesenc %xmm1,%xmm2,%xmm2
  719. cmpl 32+16(%rsp),%ecx
  720. movq 64+32(%rsp),%rbx
  721. vaesenc %xmm1,%xmm3,%xmm3
  722. prefetcht0 31(%r12)
  723. vaesenc %xmm1,%xmm4,%xmm4
  724. prefetcht0 15(%r10)
  725. vaesenc %xmm1,%xmm5,%xmm5
  726. leaq (%r12,%rbx,1),%rbx
  727. cmovgeq %rsp,%r12
  728. vaesenc %xmm1,%xmm6,%xmm6
  729. cmovgq %rsp,%rbx
  730. vaesenc %xmm1,%xmm7,%xmm7
  731. subq %r12,%rbx
  732. vaesenc %xmm1,%xmm8,%xmm8
  733. vpxor 16(%r12),%xmm15,%xmm10
  734. movq %rbx,64+32(%rsp)
  735. vaesenc %xmm1,%xmm9,%xmm9
  736. vmovups -8(%rsi),%xmm1
  737. leaq 16(%r12,%rbx,1),%r12
  738. vaesenc %xmm0,%xmm2,%xmm2
  739. cmpl 32+20(%rsp),%ecx
  740. movq 64+40(%rsp),%rbx
  741. vaesenc %xmm0,%xmm3,%xmm3
  742. prefetcht0 31(%r13)
  743. vaesenc %xmm0,%xmm4,%xmm4
  744. prefetcht0 15(%r11)
  745. vaesenc %xmm0,%xmm5,%xmm5
  746. leaq (%rbx,%r13,1),%rbx
  747. cmovgeq %rsp,%r13
  748. vaesenc %xmm0,%xmm6,%xmm6
  749. cmovgq %rsp,%rbx
  750. vaesenc %xmm0,%xmm7,%xmm7
  751. subq %r13,%rbx
  752. vaesenc %xmm0,%xmm8,%xmm8
  753. vpxor 16(%r13),%xmm15,%xmm11
  754. movq %rbx,64+40(%rsp)
  755. vaesenc %xmm0,%xmm9,%xmm9
  756. vmovups 8(%rsi),%xmm0
  757. leaq 16(%r13,%rbx,1),%r13
  758. vaesenc %xmm1,%xmm2,%xmm2
  759. cmpl 32+24(%rsp),%ecx
  760. movq 64+48(%rsp),%rbx
  761. vaesenc %xmm1,%xmm3,%xmm3
  762. prefetcht0 31(%r14)
  763. vaesenc %xmm1,%xmm4,%xmm4
  764. prefetcht0 15(%r12)
  765. vaesenc %xmm1,%xmm5,%xmm5
  766. leaq (%r14,%rbx,1),%rbx
  767. cmovgeq %rsp,%r14
  768. vaesenc %xmm1,%xmm6,%xmm6
  769. cmovgq %rsp,%rbx
  770. vaesenc %xmm1,%xmm7,%xmm7
  771. subq %r14,%rbx
  772. vaesenc %xmm1,%xmm8,%xmm8
  773. vpxor 16(%r14),%xmm15,%xmm12
  774. movq %rbx,64+48(%rsp)
  775. vaesenc %xmm1,%xmm9,%xmm9
  776. vmovups 24(%rsi),%xmm1
  777. leaq 16(%r14,%rbx,1),%r14
  778. vaesenc %xmm0,%xmm2,%xmm2
  779. cmpl 32+28(%rsp),%ecx
  780. movq 64+56(%rsp),%rbx
  781. vaesenc %xmm0,%xmm3,%xmm3
  782. prefetcht0 31(%r15)
  783. vaesenc %xmm0,%xmm4,%xmm4
  784. prefetcht0 15(%r13)
  785. vaesenc %xmm0,%xmm5,%xmm5
  786. leaq (%r15,%rbx,1),%rbx
  787. cmovgeq %rsp,%r15
  788. vaesenc %xmm0,%xmm6,%xmm6
  789. cmovgq %rsp,%rbx
  790. vaesenc %xmm0,%xmm7,%xmm7
  791. subq %r15,%rbx
  792. vaesenc %xmm0,%xmm8,%xmm8
  793. vpxor 16(%r15),%xmm15,%xmm13
  794. movq %rbx,64+56(%rsp)
  795. vaesenc %xmm0,%xmm9,%xmm9
  796. vmovups 40(%rsi),%xmm0
  797. leaq 16(%r15,%rbx,1),%r15
  798. vmovdqu 32(%rsp),%xmm14
  799. prefetcht0 15(%r14)
  800. prefetcht0 15(%r15)
  801. cmpl $11,%eax
  802. jb .Lenc8x_tail
  803. vaesenc %xmm1,%xmm2,%xmm2
  804. vaesenc %xmm1,%xmm3,%xmm3
  805. vaesenc %xmm1,%xmm4,%xmm4
  806. vaesenc %xmm1,%xmm5,%xmm5
  807. vaesenc %xmm1,%xmm6,%xmm6
  808. vaesenc %xmm1,%xmm7,%xmm7
  809. vaesenc %xmm1,%xmm8,%xmm8
  810. vaesenc %xmm1,%xmm9,%xmm9
  811. vmovups 176-120(%rsi),%xmm1
  812. vaesenc %xmm0,%xmm2,%xmm2
  813. vaesenc %xmm0,%xmm3,%xmm3
  814. vaesenc %xmm0,%xmm4,%xmm4
  815. vaesenc %xmm0,%xmm5,%xmm5
  816. vaesenc %xmm0,%xmm6,%xmm6
  817. vaesenc %xmm0,%xmm7,%xmm7
  818. vaesenc %xmm0,%xmm8,%xmm8
  819. vaesenc %xmm0,%xmm9,%xmm9
  820. vmovups 192-120(%rsi),%xmm0
  821. je .Lenc8x_tail
  822. vaesenc %xmm1,%xmm2,%xmm2
  823. vaesenc %xmm1,%xmm3,%xmm3
  824. vaesenc %xmm1,%xmm4,%xmm4
  825. vaesenc %xmm1,%xmm5,%xmm5
  826. vaesenc %xmm1,%xmm6,%xmm6
  827. vaesenc %xmm1,%xmm7,%xmm7
  828. vaesenc %xmm1,%xmm8,%xmm8
  829. vaesenc %xmm1,%xmm9,%xmm9
  830. vmovups 208-120(%rsi),%xmm1
  831. vaesenc %xmm0,%xmm2,%xmm2
  832. vaesenc %xmm0,%xmm3,%xmm3
  833. vaesenc %xmm0,%xmm4,%xmm4
  834. vaesenc %xmm0,%xmm5,%xmm5
  835. vaesenc %xmm0,%xmm6,%xmm6
  836. vaesenc %xmm0,%xmm7,%xmm7
  837. vaesenc %xmm0,%xmm8,%xmm8
  838. vaesenc %xmm0,%xmm9,%xmm9
  839. vmovups 224-120(%rsi),%xmm0
  840. .Lenc8x_tail:
  841. vaesenc %xmm1,%xmm2,%xmm2
  842. vpxor %xmm15,%xmm15,%xmm15
  843. vaesenc %xmm1,%xmm3,%xmm3
  844. vaesenc %xmm1,%xmm4,%xmm4
  845. vpcmpgtd %xmm15,%xmm14,%xmm15
  846. vaesenc %xmm1,%xmm5,%xmm5
  847. vaesenc %xmm1,%xmm6,%xmm6
  848. vpaddd %xmm14,%xmm15,%xmm15
  849. vmovdqu 48(%rsp),%xmm14
  850. vaesenc %xmm1,%xmm7,%xmm7
  851. movq 64(%rsp),%rbx
  852. vaesenc %xmm1,%xmm8,%xmm8
  853. vaesenc %xmm1,%xmm9,%xmm9
  854. vmovups 16-120(%rsi),%xmm1
  855. vaesenclast %xmm0,%xmm2,%xmm2
  856. vmovdqa %xmm15,32(%rsp)
  857. vpxor %xmm15,%xmm15,%xmm15
  858. vaesenclast %xmm0,%xmm3,%xmm3
  859. vaesenclast %xmm0,%xmm4,%xmm4
  860. vpcmpgtd %xmm15,%xmm14,%xmm15
  861. vaesenclast %xmm0,%xmm5,%xmm5
  862. vaesenclast %xmm0,%xmm6,%xmm6
  863. vpaddd %xmm15,%xmm14,%xmm14
  864. vmovdqu -120(%rsi),%xmm15
  865. vaesenclast %xmm0,%xmm7,%xmm7
  866. vaesenclast %xmm0,%xmm8,%xmm8
  867. vmovdqa %xmm14,48(%rsp)
  868. vaesenclast %xmm0,%xmm9,%xmm9
  869. vmovups 32-120(%rsi),%xmm0
  870. vmovups %xmm2,-16(%r8)
  871. subq %rbx,%r8
  872. vpxor 0(%rbp),%xmm2,%xmm2
  873. vmovups %xmm3,-16(%r9)
  874. subq 72(%rsp),%r9
  875. vpxor 16(%rbp),%xmm3,%xmm3
  876. vmovups %xmm4,-16(%r10)
  877. subq 80(%rsp),%r10
  878. vpxor 32(%rbp),%xmm4,%xmm4
  879. vmovups %xmm5,-16(%r11)
  880. subq 88(%rsp),%r11
  881. vpxor 48(%rbp),%xmm5,%xmm5
  882. vmovups %xmm6,-16(%r12)
  883. subq 96(%rsp),%r12
  884. vpxor %xmm10,%xmm6,%xmm6
  885. vmovups %xmm7,-16(%r13)
  886. subq 104(%rsp),%r13
  887. vpxor %xmm11,%xmm7,%xmm7
  888. vmovups %xmm8,-16(%r14)
  889. subq 112(%rsp),%r14
  890. vpxor %xmm12,%xmm8,%xmm8
  891. vmovups %xmm9,-16(%r15)
  892. subq 120(%rsp),%r15
  893. vpxor %xmm13,%xmm9,%xmm9
  894. decl %edx
  895. jnz .Loop_enc8x
  896. movq 16(%rsp),%rax
  897. .cfi_def_cfa %rax,8
  898. .Lenc8x_done:
  899. vzeroupper
  900. movq -48(%rax),%r15
  901. .cfi_restore %r15
  902. movq -40(%rax),%r14
  903. .cfi_restore %r14
  904. movq -32(%rax),%r13
  905. .cfi_restore %r13
  906. movq -24(%rax),%r12
  907. .cfi_restore %r12
  908. movq -16(%rax),%rbp
  909. .cfi_restore %rbp
  910. movq -8(%rax),%rbx
  911. .cfi_restore %rbx
  912. leaq (%rax),%rsp
  913. .cfi_def_cfa_register %rsp
  914. .Lenc8x_epilogue:
  915. .byte 0xf3,0xc3
  916. .cfi_endproc
  917. .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
  918. .type aesni_multi_cbc_decrypt_avx,@function
  919. .align 32
  920. aesni_multi_cbc_decrypt_avx:
  921. .cfi_startproc
  922. _avx_cbc_dec_shortcut:
  923. movq %rsp,%rax
  924. .cfi_def_cfa_register %rax
  925. pushq %rbx
  926. .cfi_offset %rbx,-16
  927. pushq %rbp
  928. .cfi_offset %rbp,-24
  929. pushq %r12
  930. .cfi_offset %r12,-32
  931. pushq %r13
  932. .cfi_offset %r13,-40
  933. pushq %r14
  934. .cfi_offset %r14,-48
  935. pushq %r15
  936. .cfi_offset %r15,-56
  937. subq $256,%rsp
  938. andq $-256,%rsp
  939. subq $192,%rsp
  940. movq %rax,16(%rsp)
  941. .cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
  942. .Ldec8x_body:
  943. vzeroupper
  944. vmovdqu (%rsi),%xmm15
  945. leaq 120(%rsi),%rsi
  946. leaq 160(%rdi),%rdi
  947. shrl $1,%edx
  948. .Ldec8x_loop_grande:
  949. xorl %edx,%edx
  950. movl -144(%rdi),%ecx
  951. movq -160(%rdi),%r8
  952. cmpl %edx,%ecx
  953. movq -152(%rdi),%rbx
  954. cmovgl %ecx,%edx
  955. testl %ecx,%ecx
  956. vmovdqu -136(%rdi),%xmm2
  957. movl %ecx,32(%rsp)
  958. cmovleq %rsp,%r8
  959. subq %r8,%rbx
  960. movq %rbx,64(%rsp)
  961. vmovdqu %xmm2,192(%rsp)
  962. movl -104(%rdi),%ecx
  963. movq -120(%rdi),%r9
  964. cmpl %edx,%ecx
  965. movq -112(%rdi),%rbp
  966. cmovgl %ecx,%edx
  967. testl %ecx,%ecx
  968. vmovdqu -96(%rdi),%xmm3
  969. movl %ecx,36(%rsp)
  970. cmovleq %rsp,%r9
  971. subq %r9,%rbp
  972. movq %rbp,72(%rsp)
  973. vmovdqu %xmm3,208(%rsp)
  974. movl -64(%rdi),%ecx
  975. movq -80(%rdi),%r10
  976. cmpl %edx,%ecx
  977. movq -72(%rdi),%rbp
  978. cmovgl %ecx,%edx
  979. testl %ecx,%ecx
  980. vmovdqu -56(%rdi),%xmm4
  981. movl %ecx,40(%rsp)
  982. cmovleq %rsp,%r10
  983. subq %r10,%rbp
  984. movq %rbp,80(%rsp)
  985. vmovdqu %xmm4,224(%rsp)
  986. movl -24(%rdi),%ecx
  987. movq -40(%rdi),%r11
  988. cmpl %edx,%ecx
  989. movq -32(%rdi),%rbp
  990. cmovgl %ecx,%edx
  991. testl %ecx,%ecx
  992. vmovdqu -16(%rdi),%xmm5
  993. movl %ecx,44(%rsp)
  994. cmovleq %rsp,%r11
  995. subq %r11,%rbp
  996. movq %rbp,88(%rsp)
  997. vmovdqu %xmm5,240(%rsp)
  998. movl 16(%rdi),%ecx
  999. movq 0(%rdi),%r12
  1000. cmpl %edx,%ecx
  1001. movq 8(%rdi),%rbp
  1002. cmovgl %ecx,%edx
  1003. testl %ecx,%ecx
  1004. vmovdqu 24(%rdi),%xmm6
  1005. movl %ecx,48(%rsp)
  1006. cmovleq %rsp,%r12
  1007. subq %r12,%rbp
  1008. movq %rbp,96(%rsp)
  1009. vmovdqu %xmm6,256(%rsp)
  1010. movl 56(%rdi),%ecx
  1011. movq 40(%rdi),%r13
  1012. cmpl %edx,%ecx
  1013. movq 48(%rdi),%rbp
  1014. cmovgl %ecx,%edx
  1015. testl %ecx,%ecx
  1016. vmovdqu 64(%rdi),%xmm7
  1017. movl %ecx,52(%rsp)
  1018. cmovleq %rsp,%r13
  1019. subq %r13,%rbp
  1020. movq %rbp,104(%rsp)
  1021. vmovdqu %xmm7,272(%rsp)
  1022. movl 96(%rdi),%ecx
  1023. movq 80(%rdi),%r14
  1024. cmpl %edx,%ecx
  1025. movq 88(%rdi),%rbp
  1026. cmovgl %ecx,%edx
  1027. testl %ecx,%ecx
  1028. vmovdqu 104(%rdi),%xmm8
  1029. movl %ecx,56(%rsp)
  1030. cmovleq %rsp,%r14
  1031. subq %r14,%rbp
  1032. movq %rbp,112(%rsp)
  1033. vmovdqu %xmm8,288(%rsp)
  1034. movl 136(%rdi),%ecx
  1035. movq 120(%rdi),%r15
  1036. cmpl %edx,%ecx
  1037. movq 128(%rdi),%rbp
  1038. cmovgl %ecx,%edx
  1039. testl %ecx,%ecx
  1040. vmovdqu 144(%rdi),%xmm9
  1041. movl %ecx,60(%rsp)
  1042. cmovleq %rsp,%r15
  1043. subq %r15,%rbp
  1044. movq %rbp,120(%rsp)
  1045. vmovdqu %xmm9,304(%rsp)
  1046. testl %edx,%edx
  1047. jz .Ldec8x_done
  1048. vmovups 16-120(%rsi),%xmm1
  1049. vmovups 32-120(%rsi),%xmm0
  1050. movl 240-120(%rsi),%eax
  1051. leaq 192+128(%rsp),%rbp
  1052. vmovdqu (%r8),%xmm2
  1053. vmovdqu (%r9),%xmm3
  1054. vmovdqu (%r10),%xmm4
  1055. vmovdqu (%r11),%xmm5
  1056. vmovdqu (%r12),%xmm6
  1057. vmovdqu (%r13),%xmm7
  1058. vmovdqu (%r14),%xmm8
  1059. vmovdqu (%r15),%xmm9
  1060. vmovdqu %xmm2,0(%rbp)
  1061. vpxor %xmm15,%xmm2,%xmm2
  1062. vmovdqu %xmm3,16(%rbp)
  1063. vpxor %xmm15,%xmm3,%xmm3
  1064. vmovdqu %xmm4,32(%rbp)
  1065. vpxor %xmm15,%xmm4,%xmm4
  1066. vmovdqu %xmm5,48(%rbp)
  1067. vpxor %xmm15,%xmm5,%xmm5
  1068. vmovdqu %xmm6,64(%rbp)
  1069. vpxor %xmm15,%xmm6,%xmm6
  1070. vmovdqu %xmm7,80(%rbp)
  1071. vpxor %xmm15,%xmm7,%xmm7
  1072. vmovdqu %xmm8,96(%rbp)
  1073. vpxor %xmm15,%xmm8,%xmm8
  1074. vmovdqu %xmm9,112(%rbp)
  1075. vpxor %xmm15,%xmm9,%xmm9
  1076. xorq $0x80,%rbp
  1077. movl $1,%ecx
  1078. jmp .Loop_dec8x
  1079. .align 32
  1080. .Loop_dec8x:
  1081. vaesdec %xmm1,%xmm2,%xmm2
  1082. cmpl 32+0(%rsp),%ecx
  1083. vaesdec %xmm1,%xmm3,%xmm3
  1084. prefetcht0 31(%r8)
  1085. vaesdec %xmm1,%xmm4,%xmm4
  1086. vaesdec %xmm1,%xmm5,%xmm5
  1087. leaq (%r8,%rbx,1),%rbx
  1088. cmovgeq %rsp,%r8
  1089. vaesdec %xmm1,%xmm6,%xmm6
  1090. cmovgq %rsp,%rbx
  1091. vaesdec %xmm1,%xmm7,%xmm7
  1092. subq %r8,%rbx
  1093. vaesdec %xmm1,%xmm8,%xmm8
  1094. vmovdqu 16(%r8),%xmm10
  1095. movq %rbx,64+0(%rsp)
  1096. vaesdec %xmm1,%xmm9,%xmm9
  1097. vmovups -72(%rsi),%xmm1
  1098. leaq 16(%r8,%rbx,1),%r8
  1099. vmovdqu %xmm10,128(%rsp)
  1100. vaesdec %xmm0,%xmm2,%xmm2
  1101. cmpl 32+4(%rsp),%ecx
  1102. movq 64+8(%rsp),%rbx
  1103. vaesdec %xmm0,%xmm3,%xmm3
  1104. prefetcht0 31(%r9)
  1105. vaesdec %xmm0,%xmm4,%xmm4
  1106. vaesdec %xmm0,%xmm5,%xmm5
  1107. leaq (%r9,%rbx,1),%rbx
  1108. cmovgeq %rsp,%r9
  1109. vaesdec %xmm0,%xmm6,%xmm6
  1110. cmovgq %rsp,%rbx
  1111. vaesdec %xmm0,%xmm7,%xmm7
  1112. subq %r9,%rbx
  1113. vaesdec %xmm0,%xmm8,%xmm8
  1114. vmovdqu 16(%r9),%xmm11
  1115. movq %rbx,64+8(%rsp)
  1116. vaesdec %xmm0,%xmm9,%xmm9
  1117. vmovups -56(%rsi),%xmm0
  1118. leaq 16(%r9,%rbx,1),%r9
  1119. vmovdqu %xmm11,144(%rsp)
  1120. vaesdec %xmm1,%xmm2,%xmm2
  1121. cmpl 32+8(%rsp),%ecx
  1122. movq 64+16(%rsp),%rbx
  1123. vaesdec %xmm1,%xmm3,%xmm3
  1124. prefetcht0 31(%r10)
  1125. vaesdec %xmm1,%xmm4,%xmm4
  1126. prefetcht0 15(%r8)
  1127. vaesdec %xmm1,%xmm5,%xmm5
  1128. leaq (%r10,%rbx,1),%rbx
  1129. cmovgeq %rsp,%r10
  1130. vaesdec %xmm1,%xmm6,%xmm6
  1131. cmovgq %rsp,%rbx
  1132. vaesdec %xmm1,%xmm7,%xmm7
  1133. subq %r10,%rbx
  1134. vaesdec %xmm1,%xmm8,%xmm8
  1135. vmovdqu 16(%r10),%xmm12
  1136. movq %rbx,64+16(%rsp)
  1137. vaesdec %xmm1,%xmm9,%xmm9
  1138. vmovups -40(%rsi),%xmm1
  1139. leaq 16(%r10,%rbx,1),%r10
  1140. vmovdqu %xmm12,160(%rsp)
  1141. vaesdec %xmm0,%xmm2,%xmm2
  1142. cmpl 32+12(%rsp),%ecx
  1143. movq 64+24(%rsp),%rbx
  1144. vaesdec %xmm0,%xmm3,%xmm3
  1145. prefetcht0 31(%r11)
  1146. vaesdec %xmm0,%xmm4,%xmm4
  1147. prefetcht0 15(%r9)
  1148. vaesdec %xmm0,%xmm5,%xmm5
  1149. leaq (%r11,%rbx,1),%rbx
  1150. cmovgeq %rsp,%r11
  1151. vaesdec %xmm0,%xmm6,%xmm6
  1152. cmovgq %rsp,%rbx
  1153. vaesdec %xmm0,%xmm7,%xmm7
  1154. subq %r11,%rbx
  1155. vaesdec %xmm0,%xmm8,%xmm8
  1156. vmovdqu 16(%r11),%xmm13
  1157. movq %rbx,64+24(%rsp)
  1158. vaesdec %xmm0,%xmm9,%xmm9
  1159. vmovups -24(%rsi),%xmm0
  1160. leaq 16(%r11,%rbx,1),%r11
  1161. vmovdqu %xmm13,176(%rsp)
  1162. vaesdec %xmm1,%xmm2,%xmm2
  1163. cmpl 32+16(%rsp),%ecx
  1164. movq 64+32(%rsp),%rbx
  1165. vaesdec %xmm1,%xmm3,%xmm3
  1166. prefetcht0 31(%r12)
  1167. vaesdec %xmm1,%xmm4,%xmm4
  1168. prefetcht0 15(%r10)
  1169. vaesdec %xmm1,%xmm5,%xmm5
  1170. leaq (%r12,%rbx,1),%rbx
  1171. cmovgeq %rsp,%r12
  1172. vaesdec %xmm1,%xmm6,%xmm6
  1173. cmovgq %rsp,%rbx
  1174. vaesdec %xmm1,%xmm7,%xmm7
  1175. subq %r12,%rbx
  1176. vaesdec %xmm1,%xmm8,%xmm8
  1177. vmovdqu 16(%r12),%xmm10
  1178. movq %rbx,64+32(%rsp)
  1179. vaesdec %xmm1,%xmm9,%xmm9
  1180. vmovups -8(%rsi),%xmm1
  1181. leaq 16(%r12,%rbx,1),%r12
  1182. vaesdec %xmm0,%xmm2,%xmm2
  1183. cmpl 32+20(%rsp),%ecx
  1184. movq 64+40(%rsp),%rbx
  1185. vaesdec %xmm0,%xmm3,%xmm3
  1186. prefetcht0 31(%r13)
  1187. vaesdec %xmm0,%xmm4,%xmm4
  1188. prefetcht0 15(%r11)
  1189. vaesdec %xmm0,%xmm5,%xmm5
  1190. leaq (%rbx,%r13,1),%rbx
  1191. cmovgeq %rsp,%r13
  1192. vaesdec %xmm0,%xmm6,%xmm6
  1193. cmovgq %rsp,%rbx
  1194. vaesdec %xmm0,%xmm7,%xmm7
  1195. subq %r13,%rbx
  1196. vaesdec %xmm0,%xmm8,%xmm8
  1197. vmovdqu 16(%r13),%xmm11
  1198. movq %rbx,64+40(%rsp)
  1199. vaesdec %xmm0,%xmm9,%xmm9
  1200. vmovups 8(%rsi),%xmm0
  1201. leaq 16(%r13,%rbx,1),%r13
  1202. vaesdec %xmm1,%xmm2,%xmm2
  1203. cmpl 32+24(%rsp),%ecx
  1204. movq 64+48(%rsp),%rbx
  1205. vaesdec %xmm1,%xmm3,%xmm3
  1206. prefetcht0 31(%r14)
  1207. vaesdec %xmm1,%xmm4,%xmm4
  1208. prefetcht0 15(%r12)
  1209. vaesdec %xmm1,%xmm5,%xmm5
  1210. leaq (%r14,%rbx,1),%rbx
  1211. cmovgeq %rsp,%r14
  1212. vaesdec %xmm1,%xmm6,%xmm6
  1213. cmovgq %rsp,%rbx
  1214. vaesdec %xmm1,%xmm7,%xmm7
  1215. subq %r14,%rbx
  1216. vaesdec %xmm1,%xmm8,%xmm8
  1217. vmovdqu 16(%r14),%xmm12
  1218. movq %rbx,64+48(%rsp)
  1219. vaesdec %xmm1,%xmm9,%xmm9
  1220. vmovups 24(%rsi),%xmm1
  1221. leaq 16(%r14,%rbx,1),%r14
  1222. vaesdec %xmm0,%xmm2,%xmm2
  1223. cmpl 32+28(%rsp),%ecx
  1224. movq 64+56(%rsp),%rbx
  1225. vaesdec %xmm0,%xmm3,%xmm3
  1226. prefetcht0 31(%r15)
  1227. vaesdec %xmm0,%xmm4,%xmm4
  1228. prefetcht0 15(%r13)
  1229. vaesdec %xmm0,%xmm5,%xmm5
  1230. leaq (%r15,%rbx,1),%rbx
  1231. cmovgeq %rsp,%r15
  1232. vaesdec %xmm0,%xmm6,%xmm6
  1233. cmovgq %rsp,%rbx
  1234. vaesdec %xmm0,%xmm7,%xmm7
  1235. subq %r15,%rbx
  1236. vaesdec %xmm0,%xmm8,%xmm8
  1237. vmovdqu 16(%r15),%xmm13
  1238. movq %rbx,64+56(%rsp)
  1239. vaesdec %xmm0,%xmm9,%xmm9
  1240. vmovups 40(%rsi),%xmm0
  1241. leaq 16(%r15,%rbx,1),%r15
  1242. vmovdqu 32(%rsp),%xmm14
  1243. prefetcht0 15(%r14)
  1244. prefetcht0 15(%r15)
  1245. cmpl $11,%eax
  1246. jb .Ldec8x_tail
  1247. vaesdec %xmm1,%xmm2,%xmm2
  1248. vaesdec %xmm1,%xmm3,%xmm3
  1249. vaesdec %xmm1,%xmm4,%xmm4
  1250. vaesdec %xmm1,%xmm5,%xmm5
  1251. vaesdec %xmm1,%xmm6,%xmm6
  1252. vaesdec %xmm1,%xmm7,%xmm7
  1253. vaesdec %xmm1,%xmm8,%xmm8
  1254. vaesdec %xmm1,%xmm9,%xmm9
  1255. vmovups 176-120(%rsi),%xmm1
  1256. vaesdec %xmm0,%xmm2,%xmm2
  1257. vaesdec %xmm0,%xmm3,%xmm3
  1258. vaesdec %xmm0,%xmm4,%xmm4
  1259. vaesdec %xmm0,%xmm5,%xmm5
  1260. vaesdec %xmm0,%xmm6,%xmm6
  1261. vaesdec %xmm0,%xmm7,%xmm7
  1262. vaesdec %xmm0,%xmm8,%xmm8
  1263. vaesdec %xmm0,%xmm9,%xmm9
  1264. vmovups 192-120(%rsi),%xmm0
  1265. je .Ldec8x_tail
  1266. vaesdec %xmm1,%xmm2,%xmm2
  1267. vaesdec %xmm1,%xmm3,%xmm3
  1268. vaesdec %xmm1,%xmm4,%xmm4
  1269. vaesdec %xmm1,%xmm5,%xmm5
  1270. vaesdec %xmm1,%xmm6,%xmm6
  1271. vaesdec %xmm1,%xmm7,%xmm7
  1272. vaesdec %xmm1,%xmm8,%xmm8
  1273. vaesdec %xmm1,%xmm9,%xmm9
  1274. vmovups 208-120(%rsi),%xmm1
  1275. vaesdec %xmm0,%xmm2,%xmm2
  1276. vaesdec %xmm0,%xmm3,%xmm3
  1277. vaesdec %xmm0,%xmm4,%xmm4
  1278. vaesdec %xmm0,%xmm5,%xmm5
  1279. vaesdec %xmm0,%xmm6,%xmm6
  1280. vaesdec %xmm0,%xmm7,%xmm7
  1281. vaesdec %xmm0,%xmm8,%xmm8
  1282. vaesdec %xmm0,%xmm9,%xmm9
  1283. vmovups 224-120(%rsi),%xmm0
  1284. .Ldec8x_tail:
  1285. vaesdec %xmm1,%xmm2,%xmm2
  1286. vpxor %xmm15,%xmm15,%xmm15
  1287. vaesdec %xmm1,%xmm3,%xmm3
  1288. vaesdec %xmm1,%xmm4,%xmm4
  1289. vpcmpgtd %xmm15,%xmm14,%xmm15
  1290. vaesdec %xmm1,%xmm5,%xmm5
  1291. vaesdec %xmm1,%xmm6,%xmm6
  1292. vpaddd %xmm14,%xmm15,%xmm15
  1293. vmovdqu 48(%rsp),%xmm14
  1294. vaesdec %xmm1,%xmm7,%xmm7
  1295. movq 64(%rsp),%rbx
  1296. vaesdec %xmm1,%xmm8,%xmm8
  1297. vaesdec %xmm1,%xmm9,%xmm9
  1298. vmovups 16-120(%rsi),%xmm1
  1299. vaesdeclast %xmm0,%xmm2,%xmm2
  1300. vmovdqa %xmm15,32(%rsp)
  1301. vpxor %xmm15,%xmm15,%xmm15
  1302. vaesdeclast %xmm0,%xmm3,%xmm3
  1303. vpxor 0(%rbp),%xmm2,%xmm2
  1304. vaesdeclast %xmm0,%xmm4,%xmm4
  1305. vpxor 16(%rbp),%xmm3,%xmm3
  1306. vpcmpgtd %xmm15,%xmm14,%xmm15
  1307. vaesdeclast %xmm0,%xmm5,%xmm5
  1308. vpxor 32(%rbp),%xmm4,%xmm4
  1309. vaesdeclast %xmm0,%xmm6,%xmm6
  1310. vpxor 48(%rbp),%xmm5,%xmm5
  1311. vpaddd %xmm15,%xmm14,%xmm14
  1312. vmovdqu -120(%rsi),%xmm15
  1313. vaesdeclast %xmm0,%xmm7,%xmm7
  1314. vpxor 64(%rbp),%xmm6,%xmm6
  1315. vaesdeclast %xmm0,%xmm8,%xmm8
  1316. vpxor 80(%rbp),%xmm7,%xmm7
  1317. vmovdqa %xmm14,48(%rsp)
  1318. vaesdeclast %xmm0,%xmm9,%xmm9
  1319. vpxor 96(%rbp),%xmm8,%xmm8
  1320. vmovups 32-120(%rsi),%xmm0
  1321. vmovups %xmm2,-16(%r8)
  1322. subq %rbx,%r8
  1323. vmovdqu 128+0(%rsp),%xmm2
  1324. vpxor 112(%rbp),%xmm9,%xmm9
  1325. vmovups %xmm3,-16(%r9)
  1326. subq 72(%rsp),%r9
  1327. vmovdqu %xmm2,0(%rbp)
  1328. vpxor %xmm15,%xmm2,%xmm2
  1329. vmovdqu 128+16(%rsp),%xmm3
  1330. vmovups %xmm4,-16(%r10)
  1331. subq 80(%rsp),%r10
  1332. vmovdqu %xmm3,16(%rbp)
  1333. vpxor %xmm15,%xmm3,%xmm3
  1334. vmovdqu 128+32(%rsp),%xmm4
  1335. vmovups %xmm5,-16(%r11)
  1336. subq 88(%rsp),%r11
  1337. vmovdqu %xmm4,32(%rbp)
  1338. vpxor %xmm15,%xmm4,%xmm4
  1339. vmovdqu 128+48(%rsp),%xmm5
  1340. vmovups %xmm6,-16(%r12)
  1341. subq 96(%rsp),%r12
  1342. vmovdqu %xmm5,48(%rbp)
  1343. vpxor %xmm15,%xmm5,%xmm5
  1344. vmovdqu %xmm10,64(%rbp)
  1345. vpxor %xmm10,%xmm15,%xmm6
  1346. vmovups %xmm7,-16(%r13)
  1347. subq 104(%rsp),%r13
  1348. vmovdqu %xmm11,80(%rbp)
  1349. vpxor %xmm11,%xmm15,%xmm7
  1350. vmovups %xmm8,-16(%r14)
  1351. subq 112(%rsp),%r14
  1352. vmovdqu %xmm12,96(%rbp)
  1353. vpxor %xmm12,%xmm15,%xmm8
  1354. vmovups %xmm9,-16(%r15)
  1355. subq 120(%rsp),%r15
  1356. vmovdqu %xmm13,112(%rbp)
  1357. vpxor %xmm13,%xmm15,%xmm9
  1358. xorq $128,%rbp
  1359. decl %edx
  1360. jnz .Loop_dec8x
  1361. movq 16(%rsp),%rax
  1362. .cfi_def_cfa %rax,8
  1363. .Ldec8x_done:
  1364. vzeroupper
  1365. movq -48(%rax),%r15
  1366. .cfi_restore %r15
  1367. movq -40(%rax),%r14
  1368. .cfi_restore %r14
  1369. movq -32(%rax),%r13
  1370. .cfi_restore %r13
  1371. movq -24(%rax),%r12
  1372. .cfi_restore %r12
  1373. movq -16(%rax),%rbp
  1374. .cfi_restore %rbp
  1375. movq -8(%rax),%rbx
  1376. .cfi_restore %rbx
  1377. leaq (%rax),%rsp
  1378. .cfi_def_cfa_register %rsp
  1379. .Ldec8x_epilogue:
  1380. .byte 0xf3,0xc3
  1381. .cfi_endproc
  1382. .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx