chacha-x86_64.masm 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547
  1. OPTION DOTNAME
  2. .text$ SEGMENT ALIGN(256) 'CODE'
  3. EXTERN OPENSSL_ia32cap_P:NEAR
  4. ALIGN 64
  5. $L$zero::
  6. DD 0,0,0,0
  7. $L$one::
  8. DD 1,0,0,0
  9. $L$inc::
  10. DD 0,1,2,3
  11. $L$four::
  12. DD 4,4,4,4
  13. $L$incy::
  14. DD 0,2,4,6,1,3,5,7
  15. $L$eight::
  16. DD 8,8,8,8,8,8,8,8
  17. $L$rot16::
  18. DB 02h,03h,00h,01h,06h,07h,04h,05h,0ah,0bh,08h,09h,0eh,0fh,0ch,0dh
  19. $L$rot24::
  20. DB 03h,00h,01h,02h,07h,04h,05h,06h,0bh,08h,09h,0ah,0fh,0ch,0dh,0eh
  21. $L$twoy::
  22. DD 2,0,0,0,2,0,0,0
  23. ALIGN 64
  24. $L$zeroz::
  25. DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
  26. $L$fourz::
  27. DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
  28. $L$incz::
  29. DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
  30. $L$sixteen::
  31. DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
  32. $L$sigma::
  33. DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
  34. DB 0
  35. DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
  36. DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
  37. DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
  38. DB 108,46,111,114,103,62,0
  39. PUBLIC ChaCha20_ctr32
  40. ALIGN 64
  41. ChaCha20_ctr32 PROC PUBLIC
  42. mov QWORD PTR[8+rsp],rdi ;WIN64 prologue
  43. mov QWORD PTR[16+rsp],rsi
  44. mov rax,rsp
  45. $L$SEH_begin_ChaCha20_ctr32::
  46. mov rdi,rcx
  47. mov rsi,rdx
  48. mov rdx,r8
  49. mov rcx,r9
  50. mov r8,QWORD PTR[40+rsp]
  51. cmp rdx,0
  52. je $L$no_data
  53. mov r10,QWORD PTR[((OPENSSL_ia32cap_P+4))]
  54. test r10d,512
  55. jnz $L$ChaCha20_ssse3
  56. push rbx
  57. push rbp
  58. push r12
  59. push r13
  60. push r14
  61. push r15
  62. sub rsp,64+24
  63. $L$ctr32_body::
  64. movdqu xmm1,XMMWORD PTR[rcx]
  65. movdqu xmm2,XMMWORD PTR[16+rcx]
  66. movdqu xmm3,XMMWORD PTR[r8]
  67. movdqa xmm4,XMMWORD PTR[$L$one]
  68. movdqa XMMWORD PTR[16+rsp],xmm1
  69. movdqa XMMWORD PTR[32+rsp],xmm2
  70. movdqa XMMWORD PTR[48+rsp],xmm3
  71. mov rbp,rdx
  72. jmp $L$oop_outer
  73. ALIGN 32
  74. $L$oop_outer::
  75. mov eax,061707865h
  76. mov ebx,03320646eh
  77. mov ecx,079622d32h
  78. mov edx,06b206574h
  79. mov r8d,DWORD PTR[16+rsp]
  80. mov r9d,DWORD PTR[20+rsp]
  81. mov r10d,DWORD PTR[24+rsp]
  82. mov r11d,DWORD PTR[28+rsp]
  83. movd r12d,xmm3
  84. mov r13d,DWORD PTR[52+rsp]
  85. mov r14d,DWORD PTR[56+rsp]
  86. mov r15d,DWORD PTR[60+rsp]
  87. mov QWORD PTR[((64+0))+rsp],rbp
  88. mov ebp,10
  89. mov QWORD PTR[((64+8))+rsp],rsi
  90. DB 102,72,15,126,214
  91. mov QWORD PTR[((64+16))+rsp],rdi
  92. mov rdi,rsi
  93. shr rdi,32
  94. jmp $L$oop
  95. ALIGN 32
  96. $L$oop::
  97. add eax,r8d
  98. xor r12d,eax
  99. rol r12d,16
  100. add ebx,r9d
  101. xor r13d,ebx
  102. rol r13d,16
  103. add esi,r12d
  104. xor r8d,esi
  105. rol r8d,12
  106. add edi,r13d
  107. xor r9d,edi
  108. rol r9d,12
  109. add eax,r8d
  110. xor r12d,eax
  111. rol r12d,8
  112. add ebx,r9d
  113. xor r13d,ebx
  114. rol r13d,8
  115. add esi,r12d
  116. xor r8d,esi
  117. rol r8d,7
  118. add edi,r13d
  119. xor r9d,edi
  120. rol r9d,7
  121. mov DWORD PTR[32+rsp],esi
  122. mov DWORD PTR[36+rsp],edi
  123. mov esi,DWORD PTR[40+rsp]
  124. mov edi,DWORD PTR[44+rsp]
  125. add ecx,r10d
  126. xor r14d,ecx
  127. rol r14d,16
  128. add edx,r11d
  129. xor r15d,edx
  130. rol r15d,16
  131. add esi,r14d
  132. xor r10d,esi
  133. rol r10d,12
  134. add edi,r15d
  135. xor r11d,edi
  136. rol r11d,12
  137. add ecx,r10d
  138. xor r14d,ecx
  139. rol r14d,8
  140. add edx,r11d
  141. xor r15d,edx
  142. rol r15d,8
  143. add esi,r14d
  144. xor r10d,esi
  145. rol r10d,7
  146. add edi,r15d
  147. xor r11d,edi
  148. rol r11d,7
  149. add eax,r9d
  150. xor r15d,eax
  151. rol r15d,16
  152. add ebx,r10d
  153. xor r12d,ebx
  154. rol r12d,16
  155. add esi,r15d
  156. xor r9d,esi
  157. rol r9d,12
  158. add edi,r12d
  159. xor r10d,edi
  160. rol r10d,12
  161. add eax,r9d
  162. xor r15d,eax
  163. rol r15d,8
  164. add ebx,r10d
  165. xor r12d,ebx
  166. rol r12d,8
  167. add esi,r15d
  168. xor r9d,esi
  169. rol r9d,7
  170. add edi,r12d
  171. xor r10d,edi
  172. rol r10d,7
  173. mov DWORD PTR[40+rsp],esi
  174. mov DWORD PTR[44+rsp],edi
  175. mov esi,DWORD PTR[32+rsp]
  176. mov edi,DWORD PTR[36+rsp]
  177. add ecx,r11d
  178. xor r13d,ecx
  179. rol r13d,16
  180. add edx,r8d
  181. xor r14d,edx
  182. rol r14d,16
  183. add esi,r13d
  184. xor r11d,esi
  185. rol r11d,12
  186. add edi,r14d
  187. xor r8d,edi
  188. rol r8d,12
  189. add ecx,r11d
  190. xor r13d,ecx
  191. rol r13d,8
  192. add edx,r8d
  193. xor r14d,edx
  194. rol r14d,8
  195. add esi,r13d
  196. xor r11d,esi
  197. rol r11d,7
  198. add edi,r14d
  199. xor r8d,edi
  200. rol r8d,7
  201. dec ebp
  202. jnz $L$oop
  203. mov DWORD PTR[36+rsp],edi
  204. mov DWORD PTR[32+rsp],esi
  205. mov rbp,QWORD PTR[64+rsp]
  206. movdqa xmm1,xmm2
  207. mov rsi,QWORD PTR[((64+8))+rsp]
  208. paddd xmm3,xmm4
  209. mov rdi,QWORD PTR[((64+16))+rsp]
  210. add eax,061707865h
  211. add ebx,03320646eh
  212. add ecx,079622d32h
  213. add edx,06b206574h
  214. add r8d,DWORD PTR[16+rsp]
  215. add r9d,DWORD PTR[20+rsp]
  216. add r10d,DWORD PTR[24+rsp]
  217. add r11d,DWORD PTR[28+rsp]
  218. add r12d,DWORD PTR[48+rsp]
  219. add r13d,DWORD PTR[52+rsp]
  220. add r14d,DWORD PTR[56+rsp]
  221. add r15d,DWORD PTR[60+rsp]
  222. paddd xmm1,XMMWORD PTR[32+rsp]
  223. cmp rbp,64
  224. jb $L$tail
  225. xor eax,DWORD PTR[rsi]
  226. xor ebx,DWORD PTR[4+rsi]
  227. xor ecx,DWORD PTR[8+rsi]
  228. xor edx,DWORD PTR[12+rsi]
  229. xor r8d,DWORD PTR[16+rsi]
  230. xor r9d,DWORD PTR[20+rsi]
  231. xor r10d,DWORD PTR[24+rsi]
  232. xor r11d,DWORD PTR[28+rsi]
  233. movdqu xmm0,XMMWORD PTR[32+rsi]
  234. xor r12d,DWORD PTR[48+rsi]
  235. xor r13d,DWORD PTR[52+rsi]
  236. xor r14d,DWORD PTR[56+rsi]
  237. xor r15d,DWORD PTR[60+rsi]
  238. lea rsi,QWORD PTR[64+rsi]
  239. pxor xmm0,xmm1
  240. movdqa XMMWORD PTR[32+rsp],xmm2
  241. movd DWORD PTR[48+rsp],xmm3
  242. mov DWORD PTR[rdi],eax
  243. mov DWORD PTR[4+rdi],ebx
  244. mov DWORD PTR[8+rdi],ecx
  245. mov DWORD PTR[12+rdi],edx
  246. mov DWORD PTR[16+rdi],r8d
  247. mov DWORD PTR[20+rdi],r9d
  248. mov DWORD PTR[24+rdi],r10d
  249. mov DWORD PTR[28+rdi],r11d
  250. movdqu XMMWORD PTR[32+rdi],xmm0
  251. mov DWORD PTR[48+rdi],r12d
  252. mov DWORD PTR[52+rdi],r13d
  253. mov DWORD PTR[56+rdi],r14d
  254. mov DWORD PTR[60+rdi],r15d
  255. lea rdi,QWORD PTR[64+rdi]
  256. sub rbp,64
  257. jnz $L$oop_outer
  258. jmp $L$done
  259. ALIGN 16
  260. $L$tail::
  261. mov DWORD PTR[rsp],eax
  262. mov DWORD PTR[4+rsp],ebx
  263. xor rbx,rbx
  264. mov DWORD PTR[8+rsp],ecx
  265. mov DWORD PTR[12+rsp],edx
  266. mov DWORD PTR[16+rsp],r8d
  267. mov DWORD PTR[20+rsp],r9d
  268. mov DWORD PTR[24+rsp],r10d
  269. mov DWORD PTR[28+rsp],r11d
  270. movdqa XMMWORD PTR[32+rsp],xmm1
  271. mov DWORD PTR[48+rsp],r12d
  272. mov DWORD PTR[52+rsp],r13d
  273. mov DWORD PTR[56+rsp],r14d
  274. mov DWORD PTR[60+rsp],r15d
  275. $L$oop_tail::
  276. movzx eax,BYTE PTR[rbx*1+rsi]
  277. movzx edx,BYTE PTR[rbx*1+rsp]
  278. lea rbx,QWORD PTR[1+rbx]
  279. xor eax,edx
  280. mov BYTE PTR[((-1))+rbx*1+rdi],al
  281. dec rbp
  282. jnz $L$oop_tail
  283. $L$done::
  284. lea rsi,QWORD PTR[((64+24+48))+rsp]
  285. mov r15,QWORD PTR[((-48))+rsi]
  286. mov r14,QWORD PTR[((-40))+rsi]
  287. mov r13,QWORD PTR[((-32))+rsi]
  288. mov r12,QWORD PTR[((-24))+rsi]
  289. mov rbp,QWORD PTR[((-16))+rsi]
  290. mov rbx,QWORD PTR[((-8))+rsi]
  291. lea rsp,QWORD PTR[rsi]
  292. $L$no_data::
  293. mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
  294. mov rsi,QWORD PTR[16+rsp]
  295. DB 0F3h,0C3h ;repret
  296. $L$SEH_end_ChaCha20_ctr32::
  297. ChaCha20_ctr32 ENDP
  298. ALIGN 32
  299. ChaCha20_ssse3 PROC PRIVATE
  300. mov QWORD PTR[8+rsp],rdi ;WIN64 prologue
  301. mov QWORD PTR[16+rsp],rsi
  302. mov rax,rsp
  303. $L$SEH_begin_ChaCha20_ssse3::
  304. mov rdi,rcx
  305. mov rsi,rdx
  306. mov rdx,r8
  307. mov rcx,r9
  308. mov r8,QWORD PTR[40+rsp]
  309. $L$ChaCha20_ssse3::
  310. mov r9,rsp
  311. test r10d,2048
  312. jnz $L$ChaCha20_4xop
  313. cmp rdx,128
  314. je $L$ChaCha20_128
  315. ja $L$ChaCha20_4x
  316. $L$do_sse3_after_all::
  317. sub rsp,64+168
  318. movaps XMMWORD PTR[(-40)+r9],xmm6
  319. movaps XMMWORD PTR[(-24)+r9],xmm7
  320. $L$ssse3_body::
  321. movdqa xmm0,XMMWORD PTR[$L$sigma]
  322. movdqu xmm1,XMMWORD PTR[rcx]
  323. movdqu xmm2,XMMWORD PTR[16+rcx]
  324. movdqu xmm3,XMMWORD PTR[r8]
  325. movdqa xmm6,XMMWORD PTR[$L$rot16]
  326. movdqa xmm7,XMMWORD PTR[$L$rot24]
  327. movdqa XMMWORD PTR[rsp],xmm0
  328. movdqa XMMWORD PTR[16+rsp],xmm1
  329. movdqa XMMWORD PTR[32+rsp],xmm2
  330. movdqa XMMWORD PTR[48+rsp],xmm3
  331. mov r8,10
  332. jmp $L$oop_ssse3
  333. ALIGN 32
  334. $L$oop_outer_ssse3::
  335. movdqa xmm3,XMMWORD PTR[$L$one]
  336. movdqa xmm0,XMMWORD PTR[rsp]
  337. movdqa xmm1,XMMWORD PTR[16+rsp]
  338. movdqa xmm2,XMMWORD PTR[32+rsp]
  339. paddd xmm3,XMMWORD PTR[48+rsp]
  340. mov r8,10
  341. movdqa XMMWORD PTR[48+rsp],xmm3
  342. jmp $L$oop_ssse3
  343. ALIGN 32
  344. $L$oop_ssse3::
  345. paddd xmm0,xmm1
  346. pxor xmm3,xmm0
  347. DB 102,15,56,0,222
  348. paddd xmm2,xmm3
  349. pxor xmm1,xmm2
  350. movdqa xmm4,xmm1
  351. psrld xmm1,20
  352. pslld xmm4,12
  353. por xmm1,xmm4
  354. paddd xmm0,xmm1
  355. pxor xmm3,xmm0
  356. DB 102,15,56,0,223
  357. paddd xmm2,xmm3
  358. pxor xmm1,xmm2
  359. movdqa xmm4,xmm1
  360. psrld xmm1,25
  361. pslld xmm4,7
  362. por xmm1,xmm4
  363. pshufd xmm2,xmm2,78
  364. pshufd xmm1,xmm1,57
  365. pshufd xmm3,xmm3,147
  366. nop
  367. paddd xmm0,xmm1
  368. pxor xmm3,xmm0
  369. DB 102,15,56,0,222
  370. paddd xmm2,xmm3
  371. pxor xmm1,xmm2
  372. movdqa xmm4,xmm1
  373. psrld xmm1,20
  374. pslld xmm4,12
  375. por xmm1,xmm4
  376. paddd xmm0,xmm1
  377. pxor xmm3,xmm0
  378. DB 102,15,56,0,223
  379. paddd xmm2,xmm3
  380. pxor xmm1,xmm2
  381. movdqa xmm4,xmm1
  382. psrld xmm1,25
  383. pslld xmm4,7
  384. por xmm1,xmm4
  385. pshufd xmm2,xmm2,78
  386. pshufd xmm1,xmm1,147
  387. pshufd xmm3,xmm3,57
  388. dec r8
  389. jnz $L$oop_ssse3
  390. paddd xmm0,XMMWORD PTR[rsp]
  391. paddd xmm1,XMMWORD PTR[16+rsp]
  392. paddd xmm2,XMMWORD PTR[32+rsp]
  393. paddd xmm3,XMMWORD PTR[48+rsp]
  394. cmp rdx,64
  395. jb $L$tail_ssse3
  396. movdqu xmm4,XMMWORD PTR[rsi]
  397. movdqu xmm5,XMMWORD PTR[16+rsi]
  398. pxor xmm0,xmm4
  399. movdqu xmm4,XMMWORD PTR[32+rsi]
  400. pxor xmm1,xmm5
  401. movdqu xmm5,XMMWORD PTR[48+rsi]
  402. lea rsi,QWORD PTR[64+rsi]
  403. pxor xmm2,xmm4
  404. pxor xmm3,xmm5
  405. movdqu XMMWORD PTR[rdi],xmm0
  406. movdqu XMMWORD PTR[16+rdi],xmm1
  407. movdqu XMMWORD PTR[32+rdi],xmm2
  408. movdqu XMMWORD PTR[48+rdi],xmm3
  409. lea rdi,QWORD PTR[64+rdi]
  410. sub rdx,64
  411. jnz $L$oop_outer_ssse3
  412. jmp $L$done_ssse3
  413. ALIGN 16
  414. $L$tail_ssse3::
  415. movdqa XMMWORD PTR[rsp],xmm0
  416. movdqa XMMWORD PTR[16+rsp],xmm1
  417. movdqa XMMWORD PTR[32+rsp],xmm2
  418. movdqa XMMWORD PTR[48+rsp],xmm3
  419. xor r8,r8
  420. $L$oop_tail_ssse3::
  421. movzx eax,BYTE PTR[r8*1+rsi]
  422. movzx ecx,BYTE PTR[r8*1+rsp]
  423. lea r8,QWORD PTR[1+r8]
  424. xor eax,ecx
  425. mov BYTE PTR[((-1))+r8*1+rdi],al
  426. dec rdx
  427. jnz $L$oop_tail_ssse3
  428. $L$done_ssse3::
  429. movaps xmm6,XMMWORD PTR[((-40))+r9]
  430. movaps xmm7,XMMWORD PTR[((-24))+r9]
  431. lea rsp,QWORD PTR[r9]
  432. $L$ssse3_epilogue::
  433. mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
  434. mov rsi,QWORD PTR[16+rsp]
  435. DB 0F3h,0C3h ;repret
  436. $L$SEH_end_ChaCha20_ssse3::
  437. ChaCha20_ssse3 ENDP
  438. ALIGN 32
  439. ChaCha20_128 PROC PRIVATE
  440. mov QWORD PTR[8+rsp],rdi ;WIN64 prologue
  441. mov QWORD PTR[16+rsp],rsi
  442. mov rax,rsp
  443. $L$SEH_begin_ChaCha20_128::
  444. mov rdi,rcx
  445. mov rsi,rdx
  446. mov rdx,r8
  447. mov rcx,r9
  448. mov r8,QWORD PTR[40+rsp]
  449. $L$ChaCha20_128::
  450. mov r9,rsp
  451. sub rsp,64+104
  452. movaps XMMWORD PTR[(-104)+r9],xmm6
  453. movaps XMMWORD PTR[(-88)+r9],xmm7
  454. movaps XMMWORD PTR[(-72)+r9],xmm8
  455. movaps XMMWORD PTR[(-56)+r9],xmm9
  456. movaps XMMWORD PTR[(-40)+r9],xmm10
  457. movaps XMMWORD PTR[(-24)+r9],xmm11
  458. $L$128_body::
  459. movdqa xmm8,XMMWORD PTR[$L$sigma]
  460. movdqu xmm9,XMMWORD PTR[rcx]
  461. movdqu xmm2,XMMWORD PTR[16+rcx]
  462. movdqu xmm3,XMMWORD PTR[r8]
  463. movdqa xmm1,XMMWORD PTR[$L$one]
  464. movdqa xmm6,XMMWORD PTR[$L$rot16]
  465. movdqa xmm7,XMMWORD PTR[$L$rot24]
  466. movdqa xmm10,xmm8
  467. movdqa XMMWORD PTR[rsp],xmm8
  468. movdqa xmm11,xmm9
  469. movdqa XMMWORD PTR[16+rsp],xmm9
  470. movdqa xmm0,xmm2
  471. movdqa XMMWORD PTR[32+rsp],xmm2
  472. paddd xmm1,xmm3
  473. movdqa XMMWORD PTR[48+rsp],xmm3
  474. mov r8,10
  475. jmp $L$oop_128
  476. ALIGN 32
  477. $L$oop_128::
  478. paddd xmm8,xmm9
  479. pxor xmm3,xmm8
  480. paddd xmm10,xmm11
  481. pxor xmm1,xmm10
  482. DB 102,15,56,0,222
  483. DB 102,15,56,0,206
  484. paddd xmm2,xmm3
  485. paddd xmm0,xmm1
  486. pxor xmm9,xmm2
  487. pxor xmm11,xmm0
  488. movdqa xmm4,xmm9
  489. psrld xmm9,20
  490. movdqa xmm5,xmm11
  491. pslld xmm4,12
  492. psrld xmm11,20
  493. por xmm9,xmm4
  494. pslld xmm5,12
  495. por xmm11,xmm5
  496. paddd xmm8,xmm9
  497. pxor xmm3,xmm8
  498. paddd xmm10,xmm11
  499. pxor xmm1,xmm10
  500. DB 102,15,56,0,223
  501. DB 102,15,56,0,207
  502. paddd xmm2,xmm3
  503. paddd xmm0,xmm1
  504. pxor xmm9,xmm2
  505. pxor xmm11,xmm0
  506. movdqa xmm4,xmm9
  507. psrld xmm9,25
  508. movdqa xmm5,xmm11
  509. pslld xmm4,7
  510. psrld xmm11,25
  511. por xmm9,xmm4
  512. pslld xmm5,7
  513. por xmm11,xmm5
  514. pshufd xmm2,xmm2,78
  515. pshufd xmm9,xmm9,57
  516. pshufd xmm3,xmm3,147
  517. pshufd xmm0,xmm0,78
  518. pshufd xmm11,xmm11,57
  519. pshufd xmm1,xmm1,147
  520. paddd xmm8,xmm9
  521. pxor xmm3,xmm8
  522. paddd xmm10,xmm11
  523. pxor xmm1,xmm10
  524. DB 102,15,56,0,222
  525. DB 102,15,56,0,206
  526. paddd xmm2,xmm3
  527. paddd xmm0,xmm1
  528. pxor xmm9,xmm2
  529. pxor xmm11,xmm0
  530. movdqa xmm4,xmm9
  531. psrld xmm9,20
  532. movdqa xmm5,xmm11
  533. pslld xmm4,12
  534. psrld xmm11,20
  535. por xmm9,xmm4
  536. pslld xmm5,12
  537. por xmm11,xmm5
  538. paddd xmm8,xmm9
  539. pxor xmm3,xmm8
  540. paddd xmm10,xmm11
  541. pxor xmm1,xmm10
  542. DB 102,15,56,0,223
  543. DB 102,15,56,0,207
  544. paddd xmm2,xmm3
  545. paddd xmm0,xmm1
  546. pxor xmm9,xmm2
  547. pxor xmm11,xmm0
  548. movdqa xmm4,xmm9
  549. psrld xmm9,25
  550. movdqa xmm5,xmm11
  551. pslld xmm4,7
  552. psrld xmm11,25
  553. por xmm9,xmm4
  554. pslld xmm5,7
  555. por xmm11,xmm5
  556. pshufd xmm2,xmm2,78
  557. pshufd xmm9,xmm9,147
  558. pshufd xmm3,xmm3,57
  559. pshufd xmm0,xmm0,78
  560. pshufd xmm11,xmm11,147
  561. pshufd xmm1,xmm1,57
  562. dec r8
  563. jnz $L$oop_128
  564. paddd xmm8,XMMWORD PTR[rsp]
  565. paddd xmm9,XMMWORD PTR[16+rsp]
  566. paddd xmm2,XMMWORD PTR[32+rsp]
  567. paddd xmm3,XMMWORD PTR[48+rsp]
  568. paddd xmm1,XMMWORD PTR[$L$one]
  569. paddd xmm10,XMMWORD PTR[rsp]
  570. paddd xmm11,XMMWORD PTR[16+rsp]
  571. paddd xmm0,XMMWORD PTR[32+rsp]
  572. paddd xmm1,XMMWORD PTR[48+rsp]
  573. movdqu xmm4,XMMWORD PTR[rsi]
  574. movdqu xmm5,XMMWORD PTR[16+rsi]
  575. pxor xmm8,xmm4
  576. movdqu xmm4,XMMWORD PTR[32+rsi]
  577. pxor xmm9,xmm5
  578. movdqu xmm5,XMMWORD PTR[48+rsi]
  579. pxor xmm2,xmm4
  580. movdqu xmm4,XMMWORD PTR[64+rsi]
  581. pxor xmm3,xmm5
  582. movdqu xmm5,XMMWORD PTR[80+rsi]
  583. pxor xmm10,xmm4
  584. movdqu xmm4,XMMWORD PTR[96+rsi]
  585. pxor xmm11,xmm5
  586. movdqu xmm5,XMMWORD PTR[112+rsi]
  587. pxor xmm0,xmm4
  588. pxor xmm1,xmm5
  589. movdqu XMMWORD PTR[rdi],xmm8
  590. movdqu XMMWORD PTR[16+rdi],xmm9
  591. movdqu XMMWORD PTR[32+rdi],xmm2
  592. movdqu XMMWORD PTR[48+rdi],xmm3
  593. movdqu XMMWORD PTR[64+rdi],xmm10
  594. movdqu XMMWORD PTR[80+rdi],xmm11
  595. movdqu XMMWORD PTR[96+rdi],xmm0
  596. movdqu XMMWORD PTR[112+rdi],xmm1
  597. movaps xmm6,XMMWORD PTR[((-104))+r9]
  598. movaps xmm7,XMMWORD PTR[((-88))+r9]
  599. movaps xmm8,XMMWORD PTR[((-72))+r9]
  600. movaps xmm9,XMMWORD PTR[((-56))+r9]
  601. movaps xmm10,XMMWORD PTR[((-40))+r9]
  602. movaps xmm11,XMMWORD PTR[((-24))+r9]
  603. lea rsp,QWORD PTR[r9]
  604. $L$128_epilogue::
  605. mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
  606. mov rsi,QWORD PTR[16+rsp]
  607. DB 0F3h,0C3h ;repret
  608. $L$SEH_end_ChaCha20_128::
  609. ChaCha20_128 ENDP
  610. ALIGN 32
  611. ChaCha20_4x PROC PRIVATE
  612. mov QWORD PTR[8+rsp],rdi ;WIN64 prologue
  613. mov QWORD PTR[16+rsp],rsi
  614. mov rax,rsp
  615. $L$SEH_begin_ChaCha20_4x::
  616. mov rdi,rcx
  617. mov rsi,rdx
  618. mov rdx,r8
  619. mov rcx,r9
  620. mov r8,QWORD PTR[40+rsp]
  621. $L$ChaCha20_4x::
  622. mov r9,rsp
  623. mov r11,r10
  624. shr r10,32
  625. test r10,32
  626. jnz $L$ChaCha20_8x
  627. cmp rdx,192
  628. ja $L$proceed4x
  629. and r11,71303168
  630. cmp r11,4194304
  631. je $L$do_sse3_after_all
  632. $L$proceed4x::
  633. sub rsp,0140h+168
  634. movaps XMMWORD PTR[(-168)+r9],xmm6
  635. movaps XMMWORD PTR[(-152)+r9],xmm7
  636. movaps XMMWORD PTR[(-136)+r9],xmm8
  637. movaps XMMWORD PTR[(-120)+r9],xmm9
  638. movaps XMMWORD PTR[(-104)+r9],xmm10
  639. movaps XMMWORD PTR[(-88)+r9],xmm11
  640. movaps XMMWORD PTR[(-72)+r9],xmm12
  641. movaps XMMWORD PTR[(-56)+r9],xmm13
  642. movaps XMMWORD PTR[(-40)+r9],xmm14
  643. movaps XMMWORD PTR[(-24)+r9],xmm15
  644. $L$4x_body::
  645. movdqa xmm11,XMMWORD PTR[$L$sigma]
  646. movdqu xmm15,XMMWORD PTR[rcx]
  647. movdqu xmm7,XMMWORD PTR[16+rcx]
  648. movdqu xmm3,XMMWORD PTR[r8]
  649. lea rcx,QWORD PTR[256+rsp]
  650. lea r10,QWORD PTR[$L$rot16]
  651. lea r11,QWORD PTR[$L$rot24]
  652. pshufd xmm8,xmm11,000h
  653. pshufd xmm9,xmm11,055h
  654. movdqa XMMWORD PTR[64+rsp],xmm8
  655. pshufd xmm10,xmm11,0aah
  656. movdqa XMMWORD PTR[80+rsp],xmm9
  657. pshufd xmm11,xmm11,0ffh
  658. movdqa XMMWORD PTR[96+rsp],xmm10
  659. movdqa XMMWORD PTR[112+rsp],xmm11
  660. pshufd xmm12,xmm15,000h
  661. pshufd xmm13,xmm15,055h
  662. movdqa XMMWORD PTR[(128-256)+rcx],xmm12
  663. pshufd xmm14,xmm15,0aah
  664. movdqa XMMWORD PTR[(144-256)+rcx],xmm13
  665. pshufd xmm15,xmm15,0ffh
  666. movdqa XMMWORD PTR[(160-256)+rcx],xmm14
  667. movdqa XMMWORD PTR[(176-256)+rcx],xmm15
  668. pshufd xmm4,xmm7,000h
  669. pshufd xmm5,xmm7,055h
  670. movdqa XMMWORD PTR[(192-256)+rcx],xmm4
  671. pshufd xmm6,xmm7,0aah
  672. movdqa XMMWORD PTR[(208-256)+rcx],xmm5
  673. pshufd xmm7,xmm7,0ffh
  674. movdqa XMMWORD PTR[(224-256)+rcx],xmm6
  675. movdqa XMMWORD PTR[(240-256)+rcx],xmm7
  676. pshufd xmm0,xmm3,000h
  677. pshufd xmm1,xmm3,055h
  678. paddd xmm0,XMMWORD PTR[$L$inc]
  679. pshufd xmm2,xmm3,0aah
  680. movdqa XMMWORD PTR[(272-256)+rcx],xmm1
  681. pshufd xmm3,xmm3,0ffh
  682. movdqa XMMWORD PTR[(288-256)+rcx],xmm2
  683. movdqa XMMWORD PTR[(304-256)+rcx],xmm3
  684. jmp $L$oop_enter4x
  685. ALIGN 32
  686. $L$oop_outer4x::
  687. movdqa xmm8,XMMWORD PTR[64+rsp]
  688. movdqa xmm9,XMMWORD PTR[80+rsp]
  689. movdqa xmm10,XMMWORD PTR[96+rsp]
  690. movdqa xmm11,XMMWORD PTR[112+rsp]
  691. movdqa xmm12,XMMWORD PTR[((128-256))+rcx]
  692. movdqa xmm13,XMMWORD PTR[((144-256))+rcx]
  693. movdqa xmm14,XMMWORD PTR[((160-256))+rcx]
  694. movdqa xmm15,XMMWORD PTR[((176-256))+rcx]
  695. movdqa xmm4,XMMWORD PTR[((192-256))+rcx]
  696. movdqa xmm5,XMMWORD PTR[((208-256))+rcx]
  697. movdqa xmm6,XMMWORD PTR[((224-256))+rcx]
  698. movdqa xmm7,XMMWORD PTR[((240-256))+rcx]
  699. movdqa xmm0,XMMWORD PTR[((256-256))+rcx]
  700. movdqa xmm1,XMMWORD PTR[((272-256))+rcx]
  701. movdqa xmm2,XMMWORD PTR[((288-256))+rcx]
  702. movdqa xmm3,XMMWORD PTR[((304-256))+rcx]
  703. paddd xmm0,XMMWORD PTR[$L$four]
  704. $L$oop_enter4x::
  705. movdqa XMMWORD PTR[32+rsp],xmm6
  706. movdqa XMMWORD PTR[48+rsp],xmm7
  707. movdqa xmm7,XMMWORD PTR[r10]
  708. mov eax,10
  709. movdqa XMMWORD PTR[(256-256)+rcx],xmm0
  710. jmp $L$oop4x
  711. ALIGN 32
  712. $L$oop4x::
  713. paddd xmm8,xmm12
  714. paddd xmm9,xmm13
  715. pxor xmm0,xmm8
  716. pxor xmm1,xmm9
  717. DB 102,15,56,0,199
  718. DB 102,15,56,0,207
  719. paddd xmm4,xmm0
  720. paddd xmm5,xmm1
  721. pxor xmm12,xmm4
  722. pxor xmm13,xmm5
  723. movdqa xmm6,xmm12
  724. pslld xmm12,12
  725. psrld xmm6,20
  726. movdqa xmm7,xmm13
  727. pslld xmm13,12
  728. por xmm12,xmm6
  729. psrld xmm7,20
  730. movdqa xmm6,XMMWORD PTR[r11]
  731. por xmm13,xmm7
  732. paddd xmm8,xmm12
  733. paddd xmm9,xmm13
  734. pxor xmm0,xmm8
  735. pxor xmm1,xmm9
  736. DB 102,15,56,0,198
  737. DB 102,15,56,0,206
  738. paddd xmm4,xmm0
  739. paddd xmm5,xmm1
  740. pxor xmm12,xmm4
  741. pxor xmm13,xmm5
  742. movdqa xmm7,xmm12
  743. pslld xmm12,7
  744. psrld xmm7,25
  745. movdqa xmm6,xmm13
  746. pslld xmm13,7
  747. por xmm12,xmm7
  748. psrld xmm6,25
  749. movdqa xmm7,XMMWORD PTR[r10]
  750. por xmm13,xmm6
  751. movdqa XMMWORD PTR[rsp],xmm4
  752. movdqa XMMWORD PTR[16+rsp],xmm5
  753. movdqa xmm4,XMMWORD PTR[32+rsp]
  754. movdqa xmm5,XMMWORD PTR[48+rsp]
  755. paddd xmm10,xmm14
  756. paddd xmm11,xmm15
  757. pxor xmm2,xmm10
  758. pxor xmm3,xmm11
  759. DB 102,15,56,0,215
  760. DB 102,15,56,0,223
  761. paddd xmm4,xmm2
  762. paddd xmm5,xmm3
  763. pxor xmm14,xmm4
  764. pxor xmm15,xmm5
  765. movdqa xmm6,xmm14
  766. pslld xmm14,12
  767. psrld xmm6,20
  768. movdqa xmm7,xmm15
  769. pslld xmm15,12
  770. por xmm14,xmm6
  771. psrld xmm7,20
  772. movdqa xmm6,XMMWORD PTR[r11]
  773. por xmm15,xmm7
  774. paddd xmm10,xmm14
  775. paddd xmm11,xmm15
  776. pxor xmm2,xmm10
  777. pxor xmm3,xmm11
  778. DB 102,15,56,0,214
  779. DB 102,15,56,0,222
  780. paddd xmm4,xmm2
  781. paddd xmm5,xmm3
  782. pxor xmm14,xmm4
  783. pxor xmm15,xmm5
  784. movdqa xmm7,xmm14
  785. pslld xmm14,7
  786. psrld xmm7,25
  787. movdqa xmm6,xmm15
  788. pslld xmm15,7
  789. por xmm14,xmm7
  790. psrld xmm6,25
  791. movdqa xmm7,XMMWORD PTR[r10]
  792. por xmm15,xmm6
  793. paddd xmm8,xmm13
  794. paddd xmm9,xmm14
  795. pxor xmm3,xmm8
  796. pxor xmm0,xmm9
  797. DB 102,15,56,0,223
  798. DB 102,15,56,0,199
  799. paddd xmm4,xmm3
  800. paddd xmm5,xmm0
  801. pxor xmm13,xmm4
  802. pxor xmm14,xmm5
  803. movdqa xmm6,xmm13
  804. pslld xmm13,12
  805. psrld xmm6,20
  806. movdqa xmm7,xmm14
  807. pslld xmm14,12
  808. por xmm13,xmm6
  809. psrld xmm7,20
  810. movdqa xmm6,XMMWORD PTR[r11]
  811. por xmm14,xmm7
  812. paddd xmm8,xmm13
  813. paddd xmm9,xmm14
  814. pxor xmm3,xmm8
  815. pxor xmm0,xmm9
  816. DB 102,15,56,0,222
  817. DB 102,15,56,0,198
  818. paddd xmm4,xmm3
  819. paddd xmm5,xmm0
  820. pxor xmm13,xmm4
  821. pxor xmm14,xmm5
  822. movdqa xmm7,xmm13
  823. pslld xmm13,7
  824. psrld xmm7,25
  825. movdqa xmm6,xmm14
  826. pslld xmm14,7
  827. por xmm13,xmm7
  828. psrld xmm6,25
  829. movdqa xmm7,XMMWORD PTR[r10]
  830. por xmm14,xmm6
  831. movdqa XMMWORD PTR[32+rsp],xmm4
  832. movdqa XMMWORD PTR[48+rsp],xmm5
  833. movdqa xmm4,XMMWORD PTR[rsp]
  834. movdqa xmm5,XMMWORD PTR[16+rsp]
  835. paddd xmm10,xmm15
  836. paddd xmm11,xmm12
  837. pxor xmm1,xmm10
  838. pxor xmm2,xmm11
  839. DB 102,15,56,0,207
  840. DB 102,15,56,0,215
  841. paddd xmm4,xmm1
  842. paddd xmm5,xmm2
  843. pxor xmm15,xmm4
  844. pxor xmm12,xmm5
  845. movdqa xmm6,xmm15
  846. pslld xmm15,12
  847. psrld xmm6,20
  848. movdqa xmm7,xmm12
  849. pslld xmm12,12
  850. por xmm15,xmm6
  851. psrld xmm7,20
  852. movdqa xmm6,XMMWORD PTR[r11]
  853. por xmm12,xmm7
  854. paddd xmm10,xmm15
  855. paddd xmm11,xmm12
  856. pxor xmm1,xmm10
  857. pxor xmm2,xmm11
  858. DB 102,15,56,0,206
  859. DB 102,15,56,0,214
  860. paddd xmm4,xmm1
  861. paddd xmm5,xmm2
  862. pxor xmm15,xmm4
  863. pxor xmm12,xmm5
  864. movdqa xmm7,xmm15
  865. pslld xmm15,7
  866. psrld xmm7,25
  867. movdqa xmm6,xmm12
  868. pslld xmm12,7
  869. por xmm15,xmm7
  870. psrld xmm6,25
  871. movdqa xmm7,XMMWORD PTR[r10]
  872. por xmm12,xmm6
  873. dec eax
  874. jnz $L$oop4x
  875. paddd xmm8,XMMWORD PTR[64+rsp]
  876. paddd xmm9,XMMWORD PTR[80+rsp]
  877. paddd xmm10,XMMWORD PTR[96+rsp]
  878. paddd xmm11,XMMWORD PTR[112+rsp]
  879. movdqa xmm6,xmm8
  880. punpckldq xmm8,xmm9
  881. movdqa xmm7,xmm10
  882. punpckldq xmm10,xmm11
  883. punpckhdq xmm6,xmm9
  884. punpckhdq xmm7,xmm11
  885. movdqa xmm9,xmm8
  886. punpcklqdq xmm8,xmm10
  887. movdqa xmm11,xmm6
  888. punpcklqdq xmm6,xmm7
  889. punpckhqdq xmm9,xmm10
  890. punpckhqdq xmm11,xmm7
  891. paddd xmm12,XMMWORD PTR[((128-256))+rcx]
  892. paddd xmm13,XMMWORD PTR[((144-256))+rcx]
  893. paddd xmm14,XMMWORD PTR[((160-256))+rcx]
  894. paddd xmm15,XMMWORD PTR[((176-256))+rcx]
  895. movdqa XMMWORD PTR[rsp],xmm8
  896. movdqa XMMWORD PTR[16+rsp],xmm9
  897. movdqa xmm8,XMMWORD PTR[32+rsp]
  898. movdqa xmm9,XMMWORD PTR[48+rsp]
  899. movdqa xmm10,xmm12
  900. punpckldq xmm12,xmm13
  901. movdqa xmm7,xmm14
  902. punpckldq xmm14,xmm15
  903. punpckhdq xmm10,xmm13
  904. punpckhdq xmm7,xmm15
  905. movdqa xmm13,xmm12
  906. punpcklqdq xmm12,xmm14
  907. movdqa xmm15,xmm10
  908. punpcklqdq xmm10,xmm7
  909. punpckhqdq xmm13,xmm14
  910. punpckhqdq xmm15,xmm7
  911. paddd xmm4,XMMWORD PTR[((192-256))+rcx]
  912. paddd xmm5,XMMWORD PTR[((208-256))+rcx]
  913. paddd xmm8,XMMWORD PTR[((224-256))+rcx]
  914. paddd xmm9,XMMWORD PTR[((240-256))+rcx]
  915. movdqa XMMWORD PTR[32+rsp],xmm6
  916. movdqa XMMWORD PTR[48+rsp],xmm11
  917. movdqa xmm14,xmm4
  918. punpckldq xmm4,xmm5
  919. movdqa xmm7,xmm8
  920. punpckldq xmm8,xmm9
  921. punpckhdq xmm14,xmm5
  922. punpckhdq xmm7,xmm9
  923. movdqa xmm5,xmm4
  924. punpcklqdq xmm4,xmm8
  925. movdqa xmm9,xmm14
  926. punpcklqdq xmm14,xmm7
  927. punpckhqdq xmm5,xmm8
  928. punpckhqdq xmm9,xmm7
  929. paddd xmm0,XMMWORD PTR[((256-256))+rcx]
  930. paddd xmm1,XMMWORD PTR[((272-256))+rcx]
  931. paddd xmm2,XMMWORD PTR[((288-256))+rcx]
  932. paddd xmm3,XMMWORD PTR[((304-256))+rcx]
  933. movdqa xmm8,xmm0
  934. punpckldq xmm0,xmm1
  935. movdqa xmm7,xmm2
  936. punpckldq xmm2,xmm3
  937. punpckhdq xmm8,xmm1
  938. punpckhdq xmm7,xmm3
  939. movdqa xmm1,xmm0
  940. punpcklqdq xmm0,xmm2
  941. movdqa xmm3,xmm8
  942. punpcklqdq xmm8,xmm7
  943. punpckhqdq xmm1,xmm2
  944. punpckhqdq xmm3,xmm7
  945. cmp rdx,64*4
  946. jb $L$tail4x
  947. movdqu xmm6,XMMWORD PTR[rsi]
  948. movdqu xmm11,XMMWORD PTR[16+rsi]
  949. movdqu xmm2,XMMWORD PTR[32+rsi]
  950. movdqu xmm7,XMMWORD PTR[48+rsi]
  951. pxor xmm6,XMMWORD PTR[rsp]
  952. pxor xmm11,xmm12
  953. pxor xmm2,xmm4
  954. pxor xmm7,xmm0
  955. movdqu XMMWORD PTR[rdi],xmm6
  956. movdqu xmm6,XMMWORD PTR[64+rsi]
  957. movdqu XMMWORD PTR[16+rdi],xmm11
  958. movdqu xmm11,XMMWORD PTR[80+rsi]
  959. movdqu XMMWORD PTR[32+rdi],xmm2
  960. movdqu xmm2,XMMWORD PTR[96+rsi]
  961. movdqu XMMWORD PTR[48+rdi],xmm7
  962. movdqu xmm7,XMMWORD PTR[112+rsi]
  963. lea rsi,QWORD PTR[128+rsi]
  964. pxor xmm6,XMMWORD PTR[16+rsp]
  965. pxor xmm11,xmm13
  966. pxor xmm2,xmm5
  967. pxor xmm7,xmm1
  968. movdqu XMMWORD PTR[64+rdi],xmm6
  969. movdqu xmm6,XMMWORD PTR[rsi]
  970. movdqu XMMWORD PTR[80+rdi],xmm11
  971. movdqu xmm11,XMMWORD PTR[16+rsi]
  972. movdqu XMMWORD PTR[96+rdi],xmm2
  973. movdqu xmm2,XMMWORD PTR[32+rsi]
  974. movdqu XMMWORD PTR[112+rdi],xmm7
  975. lea rdi,QWORD PTR[128+rdi]
  976. movdqu xmm7,XMMWORD PTR[48+rsi]
  977. pxor xmm6,XMMWORD PTR[32+rsp]
  978. pxor xmm11,xmm10
  979. pxor xmm2,xmm14
  980. pxor xmm7,xmm8
  981. movdqu XMMWORD PTR[rdi],xmm6
  982. movdqu xmm6,XMMWORD PTR[64+rsi]
  983. movdqu XMMWORD PTR[16+rdi],xmm11
  984. movdqu xmm11,XMMWORD PTR[80+rsi]
  985. movdqu XMMWORD PTR[32+rdi],xmm2
  986. movdqu xmm2,XMMWORD PTR[96+rsi]
  987. movdqu XMMWORD PTR[48+rdi],xmm7
  988. movdqu xmm7,XMMWORD PTR[112+rsi]
  989. lea rsi,QWORD PTR[128+rsi]
  990. pxor xmm6,XMMWORD PTR[48+rsp]
  991. pxor xmm11,xmm15
  992. pxor xmm2,xmm9
  993. pxor xmm7,xmm3
  994. movdqu XMMWORD PTR[64+rdi],xmm6
  995. movdqu XMMWORD PTR[80+rdi],xmm11
  996. movdqu XMMWORD PTR[96+rdi],xmm2
  997. movdqu XMMWORD PTR[112+rdi],xmm7
  998. lea rdi,QWORD PTR[128+rdi]
  999. sub rdx,64*4
  1000. jnz $L$oop_outer4x
  1001. jmp $L$done4x
  1002. $L$tail4x::
  1003. cmp rdx,192
  1004. jae $L$192_or_more4x
  1005. cmp rdx,128
  1006. jae $L$128_or_more4x
  1007. cmp rdx,64
  1008. jae $L$64_or_more4x
  1009. xor r10,r10
  1010. movdqa XMMWORD PTR[16+rsp],xmm12
  1011. movdqa XMMWORD PTR[32+rsp],xmm4
  1012. movdqa XMMWORD PTR[48+rsp],xmm0
  1013. jmp $L$oop_tail4x
  1014. ALIGN 32
  1015. $L$64_or_more4x::
  1016. movdqu xmm6,XMMWORD PTR[rsi]
  1017. movdqu xmm11,XMMWORD PTR[16+rsi]
  1018. movdqu xmm2,XMMWORD PTR[32+rsi]
  1019. movdqu xmm7,XMMWORD PTR[48+rsi]
  1020. pxor xmm6,XMMWORD PTR[rsp]
  1021. pxor xmm11,xmm12
  1022. pxor xmm2,xmm4
  1023. pxor xmm7,xmm0
  1024. movdqu XMMWORD PTR[rdi],xmm6
  1025. movdqu XMMWORD PTR[16+rdi],xmm11
  1026. movdqu XMMWORD PTR[32+rdi],xmm2
  1027. movdqu XMMWORD PTR[48+rdi],xmm7
  1028. je $L$done4x
  1029. movdqa xmm6,XMMWORD PTR[16+rsp]
  1030. lea rsi,QWORD PTR[64+rsi]
  1031. xor r10,r10
  1032. movdqa XMMWORD PTR[rsp],xmm6
  1033. movdqa XMMWORD PTR[16+rsp],xmm13
  1034. lea rdi,QWORD PTR[64+rdi]
  1035. movdqa XMMWORD PTR[32+rsp],xmm5
  1036. sub rdx,64
  1037. movdqa XMMWORD PTR[48+rsp],xmm1
  1038. jmp $L$oop_tail4x
  1039. ALIGN 32
  1040. $L$128_or_more4x::
  1041. movdqu xmm6,XMMWORD PTR[rsi]
  1042. movdqu xmm11,XMMWORD PTR[16+rsi]
  1043. movdqu xmm2,XMMWORD PTR[32+rsi]
  1044. movdqu xmm7,XMMWORD PTR[48+rsi]
  1045. pxor xmm6,XMMWORD PTR[rsp]
  1046. pxor xmm11,xmm12
  1047. pxor xmm2,xmm4
  1048. pxor xmm7,xmm0
  1049. movdqu XMMWORD PTR[rdi],xmm6
  1050. movdqu xmm6,XMMWORD PTR[64+rsi]
  1051. movdqu XMMWORD PTR[16+rdi],xmm11
  1052. movdqu xmm11,XMMWORD PTR[80+rsi]
  1053. movdqu XMMWORD PTR[32+rdi],xmm2
  1054. movdqu xmm2,XMMWORD PTR[96+rsi]
  1055. movdqu XMMWORD PTR[48+rdi],xmm7
  1056. movdqu xmm7,XMMWORD PTR[112+rsi]
  1057. pxor xmm6,XMMWORD PTR[16+rsp]
  1058. pxor xmm11,xmm13
  1059. pxor xmm2,xmm5
  1060. pxor xmm7,xmm1
  1061. movdqu XMMWORD PTR[64+rdi],xmm6
  1062. movdqu XMMWORD PTR[80+rdi],xmm11
  1063. movdqu XMMWORD PTR[96+rdi],xmm2
  1064. movdqu XMMWORD PTR[112+rdi],xmm7
  1065. je $L$done4x
  1066. movdqa xmm6,XMMWORD PTR[32+rsp]
  1067. lea rsi,QWORD PTR[128+rsi]
  1068. xor r10,r10
  1069. movdqa XMMWORD PTR[rsp],xmm6
  1070. movdqa XMMWORD PTR[16+rsp],xmm10
  1071. lea rdi,QWORD PTR[128+rdi]
  1072. movdqa XMMWORD PTR[32+rsp],xmm14
  1073. sub rdx,128
  1074. movdqa XMMWORD PTR[48+rsp],xmm8
  1075. jmp $L$oop_tail4x
  1076. ALIGN 32
  1077. $L$192_or_more4x::
  1078. movdqu xmm6,XMMWORD PTR[rsi]
  1079. movdqu xmm11,XMMWORD PTR[16+rsi]
  1080. movdqu xmm2,XMMWORD PTR[32+rsi]
  1081. movdqu xmm7,XMMWORD PTR[48+rsi]
  1082. pxor xmm6,XMMWORD PTR[rsp]
  1083. pxor xmm11,xmm12
  1084. pxor xmm2,xmm4
  1085. pxor xmm7,xmm0
  1086. movdqu XMMWORD PTR[rdi],xmm6
  1087. movdqu xmm6,XMMWORD PTR[64+rsi]
  1088. movdqu XMMWORD PTR[16+rdi],xmm11
  1089. movdqu xmm11,XMMWORD PTR[80+rsi]
  1090. movdqu XMMWORD PTR[32+rdi],xmm2
  1091. movdqu xmm2,XMMWORD PTR[96+rsi]
  1092. movdqu XMMWORD PTR[48+rdi],xmm7
  1093. movdqu xmm7,XMMWORD PTR[112+rsi]
  1094. lea rsi,QWORD PTR[128+rsi]
  1095. pxor xmm6,XMMWORD PTR[16+rsp]
  1096. pxor xmm11,xmm13
  1097. pxor xmm2,xmm5
  1098. pxor xmm7,xmm1
  1099. movdqu XMMWORD PTR[64+rdi],xmm6
  1100. movdqu xmm6,XMMWORD PTR[rsi]
  1101. movdqu XMMWORD PTR[80+rdi],xmm11
  1102. movdqu xmm11,XMMWORD PTR[16+rsi]
  1103. movdqu XMMWORD PTR[96+rdi],xmm2
  1104. movdqu xmm2,XMMWORD PTR[32+rsi]
  1105. movdqu XMMWORD PTR[112+rdi],xmm7
  1106. lea rdi,QWORD PTR[128+rdi]
  1107. movdqu xmm7,XMMWORD PTR[48+rsi]
  1108. pxor xmm6,XMMWORD PTR[32+rsp]
  1109. pxor xmm11,xmm10
  1110. pxor xmm2,xmm14
  1111. pxor xmm7,xmm8
  1112. movdqu XMMWORD PTR[rdi],xmm6
  1113. movdqu XMMWORD PTR[16+rdi],xmm11
  1114. movdqu XMMWORD PTR[32+rdi],xmm2
  1115. movdqu XMMWORD PTR[48+rdi],xmm7
  1116. je $L$done4x
  1117. movdqa xmm6,XMMWORD PTR[48+rsp]
  1118. lea rsi,QWORD PTR[64+rsi]
  1119. xor r10,r10
  1120. movdqa XMMWORD PTR[rsp],xmm6
  1121. movdqa XMMWORD PTR[16+rsp],xmm15
  1122. lea rdi,QWORD PTR[64+rdi]
  1123. movdqa XMMWORD PTR[32+rsp],xmm9
  1124. sub rdx,192
  1125. movdqa XMMWORD PTR[48+rsp],xmm3
  1126. $L$oop_tail4x::
  1127. movzx eax,BYTE PTR[r10*1+rsi]
  1128. movzx ecx,BYTE PTR[r10*1+rsp]
  1129. lea r10,QWORD PTR[1+r10]
  1130. xor eax,ecx
  1131. mov BYTE PTR[((-1))+r10*1+rdi],al
  1132. dec rdx
  1133. jnz $L$oop_tail4x
  1134. $L$done4x::
  1135. movaps xmm6,XMMWORD PTR[((-168))+r9]
  1136. movaps xmm7,XMMWORD PTR[((-152))+r9]
  1137. movaps xmm8,XMMWORD PTR[((-136))+r9]
  1138. movaps xmm9,XMMWORD PTR[((-120))+r9]
  1139. movaps xmm10,XMMWORD PTR[((-104))+r9]
  1140. movaps xmm11,XMMWORD PTR[((-88))+r9]
  1141. movaps xmm12,XMMWORD PTR[((-72))+r9]
  1142. movaps xmm13,XMMWORD PTR[((-56))+r9]
  1143. movaps xmm14,XMMWORD PTR[((-40))+r9]
  1144. movaps xmm15,XMMWORD PTR[((-24))+r9]
  1145. lea rsp,QWORD PTR[r9]
  1146. $L$4x_epilogue::
  1147. mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
  1148. mov rsi,QWORD PTR[16+rsp]
  1149. DB 0F3h,0C3h ;repret
  1150. $L$SEH_end_ChaCha20_4x::
  1151. ChaCha20_4x ENDP
  1152. ALIGN 32
  1153. ChaCha20_4xop PROC PRIVATE
  1154. mov QWORD PTR[8+rsp],rdi ;WIN64 prologue
  1155. mov QWORD PTR[16+rsp],rsi
  1156. mov rax,rsp
  1157. $L$SEH_begin_ChaCha20_4xop::
  1158. mov rdi,rcx
  1159. mov rsi,rdx
  1160. mov rdx,r8
  1161. mov rcx,r9
  1162. mov r8,QWORD PTR[40+rsp]
  1163. $L$ChaCha20_4xop::
  1164. mov r9,rsp
  1165. sub rsp,0140h+168
  1166. movaps XMMWORD PTR[(-168)+r9],xmm6
  1167. movaps XMMWORD PTR[(-152)+r9],xmm7
  1168. movaps XMMWORD PTR[(-136)+r9],xmm8
  1169. movaps XMMWORD PTR[(-120)+r9],xmm9
  1170. movaps XMMWORD PTR[(-104)+r9],xmm10
  1171. movaps XMMWORD PTR[(-88)+r9],xmm11
  1172. movaps XMMWORD PTR[(-72)+r9],xmm12
  1173. movaps XMMWORD PTR[(-56)+r9],xmm13
  1174. movaps XMMWORD PTR[(-40)+r9],xmm14
  1175. movaps XMMWORD PTR[(-24)+r9],xmm15
  1176. $L$4xop_body::
  1177. vzeroupper
  1178. vmovdqa xmm11,XMMWORD PTR[$L$sigma]
  1179. vmovdqu xmm3,XMMWORD PTR[rcx]
  1180. vmovdqu xmm15,XMMWORD PTR[16+rcx]
  1181. vmovdqu xmm7,XMMWORD PTR[r8]
  1182. lea rcx,QWORD PTR[256+rsp]
  1183. vpshufd xmm8,xmm11,000h
  1184. vpshufd xmm9,xmm11,055h
  1185. vmovdqa XMMWORD PTR[64+rsp],xmm8
  1186. vpshufd xmm10,xmm11,0aah
  1187. vmovdqa XMMWORD PTR[80+rsp],xmm9
  1188. vpshufd xmm11,xmm11,0ffh
  1189. vmovdqa XMMWORD PTR[96+rsp],xmm10
  1190. vmovdqa XMMWORD PTR[112+rsp],xmm11
  1191. vpshufd xmm0,xmm3,000h
  1192. vpshufd xmm1,xmm3,055h
  1193. vmovdqa XMMWORD PTR[(128-256)+rcx],xmm0
  1194. vpshufd xmm2,xmm3,0aah
  1195. vmovdqa XMMWORD PTR[(144-256)+rcx],xmm1
  1196. vpshufd xmm3,xmm3,0ffh
  1197. vmovdqa XMMWORD PTR[(160-256)+rcx],xmm2
  1198. vmovdqa XMMWORD PTR[(176-256)+rcx],xmm3
  1199. vpshufd xmm12,xmm15,000h
  1200. vpshufd xmm13,xmm15,055h
  1201. vmovdqa XMMWORD PTR[(192-256)+rcx],xmm12
  1202. vpshufd xmm14,xmm15,0aah
  1203. vmovdqa XMMWORD PTR[(208-256)+rcx],xmm13
  1204. vpshufd xmm15,xmm15,0ffh
  1205. vmovdqa XMMWORD PTR[(224-256)+rcx],xmm14
  1206. vmovdqa XMMWORD PTR[(240-256)+rcx],xmm15
  1207. vpshufd xmm4,xmm7,000h
  1208. vpshufd xmm5,xmm7,055h
  1209. vpaddd xmm4,xmm4,XMMWORD PTR[$L$inc]
  1210. vpshufd xmm6,xmm7,0aah
  1211. vmovdqa XMMWORD PTR[(272-256)+rcx],xmm5
  1212. vpshufd xmm7,xmm7,0ffh
  1213. vmovdqa XMMWORD PTR[(288-256)+rcx],xmm6
  1214. vmovdqa XMMWORD PTR[(304-256)+rcx],xmm7
  1215. jmp $L$oop_enter4xop
  1216. ALIGN 32
  1217. $L$oop_outer4xop::
  1218. vmovdqa xmm8,XMMWORD PTR[64+rsp]
  1219. vmovdqa xmm9,XMMWORD PTR[80+rsp]
  1220. vmovdqa xmm10,XMMWORD PTR[96+rsp]
  1221. vmovdqa xmm11,XMMWORD PTR[112+rsp]
  1222. vmovdqa xmm0,XMMWORD PTR[((128-256))+rcx]
  1223. vmovdqa xmm1,XMMWORD PTR[((144-256))+rcx]
  1224. vmovdqa xmm2,XMMWORD PTR[((160-256))+rcx]
  1225. vmovdqa xmm3,XMMWORD PTR[((176-256))+rcx]
  1226. vmovdqa xmm12,XMMWORD PTR[((192-256))+rcx]
  1227. vmovdqa xmm13,XMMWORD PTR[((208-256))+rcx]
  1228. vmovdqa xmm14,XMMWORD PTR[((224-256))+rcx]
  1229. vmovdqa xmm15,XMMWORD PTR[((240-256))+rcx]
  1230. vmovdqa xmm4,XMMWORD PTR[((256-256))+rcx]
  1231. vmovdqa xmm5,XMMWORD PTR[((272-256))+rcx]
  1232. vmovdqa xmm6,XMMWORD PTR[((288-256))+rcx]
  1233. vmovdqa xmm7,XMMWORD PTR[((304-256))+rcx]
  1234. vpaddd xmm4,xmm4,XMMWORD PTR[$L$four]
  1235. $L$oop_enter4xop::
  1236. mov eax,10
  1237. vmovdqa XMMWORD PTR[(256-256)+rcx],xmm4
  1238. jmp $L$oop4xop
  1239. ALIGN 32
  1240. $L$oop4xop::
  1241. vpaddd xmm8,xmm8,xmm0
  1242. vpaddd xmm9,xmm9,xmm1
  1243. vpaddd xmm10,xmm10,xmm2
  1244. vpaddd xmm11,xmm11,xmm3
  1245. vpxor xmm4,xmm8,xmm4
  1246. vpxor xmm5,xmm9,xmm5
  1247. vpxor xmm6,xmm10,xmm6
  1248. vpxor xmm7,xmm11,xmm7
  1249. DB 143,232,120,194,228,16
  1250. DB 143,232,120,194,237,16
  1251. DB 143,232,120,194,246,16
  1252. DB 143,232,120,194,255,16
  1253. vpaddd xmm12,xmm12,xmm4
  1254. vpaddd xmm13,xmm13,xmm5
  1255. vpaddd xmm14,xmm14,xmm6
  1256. vpaddd xmm15,xmm15,xmm7
  1257. vpxor xmm0,xmm12,xmm0
  1258. vpxor xmm1,xmm13,xmm1
  1259. vpxor xmm2,xmm2,xmm14
  1260. vpxor xmm3,xmm3,xmm15
  1261. DB 143,232,120,194,192,12
  1262. DB 143,232,120,194,201,12
  1263. DB 143,232,120,194,210,12
  1264. DB 143,232,120,194,219,12
  1265. vpaddd xmm8,xmm0,xmm8
  1266. vpaddd xmm9,xmm1,xmm9
  1267. vpaddd xmm10,xmm10,xmm2
  1268. vpaddd xmm11,xmm11,xmm3
  1269. vpxor xmm4,xmm8,xmm4
  1270. vpxor xmm5,xmm9,xmm5
  1271. vpxor xmm6,xmm10,xmm6
  1272. vpxor xmm7,xmm11,xmm7
  1273. DB 143,232,120,194,228,8
  1274. DB 143,232,120,194,237,8
  1275. DB 143,232,120,194,246,8
  1276. DB 143,232,120,194,255,8
  1277. vpaddd xmm12,xmm12,xmm4
  1278. vpaddd xmm13,xmm13,xmm5
  1279. vpaddd xmm14,xmm14,xmm6
  1280. vpaddd xmm15,xmm15,xmm7
  1281. vpxor xmm0,xmm12,xmm0
  1282. vpxor xmm1,xmm13,xmm1
  1283. vpxor xmm2,xmm2,xmm14
  1284. vpxor xmm3,xmm3,xmm15
  1285. DB 143,232,120,194,192,7
  1286. DB 143,232,120,194,201,7
  1287. DB 143,232,120,194,210,7
  1288. DB 143,232,120,194,219,7
  1289. vpaddd xmm8,xmm8,xmm1
  1290. vpaddd xmm9,xmm9,xmm2
  1291. vpaddd xmm10,xmm10,xmm3
  1292. vpaddd xmm11,xmm11,xmm0
  1293. vpxor xmm7,xmm8,xmm7
  1294. vpxor xmm4,xmm9,xmm4
  1295. vpxor xmm5,xmm10,xmm5
  1296. vpxor xmm6,xmm11,xmm6
  1297. DB 143,232,120,194,255,16
  1298. DB 143,232,120,194,228,16
  1299. DB 143,232,120,194,237,16
  1300. DB 143,232,120,194,246,16
  1301. vpaddd xmm14,xmm14,xmm7
  1302. vpaddd xmm15,xmm15,xmm4
  1303. vpaddd xmm12,xmm12,xmm5
  1304. vpaddd xmm13,xmm13,xmm6
  1305. vpxor xmm1,xmm14,xmm1
  1306. vpxor xmm2,xmm15,xmm2
  1307. vpxor xmm3,xmm3,xmm12
  1308. vpxor xmm0,xmm0,xmm13
  1309. DB 143,232,120,194,201,12
  1310. DB 143,232,120,194,210,12
  1311. DB 143,232,120,194,219,12
  1312. DB 143,232,120,194,192,12
  1313. vpaddd xmm8,xmm1,xmm8
  1314. vpaddd xmm9,xmm2,xmm9
  1315. vpaddd xmm10,xmm10,xmm3
  1316. vpaddd xmm11,xmm11,xmm0
  1317. vpxor xmm7,xmm8,xmm7
  1318. vpxor xmm4,xmm9,xmm4
  1319. vpxor xmm5,xmm10,xmm5
  1320. vpxor xmm6,xmm11,xmm6
  1321. DB 143,232,120,194,255,8
  1322. DB 143,232,120,194,228,8
  1323. DB 143,232,120,194,237,8
  1324. DB 143,232,120,194,246,8
  1325. vpaddd xmm14,xmm14,xmm7
  1326. vpaddd xmm15,xmm15,xmm4
  1327. vpaddd xmm12,xmm12,xmm5
  1328. vpaddd xmm13,xmm13,xmm6
  1329. vpxor xmm1,xmm14,xmm1
  1330. vpxor xmm2,xmm15,xmm2
  1331. vpxor xmm3,xmm3,xmm12
  1332. vpxor xmm0,xmm0,xmm13
  1333. DB 143,232,120,194,201,7
  1334. DB 143,232,120,194,210,7
  1335. DB 143,232,120,194,219,7
  1336. DB 143,232,120,194,192,7
  1337. dec eax
  1338. jnz $L$oop4xop
  1339. vpaddd xmm8,xmm8,XMMWORD PTR[64+rsp]
  1340. vpaddd xmm9,xmm9,XMMWORD PTR[80+rsp]
  1341. vpaddd xmm10,xmm10,XMMWORD PTR[96+rsp]
  1342. vpaddd xmm11,xmm11,XMMWORD PTR[112+rsp]
  1343. vmovdqa XMMWORD PTR[32+rsp],xmm14
  1344. vmovdqa XMMWORD PTR[48+rsp],xmm15
  1345. vpunpckldq xmm14,xmm8,xmm9
  1346. vpunpckldq xmm15,xmm10,xmm11
  1347. vpunpckhdq xmm8,xmm8,xmm9
  1348. vpunpckhdq xmm10,xmm10,xmm11
  1349. vpunpcklqdq xmm9,xmm14,xmm15
  1350. vpunpckhqdq xmm14,xmm14,xmm15
  1351. vpunpcklqdq xmm11,xmm8,xmm10
  1352. vpunpckhqdq xmm8,xmm8,xmm10
  1353. vpaddd xmm0,xmm0,XMMWORD PTR[((128-256))+rcx]
  1354. vpaddd xmm1,xmm1,XMMWORD PTR[((144-256))+rcx]
  1355. vpaddd xmm2,xmm2,XMMWORD PTR[((160-256))+rcx]
  1356. vpaddd xmm3,xmm3,XMMWORD PTR[((176-256))+rcx]
  1357. vmovdqa XMMWORD PTR[rsp],xmm9
  1358. vmovdqa XMMWORD PTR[16+rsp],xmm14
  1359. vmovdqa xmm9,XMMWORD PTR[32+rsp]
  1360. vmovdqa xmm14,XMMWORD PTR[48+rsp]
  1361. vpunpckldq xmm10,xmm0,xmm1
  1362. vpunpckldq xmm15,xmm2,xmm3
  1363. vpunpckhdq xmm0,xmm0,xmm1
  1364. vpunpckhdq xmm2,xmm2,xmm3
  1365. vpunpcklqdq xmm1,xmm10,xmm15
  1366. vpunpckhqdq xmm10,xmm10,xmm15
  1367. vpunpcklqdq xmm3,xmm0,xmm2
  1368. vpunpckhqdq xmm0,xmm0,xmm2
  1369. vpaddd xmm12,xmm12,XMMWORD PTR[((192-256))+rcx]
  1370. vpaddd xmm13,xmm13,XMMWORD PTR[((208-256))+rcx]
  1371. vpaddd xmm9,xmm9,XMMWORD PTR[((224-256))+rcx]
  1372. vpaddd xmm14,xmm14,XMMWORD PTR[((240-256))+rcx]
  1373. vpunpckldq xmm2,xmm12,xmm13
  1374. vpunpckldq xmm15,xmm9,xmm14
  1375. vpunpckhdq xmm12,xmm12,xmm13
  1376. vpunpckhdq xmm9,xmm9,xmm14
  1377. vpunpcklqdq xmm13,xmm2,xmm15
  1378. vpunpckhqdq xmm2,xmm2,xmm15
  1379. vpunpcklqdq xmm14,xmm12,xmm9
  1380. vpunpckhqdq xmm12,xmm12,xmm9
  1381. vpaddd xmm4,xmm4,XMMWORD PTR[((256-256))+rcx]
  1382. vpaddd xmm5,xmm5,XMMWORD PTR[((272-256))+rcx]
  1383. vpaddd xmm6,xmm6,XMMWORD PTR[((288-256))+rcx]
  1384. vpaddd xmm7,xmm7,XMMWORD PTR[((304-256))+rcx]
  1385. vpunpckldq xmm9,xmm4,xmm5
  1386. vpunpckldq xmm15,xmm6,xmm7
  1387. vpunpckhdq xmm4,xmm4,xmm5
  1388. vpunpckhdq xmm6,xmm6,xmm7
  1389. vpunpcklqdq xmm5,xmm9,xmm15
  1390. vpunpckhqdq xmm9,xmm9,xmm15
  1391. vpunpcklqdq xmm7,xmm4,xmm6
  1392. vpunpckhqdq xmm4,xmm4,xmm6
  1393. vmovdqa xmm6,XMMWORD PTR[rsp]
  1394. vmovdqa xmm15,XMMWORD PTR[16+rsp]
  1395. cmp rdx,64*4
  1396. jb $L$tail4xop
  1397. vpxor xmm6,xmm6,XMMWORD PTR[rsi]
  1398. vpxor xmm1,xmm1,XMMWORD PTR[16+rsi]
  1399. vpxor xmm13,xmm13,XMMWORD PTR[32+rsi]
  1400. vpxor xmm5,xmm5,XMMWORD PTR[48+rsi]
  1401. vpxor xmm15,xmm15,XMMWORD PTR[64+rsi]
  1402. vpxor xmm10,xmm10,XMMWORD PTR[80+rsi]
  1403. vpxor xmm2,xmm2,XMMWORD PTR[96+rsi]
  1404. vpxor xmm9,xmm9,XMMWORD PTR[112+rsi]
  1405. lea rsi,QWORD PTR[128+rsi]
  1406. vpxor xmm11,xmm11,XMMWORD PTR[rsi]
  1407. vpxor xmm3,xmm3,XMMWORD PTR[16+rsi]
  1408. vpxor xmm14,xmm14,XMMWORD PTR[32+rsi]
  1409. vpxor xmm7,xmm7,XMMWORD PTR[48+rsi]
  1410. vpxor xmm8,xmm8,XMMWORD PTR[64+rsi]
  1411. vpxor xmm0,xmm0,XMMWORD PTR[80+rsi]
  1412. vpxor xmm12,xmm12,XMMWORD PTR[96+rsi]
  1413. vpxor xmm4,xmm4,XMMWORD PTR[112+rsi]
  1414. lea rsi,QWORD PTR[128+rsi]
  1415. vmovdqu XMMWORD PTR[rdi],xmm6
  1416. vmovdqu XMMWORD PTR[16+rdi],xmm1
  1417. vmovdqu XMMWORD PTR[32+rdi],xmm13
  1418. vmovdqu XMMWORD PTR[48+rdi],xmm5
  1419. vmovdqu XMMWORD PTR[64+rdi],xmm15
  1420. vmovdqu XMMWORD PTR[80+rdi],xmm10
  1421. vmovdqu XMMWORD PTR[96+rdi],xmm2
  1422. vmovdqu XMMWORD PTR[112+rdi],xmm9
  1423. lea rdi,QWORD PTR[128+rdi]
  1424. vmovdqu XMMWORD PTR[rdi],xmm11
  1425. vmovdqu XMMWORD PTR[16+rdi],xmm3
  1426. vmovdqu XMMWORD PTR[32+rdi],xmm14
  1427. vmovdqu XMMWORD PTR[48+rdi],xmm7
  1428. vmovdqu XMMWORD PTR[64+rdi],xmm8
  1429. vmovdqu XMMWORD PTR[80+rdi],xmm0
  1430. vmovdqu XMMWORD PTR[96+rdi],xmm12
  1431. vmovdqu XMMWORD PTR[112+rdi],xmm4
  1432. lea rdi,QWORD PTR[128+rdi]
  1433. sub rdx,64*4
  1434. jnz $L$oop_outer4xop
  1435. jmp $L$done4xop
  1436. ALIGN 32
  1437. $L$tail4xop::
  1438. cmp rdx,192
  1439. jae $L$192_or_more4xop
  1440. cmp rdx,128
  1441. jae $L$128_or_more4xop
  1442. cmp rdx,64
  1443. jae $L$64_or_more4xop
  1444. xor r10,r10
  1445. vmovdqa XMMWORD PTR[rsp],xmm6
  1446. vmovdqa XMMWORD PTR[16+rsp],xmm1
  1447. vmovdqa XMMWORD PTR[32+rsp],xmm13
  1448. vmovdqa XMMWORD PTR[48+rsp],xmm5
  1449. jmp $L$oop_tail4xop
  1450. ALIGN 32
  1451. $L$64_or_more4xop::
  1452. vpxor xmm6,xmm6,XMMWORD PTR[rsi]
  1453. vpxor xmm1,xmm1,XMMWORD PTR[16+rsi]
  1454. vpxor xmm13,xmm13,XMMWORD PTR[32+rsi]
  1455. vpxor xmm5,xmm5,XMMWORD PTR[48+rsi]
  1456. vmovdqu XMMWORD PTR[rdi],xmm6
  1457. vmovdqu XMMWORD PTR[16+rdi],xmm1
  1458. vmovdqu XMMWORD PTR[32+rdi],xmm13
  1459. vmovdqu XMMWORD PTR[48+rdi],xmm5
  1460. je $L$done4xop
  1461. lea rsi,QWORD PTR[64+rsi]
  1462. vmovdqa XMMWORD PTR[rsp],xmm15
  1463. xor r10,r10
  1464. vmovdqa XMMWORD PTR[16+rsp],xmm10
  1465. lea rdi,QWORD PTR[64+rdi]
  1466. vmovdqa XMMWORD PTR[32+rsp],xmm2
  1467. sub rdx,64
  1468. vmovdqa XMMWORD PTR[48+rsp],xmm9
  1469. jmp $L$oop_tail4xop
  1470. ALIGN 32
  1471. $L$128_or_more4xop::
  1472. vpxor xmm6,xmm6,XMMWORD PTR[rsi]
  1473. vpxor xmm1,xmm1,XMMWORD PTR[16+rsi]
  1474. vpxor xmm13,xmm13,XMMWORD PTR[32+rsi]
  1475. vpxor xmm5,xmm5,XMMWORD PTR[48+rsi]
  1476. vpxor xmm15,xmm15,XMMWORD PTR[64+rsi]
  1477. vpxor xmm10,xmm10,XMMWORD PTR[80+rsi]
  1478. vpxor xmm2,xmm2,XMMWORD PTR[96+rsi]
  1479. vpxor xmm9,xmm9,XMMWORD PTR[112+rsi]
  1480. vmovdqu XMMWORD PTR[rdi],xmm6
  1481. vmovdqu XMMWORD PTR[16+rdi],xmm1
  1482. vmovdqu XMMWORD PTR[32+rdi],xmm13
  1483. vmovdqu XMMWORD PTR[48+rdi],xmm5
  1484. vmovdqu XMMWORD PTR[64+rdi],xmm15
  1485. vmovdqu XMMWORD PTR[80+rdi],xmm10
  1486. vmovdqu XMMWORD PTR[96+rdi],xmm2
  1487. vmovdqu XMMWORD PTR[112+rdi],xmm9
  1488. je $L$done4xop
  1489. lea rsi,QWORD PTR[128+rsi]
  1490. vmovdqa XMMWORD PTR[rsp],xmm11
  1491. xor r10,r10
  1492. vmovdqa XMMWORD PTR[16+rsp],xmm3
  1493. lea rdi,QWORD PTR[128+rdi]
  1494. vmovdqa XMMWORD PTR[32+rsp],xmm14
  1495. sub rdx,128
  1496. vmovdqa XMMWORD PTR[48+rsp],xmm7
  1497. jmp $L$oop_tail4xop
  1498. ALIGN 32
  1499. $L$192_or_more4xop::
  1500. vpxor xmm6,xmm6,XMMWORD PTR[rsi]
  1501. vpxor xmm1,xmm1,XMMWORD PTR[16+rsi]
  1502. vpxor xmm13,xmm13,XMMWORD PTR[32+rsi]
  1503. vpxor xmm5,xmm5,XMMWORD PTR[48+rsi]
  1504. vpxor xmm15,xmm15,XMMWORD PTR[64+rsi]
  1505. vpxor xmm10,xmm10,XMMWORD PTR[80+rsi]
  1506. vpxor xmm2,xmm2,XMMWORD PTR[96+rsi]
  1507. vpxor xmm9,xmm9,XMMWORD PTR[112+rsi]
  1508. lea rsi,QWORD PTR[128+rsi]
  1509. vpxor xmm11,xmm11,XMMWORD PTR[rsi]
  1510. vpxor xmm3,xmm3,XMMWORD PTR[16+rsi]
  1511. vpxor xmm14,xmm14,XMMWORD PTR[32+rsi]
  1512. vpxor xmm7,xmm7,XMMWORD PTR[48+rsi]
  1513. vmovdqu XMMWORD PTR[rdi],xmm6
  1514. vmovdqu XMMWORD PTR[16+rdi],xmm1
  1515. vmovdqu XMMWORD PTR[32+rdi],xmm13
  1516. vmovdqu XMMWORD PTR[48+rdi],xmm5
  1517. vmovdqu XMMWORD PTR[64+rdi],xmm15
  1518. vmovdqu XMMWORD PTR[80+rdi],xmm10
  1519. vmovdqu XMMWORD PTR[96+rdi],xmm2
  1520. vmovdqu XMMWORD PTR[112+rdi],xmm9
  1521. lea rdi,QWORD PTR[128+rdi]
  1522. vmovdqu XMMWORD PTR[rdi],xmm11
  1523. vmovdqu XMMWORD PTR[16+rdi],xmm3
  1524. vmovdqu XMMWORD PTR[32+rdi],xmm14
  1525. vmovdqu XMMWORD PTR[48+rdi],xmm7
  1526. je $L$done4xop
  1527. lea rsi,QWORD PTR[64+rsi]
  1528. vmovdqa XMMWORD PTR[rsp],xmm8
  1529. xor r10,r10
  1530. vmovdqa XMMWORD PTR[16+rsp],xmm0
  1531. lea rdi,QWORD PTR[64+rdi]
  1532. vmovdqa XMMWORD PTR[32+rsp],xmm12
  1533. sub rdx,192
  1534. vmovdqa XMMWORD PTR[48+rsp],xmm4
  1535. $L$oop_tail4xop::
  1536. movzx eax,BYTE PTR[r10*1+rsi]
  1537. movzx ecx,BYTE PTR[r10*1+rsp]
  1538. lea r10,QWORD PTR[1+r10]
  1539. xor eax,ecx
  1540. mov BYTE PTR[((-1))+r10*1+rdi],al
  1541. dec rdx
  1542. jnz $L$oop_tail4xop
  1543. $L$done4xop::
  1544. vzeroupper
  1545. movaps xmm6,XMMWORD PTR[((-168))+r9]
  1546. movaps xmm7,XMMWORD PTR[((-152))+r9]
  1547. movaps xmm8,XMMWORD PTR[((-136))+r9]
  1548. movaps xmm9,XMMWORD PTR[((-120))+r9]
  1549. movaps xmm10,XMMWORD PTR[((-104))+r9]
  1550. movaps xmm11,XMMWORD PTR[((-88))+r9]
  1551. movaps xmm12,XMMWORD PTR[((-72))+r9]
  1552. movaps xmm13,XMMWORD PTR[((-56))+r9]
  1553. movaps xmm14,XMMWORD PTR[((-40))+r9]
  1554. movaps xmm15,XMMWORD PTR[((-24))+r9]
  1555. lea rsp,QWORD PTR[r9]
  1556. $L$4xop_epilogue::
  1557. mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
  1558. mov rsi,QWORD PTR[16+rsp]
  1559. DB 0F3h,0C3h ;repret
  1560. $L$SEH_end_ChaCha20_4xop::
  1561. ChaCha20_4xop ENDP
  1562. ALIGN 32
  1563. ChaCha20_8x PROC PRIVATE
  1564. mov QWORD PTR[8+rsp],rdi ;WIN64 prologue
  1565. mov QWORD PTR[16+rsp],rsi
  1566. mov rax,rsp
  1567. $L$SEH_begin_ChaCha20_8x::
  1568. mov rdi,rcx
  1569. mov rsi,rdx
  1570. mov rdx,r8
  1571. mov rcx,r9
  1572. mov r8,QWORD PTR[40+rsp]
  1573. $L$ChaCha20_8x::
  1574. mov r9,rsp
  1575. sub rsp,0280h+168
  1576. and rsp,-32
  1577. movaps XMMWORD PTR[(-168)+r9],xmm6
  1578. movaps XMMWORD PTR[(-152)+r9],xmm7
  1579. movaps XMMWORD PTR[(-136)+r9],xmm8
  1580. movaps XMMWORD PTR[(-120)+r9],xmm9
  1581. movaps XMMWORD PTR[(-104)+r9],xmm10
  1582. movaps XMMWORD PTR[(-88)+r9],xmm11
  1583. movaps XMMWORD PTR[(-72)+r9],xmm12
  1584. movaps XMMWORD PTR[(-56)+r9],xmm13
  1585. movaps XMMWORD PTR[(-40)+r9],xmm14
  1586. movaps XMMWORD PTR[(-24)+r9],xmm15
  1587. $L$8x_body::
  1588. vzeroupper
  1589. vbroadcasti128 ymm11,XMMWORD PTR[$L$sigma]
  1590. vbroadcasti128 ymm3,XMMWORD PTR[rcx]
  1591. vbroadcasti128 ymm15,XMMWORD PTR[16+rcx]
  1592. vbroadcasti128 ymm7,XMMWORD PTR[r8]
  1593. lea rcx,QWORD PTR[256+rsp]
  1594. lea rax,QWORD PTR[512+rsp]
  1595. lea r10,QWORD PTR[$L$rot16]
  1596. lea r11,QWORD PTR[$L$rot24]
  1597. vpshufd ymm8,ymm11,000h
  1598. vpshufd ymm9,ymm11,055h
  1599. vmovdqa YMMWORD PTR[(128-256)+rcx],ymm8
  1600. vpshufd ymm10,ymm11,0aah
  1601. vmovdqa YMMWORD PTR[(160-256)+rcx],ymm9
  1602. vpshufd ymm11,ymm11,0ffh
  1603. vmovdqa YMMWORD PTR[(192-256)+rcx],ymm10
  1604. vmovdqa YMMWORD PTR[(224-256)+rcx],ymm11
  1605. vpshufd ymm0,ymm3,000h
  1606. vpshufd ymm1,ymm3,055h
  1607. vmovdqa YMMWORD PTR[(256-256)+rcx],ymm0
  1608. vpshufd ymm2,ymm3,0aah
  1609. vmovdqa YMMWORD PTR[(288-256)+rcx],ymm1
  1610. vpshufd ymm3,ymm3,0ffh
  1611. vmovdqa YMMWORD PTR[(320-256)+rcx],ymm2
  1612. vmovdqa YMMWORD PTR[(352-256)+rcx],ymm3
  1613. vpshufd ymm12,ymm15,000h
  1614. vpshufd ymm13,ymm15,055h
  1615. vmovdqa YMMWORD PTR[(384-512)+rax],ymm12
  1616. vpshufd ymm14,ymm15,0aah
  1617. vmovdqa YMMWORD PTR[(416-512)+rax],ymm13
  1618. vpshufd ymm15,ymm15,0ffh
  1619. vmovdqa YMMWORD PTR[(448-512)+rax],ymm14
  1620. vmovdqa YMMWORD PTR[(480-512)+rax],ymm15
  1621. vpshufd ymm4,ymm7,000h
  1622. vpshufd ymm5,ymm7,055h
  1623. vpaddd ymm4,ymm4,YMMWORD PTR[$L$incy]
  1624. vpshufd ymm6,ymm7,0aah
  1625. vmovdqa YMMWORD PTR[(544-512)+rax],ymm5
  1626. vpshufd ymm7,ymm7,0ffh
  1627. vmovdqa YMMWORD PTR[(576-512)+rax],ymm6
  1628. vmovdqa YMMWORD PTR[(608-512)+rax],ymm7
  1629. jmp $L$oop_enter8x
  1630. ALIGN 32
  1631. $L$oop_outer8x::
  1632. vmovdqa ymm8,YMMWORD PTR[((128-256))+rcx]
  1633. vmovdqa ymm9,YMMWORD PTR[((160-256))+rcx]
  1634. vmovdqa ymm10,YMMWORD PTR[((192-256))+rcx]
  1635. vmovdqa ymm11,YMMWORD PTR[((224-256))+rcx]
  1636. vmovdqa ymm0,YMMWORD PTR[((256-256))+rcx]
  1637. vmovdqa ymm1,YMMWORD PTR[((288-256))+rcx]
  1638. vmovdqa ymm2,YMMWORD PTR[((320-256))+rcx]
  1639. vmovdqa ymm3,YMMWORD PTR[((352-256))+rcx]
  1640. vmovdqa ymm12,YMMWORD PTR[((384-512))+rax]
  1641. vmovdqa ymm13,YMMWORD PTR[((416-512))+rax]
  1642. vmovdqa ymm14,YMMWORD PTR[((448-512))+rax]
  1643. vmovdqa ymm15,YMMWORD PTR[((480-512))+rax]
  1644. vmovdqa ymm4,YMMWORD PTR[((512-512))+rax]
  1645. vmovdqa ymm5,YMMWORD PTR[((544-512))+rax]
  1646. vmovdqa ymm6,YMMWORD PTR[((576-512))+rax]
  1647. vmovdqa ymm7,YMMWORD PTR[((608-512))+rax]
  1648. vpaddd ymm4,ymm4,YMMWORD PTR[$L$eight]
  1649. $L$oop_enter8x::
  1650. vmovdqa YMMWORD PTR[64+rsp],ymm14
  1651. vmovdqa YMMWORD PTR[96+rsp],ymm15
  1652. vbroadcasti128 ymm15,XMMWORD PTR[r10]
  1653. vmovdqa YMMWORD PTR[(512-512)+rax],ymm4
  1654. mov eax,10
  1655. jmp $L$oop8x
  1656. ALIGN 32
  1657. $L$oop8x::
  1658. vpaddd ymm8,ymm8,ymm0
  1659. vpxor ymm4,ymm8,ymm4
  1660. vpshufb ymm4,ymm4,ymm15
  1661. vpaddd ymm9,ymm9,ymm1
  1662. vpxor ymm5,ymm9,ymm5
  1663. vpshufb ymm5,ymm5,ymm15
  1664. vpaddd ymm12,ymm12,ymm4
  1665. vpxor ymm0,ymm12,ymm0
  1666. vpslld ymm14,ymm0,12
  1667. vpsrld ymm0,ymm0,20
  1668. vpor ymm0,ymm14,ymm0
  1669. vbroadcasti128 ymm14,XMMWORD PTR[r11]
  1670. vpaddd ymm13,ymm13,ymm5
  1671. vpxor ymm1,ymm13,ymm1
  1672. vpslld ymm15,ymm1,12
  1673. vpsrld ymm1,ymm1,20
  1674. vpor ymm1,ymm15,ymm1
  1675. vpaddd ymm8,ymm8,ymm0
  1676. vpxor ymm4,ymm8,ymm4
  1677. vpshufb ymm4,ymm4,ymm14
  1678. vpaddd ymm9,ymm9,ymm1
  1679. vpxor ymm5,ymm9,ymm5
  1680. vpshufb ymm5,ymm5,ymm14
  1681. vpaddd ymm12,ymm12,ymm4
  1682. vpxor ymm0,ymm12,ymm0
  1683. vpslld ymm15,ymm0,7
  1684. vpsrld ymm0,ymm0,25
  1685. vpor ymm0,ymm15,ymm0
  1686. vbroadcasti128 ymm15,XMMWORD PTR[r10]
  1687. vpaddd ymm13,ymm13,ymm5
  1688. vpxor ymm1,ymm13,ymm1
  1689. vpslld ymm14,ymm1,7
  1690. vpsrld ymm1,ymm1,25
  1691. vpor ymm1,ymm14,ymm1
  1692. vmovdqa YMMWORD PTR[rsp],ymm12
  1693. vmovdqa YMMWORD PTR[32+rsp],ymm13
  1694. vmovdqa ymm12,YMMWORD PTR[64+rsp]
  1695. vmovdqa ymm13,YMMWORD PTR[96+rsp]
  1696. vpaddd ymm10,ymm10,ymm2
  1697. vpxor ymm6,ymm10,ymm6
  1698. vpshufb ymm6,ymm6,ymm15
  1699. vpaddd ymm11,ymm11,ymm3
  1700. vpxor ymm7,ymm11,ymm7
  1701. vpshufb ymm7,ymm7,ymm15
  1702. vpaddd ymm12,ymm12,ymm6
  1703. vpxor ymm2,ymm12,ymm2
  1704. vpslld ymm14,ymm2,12
  1705. vpsrld ymm2,ymm2,20
  1706. vpor ymm2,ymm14,ymm2
  1707. vbroadcasti128 ymm14,XMMWORD PTR[r11]
  1708. vpaddd ymm13,ymm13,ymm7
  1709. vpxor ymm3,ymm13,ymm3
  1710. vpslld ymm15,ymm3,12
  1711. vpsrld ymm3,ymm3,20
  1712. vpor ymm3,ymm15,ymm3
  1713. vpaddd ymm10,ymm10,ymm2
  1714. vpxor ymm6,ymm10,ymm6
  1715. vpshufb ymm6,ymm6,ymm14
  1716. vpaddd ymm11,ymm11,ymm3
  1717. vpxor ymm7,ymm11,ymm7
  1718. vpshufb ymm7,ymm7,ymm14
  1719. vpaddd ymm12,ymm12,ymm6
  1720. vpxor ymm2,ymm12,ymm2
  1721. vpslld ymm15,ymm2,7
  1722. vpsrld ymm2,ymm2,25
  1723. vpor ymm2,ymm15,ymm2
  1724. vbroadcasti128 ymm15,XMMWORD PTR[r10]
  1725. vpaddd ymm13,ymm13,ymm7
  1726. vpxor ymm3,ymm13,ymm3
  1727. vpslld ymm14,ymm3,7
  1728. vpsrld ymm3,ymm3,25
  1729. vpor ymm3,ymm14,ymm3
  1730. vpaddd ymm8,ymm8,ymm1
  1731. vpxor ymm7,ymm8,ymm7
  1732. vpshufb ymm7,ymm7,ymm15
  1733. vpaddd ymm9,ymm9,ymm2
  1734. vpxor ymm4,ymm9,ymm4
  1735. vpshufb ymm4,ymm4,ymm15
  1736. vpaddd ymm12,ymm12,ymm7
  1737. vpxor ymm1,ymm12,ymm1
  1738. vpslld ymm14,ymm1,12
  1739. vpsrld ymm1,ymm1,20
  1740. vpor ymm1,ymm14,ymm1
  1741. vbroadcasti128 ymm14,XMMWORD PTR[r11]
  1742. vpaddd ymm13,ymm13,ymm4
  1743. vpxor ymm2,ymm13,ymm2
  1744. vpslld ymm15,ymm2,12
  1745. vpsrld ymm2,ymm2,20
  1746. vpor ymm2,ymm15,ymm2
  1747. vpaddd ymm8,ymm8,ymm1
  1748. vpxor ymm7,ymm8,ymm7
  1749. vpshufb ymm7,ymm7,ymm14
  1750. vpaddd ymm9,ymm9,ymm2
  1751. vpxor ymm4,ymm9,ymm4
  1752. vpshufb ymm4,ymm4,ymm14
  1753. vpaddd ymm12,ymm12,ymm7
  1754. vpxor ymm1,ymm12,ymm1
  1755. vpslld ymm15,ymm1,7
  1756. vpsrld ymm1,ymm1,25
  1757. vpor ymm1,ymm15,ymm1
  1758. vbroadcasti128 ymm15,XMMWORD PTR[r10]
  1759. vpaddd ymm13,ymm13,ymm4
  1760. vpxor ymm2,ymm13,ymm2
  1761. vpslld ymm14,ymm2,7
  1762. vpsrld ymm2,ymm2,25
  1763. vpor ymm2,ymm14,ymm2
  1764. vmovdqa YMMWORD PTR[64+rsp],ymm12
  1765. vmovdqa YMMWORD PTR[96+rsp],ymm13
  1766. vmovdqa ymm12,YMMWORD PTR[rsp]
  1767. vmovdqa ymm13,YMMWORD PTR[32+rsp]
  1768. vpaddd ymm10,ymm10,ymm3
  1769. vpxor ymm5,ymm10,ymm5
  1770. vpshufb ymm5,ymm5,ymm15
  1771. vpaddd ymm11,ymm11,ymm0
  1772. vpxor ymm6,ymm11,ymm6
  1773. vpshufb ymm6,ymm6,ymm15
  1774. vpaddd ymm12,ymm12,ymm5
  1775. vpxor ymm3,ymm12,ymm3
  1776. vpslld ymm14,ymm3,12
  1777. vpsrld ymm3,ymm3,20
  1778. vpor ymm3,ymm14,ymm3
  1779. vbroadcasti128 ymm14,XMMWORD PTR[r11]
  1780. vpaddd ymm13,ymm13,ymm6
  1781. vpxor ymm0,ymm13,ymm0
  1782. vpslld ymm15,ymm0,12
  1783. vpsrld ymm0,ymm0,20
  1784. vpor ymm0,ymm15,ymm0
  1785. vpaddd ymm10,ymm10,ymm3
  1786. vpxor ymm5,ymm10,ymm5
  1787. vpshufb ymm5,ymm5,ymm14
  1788. vpaddd ymm11,ymm11,ymm0
  1789. vpxor ymm6,ymm11,ymm6
  1790. vpshufb ymm6,ymm6,ymm14
  1791. vpaddd ymm12,ymm12,ymm5
  1792. vpxor ymm3,ymm12,ymm3
  1793. vpslld ymm15,ymm3,7
  1794. vpsrld ymm3,ymm3,25
  1795. vpor ymm3,ymm15,ymm3
  1796. vbroadcasti128 ymm15,XMMWORD PTR[r10]
  1797. vpaddd ymm13,ymm13,ymm6
  1798. vpxor ymm0,ymm13,ymm0
  1799. vpslld ymm14,ymm0,7
  1800. vpsrld ymm0,ymm0,25
  1801. vpor ymm0,ymm14,ymm0
  1802. dec eax
  1803. jnz $L$oop8x
  1804. lea rax,QWORD PTR[512+rsp]
  1805. vpaddd ymm8,ymm8,YMMWORD PTR[((128-256))+rcx]
  1806. vpaddd ymm9,ymm9,YMMWORD PTR[((160-256))+rcx]
  1807. vpaddd ymm10,ymm10,YMMWORD PTR[((192-256))+rcx]
  1808. vpaddd ymm11,ymm11,YMMWORD PTR[((224-256))+rcx]
  1809. vpunpckldq ymm14,ymm8,ymm9
  1810. vpunpckldq ymm15,ymm10,ymm11
  1811. vpunpckhdq ymm8,ymm8,ymm9
  1812. vpunpckhdq ymm10,ymm10,ymm11
  1813. vpunpcklqdq ymm9,ymm14,ymm15
  1814. vpunpckhqdq ymm14,ymm14,ymm15
  1815. vpunpcklqdq ymm11,ymm8,ymm10
  1816. vpunpckhqdq ymm8,ymm8,ymm10
  1817. vpaddd ymm0,ymm0,YMMWORD PTR[((256-256))+rcx]
  1818. vpaddd ymm1,ymm1,YMMWORD PTR[((288-256))+rcx]
  1819. vpaddd ymm2,ymm2,YMMWORD PTR[((320-256))+rcx]
  1820. vpaddd ymm3,ymm3,YMMWORD PTR[((352-256))+rcx]
  1821. vpunpckldq ymm10,ymm0,ymm1
  1822. vpunpckldq ymm15,ymm2,ymm3
  1823. vpunpckhdq ymm0,ymm0,ymm1
  1824. vpunpckhdq ymm2,ymm2,ymm3
  1825. vpunpcklqdq ymm1,ymm10,ymm15
  1826. vpunpckhqdq ymm10,ymm10,ymm15
  1827. vpunpcklqdq ymm3,ymm0,ymm2
  1828. vpunpckhqdq ymm0,ymm0,ymm2
  1829. vperm2i128 ymm15,ymm9,ymm1,020h
  1830. vperm2i128 ymm1,ymm9,ymm1,031h
  1831. vperm2i128 ymm9,ymm14,ymm10,020h
  1832. vperm2i128 ymm10,ymm14,ymm10,031h
  1833. vperm2i128 ymm14,ymm11,ymm3,020h
  1834. vperm2i128 ymm3,ymm11,ymm3,031h
  1835. vperm2i128 ymm11,ymm8,ymm0,020h
  1836. vperm2i128 ymm0,ymm8,ymm0,031h
  1837. vmovdqa YMMWORD PTR[rsp],ymm15
  1838. vmovdqa YMMWORD PTR[32+rsp],ymm9
  1839. vmovdqa ymm15,YMMWORD PTR[64+rsp]
  1840. vmovdqa ymm9,YMMWORD PTR[96+rsp]
  1841. vpaddd ymm12,ymm12,YMMWORD PTR[((384-512))+rax]
  1842. vpaddd ymm13,ymm13,YMMWORD PTR[((416-512))+rax]
  1843. vpaddd ymm15,ymm15,YMMWORD PTR[((448-512))+rax]
  1844. vpaddd ymm9,ymm9,YMMWORD PTR[((480-512))+rax]
  1845. vpunpckldq ymm2,ymm12,ymm13
  1846. vpunpckldq ymm8,ymm15,ymm9
  1847. vpunpckhdq ymm12,ymm12,ymm13
  1848. vpunpckhdq ymm15,ymm15,ymm9
  1849. vpunpcklqdq ymm13,ymm2,ymm8
  1850. vpunpckhqdq ymm2,ymm2,ymm8
  1851. vpunpcklqdq ymm9,ymm12,ymm15
  1852. vpunpckhqdq ymm12,ymm12,ymm15
  1853. vpaddd ymm4,ymm4,YMMWORD PTR[((512-512))+rax]
  1854. vpaddd ymm5,ymm5,YMMWORD PTR[((544-512))+rax]
  1855. vpaddd ymm6,ymm6,YMMWORD PTR[((576-512))+rax]
  1856. vpaddd ymm7,ymm7,YMMWORD PTR[((608-512))+rax]
  1857. vpunpckldq ymm15,ymm4,ymm5
  1858. vpunpckldq ymm8,ymm6,ymm7
  1859. vpunpckhdq ymm4,ymm4,ymm5
  1860. vpunpckhdq ymm6,ymm6,ymm7
  1861. vpunpcklqdq ymm5,ymm15,ymm8
  1862. vpunpckhqdq ymm15,ymm15,ymm8
  1863. vpunpcklqdq ymm7,ymm4,ymm6
  1864. vpunpckhqdq ymm4,ymm4,ymm6
  1865. vperm2i128 ymm8,ymm13,ymm5,020h
  1866. vperm2i128 ymm5,ymm13,ymm5,031h
  1867. vperm2i128 ymm13,ymm2,ymm15,020h
  1868. vperm2i128 ymm15,ymm2,ymm15,031h
  1869. vperm2i128 ymm2,ymm9,ymm7,020h
  1870. vperm2i128 ymm7,ymm9,ymm7,031h
  1871. vperm2i128 ymm9,ymm12,ymm4,020h
  1872. vperm2i128 ymm4,ymm12,ymm4,031h
  1873. vmovdqa ymm6,YMMWORD PTR[rsp]
  1874. vmovdqa ymm12,YMMWORD PTR[32+rsp]
  1875. cmp rdx,64*8
  1876. jb $L$tail8x
  1877. vpxor ymm6,ymm6,YMMWORD PTR[rsi]
  1878. vpxor ymm8,ymm8,YMMWORD PTR[32+rsi]
  1879. vpxor ymm1,ymm1,YMMWORD PTR[64+rsi]
  1880. vpxor ymm5,ymm5,YMMWORD PTR[96+rsi]
  1881. lea rsi,QWORD PTR[128+rsi]
  1882. vmovdqu YMMWORD PTR[rdi],ymm6
  1883. vmovdqu YMMWORD PTR[32+rdi],ymm8
  1884. vmovdqu YMMWORD PTR[64+rdi],ymm1
  1885. vmovdqu YMMWORD PTR[96+rdi],ymm5
  1886. lea rdi,QWORD PTR[128+rdi]
  1887. vpxor ymm12,ymm12,YMMWORD PTR[rsi]
  1888. vpxor ymm13,ymm13,YMMWORD PTR[32+rsi]
  1889. vpxor ymm10,ymm10,YMMWORD PTR[64+rsi]
  1890. vpxor ymm15,ymm15,YMMWORD PTR[96+rsi]
  1891. lea rsi,QWORD PTR[128+rsi]
  1892. vmovdqu YMMWORD PTR[rdi],ymm12
  1893. vmovdqu YMMWORD PTR[32+rdi],ymm13
  1894. vmovdqu YMMWORD PTR[64+rdi],ymm10
  1895. vmovdqu YMMWORD PTR[96+rdi],ymm15
  1896. lea rdi,QWORD PTR[128+rdi]
  1897. vpxor ymm14,ymm14,YMMWORD PTR[rsi]
  1898. vpxor ymm2,ymm2,YMMWORD PTR[32+rsi]
  1899. vpxor ymm3,ymm3,YMMWORD PTR[64+rsi]
  1900. vpxor ymm7,ymm7,YMMWORD PTR[96+rsi]
  1901. lea rsi,QWORD PTR[128+rsi]
  1902. vmovdqu YMMWORD PTR[rdi],ymm14
  1903. vmovdqu YMMWORD PTR[32+rdi],ymm2
  1904. vmovdqu YMMWORD PTR[64+rdi],ymm3
  1905. vmovdqu YMMWORD PTR[96+rdi],ymm7
  1906. lea rdi,QWORD PTR[128+rdi]
  1907. vpxor ymm11,ymm11,YMMWORD PTR[rsi]
  1908. vpxor ymm9,ymm9,YMMWORD PTR[32+rsi]
  1909. vpxor ymm0,ymm0,YMMWORD PTR[64+rsi]
  1910. vpxor ymm4,ymm4,YMMWORD PTR[96+rsi]
  1911. lea rsi,QWORD PTR[128+rsi]
  1912. vmovdqu YMMWORD PTR[rdi],ymm11
  1913. vmovdqu YMMWORD PTR[32+rdi],ymm9
  1914. vmovdqu YMMWORD PTR[64+rdi],ymm0
  1915. vmovdqu YMMWORD PTR[96+rdi],ymm4
  1916. lea rdi,QWORD PTR[128+rdi]
  1917. sub rdx,64*8
  1918. jnz $L$oop_outer8x
  1919. jmp $L$done8x
  1920. $L$tail8x::
  1921. cmp rdx,448
  1922. jae $L$448_or_more8x
  1923. cmp rdx,384
  1924. jae $L$384_or_more8x
  1925. cmp rdx,320
  1926. jae $L$320_or_more8x
  1927. cmp rdx,256
  1928. jae $L$256_or_more8x
  1929. cmp rdx,192
  1930. jae $L$192_or_more8x
  1931. cmp rdx,128
  1932. jae $L$128_or_more8x
  1933. cmp rdx,64
  1934. jae $L$64_or_more8x
  1935. xor r10,r10
  1936. vmovdqa YMMWORD PTR[rsp],ymm6
  1937. vmovdqa YMMWORD PTR[32+rsp],ymm8
  1938. jmp $L$oop_tail8x
  1939. ALIGN 32
  1940. $L$64_or_more8x::
  1941. vpxor ymm6,ymm6,YMMWORD PTR[rsi]
  1942. vpxor ymm8,ymm8,YMMWORD PTR[32+rsi]
  1943. vmovdqu YMMWORD PTR[rdi],ymm6
  1944. vmovdqu YMMWORD PTR[32+rdi],ymm8
  1945. je $L$done8x
  1946. lea rsi,QWORD PTR[64+rsi]
  1947. xor r10,r10
  1948. vmovdqa YMMWORD PTR[rsp],ymm1
  1949. lea rdi,QWORD PTR[64+rdi]
  1950. sub rdx,64
  1951. vmovdqa YMMWORD PTR[32+rsp],ymm5
  1952. jmp $L$oop_tail8x
  1953. ALIGN 32
  1954. $L$128_or_more8x::
  1955. vpxor ymm6,ymm6,YMMWORD PTR[rsi]
  1956. vpxor ymm8,ymm8,YMMWORD PTR[32+rsi]
  1957. vpxor ymm1,ymm1,YMMWORD PTR[64+rsi]
  1958. vpxor ymm5,ymm5,YMMWORD PTR[96+rsi]
  1959. vmovdqu YMMWORD PTR[rdi],ymm6
  1960. vmovdqu YMMWORD PTR[32+rdi],ymm8
  1961. vmovdqu YMMWORD PTR[64+rdi],ymm1
  1962. vmovdqu YMMWORD PTR[96+rdi],ymm5
  1963. je $L$done8x
  1964. lea rsi,QWORD PTR[128+rsi]
  1965. xor r10,r10
  1966. vmovdqa YMMWORD PTR[rsp],ymm12
  1967. lea rdi,QWORD PTR[128+rdi]
  1968. sub rdx,128
  1969. vmovdqa YMMWORD PTR[32+rsp],ymm13
  1970. jmp $L$oop_tail8x
  1971. ALIGN 32
  1972. $L$192_or_more8x::
  1973. vpxor ymm6,ymm6,YMMWORD PTR[rsi]
  1974. vpxor ymm8,ymm8,YMMWORD PTR[32+rsi]
  1975. vpxor ymm1,ymm1,YMMWORD PTR[64+rsi]
  1976. vpxor ymm5,ymm5,YMMWORD PTR[96+rsi]
  1977. vpxor ymm12,ymm12,YMMWORD PTR[128+rsi]
  1978. vpxor ymm13,ymm13,YMMWORD PTR[160+rsi]
  1979. vmovdqu YMMWORD PTR[rdi],ymm6
  1980. vmovdqu YMMWORD PTR[32+rdi],ymm8
  1981. vmovdqu YMMWORD PTR[64+rdi],ymm1
  1982. vmovdqu YMMWORD PTR[96+rdi],ymm5
  1983. vmovdqu YMMWORD PTR[128+rdi],ymm12
  1984. vmovdqu YMMWORD PTR[160+rdi],ymm13
  1985. je $L$done8x
  1986. lea rsi,QWORD PTR[192+rsi]
  1987. xor r10,r10
  1988. vmovdqa YMMWORD PTR[rsp],ymm10
  1989. lea rdi,QWORD PTR[192+rdi]
  1990. sub rdx,192
  1991. vmovdqa YMMWORD PTR[32+rsp],ymm15
  1992. jmp $L$oop_tail8x
  1993. ALIGN 32
  1994. $L$256_or_more8x::
  1995. vpxor ymm6,ymm6,YMMWORD PTR[rsi]
  1996. vpxor ymm8,ymm8,YMMWORD PTR[32+rsi]
  1997. vpxor ymm1,ymm1,YMMWORD PTR[64+rsi]
  1998. vpxor ymm5,ymm5,YMMWORD PTR[96+rsi]
  1999. vpxor ymm12,ymm12,YMMWORD PTR[128+rsi]
  2000. vpxor ymm13,ymm13,YMMWORD PTR[160+rsi]
  2001. vpxor ymm10,ymm10,YMMWORD PTR[192+rsi]
  2002. vpxor ymm15,ymm15,YMMWORD PTR[224+rsi]
  2003. vmovdqu YMMWORD PTR[rdi],ymm6
  2004. vmovdqu YMMWORD PTR[32+rdi],ymm8
  2005. vmovdqu YMMWORD PTR[64+rdi],ymm1
  2006. vmovdqu YMMWORD PTR[96+rdi],ymm5
  2007. vmovdqu YMMWORD PTR[128+rdi],ymm12
  2008. vmovdqu YMMWORD PTR[160+rdi],ymm13
  2009. vmovdqu YMMWORD PTR[192+rdi],ymm10
  2010. vmovdqu YMMWORD PTR[224+rdi],ymm15
  2011. je $L$done8x
  2012. lea rsi,QWORD PTR[256+rsi]
  2013. xor r10,r10
  2014. vmovdqa YMMWORD PTR[rsp],ymm14
  2015. lea rdi,QWORD PTR[256+rdi]
  2016. sub rdx,256
  2017. vmovdqa YMMWORD PTR[32+rsp],ymm2
  2018. jmp $L$oop_tail8x
  2019. ALIGN 32
  2020. $L$320_or_more8x::
  2021. vpxor ymm6,ymm6,YMMWORD PTR[rsi]
  2022. vpxor ymm8,ymm8,YMMWORD PTR[32+rsi]
  2023. vpxor ymm1,ymm1,YMMWORD PTR[64+rsi]
  2024. vpxor ymm5,ymm5,YMMWORD PTR[96+rsi]
  2025. vpxor ymm12,ymm12,YMMWORD PTR[128+rsi]
  2026. vpxor ymm13,ymm13,YMMWORD PTR[160+rsi]
  2027. vpxor ymm10,ymm10,YMMWORD PTR[192+rsi]
  2028. vpxor ymm15,ymm15,YMMWORD PTR[224+rsi]
  2029. vpxor ymm14,ymm14,YMMWORD PTR[256+rsi]
  2030. vpxor ymm2,ymm2,YMMWORD PTR[288+rsi]
  2031. vmovdqu YMMWORD PTR[rdi],ymm6
  2032. vmovdqu YMMWORD PTR[32+rdi],ymm8
  2033. vmovdqu YMMWORD PTR[64+rdi],ymm1
  2034. vmovdqu YMMWORD PTR[96+rdi],ymm5
  2035. vmovdqu YMMWORD PTR[128+rdi],ymm12
  2036. vmovdqu YMMWORD PTR[160+rdi],ymm13
  2037. vmovdqu YMMWORD PTR[192+rdi],ymm10
  2038. vmovdqu YMMWORD PTR[224+rdi],ymm15
  2039. vmovdqu YMMWORD PTR[256+rdi],ymm14
  2040. vmovdqu YMMWORD PTR[288+rdi],ymm2
  2041. je $L$done8x
  2042. lea rsi,QWORD PTR[320+rsi]
  2043. xor r10,r10
  2044. vmovdqa YMMWORD PTR[rsp],ymm3
  2045. lea rdi,QWORD PTR[320+rdi]
  2046. sub rdx,320
  2047. vmovdqa YMMWORD PTR[32+rsp],ymm7
  2048. jmp $L$oop_tail8x
  2049. ALIGN 32
  2050. $L$384_or_more8x::
  2051. vpxor ymm6,ymm6,YMMWORD PTR[rsi]
  2052. vpxor ymm8,ymm8,YMMWORD PTR[32+rsi]
  2053. vpxor ymm1,ymm1,YMMWORD PTR[64+rsi]
  2054. vpxor ymm5,ymm5,YMMWORD PTR[96+rsi]
  2055. vpxor ymm12,ymm12,YMMWORD PTR[128+rsi]
  2056. vpxor ymm13,ymm13,YMMWORD PTR[160+rsi]
  2057. vpxor ymm10,ymm10,YMMWORD PTR[192+rsi]
  2058. vpxor ymm15,ymm15,YMMWORD PTR[224+rsi]
  2059. vpxor ymm14,ymm14,YMMWORD PTR[256+rsi]
  2060. vpxor ymm2,ymm2,YMMWORD PTR[288+rsi]
  2061. vpxor ymm3,ymm3,YMMWORD PTR[320+rsi]
  2062. vpxor ymm7,ymm7,YMMWORD PTR[352+rsi]
  2063. vmovdqu YMMWORD PTR[rdi],ymm6
  2064. vmovdqu YMMWORD PTR[32+rdi],ymm8
  2065. vmovdqu YMMWORD PTR[64+rdi],ymm1
  2066. vmovdqu YMMWORD PTR[96+rdi],ymm5
  2067. vmovdqu YMMWORD PTR[128+rdi],ymm12
  2068. vmovdqu YMMWORD PTR[160+rdi],ymm13
  2069. vmovdqu YMMWORD PTR[192+rdi],ymm10
  2070. vmovdqu YMMWORD PTR[224+rdi],ymm15
  2071. vmovdqu YMMWORD PTR[256+rdi],ymm14
  2072. vmovdqu YMMWORD PTR[288+rdi],ymm2
  2073. vmovdqu YMMWORD PTR[320+rdi],ymm3
  2074. vmovdqu YMMWORD PTR[352+rdi],ymm7
  2075. je $L$done8x
  2076. lea rsi,QWORD PTR[384+rsi]
  2077. xor r10,r10
  2078. vmovdqa YMMWORD PTR[rsp],ymm11
  2079. lea rdi,QWORD PTR[384+rdi]
  2080. sub rdx,384
  2081. vmovdqa YMMWORD PTR[32+rsp],ymm9
  2082. jmp $L$oop_tail8x
  2083. ALIGN 32
  2084. $L$448_or_more8x::
  2085. vpxor ymm6,ymm6,YMMWORD PTR[rsi]
  2086. vpxor ymm8,ymm8,YMMWORD PTR[32+rsi]
  2087. vpxor ymm1,ymm1,YMMWORD PTR[64+rsi]
  2088. vpxor ymm5,ymm5,YMMWORD PTR[96+rsi]
  2089. vpxor ymm12,ymm12,YMMWORD PTR[128+rsi]
  2090. vpxor ymm13,ymm13,YMMWORD PTR[160+rsi]
  2091. vpxor ymm10,ymm10,YMMWORD PTR[192+rsi]
  2092. vpxor ymm15,ymm15,YMMWORD PTR[224+rsi]
  2093. vpxor ymm14,ymm14,YMMWORD PTR[256+rsi]
  2094. vpxor ymm2,ymm2,YMMWORD PTR[288+rsi]
  2095. vpxor ymm3,ymm3,YMMWORD PTR[320+rsi]
  2096. vpxor ymm7,ymm7,YMMWORD PTR[352+rsi]
  2097. vpxor ymm11,ymm11,YMMWORD PTR[384+rsi]
  2098. vpxor ymm9,ymm9,YMMWORD PTR[416+rsi]
  2099. vmovdqu YMMWORD PTR[rdi],ymm6
  2100. vmovdqu YMMWORD PTR[32+rdi],ymm8
  2101. vmovdqu YMMWORD PTR[64+rdi],ymm1
  2102. vmovdqu YMMWORD PTR[96+rdi],ymm5
  2103. vmovdqu YMMWORD PTR[128+rdi],ymm12
  2104. vmovdqu YMMWORD PTR[160+rdi],ymm13
  2105. vmovdqu YMMWORD PTR[192+rdi],ymm10
  2106. vmovdqu YMMWORD PTR[224+rdi],ymm15
  2107. vmovdqu YMMWORD PTR[256+rdi],ymm14
  2108. vmovdqu YMMWORD PTR[288+rdi],ymm2
  2109. vmovdqu YMMWORD PTR[320+rdi],ymm3
  2110. vmovdqu YMMWORD PTR[352+rdi],ymm7
  2111. vmovdqu YMMWORD PTR[384+rdi],ymm11
  2112. vmovdqu YMMWORD PTR[416+rdi],ymm9
  2113. je $L$done8x
  2114. lea rsi,QWORD PTR[448+rsi]
  2115. xor r10,r10
  2116. vmovdqa YMMWORD PTR[rsp],ymm0
  2117. lea rdi,QWORD PTR[448+rdi]
  2118. sub rdx,448
  2119. vmovdqa YMMWORD PTR[32+rsp],ymm4
  2120. $L$oop_tail8x::
  2121. movzx eax,BYTE PTR[r10*1+rsi]
  2122. movzx ecx,BYTE PTR[r10*1+rsp]
  2123. lea r10,QWORD PTR[1+r10]
  2124. xor eax,ecx
  2125. mov BYTE PTR[((-1))+r10*1+rdi],al
  2126. dec rdx
  2127. jnz $L$oop_tail8x
  2128. $L$done8x::
  2129. vzeroall
  2130. movaps xmm6,XMMWORD PTR[((-168))+r9]
  2131. movaps xmm7,XMMWORD PTR[((-152))+r9]
  2132. movaps xmm8,XMMWORD PTR[((-136))+r9]
  2133. movaps xmm9,XMMWORD PTR[((-120))+r9]
  2134. movaps xmm10,XMMWORD PTR[((-104))+r9]
  2135. movaps xmm11,XMMWORD PTR[((-88))+r9]
  2136. movaps xmm12,XMMWORD PTR[((-72))+r9]
  2137. movaps xmm13,XMMWORD PTR[((-56))+r9]
  2138. movaps xmm14,XMMWORD PTR[((-40))+r9]
  2139. movaps xmm15,XMMWORD PTR[((-24))+r9]
  2140. lea rsp,QWORD PTR[r9]
  2141. $L$8x_epilogue::
  2142. mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue
  2143. mov rsi,QWORD PTR[16+rsp]
  2144. DB 0F3h,0C3h ;repret
  2145. $L$SEH_end_ChaCha20_8x::
  2146. ChaCha20_8x ENDP
  2147. EXTERN __imp_RtlVirtualUnwind:NEAR
  2148. ALIGN 16
  2149. se_handler PROC PRIVATE
  2150. push rsi
  2151. push rdi
  2152. push rbx
  2153. push rbp
  2154. push r12
  2155. push r13
  2156. push r14
  2157. push r15
  2158. pushfq
  2159. sub rsp,64
  2160. mov rax,QWORD PTR[120+r8]
  2161. mov rbx,QWORD PTR[248+r8]
  2162. mov rsi,QWORD PTR[8+r9]
  2163. mov r11,QWORD PTR[56+r9]
  2164. lea r10,QWORD PTR[$L$ctr32_body]
  2165. cmp rbx,r10
  2166. jb $L$common_seh_tail
  2167. mov rax,QWORD PTR[152+r8]
  2168. lea r10,QWORD PTR[$L$no_data]
  2169. cmp rbx,r10
  2170. jae $L$common_seh_tail
  2171. lea rax,QWORD PTR[((64+24+48))+rax]
  2172. mov rbx,QWORD PTR[((-8))+rax]
  2173. mov rbp,QWORD PTR[((-16))+rax]
  2174. mov r12,QWORD PTR[((-24))+rax]
  2175. mov r13,QWORD PTR[((-32))+rax]
  2176. mov r14,QWORD PTR[((-40))+rax]
  2177. mov r15,QWORD PTR[((-48))+rax]
  2178. mov QWORD PTR[144+r8],rbx
  2179. mov QWORD PTR[160+r8],rbp
  2180. mov QWORD PTR[216+r8],r12
  2181. mov QWORD PTR[224+r8],r13
  2182. mov QWORD PTR[232+r8],r14
  2183. mov QWORD PTR[240+r8],r15
  2184. $L$common_seh_tail::
  2185. mov rdi,QWORD PTR[8+rax]
  2186. mov rsi,QWORD PTR[16+rax]
  2187. mov QWORD PTR[152+r8],rax
  2188. mov QWORD PTR[168+r8],rsi
  2189. mov QWORD PTR[176+r8],rdi
  2190. mov rdi,QWORD PTR[40+r9]
  2191. mov rsi,r8
  2192. mov ecx,154
  2193. DD 0a548f3fch
  2194. mov rsi,r9
  2195. xor rcx,rcx
  2196. mov rdx,QWORD PTR[8+rsi]
  2197. mov r8,QWORD PTR[rsi]
  2198. mov r9,QWORD PTR[16+rsi]
  2199. mov r10,QWORD PTR[40+rsi]
  2200. lea r11,QWORD PTR[56+rsi]
  2201. lea r12,QWORD PTR[24+rsi]
  2202. mov QWORD PTR[32+rsp],r10
  2203. mov QWORD PTR[40+rsp],r11
  2204. mov QWORD PTR[48+rsp],r12
  2205. mov QWORD PTR[56+rsp],rcx
  2206. call QWORD PTR[__imp_RtlVirtualUnwind]
  2207. mov eax,1
  2208. add rsp,64
  2209. popfq
  2210. pop r15
  2211. pop r14
  2212. pop r13
  2213. pop r12
  2214. pop rbp
  2215. pop rbx
  2216. pop rdi
  2217. pop rsi
  2218. DB 0F3h,0C3h ;repret
  2219. se_handler ENDP
  2220. ALIGN 16
  2221. simd_handler PROC PRIVATE
  2222. push rsi
  2223. push rdi
  2224. push rbx
  2225. push rbp
  2226. push r12
  2227. push r13
  2228. push r14
  2229. push r15
  2230. pushfq
  2231. sub rsp,64
  2232. mov rax,QWORD PTR[120+r8]
  2233. mov rbx,QWORD PTR[248+r8]
  2234. mov rsi,QWORD PTR[8+r9]
  2235. mov r11,QWORD PTR[56+r9]
  2236. mov r10d,DWORD PTR[r11]
  2237. lea r10,QWORD PTR[r10*1+rsi]
  2238. cmp rbx,r10
  2239. jb $L$common_seh_tail
  2240. mov rax,QWORD PTR[192+r8]
  2241. mov r10d,DWORD PTR[4+r11]
  2242. mov ecx,DWORD PTR[8+r11]
  2243. lea r10,QWORD PTR[r10*1+rsi]
  2244. cmp rbx,r10
  2245. jae $L$common_seh_tail
  2246. neg rcx
  2247. lea rsi,QWORD PTR[((-8))+rcx*1+rax]
  2248. lea rdi,QWORD PTR[512+r8]
  2249. neg ecx
  2250. shr ecx,3
  2251. DD 0a548f3fch
  2252. jmp $L$common_seh_tail
  2253. simd_handler ENDP
  2254. .text$ ENDS
  2255. .pdata SEGMENT READONLY ALIGN(4)
  2256. ALIGN 4
  2257. DD imagerel $L$SEH_begin_ChaCha20_ctr32
  2258. DD imagerel $L$SEH_end_ChaCha20_ctr32
  2259. DD imagerel $L$SEH_info_ChaCha20_ctr32
  2260. DD imagerel $L$SEH_begin_ChaCha20_ssse3
  2261. DD imagerel $L$SEH_end_ChaCha20_ssse3
  2262. DD imagerel $L$SEH_info_ChaCha20_ssse3
  2263. DD imagerel $L$SEH_begin_ChaCha20_128
  2264. DD imagerel $L$SEH_end_ChaCha20_128
  2265. DD imagerel $L$SEH_info_ChaCha20_128
  2266. DD imagerel $L$SEH_begin_ChaCha20_4x
  2267. DD imagerel $L$SEH_end_ChaCha20_4x
  2268. DD imagerel $L$SEH_info_ChaCha20_4x
  2269. DD imagerel $L$SEH_begin_ChaCha20_4xop
  2270. DD imagerel $L$SEH_end_ChaCha20_4xop
  2271. DD imagerel $L$SEH_info_ChaCha20_4xop
  2272. DD imagerel $L$SEH_begin_ChaCha20_8x
  2273. DD imagerel $L$SEH_end_ChaCha20_8x
  2274. DD imagerel $L$SEH_info_ChaCha20_8x
  2275. .pdata ENDS
  2276. .xdata SEGMENT READONLY ALIGN(8)
  2277. ALIGN 8
  2278. $L$SEH_info_ChaCha20_ctr32::
  2279. DB 9,0,0,0
  2280. DD imagerel se_handler
  2281. $L$SEH_info_ChaCha20_ssse3::
  2282. DB 9,0,0,0
  2283. DD imagerel simd_handler
  2284. DD imagerel $L$ssse3_body,imagerel $L$ssse3_epilogue
  2285. DD 020h,0
  2286. $L$SEH_info_ChaCha20_128::
  2287. DB 9,0,0,0
  2288. DD imagerel simd_handler
  2289. DD imagerel $L$128_body,imagerel $L$128_epilogue
  2290. DD 060h,0
  2291. $L$SEH_info_ChaCha20_4x::
  2292. DB 9,0,0,0
  2293. DD imagerel simd_handler
  2294. DD imagerel $L$4x_body,imagerel $L$4x_epilogue
  2295. DD 0a0h,0
  2296. $L$SEH_info_ChaCha20_4xop::
  2297. DB 9,0,0,0
  2298. DD imagerel simd_handler
  2299. DD imagerel $L$4xop_body,imagerel $L$4xop_epilogue
  2300. DD 0a0h,0
  2301. $L$SEH_info_ChaCha20_8x::
  2302. DB 9,0,0,0
  2303. DD imagerel simd_handler
  2304. DD imagerel $L$8x_body,imagerel $L$8x_epilogue
  2305. DD 0a0h,0
  2306. .xdata ENDS
  2307. END