poly1305-x86_64.s 64 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554
  1. .text
  2. .globl _poly1305_init
  3. .private_extern _poly1305_init
  4. .globl _poly1305_blocks
  5. .private_extern _poly1305_blocks
  6. .globl _poly1305_emit
  7. .private_extern _poly1305_emit
  8. .p2align 5
  9. _poly1305_init:
  10. xorq %rax,%rax
  11. movq %rax,0(%rdi)
  12. movq %rax,8(%rdi)
  13. movq %rax,16(%rdi)
  14. cmpq $0,%rsi
  15. je L$no_key
  16. leaq _poly1305_blocks(%rip),%r10
  17. leaq _poly1305_emit(%rip),%r11
  18. movq _OPENSSL_ia32cap_P+4(%rip),%r9
  19. leaq poly1305_blocks_avx(%rip),%rax
  20. leaq poly1305_emit_avx(%rip),%rcx
  21. btq $28,%r9
  22. cmovcq %rax,%r10
  23. cmovcq %rcx,%r11
  24. leaq poly1305_blocks_avx2(%rip),%rax
  25. btq $37,%r9
  26. cmovcq %rax,%r10
  27. movq $2149646336,%rax
  28. shrq $32,%r9
  29. andq %rax,%r9
  30. cmpq %rax,%r9
  31. je L$init_base2_44
  32. movq $0x0ffffffc0fffffff,%rax
  33. movq $0x0ffffffc0ffffffc,%rcx
  34. andq 0(%rsi),%rax
  35. andq 8(%rsi),%rcx
  36. movq %rax,24(%rdi)
  37. movq %rcx,32(%rdi)
  38. movq %r10,0(%rdx)
  39. movq %r11,8(%rdx)
  40. movl $1,%eax
  41. L$no_key:
  42. .byte 0xf3,0xc3
  43. .p2align 5
  44. _poly1305_blocks:
  45. L$blocks:
  46. shrq $4,%rdx
  47. jz L$no_data
  48. pushq %rbx
  49. pushq %rbp
  50. pushq %r12
  51. pushq %r13
  52. pushq %r14
  53. pushq %r15
  54. L$blocks_body:
  55. movq %rdx,%r15
  56. movq 24(%rdi),%r11
  57. movq 32(%rdi),%r13
  58. movq 0(%rdi),%r14
  59. movq 8(%rdi),%rbx
  60. movq 16(%rdi),%rbp
  61. movq %r13,%r12
  62. shrq $2,%r13
  63. movq %r12,%rax
  64. addq %r12,%r13
  65. jmp L$oop
  66. .p2align 5
  67. L$oop:
  68. addq 0(%rsi),%r14
  69. adcq 8(%rsi),%rbx
  70. leaq 16(%rsi),%rsi
  71. adcq %rcx,%rbp
  72. mulq %r14
  73. movq %rax,%r9
  74. movq %r11,%rax
  75. movq %rdx,%r10
  76. mulq %r14
  77. movq %rax,%r14
  78. movq %r11,%rax
  79. movq %rdx,%r8
  80. mulq %rbx
  81. addq %rax,%r9
  82. movq %r13,%rax
  83. adcq %rdx,%r10
  84. mulq %rbx
  85. movq %rbp,%rbx
  86. addq %rax,%r14
  87. adcq %rdx,%r8
  88. imulq %r13,%rbx
  89. addq %rbx,%r9
  90. movq %r8,%rbx
  91. adcq $0,%r10
  92. imulq %r11,%rbp
  93. addq %r9,%rbx
  94. movq $-4,%rax
  95. adcq %rbp,%r10
  96. andq %r10,%rax
  97. movq %r10,%rbp
  98. shrq $2,%r10
  99. andq $3,%rbp
  100. addq %r10,%rax
  101. addq %rax,%r14
  102. adcq $0,%rbx
  103. adcq $0,%rbp
  104. movq %r12,%rax
  105. decq %r15
  106. jnz L$oop
  107. movq %r14,0(%rdi)
  108. movq %rbx,8(%rdi)
  109. movq %rbp,16(%rdi)
  110. movq 0(%rsp),%r15
  111. movq 8(%rsp),%r14
  112. movq 16(%rsp),%r13
  113. movq 24(%rsp),%r12
  114. movq 32(%rsp),%rbp
  115. movq 40(%rsp),%rbx
  116. leaq 48(%rsp),%rsp
  117. L$no_data:
  118. L$blocks_epilogue:
  119. .byte 0xf3,0xc3
  120. .p2align 5
  121. _poly1305_emit:
  122. L$emit:
  123. movq 0(%rdi),%r8
  124. movq 8(%rdi),%r9
  125. movq 16(%rdi),%r10
  126. movq %r8,%rax
  127. addq $5,%r8
  128. movq %r9,%rcx
  129. adcq $0,%r9
  130. adcq $0,%r10
  131. shrq $2,%r10
  132. cmovnzq %r8,%rax
  133. cmovnzq %r9,%rcx
  134. addq 0(%rdx),%rax
  135. adcq 8(%rdx),%rcx
  136. movq %rax,0(%rsi)
  137. movq %rcx,8(%rsi)
  138. .byte 0xf3,0xc3
  139. .p2align 5
  140. __poly1305_block:
  141. mulq %r14
  142. movq %rax,%r9
  143. movq %r11,%rax
  144. movq %rdx,%r10
  145. mulq %r14
  146. movq %rax,%r14
  147. movq %r11,%rax
  148. movq %rdx,%r8
  149. mulq %rbx
  150. addq %rax,%r9
  151. movq %r13,%rax
  152. adcq %rdx,%r10
  153. mulq %rbx
  154. movq %rbp,%rbx
  155. addq %rax,%r14
  156. adcq %rdx,%r8
  157. imulq %r13,%rbx
  158. addq %rbx,%r9
  159. movq %r8,%rbx
  160. adcq $0,%r10
  161. imulq %r11,%rbp
  162. addq %r9,%rbx
  163. movq $-4,%rax
  164. adcq %rbp,%r10
  165. andq %r10,%rax
  166. movq %r10,%rbp
  167. shrq $2,%r10
  168. andq $3,%rbp
  169. addq %r10,%rax
  170. addq %rax,%r14
  171. adcq $0,%rbx
  172. adcq $0,%rbp
  173. .byte 0xf3,0xc3
  174. .p2align 5
  175. __poly1305_init_avx:
  176. movq %r11,%r14
  177. movq %r12,%rbx
  178. xorq %rbp,%rbp
  179. leaq 48+64(%rdi),%rdi
  180. movq %r12,%rax
  181. call __poly1305_block
  182. movl $0x3ffffff,%eax
  183. movl $0x3ffffff,%edx
  184. movq %r14,%r8
  185. andl %r14d,%eax
  186. movq %r11,%r9
  187. andl %r11d,%edx
  188. movl %eax,-64(%rdi)
  189. shrq $26,%r8
  190. movl %edx,-60(%rdi)
  191. shrq $26,%r9
  192. movl $0x3ffffff,%eax
  193. movl $0x3ffffff,%edx
  194. andl %r8d,%eax
  195. andl %r9d,%edx
  196. movl %eax,-48(%rdi)
  197. leal (%rax,%rax,4),%eax
  198. movl %edx,-44(%rdi)
  199. leal (%rdx,%rdx,4),%edx
  200. movl %eax,-32(%rdi)
  201. shrq $26,%r8
  202. movl %edx,-28(%rdi)
  203. shrq $26,%r9
  204. movq %rbx,%rax
  205. movq %r12,%rdx
  206. shlq $12,%rax
  207. shlq $12,%rdx
  208. orq %r8,%rax
  209. orq %r9,%rdx
  210. andl $0x3ffffff,%eax
  211. andl $0x3ffffff,%edx
  212. movl %eax,-16(%rdi)
  213. leal (%rax,%rax,4),%eax
  214. movl %edx,-12(%rdi)
  215. leal (%rdx,%rdx,4),%edx
  216. movl %eax,0(%rdi)
  217. movq %rbx,%r8
  218. movl %edx,4(%rdi)
  219. movq %r12,%r9
  220. movl $0x3ffffff,%eax
  221. movl $0x3ffffff,%edx
  222. shrq $14,%r8
  223. shrq $14,%r9
  224. andl %r8d,%eax
  225. andl %r9d,%edx
  226. movl %eax,16(%rdi)
  227. leal (%rax,%rax,4),%eax
  228. movl %edx,20(%rdi)
  229. leal (%rdx,%rdx,4),%edx
  230. movl %eax,32(%rdi)
  231. shrq $26,%r8
  232. movl %edx,36(%rdi)
  233. shrq $26,%r9
  234. movq %rbp,%rax
  235. shlq $24,%rax
  236. orq %rax,%r8
  237. movl %r8d,48(%rdi)
  238. leaq (%r8,%r8,4),%r8
  239. movl %r9d,52(%rdi)
  240. leaq (%r9,%r9,4),%r9
  241. movl %r8d,64(%rdi)
  242. movl %r9d,68(%rdi)
  243. movq %r12,%rax
  244. call __poly1305_block
  245. movl $0x3ffffff,%eax
  246. movq %r14,%r8
  247. andl %r14d,%eax
  248. shrq $26,%r8
  249. movl %eax,-52(%rdi)
  250. movl $0x3ffffff,%edx
  251. andl %r8d,%edx
  252. movl %edx,-36(%rdi)
  253. leal (%rdx,%rdx,4),%edx
  254. shrq $26,%r8
  255. movl %edx,-20(%rdi)
  256. movq %rbx,%rax
  257. shlq $12,%rax
  258. orq %r8,%rax
  259. andl $0x3ffffff,%eax
  260. movl %eax,-4(%rdi)
  261. leal (%rax,%rax,4),%eax
  262. movq %rbx,%r8
  263. movl %eax,12(%rdi)
  264. movl $0x3ffffff,%edx
  265. shrq $14,%r8
  266. andl %r8d,%edx
  267. movl %edx,28(%rdi)
  268. leal (%rdx,%rdx,4),%edx
  269. shrq $26,%r8
  270. movl %edx,44(%rdi)
  271. movq %rbp,%rax
  272. shlq $24,%rax
  273. orq %rax,%r8
  274. movl %r8d,60(%rdi)
  275. leaq (%r8,%r8,4),%r8
  276. movl %r8d,76(%rdi)
  277. movq %r12,%rax
  278. call __poly1305_block
  279. movl $0x3ffffff,%eax
  280. movq %r14,%r8
  281. andl %r14d,%eax
  282. shrq $26,%r8
  283. movl %eax,-56(%rdi)
  284. movl $0x3ffffff,%edx
  285. andl %r8d,%edx
  286. movl %edx,-40(%rdi)
  287. leal (%rdx,%rdx,4),%edx
  288. shrq $26,%r8
  289. movl %edx,-24(%rdi)
  290. movq %rbx,%rax
  291. shlq $12,%rax
  292. orq %r8,%rax
  293. andl $0x3ffffff,%eax
  294. movl %eax,-8(%rdi)
  295. leal (%rax,%rax,4),%eax
  296. movq %rbx,%r8
  297. movl %eax,8(%rdi)
  298. movl $0x3ffffff,%edx
  299. shrq $14,%r8
  300. andl %r8d,%edx
  301. movl %edx,24(%rdi)
  302. leal (%rdx,%rdx,4),%edx
  303. shrq $26,%r8
  304. movl %edx,40(%rdi)
  305. movq %rbp,%rax
  306. shlq $24,%rax
  307. orq %rax,%r8
  308. movl %r8d,56(%rdi)
  309. leaq (%r8,%r8,4),%r8
  310. movl %r8d,72(%rdi)
  311. leaq -48-64(%rdi),%rdi
  312. .byte 0xf3,0xc3
  313. .p2align 5
  314. poly1305_blocks_avx:
  315. movl 20(%rdi),%r8d
  316. cmpq $128,%rdx
  317. jae L$blocks_avx
  318. testl %r8d,%r8d
  319. jz L$blocks
  320. L$blocks_avx:
  321. andq $-16,%rdx
  322. jz L$no_data_avx
  323. vzeroupper
  324. testl %r8d,%r8d
  325. jz L$base2_64_avx
  326. testq $31,%rdx
  327. jz L$even_avx
  328. pushq %rbx
  329. pushq %rbp
  330. pushq %r12
  331. pushq %r13
  332. pushq %r14
  333. pushq %r15
  334. L$blocks_avx_body:
  335. movq %rdx,%r15
  336. movq 0(%rdi),%r8
  337. movq 8(%rdi),%r9
  338. movl 16(%rdi),%ebp
  339. movq 24(%rdi),%r11
  340. movq 32(%rdi),%r13
  341. movl %r8d,%r14d
  342. andq $-2147483648,%r8
  343. movq %r9,%r12
  344. movl %r9d,%ebx
  345. andq $-2147483648,%r9
  346. shrq $6,%r8
  347. shlq $52,%r12
  348. addq %r8,%r14
  349. shrq $12,%rbx
  350. shrq $18,%r9
  351. addq %r12,%r14
  352. adcq %r9,%rbx
  353. movq %rbp,%r8
  354. shlq $40,%r8
  355. shrq $24,%rbp
  356. addq %r8,%rbx
  357. adcq $0,%rbp
  358. movq $-4,%r9
  359. movq %rbp,%r8
  360. andq %rbp,%r9
  361. shrq $2,%r8
  362. andq $3,%rbp
  363. addq %r9,%r8
  364. addq %r8,%r14
  365. adcq $0,%rbx
  366. adcq $0,%rbp
  367. movq %r13,%r12
  368. movq %r13,%rax
  369. shrq $2,%r13
  370. addq %r12,%r13
  371. addq 0(%rsi),%r14
  372. adcq 8(%rsi),%rbx
  373. leaq 16(%rsi),%rsi
  374. adcq %rcx,%rbp
  375. call __poly1305_block
  376. testq %rcx,%rcx
  377. jz L$store_base2_64_avx
  378. movq %r14,%rax
  379. movq %r14,%rdx
  380. shrq $52,%r14
  381. movq %rbx,%r11
  382. movq %rbx,%r12
  383. shrq $26,%rdx
  384. andq $0x3ffffff,%rax
  385. shlq $12,%r11
  386. andq $0x3ffffff,%rdx
  387. shrq $14,%rbx
  388. orq %r11,%r14
  389. shlq $24,%rbp
  390. andq $0x3ffffff,%r14
  391. shrq $40,%r12
  392. andq $0x3ffffff,%rbx
  393. orq %r12,%rbp
  394. subq $16,%r15
  395. jz L$store_base2_26_avx
  396. vmovd %eax,%xmm0
  397. vmovd %edx,%xmm1
  398. vmovd %r14d,%xmm2
  399. vmovd %ebx,%xmm3
  400. vmovd %ebp,%xmm4
  401. jmp L$proceed_avx
  402. .p2align 5
  403. L$store_base2_64_avx:
  404. movq %r14,0(%rdi)
  405. movq %rbx,8(%rdi)
  406. movq %rbp,16(%rdi)
  407. jmp L$done_avx
  408. .p2align 4
  409. L$store_base2_26_avx:
  410. movl %eax,0(%rdi)
  411. movl %edx,4(%rdi)
  412. movl %r14d,8(%rdi)
  413. movl %ebx,12(%rdi)
  414. movl %ebp,16(%rdi)
  415. .p2align 4
  416. L$done_avx:
  417. movq 0(%rsp),%r15
  418. movq 8(%rsp),%r14
  419. movq 16(%rsp),%r13
  420. movq 24(%rsp),%r12
  421. movq 32(%rsp),%rbp
  422. movq 40(%rsp),%rbx
  423. leaq 48(%rsp),%rsp
  424. L$no_data_avx:
  425. L$blocks_avx_epilogue:
  426. .byte 0xf3,0xc3
  427. .p2align 5
  428. L$base2_64_avx:
  429. pushq %rbx
  430. pushq %rbp
  431. pushq %r12
  432. pushq %r13
  433. pushq %r14
  434. pushq %r15
  435. L$base2_64_avx_body:
  436. movq %rdx,%r15
  437. movq 24(%rdi),%r11
  438. movq 32(%rdi),%r13
  439. movq 0(%rdi),%r14
  440. movq 8(%rdi),%rbx
  441. movl 16(%rdi),%ebp
  442. movq %r13,%r12
  443. movq %r13,%rax
  444. shrq $2,%r13
  445. addq %r12,%r13
  446. testq $31,%rdx
  447. jz L$init_avx
  448. addq 0(%rsi),%r14
  449. adcq 8(%rsi),%rbx
  450. leaq 16(%rsi),%rsi
  451. adcq %rcx,%rbp
  452. subq $16,%r15
  453. call __poly1305_block
  454. L$init_avx:
  455. movq %r14,%rax
  456. movq %r14,%rdx
  457. shrq $52,%r14
  458. movq %rbx,%r8
  459. movq %rbx,%r9
  460. shrq $26,%rdx
  461. andq $0x3ffffff,%rax
  462. shlq $12,%r8
  463. andq $0x3ffffff,%rdx
  464. shrq $14,%rbx
  465. orq %r8,%r14
  466. shlq $24,%rbp
  467. andq $0x3ffffff,%r14
  468. shrq $40,%r9
  469. andq $0x3ffffff,%rbx
  470. orq %r9,%rbp
  471. vmovd %eax,%xmm0
  472. vmovd %edx,%xmm1
  473. vmovd %r14d,%xmm2
  474. vmovd %ebx,%xmm3
  475. vmovd %ebp,%xmm4
  476. movl $1,20(%rdi)
  477. call __poly1305_init_avx
  478. L$proceed_avx:
  479. movq %r15,%rdx
  480. movq 0(%rsp),%r15
  481. movq 8(%rsp),%r14
  482. movq 16(%rsp),%r13
  483. movq 24(%rsp),%r12
  484. movq 32(%rsp),%rbp
  485. movq 40(%rsp),%rbx
  486. leaq 48(%rsp),%rax
  487. leaq 48(%rsp),%rsp
  488. L$base2_64_avx_epilogue:
  489. jmp L$do_avx
  490. .p2align 5
  491. L$even_avx:
  492. vmovd 0(%rdi),%xmm0
  493. vmovd 4(%rdi),%xmm1
  494. vmovd 8(%rdi),%xmm2
  495. vmovd 12(%rdi),%xmm3
  496. vmovd 16(%rdi),%xmm4
  497. L$do_avx:
  498. leaq -88(%rsp),%r11
  499. subq $0x178,%rsp
  500. subq $64,%rdx
  501. leaq -32(%rsi),%rax
  502. cmovcq %rax,%rsi
  503. vmovdqu 48(%rdi),%xmm14
  504. leaq 112(%rdi),%rdi
  505. leaq L$const(%rip),%rcx
  506. vmovdqu 32(%rsi),%xmm5
  507. vmovdqu 48(%rsi),%xmm6
  508. vmovdqa 64(%rcx),%xmm15
  509. vpsrldq $6,%xmm5,%xmm7
  510. vpsrldq $6,%xmm6,%xmm8
  511. vpunpckhqdq %xmm6,%xmm5,%xmm9
  512. vpunpcklqdq %xmm6,%xmm5,%xmm5
  513. vpunpcklqdq %xmm8,%xmm7,%xmm8
  514. vpsrlq $40,%xmm9,%xmm9
  515. vpsrlq $26,%xmm5,%xmm6
  516. vpand %xmm15,%xmm5,%xmm5
  517. vpsrlq $4,%xmm8,%xmm7
  518. vpand %xmm15,%xmm6,%xmm6
  519. vpsrlq $30,%xmm8,%xmm8
  520. vpand %xmm15,%xmm7,%xmm7
  521. vpand %xmm15,%xmm8,%xmm8
  522. vpor 32(%rcx),%xmm9,%xmm9
  523. jbe L$skip_loop_avx
  524. vmovdqu -48(%rdi),%xmm11
  525. vmovdqu -32(%rdi),%xmm12
  526. vpshufd $0xEE,%xmm14,%xmm13
  527. vpshufd $0x44,%xmm14,%xmm10
  528. vmovdqa %xmm13,-144(%r11)
  529. vmovdqa %xmm10,0(%rsp)
  530. vpshufd $0xEE,%xmm11,%xmm14
  531. vmovdqu -16(%rdi),%xmm10
  532. vpshufd $0x44,%xmm11,%xmm11
  533. vmovdqa %xmm14,-128(%r11)
  534. vmovdqa %xmm11,16(%rsp)
  535. vpshufd $0xEE,%xmm12,%xmm13
  536. vmovdqu 0(%rdi),%xmm11
  537. vpshufd $0x44,%xmm12,%xmm12
  538. vmovdqa %xmm13,-112(%r11)
  539. vmovdqa %xmm12,32(%rsp)
  540. vpshufd $0xEE,%xmm10,%xmm14
  541. vmovdqu 16(%rdi),%xmm12
  542. vpshufd $0x44,%xmm10,%xmm10
  543. vmovdqa %xmm14,-96(%r11)
  544. vmovdqa %xmm10,48(%rsp)
  545. vpshufd $0xEE,%xmm11,%xmm13
  546. vmovdqu 32(%rdi),%xmm10
  547. vpshufd $0x44,%xmm11,%xmm11
  548. vmovdqa %xmm13,-80(%r11)
  549. vmovdqa %xmm11,64(%rsp)
  550. vpshufd $0xEE,%xmm12,%xmm14
  551. vmovdqu 48(%rdi),%xmm11
  552. vpshufd $0x44,%xmm12,%xmm12
  553. vmovdqa %xmm14,-64(%r11)
  554. vmovdqa %xmm12,80(%rsp)
  555. vpshufd $0xEE,%xmm10,%xmm13
  556. vmovdqu 64(%rdi),%xmm12
  557. vpshufd $0x44,%xmm10,%xmm10
  558. vmovdqa %xmm13,-48(%r11)
  559. vmovdqa %xmm10,96(%rsp)
  560. vpshufd $0xEE,%xmm11,%xmm14
  561. vpshufd $0x44,%xmm11,%xmm11
  562. vmovdqa %xmm14,-32(%r11)
  563. vmovdqa %xmm11,112(%rsp)
  564. vpshufd $0xEE,%xmm12,%xmm13
  565. vmovdqa 0(%rsp),%xmm14
  566. vpshufd $0x44,%xmm12,%xmm12
  567. vmovdqa %xmm13,-16(%r11)
  568. vmovdqa %xmm12,128(%rsp)
  569. jmp L$oop_avx
  570. .p2align 5
  571. L$oop_avx:
  572. vpmuludq %xmm5,%xmm14,%xmm10
  573. vpmuludq %xmm6,%xmm14,%xmm11
  574. vmovdqa %xmm2,32(%r11)
  575. vpmuludq %xmm7,%xmm14,%xmm12
  576. vmovdqa 16(%rsp),%xmm2
  577. vpmuludq %xmm8,%xmm14,%xmm13
  578. vpmuludq %xmm9,%xmm14,%xmm14
  579. vmovdqa %xmm0,0(%r11)
  580. vpmuludq 32(%rsp),%xmm9,%xmm0
  581. vmovdqa %xmm1,16(%r11)
  582. vpmuludq %xmm8,%xmm2,%xmm1
  583. vpaddq %xmm0,%xmm10,%xmm10
  584. vpaddq %xmm1,%xmm14,%xmm14
  585. vmovdqa %xmm3,48(%r11)
  586. vpmuludq %xmm7,%xmm2,%xmm0
  587. vpmuludq %xmm6,%xmm2,%xmm1
  588. vpaddq %xmm0,%xmm13,%xmm13
  589. vmovdqa 48(%rsp),%xmm3
  590. vpaddq %xmm1,%xmm12,%xmm12
  591. vmovdqa %xmm4,64(%r11)
  592. vpmuludq %xmm5,%xmm2,%xmm2
  593. vpmuludq %xmm7,%xmm3,%xmm0
  594. vpaddq %xmm2,%xmm11,%xmm11
  595. vmovdqa 64(%rsp),%xmm4
  596. vpaddq %xmm0,%xmm14,%xmm14
  597. vpmuludq %xmm6,%xmm3,%xmm1
  598. vpmuludq %xmm5,%xmm3,%xmm3
  599. vpaddq %xmm1,%xmm13,%xmm13
  600. vmovdqa 80(%rsp),%xmm2
  601. vpaddq %xmm3,%xmm12,%xmm12
  602. vpmuludq %xmm9,%xmm4,%xmm0
  603. vpmuludq %xmm8,%xmm4,%xmm4
  604. vpaddq %xmm0,%xmm11,%xmm11
  605. vmovdqa 96(%rsp),%xmm3
  606. vpaddq %xmm4,%xmm10,%xmm10
  607. vmovdqa 128(%rsp),%xmm4
  608. vpmuludq %xmm6,%xmm2,%xmm1
  609. vpmuludq %xmm5,%xmm2,%xmm2
  610. vpaddq %xmm1,%xmm14,%xmm14
  611. vpaddq %xmm2,%xmm13,%xmm13
  612. vpmuludq %xmm9,%xmm3,%xmm0
  613. vpmuludq %xmm8,%xmm3,%xmm1
  614. vpaddq %xmm0,%xmm12,%xmm12
  615. vmovdqu 0(%rsi),%xmm0
  616. vpaddq %xmm1,%xmm11,%xmm11
  617. vpmuludq %xmm7,%xmm3,%xmm3
  618. vpmuludq %xmm7,%xmm4,%xmm7
  619. vpaddq %xmm3,%xmm10,%xmm10
  620. vmovdqu 16(%rsi),%xmm1
  621. vpaddq %xmm7,%xmm11,%xmm11
  622. vpmuludq %xmm8,%xmm4,%xmm8
  623. vpmuludq %xmm9,%xmm4,%xmm9
  624. vpsrldq $6,%xmm0,%xmm2
  625. vpaddq %xmm8,%xmm12,%xmm12
  626. vpaddq %xmm9,%xmm13,%xmm13
  627. vpsrldq $6,%xmm1,%xmm3
  628. vpmuludq 112(%rsp),%xmm5,%xmm9
  629. vpmuludq %xmm6,%xmm4,%xmm5
  630. vpunpckhqdq %xmm1,%xmm0,%xmm4
  631. vpaddq %xmm9,%xmm14,%xmm14
  632. vmovdqa -144(%r11),%xmm9
  633. vpaddq %xmm5,%xmm10,%xmm10
  634. vpunpcklqdq %xmm1,%xmm0,%xmm0
  635. vpunpcklqdq %xmm3,%xmm2,%xmm3
  636. vpsrldq $5,%xmm4,%xmm4
  637. vpsrlq $26,%xmm0,%xmm1
  638. vpand %xmm15,%xmm0,%xmm0
  639. vpsrlq $4,%xmm3,%xmm2
  640. vpand %xmm15,%xmm1,%xmm1
  641. vpand 0(%rcx),%xmm4,%xmm4
  642. vpsrlq $30,%xmm3,%xmm3
  643. vpand %xmm15,%xmm2,%xmm2
  644. vpand %xmm15,%xmm3,%xmm3
  645. vpor 32(%rcx),%xmm4,%xmm4
  646. vpaddq 0(%r11),%xmm0,%xmm0
  647. vpaddq 16(%r11),%xmm1,%xmm1
  648. vpaddq 32(%r11),%xmm2,%xmm2
  649. vpaddq 48(%r11),%xmm3,%xmm3
  650. vpaddq 64(%r11),%xmm4,%xmm4
  651. leaq 32(%rsi),%rax
  652. leaq 64(%rsi),%rsi
  653. subq $64,%rdx
  654. cmovcq %rax,%rsi
  655. vpmuludq %xmm0,%xmm9,%xmm5
  656. vpmuludq %xmm1,%xmm9,%xmm6
  657. vpaddq %xmm5,%xmm10,%xmm10
  658. vpaddq %xmm6,%xmm11,%xmm11
  659. vmovdqa -128(%r11),%xmm7
  660. vpmuludq %xmm2,%xmm9,%xmm5
  661. vpmuludq %xmm3,%xmm9,%xmm6
  662. vpaddq %xmm5,%xmm12,%xmm12
  663. vpaddq %xmm6,%xmm13,%xmm13
  664. vpmuludq %xmm4,%xmm9,%xmm9
  665. vpmuludq -112(%r11),%xmm4,%xmm5
  666. vpaddq %xmm9,%xmm14,%xmm14
  667. vpaddq %xmm5,%xmm10,%xmm10
  668. vpmuludq %xmm2,%xmm7,%xmm6
  669. vpmuludq %xmm3,%xmm7,%xmm5
  670. vpaddq %xmm6,%xmm13,%xmm13
  671. vmovdqa -96(%r11),%xmm8
  672. vpaddq %xmm5,%xmm14,%xmm14
  673. vpmuludq %xmm1,%xmm7,%xmm6
  674. vpmuludq %xmm0,%xmm7,%xmm7
  675. vpaddq %xmm6,%xmm12,%xmm12
  676. vpaddq %xmm7,%xmm11,%xmm11
  677. vmovdqa -80(%r11),%xmm9
  678. vpmuludq %xmm2,%xmm8,%xmm5
  679. vpmuludq %xmm1,%xmm8,%xmm6
  680. vpaddq %xmm5,%xmm14,%xmm14
  681. vpaddq %xmm6,%xmm13,%xmm13
  682. vmovdqa -64(%r11),%xmm7
  683. vpmuludq %xmm0,%xmm8,%xmm8
  684. vpmuludq %xmm4,%xmm9,%xmm5
  685. vpaddq %xmm8,%xmm12,%xmm12
  686. vpaddq %xmm5,%xmm11,%xmm11
  687. vmovdqa -48(%r11),%xmm8
  688. vpmuludq %xmm3,%xmm9,%xmm9
  689. vpmuludq %xmm1,%xmm7,%xmm6
  690. vpaddq %xmm9,%xmm10,%xmm10
  691. vmovdqa -16(%r11),%xmm9
  692. vpaddq %xmm6,%xmm14,%xmm14
  693. vpmuludq %xmm0,%xmm7,%xmm7
  694. vpmuludq %xmm4,%xmm8,%xmm5
  695. vpaddq %xmm7,%xmm13,%xmm13
  696. vpaddq %xmm5,%xmm12,%xmm12
  697. vmovdqu 32(%rsi),%xmm5
  698. vpmuludq %xmm3,%xmm8,%xmm7
  699. vpmuludq %xmm2,%xmm8,%xmm8
  700. vpaddq %xmm7,%xmm11,%xmm11
  701. vmovdqu 48(%rsi),%xmm6
  702. vpaddq %xmm8,%xmm10,%xmm10
  703. vpmuludq %xmm2,%xmm9,%xmm2
  704. vpmuludq %xmm3,%xmm9,%xmm3
  705. vpsrldq $6,%xmm5,%xmm7
  706. vpaddq %xmm2,%xmm11,%xmm11
  707. vpmuludq %xmm4,%xmm9,%xmm4
  708. vpsrldq $6,%xmm6,%xmm8
  709. vpaddq %xmm3,%xmm12,%xmm2
  710. vpaddq %xmm4,%xmm13,%xmm3
  711. vpmuludq -32(%r11),%xmm0,%xmm4
  712. vpmuludq %xmm1,%xmm9,%xmm0
  713. vpunpckhqdq %xmm6,%xmm5,%xmm9
  714. vpaddq %xmm4,%xmm14,%xmm4
  715. vpaddq %xmm0,%xmm10,%xmm0
  716. vpunpcklqdq %xmm6,%xmm5,%xmm5
  717. vpunpcklqdq %xmm8,%xmm7,%xmm8
  718. vpsrldq $5,%xmm9,%xmm9
  719. vpsrlq $26,%xmm5,%xmm6
  720. vmovdqa 0(%rsp),%xmm14
  721. vpand %xmm15,%xmm5,%xmm5
  722. vpsrlq $4,%xmm8,%xmm7
  723. vpand %xmm15,%xmm6,%xmm6
  724. vpand 0(%rcx),%xmm9,%xmm9
  725. vpsrlq $30,%xmm8,%xmm8
  726. vpand %xmm15,%xmm7,%xmm7
  727. vpand %xmm15,%xmm8,%xmm8
  728. vpor 32(%rcx),%xmm9,%xmm9
  729. vpsrlq $26,%xmm3,%xmm13
  730. vpand %xmm15,%xmm3,%xmm3
  731. vpaddq %xmm13,%xmm4,%xmm4
  732. vpsrlq $26,%xmm0,%xmm10
  733. vpand %xmm15,%xmm0,%xmm0
  734. vpaddq %xmm10,%xmm11,%xmm1
  735. vpsrlq $26,%xmm4,%xmm10
  736. vpand %xmm15,%xmm4,%xmm4
  737. vpsrlq $26,%xmm1,%xmm11
  738. vpand %xmm15,%xmm1,%xmm1
  739. vpaddq %xmm11,%xmm2,%xmm2
  740. vpaddq %xmm10,%xmm0,%xmm0
  741. vpsllq $2,%xmm10,%xmm10
  742. vpaddq %xmm10,%xmm0,%xmm0
  743. vpsrlq $26,%xmm2,%xmm12
  744. vpand %xmm15,%xmm2,%xmm2
  745. vpaddq %xmm12,%xmm3,%xmm3
  746. vpsrlq $26,%xmm0,%xmm10
  747. vpand %xmm15,%xmm0,%xmm0
  748. vpaddq %xmm10,%xmm1,%xmm1
  749. vpsrlq $26,%xmm3,%xmm13
  750. vpand %xmm15,%xmm3,%xmm3
  751. vpaddq %xmm13,%xmm4,%xmm4
  752. ja L$oop_avx
  753. L$skip_loop_avx:
  754. vpshufd $0x10,%xmm14,%xmm14
  755. addq $32,%rdx
  756. jnz L$ong_tail_avx
  757. vpaddq %xmm2,%xmm7,%xmm7
  758. vpaddq %xmm0,%xmm5,%xmm5
  759. vpaddq %xmm1,%xmm6,%xmm6
  760. vpaddq %xmm3,%xmm8,%xmm8
  761. vpaddq %xmm4,%xmm9,%xmm9
  762. L$ong_tail_avx:
  763. vmovdqa %xmm2,32(%r11)
  764. vmovdqa %xmm0,0(%r11)
  765. vmovdqa %xmm1,16(%r11)
  766. vmovdqa %xmm3,48(%r11)
  767. vmovdqa %xmm4,64(%r11)
  768. vpmuludq %xmm7,%xmm14,%xmm12
  769. vpmuludq %xmm5,%xmm14,%xmm10
  770. vpshufd $0x10,-48(%rdi),%xmm2
  771. vpmuludq %xmm6,%xmm14,%xmm11
  772. vpmuludq %xmm8,%xmm14,%xmm13
  773. vpmuludq %xmm9,%xmm14,%xmm14
  774. vpmuludq %xmm8,%xmm2,%xmm0
  775. vpaddq %xmm0,%xmm14,%xmm14
  776. vpshufd $0x10,-32(%rdi),%xmm3
  777. vpmuludq %xmm7,%xmm2,%xmm1
  778. vpaddq %xmm1,%xmm13,%xmm13
  779. vpshufd $0x10,-16(%rdi),%xmm4
  780. vpmuludq %xmm6,%xmm2,%xmm0
  781. vpaddq %xmm0,%xmm12,%xmm12
  782. vpmuludq %xmm5,%xmm2,%xmm2
  783. vpaddq %xmm2,%xmm11,%xmm11
  784. vpmuludq %xmm9,%xmm3,%xmm3
  785. vpaddq %xmm3,%xmm10,%xmm10
  786. vpshufd $0x10,0(%rdi),%xmm2
  787. vpmuludq %xmm7,%xmm4,%xmm1
  788. vpaddq %xmm1,%xmm14,%xmm14
  789. vpmuludq %xmm6,%xmm4,%xmm0
  790. vpaddq %xmm0,%xmm13,%xmm13
  791. vpshufd $0x10,16(%rdi),%xmm3
  792. vpmuludq %xmm5,%xmm4,%xmm4
  793. vpaddq %xmm4,%xmm12,%xmm12
  794. vpmuludq %xmm9,%xmm2,%xmm1
  795. vpaddq %xmm1,%xmm11,%xmm11
  796. vpshufd $0x10,32(%rdi),%xmm4
  797. vpmuludq %xmm8,%xmm2,%xmm2
  798. vpaddq %xmm2,%xmm10,%xmm10
  799. vpmuludq %xmm6,%xmm3,%xmm0
  800. vpaddq %xmm0,%xmm14,%xmm14
  801. vpmuludq %xmm5,%xmm3,%xmm3
  802. vpaddq %xmm3,%xmm13,%xmm13
  803. vpshufd $0x10,48(%rdi),%xmm2
  804. vpmuludq %xmm9,%xmm4,%xmm1
  805. vpaddq %xmm1,%xmm12,%xmm12
  806. vpshufd $0x10,64(%rdi),%xmm3
  807. vpmuludq %xmm8,%xmm4,%xmm0
  808. vpaddq %xmm0,%xmm11,%xmm11
  809. vpmuludq %xmm7,%xmm4,%xmm4
  810. vpaddq %xmm4,%xmm10,%xmm10
  811. vpmuludq %xmm5,%xmm2,%xmm2
  812. vpaddq %xmm2,%xmm14,%xmm14
  813. vpmuludq %xmm9,%xmm3,%xmm1
  814. vpaddq %xmm1,%xmm13,%xmm13
  815. vpmuludq %xmm8,%xmm3,%xmm0
  816. vpaddq %xmm0,%xmm12,%xmm12
  817. vpmuludq %xmm7,%xmm3,%xmm1
  818. vpaddq %xmm1,%xmm11,%xmm11
  819. vpmuludq %xmm6,%xmm3,%xmm3
  820. vpaddq %xmm3,%xmm10,%xmm10
  821. jz L$short_tail_avx
  822. vmovdqu 0(%rsi),%xmm0
  823. vmovdqu 16(%rsi),%xmm1
  824. vpsrldq $6,%xmm0,%xmm2
  825. vpsrldq $6,%xmm1,%xmm3
  826. vpunpckhqdq %xmm1,%xmm0,%xmm4
  827. vpunpcklqdq %xmm1,%xmm0,%xmm0
  828. vpunpcklqdq %xmm3,%xmm2,%xmm3
  829. vpsrlq $40,%xmm4,%xmm4
  830. vpsrlq $26,%xmm0,%xmm1
  831. vpand %xmm15,%xmm0,%xmm0
  832. vpsrlq $4,%xmm3,%xmm2
  833. vpand %xmm15,%xmm1,%xmm1
  834. vpsrlq $30,%xmm3,%xmm3
  835. vpand %xmm15,%xmm2,%xmm2
  836. vpand %xmm15,%xmm3,%xmm3
  837. vpor 32(%rcx),%xmm4,%xmm4
  838. vpshufd $0x32,-64(%rdi),%xmm9
  839. vpaddq 0(%r11),%xmm0,%xmm0
  840. vpaddq 16(%r11),%xmm1,%xmm1
  841. vpaddq 32(%r11),%xmm2,%xmm2
  842. vpaddq 48(%r11),%xmm3,%xmm3
  843. vpaddq 64(%r11),%xmm4,%xmm4
  844. vpmuludq %xmm0,%xmm9,%xmm5
  845. vpaddq %xmm5,%xmm10,%xmm10
  846. vpmuludq %xmm1,%xmm9,%xmm6
  847. vpaddq %xmm6,%xmm11,%xmm11
  848. vpmuludq %xmm2,%xmm9,%xmm5
  849. vpaddq %xmm5,%xmm12,%xmm12
  850. vpshufd $0x32,-48(%rdi),%xmm7
  851. vpmuludq %xmm3,%xmm9,%xmm6
  852. vpaddq %xmm6,%xmm13,%xmm13
  853. vpmuludq %xmm4,%xmm9,%xmm9
  854. vpaddq %xmm9,%xmm14,%xmm14
  855. vpmuludq %xmm3,%xmm7,%xmm5
  856. vpaddq %xmm5,%xmm14,%xmm14
  857. vpshufd $0x32,-32(%rdi),%xmm8
  858. vpmuludq %xmm2,%xmm7,%xmm6
  859. vpaddq %xmm6,%xmm13,%xmm13
  860. vpshufd $0x32,-16(%rdi),%xmm9
  861. vpmuludq %xmm1,%xmm7,%xmm5
  862. vpaddq %xmm5,%xmm12,%xmm12
  863. vpmuludq %xmm0,%xmm7,%xmm7
  864. vpaddq %xmm7,%xmm11,%xmm11
  865. vpmuludq %xmm4,%xmm8,%xmm8
  866. vpaddq %xmm8,%xmm10,%xmm10
  867. vpshufd $0x32,0(%rdi),%xmm7
  868. vpmuludq %xmm2,%xmm9,%xmm6
  869. vpaddq %xmm6,%xmm14,%xmm14
  870. vpmuludq %xmm1,%xmm9,%xmm5
  871. vpaddq %xmm5,%xmm13,%xmm13
  872. vpshufd $0x32,16(%rdi),%xmm8
  873. vpmuludq %xmm0,%xmm9,%xmm9
  874. vpaddq %xmm9,%xmm12,%xmm12
  875. vpmuludq %xmm4,%xmm7,%xmm6
  876. vpaddq %xmm6,%xmm11,%xmm11
  877. vpshufd $0x32,32(%rdi),%xmm9
  878. vpmuludq %xmm3,%xmm7,%xmm7
  879. vpaddq %xmm7,%xmm10,%xmm10
  880. vpmuludq %xmm1,%xmm8,%xmm5
  881. vpaddq %xmm5,%xmm14,%xmm14
  882. vpmuludq %xmm0,%xmm8,%xmm8
  883. vpaddq %xmm8,%xmm13,%xmm13
  884. vpshufd $0x32,48(%rdi),%xmm7
  885. vpmuludq %xmm4,%xmm9,%xmm6
  886. vpaddq %xmm6,%xmm12,%xmm12
  887. vpshufd $0x32,64(%rdi),%xmm8
  888. vpmuludq %xmm3,%xmm9,%xmm5
  889. vpaddq %xmm5,%xmm11,%xmm11
  890. vpmuludq %xmm2,%xmm9,%xmm9
  891. vpaddq %xmm9,%xmm10,%xmm10
  892. vpmuludq %xmm0,%xmm7,%xmm7
  893. vpaddq %xmm7,%xmm14,%xmm14
  894. vpmuludq %xmm4,%xmm8,%xmm6
  895. vpaddq %xmm6,%xmm13,%xmm13
  896. vpmuludq %xmm3,%xmm8,%xmm5
  897. vpaddq %xmm5,%xmm12,%xmm12
  898. vpmuludq %xmm2,%xmm8,%xmm6
  899. vpaddq %xmm6,%xmm11,%xmm11
  900. vpmuludq %xmm1,%xmm8,%xmm8
  901. vpaddq %xmm8,%xmm10,%xmm10
  902. L$short_tail_avx:
  903. vpsrldq $8,%xmm14,%xmm9
  904. vpsrldq $8,%xmm13,%xmm8
  905. vpsrldq $8,%xmm11,%xmm6
  906. vpsrldq $8,%xmm10,%xmm5
  907. vpsrldq $8,%xmm12,%xmm7
  908. vpaddq %xmm8,%xmm13,%xmm13
  909. vpaddq %xmm9,%xmm14,%xmm14
  910. vpaddq %xmm5,%xmm10,%xmm10
  911. vpaddq %xmm6,%xmm11,%xmm11
  912. vpaddq %xmm7,%xmm12,%xmm12
  913. vpsrlq $26,%xmm13,%xmm3
  914. vpand %xmm15,%xmm13,%xmm13
  915. vpaddq %xmm3,%xmm14,%xmm14
  916. vpsrlq $26,%xmm10,%xmm0
  917. vpand %xmm15,%xmm10,%xmm10
  918. vpaddq %xmm0,%xmm11,%xmm11
  919. vpsrlq $26,%xmm14,%xmm4
  920. vpand %xmm15,%xmm14,%xmm14
  921. vpsrlq $26,%xmm11,%xmm1
  922. vpand %xmm15,%xmm11,%xmm11
  923. vpaddq %xmm1,%xmm12,%xmm12
  924. vpaddq %xmm4,%xmm10,%xmm10
  925. vpsllq $2,%xmm4,%xmm4
  926. vpaddq %xmm4,%xmm10,%xmm10
  927. vpsrlq $26,%xmm12,%xmm2
  928. vpand %xmm15,%xmm12,%xmm12
  929. vpaddq %xmm2,%xmm13,%xmm13
  930. vpsrlq $26,%xmm10,%xmm0
  931. vpand %xmm15,%xmm10,%xmm10
  932. vpaddq %xmm0,%xmm11,%xmm11
  933. vpsrlq $26,%xmm13,%xmm3
  934. vpand %xmm15,%xmm13,%xmm13
  935. vpaddq %xmm3,%xmm14,%xmm14
  936. vmovd %xmm10,-112(%rdi)
  937. vmovd %xmm11,-108(%rdi)
  938. vmovd %xmm12,-104(%rdi)
  939. vmovd %xmm13,-100(%rdi)
  940. vmovd %xmm14,-96(%rdi)
  941. leaq 88(%r11),%rsp
  942. vzeroupper
  943. .byte 0xf3,0xc3
  944. .p2align 5
  945. poly1305_emit_avx:
  946. cmpl $0,20(%rdi)
  947. je L$emit
  948. movl 0(%rdi),%eax
  949. movl 4(%rdi),%ecx
  950. movl 8(%rdi),%r8d
  951. movl 12(%rdi),%r11d
  952. movl 16(%rdi),%r10d
  953. shlq $26,%rcx
  954. movq %r8,%r9
  955. shlq $52,%r8
  956. addq %rcx,%rax
  957. shrq $12,%r9
  958. addq %rax,%r8
  959. adcq $0,%r9
  960. shlq $14,%r11
  961. movq %r10,%rax
  962. shrq $24,%r10
  963. addq %r11,%r9
  964. shlq $40,%rax
  965. addq %rax,%r9
  966. adcq $0,%r10
  967. movq %r10,%rax
  968. movq %r10,%rcx
  969. andq $3,%r10
  970. shrq $2,%rax
  971. andq $-4,%rcx
  972. addq %rcx,%rax
  973. addq %rax,%r8
  974. adcq $0,%r9
  975. adcq $0,%r10
  976. movq %r8,%rax
  977. addq $5,%r8
  978. movq %r9,%rcx
  979. adcq $0,%r9
  980. adcq $0,%r10
  981. shrq $2,%r10
  982. cmovnzq %r8,%rax
  983. cmovnzq %r9,%rcx
  984. addq 0(%rdx),%rax
  985. adcq 8(%rdx),%rcx
  986. movq %rax,0(%rsi)
  987. movq %rcx,8(%rsi)
  988. .byte 0xf3,0xc3
  989. .p2align 5
  990. poly1305_blocks_avx2:
  991. movl 20(%rdi),%r8d
  992. cmpq $128,%rdx
  993. jae L$blocks_avx2
  994. testl %r8d,%r8d
  995. jz L$blocks
  996. L$blocks_avx2:
  997. andq $-16,%rdx
  998. jz L$no_data_avx2
  999. vzeroupper
  1000. testl %r8d,%r8d
  1001. jz L$base2_64_avx2
  1002. testq $63,%rdx
  1003. jz L$even_avx2
  1004. pushq %rbx
  1005. pushq %rbp
  1006. pushq %r12
  1007. pushq %r13
  1008. pushq %r14
  1009. pushq %r15
  1010. L$blocks_avx2_body:
  1011. movq %rdx,%r15
  1012. movq 0(%rdi),%r8
  1013. movq 8(%rdi),%r9
  1014. movl 16(%rdi),%ebp
  1015. movq 24(%rdi),%r11
  1016. movq 32(%rdi),%r13
  1017. movl %r8d,%r14d
  1018. andq $-2147483648,%r8
  1019. movq %r9,%r12
  1020. movl %r9d,%ebx
  1021. andq $-2147483648,%r9
  1022. shrq $6,%r8
  1023. shlq $52,%r12
  1024. addq %r8,%r14
  1025. shrq $12,%rbx
  1026. shrq $18,%r9
  1027. addq %r12,%r14
  1028. adcq %r9,%rbx
  1029. movq %rbp,%r8
  1030. shlq $40,%r8
  1031. shrq $24,%rbp
  1032. addq %r8,%rbx
  1033. adcq $0,%rbp
  1034. movq $-4,%r9
  1035. movq %rbp,%r8
  1036. andq %rbp,%r9
  1037. shrq $2,%r8
  1038. andq $3,%rbp
  1039. addq %r9,%r8
  1040. addq %r8,%r14
  1041. adcq $0,%rbx
  1042. adcq $0,%rbp
  1043. movq %r13,%r12
  1044. movq %r13,%rax
  1045. shrq $2,%r13
  1046. addq %r12,%r13
  1047. L$base2_26_pre_avx2:
  1048. addq 0(%rsi),%r14
  1049. adcq 8(%rsi),%rbx
  1050. leaq 16(%rsi),%rsi
  1051. adcq %rcx,%rbp
  1052. subq $16,%r15
  1053. call __poly1305_block
  1054. movq %r12,%rax
  1055. testq $63,%r15
  1056. jnz L$base2_26_pre_avx2
  1057. testq %rcx,%rcx
  1058. jz L$store_base2_64_avx2
  1059. movq %r14,%rax
  1060. movq %r14,%rdx
  1061. shrq $52,%r14
  1062. movq %rbx,%r11
  1063. movq %rbx,%r12
  1064. shrq $26,%rdx
  1065. andq $0x3ffffff,%rax
  1066. shlq $12,%r11
  1067. andq $0x3ffffff,%rdx
  1068. shrq $14,%rbx
  1069. orq %r11,%r14
  1070. shlq $24,%rbp
  1071. andq $0x3ffffff,%r14
  1072. shrq $40,%r12
  1073. andq $0x3ffffff,%rbx
  1074. orq %r12,%rbp
  1075. testq %r15,%r15
  1076. jz L$store_base2_26_avx2
  1077. vmovd %eax,%xmm0
  1078. vmovd %edx,%xmm1
  1079. vmovd %r14d,%xmm2
  1080. vmovd %ebx,%xmm3
  1081. vmovd %ebp,%xmm4
  1082. jmp L$proceed_avx2
  1083. .p2align 5
  1084. L$store_base2_64_avx2:
  1085. movq %r14,0(%rdi)
  1086. movq %rbx,8(%rdi)
  1087. movq %rbp,16(%rdi)
  1088. jmp L$done_avx2
  1089. .p2align 4
  1090. L$store_base2_26_avx2:
  1091. movl %eax,0(%rdi)
  1092. movl %edx,4(%rdi)
  1093. movl %r14d,8(%rdi)
  1094. movl %ebx,12(%rdi)
  1095. movl %ebp,16(%rdi)
  1096. .p2align 4
  1097. L$done_avx2:
  1098. movq 0(%rsp),%r15
  1099. movq 8(%rsp),%r14
  1100. movq 16(%rsp),%r13
  1101. movq 24(%rsp),%r12
  1102. movq 32(%rsp),%rbp
  1103. movq 40(%rsp),%rbx
  1104. leaq 48(%rsp),%rsp
  1105. L$no_data_avx2:
  1106. L$blocks_avx2_epilogue:
  1107. .byte 0xf3,0xc3
  1108. .p2align 5
  1109. L$base2_64_avx2:
  1110. pushq %rbx
  1111. pushq %rbp
  1112. pushq %r12
  1113. pushq %r13
  1114. pushq %r14
  1115. pushq %r15
  1116. L$base2_64_avx2_body:
  1117. movq %rdx,%r15
  1118. movq 24(%rdi),%r11
  1119. movq 32(%rdi),%r13
  1120. movq 0(%rdi),%r14
  1121. movq 8(%rdi),%rbx
  1122. movl 16(%rdi),%ebp
  1123. movq %r13,%r12
  1124. movq %r13,%rax
  1125. shrq $2,%r13
  1126. addq %r12,%r13
  1127. testq $63,%rdx
  1128. jz L$init_avx2
  1129. L$base2_64_pre_avx2:
  1130. addq 0(%rsi),%r14
  1131. adcq 8(%rsi),%rbx
  1132. leaq 16(%rsi),%rsi
  1133. adcq %rcx,%rbp
  1134. subq $16,%r15
  1135. call __poly1305_block
  1136. movq %r12,%rax
  1137. testq $63,%r15
  1138. jnz L$base2_64_pre_avx2
  1139. L$init_avx2:
  1140. movq %r14,%rax
  1141. movq %r14,%rdx
  1142. shrq $52,%r14
  1143. movq %rbx,%r8
  1144. movq %rbx,%r9
  1145. shrq $26,%rdx
  1146. andq $0x3ffffff,%rax
  1147. shlq $12,%r8
  1148. andq $0x3ffffff,%rdx
  1149. shrq $14,%rbx
  1150. orq %r8,%r14
  1151. shlq $24,%rbp
  1152. andq $0x3ffffff,%r14
  1153. shrq $40,%r9
  1154. andq $0x3ffffff,%rbx
  1155. orq %r9,%rbp
  1156. vmovd %eax,%xmm0
  1157. vmovd %edx,%xmm1
  1158. vmovd %r14d,%xmm2
  1159. vmovd %ebx,%xmm3
  1160. vmovd %ebp,%xmm4
  1161. movl $1,20(%rdi)
  1162. call __poly1305_init_avx
  1163. L$proceed_avx2:
  1164. movq %r15,%rdx
  1165. movl _OPENSSL_ia32cap_P+8(%rip),%r10d
  1166. movl $3221291008,%r11d
  1167. movq 0(%rsp),%r15
  1168. movq 8(%rsp),%r14
  1169. movq 16(%rsp),%r13
  1170. movq 24(%rsp),%r12
  1171. movq 32(%rsp),%rbp
  1172. movq 40(%rsp),%rbx
  1173. leaq 48(%rsp),%rax
  1174. leaq 48(%rsp),%rsp
  1175. L$base2_64_avx2_epilogue:
  1176. jmp L$do_avx2
  1177. .p2align 5
  1178. L$even_avx2:
  1179. movl _OPENSSL_ia32cap_P+8(%rip),%r10d
  1180. vmovd 0(%rdi),%xmm0
  1181. vmovd 4(%rdi),%xmm1
  1182. vmovd 8(%rdi),%xmm2
  1183. vmovd 12(%rdi),%xmm3
  1184. vmovd 16(%rdi),%xmm4
  1185. L$do_avx2:
  1186. cmpq $512,%rdx
  1187. jb L$skip_avx512
  1188. andl %r11d,%r10d
  1189. testl $65536,%r10d
  1190. jnz L$blocks_avx512
  1191. L$skip_avx512:
  1192. leaq -8(%rsp),%r11
  1193. subq $0x128,%rsp
  1194. leaq L$const(%rip),%rcx
  1195. leaq 48+64(%rdi),%rdi
  1196. vmovdqa 96(%rcx),%ymm7
  1197. vmovdqu -64(%rdi),%xmm9
  1198. andq $-512,%rsp
  1199. vmovdqu -48(%rdi),%xmm10
  1200. vmovdqu -32(%rdi),%xmm6
  1201. vmovdqu -16(%rdi),%xmm11
  1202. vmovdqu 0(%rdi),%xmm12
  1203. vmovdqu 16(%rdi),%xmm13
  1204. leaq 144(%rsp),%rax
  1205. vmovdqu 32(%rdi),%xmm14
  1206. vpermd %ymm9,%ymm7,%ymm9
  1207. vmovdqu 48(%rdi),%xmm15
  1208. vpermd %ymm10,%ymm7,%ymm10
  1209. vmovdqu 64(%rdi),%xmm5
  1210. vpermd %ymm6,%ymm7,%ymm6
  1211. vmovdqa %ymm9,0(%rsp)
  1212. vpermd %ymm11,%ymm7,%ymm11
  1213. vmovdqa %ymm10,32-144(%rax)
  1214. vpermd %ymm12,%ymm7,%ymm12
  1215. vmovdqa %ymm6,64-144(%rax)
  1216. vpermd %ymm13,%ymm7,%ymm13
  1217. vmovdqa %ymm11,96-144(%rax)
  1218. vpermd %ymm14,%ymm7,%ymm14
  1219. vmovdqa %ymm12,128-144(%rax)
  1220. vpermd %ymm15,%ymm7,%ymm15
  1221. vmovdqa %ymm13,160-144(%rax)
  1222. vpermd %ymm5,%ymm7,%ymm5
  1223. vmovdqa %ymm14,192-144(%rax)
  1224. vmovdqa %ymm15,224-144(%rax)
  1225. vmovdqa %ymm5,256-144(%rax)
  1226. vmovdqa 64(%rcx),%ymm5
  1227. vmovdqu 0(%rsi),%xmm7
  1228. vmovdqu 16(%rsi),%xmm8
  1229. vinserti128 $1,32(%rsi),%ymm7,%ymm7
  1230. vinserti128 $1,48(%rsi),%ymm8,%ymm8
  1231. leaq 64(%rsi),%rsi
  1232. vpsrldq $6,%ymm7,%ymm9
  1233. vpsrldq $6,%ymm8,%ymm10
  1234. vpunpckhqdq %ymm8,%ymm7,%ymm6
  1235. vpunpcklqdq %ymm10,%ymm9,%ymm9
  1236. vpunpcklqdq %ymm8,%ymm7,%ymm7
  1237. vpsrlq $30,%ymm9,%ymm10
  1238. vpsrlq $4,%ymm9,%ymm9
  1239. vpsrlq $26,%ymm7,%ymm8
  1240. vpsrlq $40,%ymm6,%ymm6
  1241. vpand %ymm5,%ymm9,%ymm9
  1242. vpand %ymm5,%ymm7,%ymm7
  1243. vpand %ymm5,%ymm8,%ymm8
  1244. vpand %ymm5,%ymm10,%ymm10
  1245. vpor 32(%rcx),%ymm6,%ymm6
  1246. vpaddq %ymm2,%ymm9,%ymm2
  1247. subq $64,%rdx
  1248. jz L$tail_avx2
  1249. jmp L$oop_avx2
  1250. .p2align 5
  1251. L$oop_avx2:
  1252. vpaddq %ymm0,%ymm7,%ymm0
  1253. vmovdqa 0(%rsp),%ymm7
  1254. vpaddq %ymm1,%ymm8,%ymm1
  1255. vmovdqa 32(%rsp),%ymm8
  1256. vpaddq %ymm3,%ymm10,%ymm3
  1257. vmovdqa 96(%rsp),%ymm9
  1258. vpaddq %ymm4,%ymm6,%ymm4
  1259. vmovdqa 48(%rax),%ymm10
  1260. vmovdqa 112(%rax),%ymm5
  1261. vpmuludq %ymm2,%ymm7,%ymm13
  1262. vpmuludq %ymm2,%ymm8,%ymm14
  1263. vpmuludq %ymm2,%ymm9,%ymm15
  1264. vpmuludq %ymm2,%ymm10,%ymm11
  1265. vpmuludq %ymm2,%ymm5,%ymm12
  1266. vpmuludq %ymm0,%ymm8,%ymm6
  1267. vpmuludq %ymm1,%ymm8,%ymm2
  1268. vpaddq %ymm6,%ymm12,%ymm12
  1269. vpaddq %ymm2,%ymm13,%ymm13
  1270. vpmuludq %ymm3,%ymm8,%ymm6
  1271. vpmuludq 64(%rsp),%ymm4,%ymm2
  1272. vpaddq %ymm6,%ymm15,%ymm15
  1273. vpaddq %ymm2,%ymm11,%ymm11
  1274. vmovdqa -16(%rax),%ymm8
  1275. vpmuludq %ymm0,%ymm7,%ymm6
  1276. vpmuludq %ymm1,%ymm7,%ymm2
  1277. vpaddq %ymm6,%ymm11,%ymm11
  1278. vpaddq %ymm2,%ymm12,%ymm12
  1279. vpmuludq %ymm3,%ymm7,%ymm6
  1280. vpmuludq %ymm4,%ymm7,%ymm2
  1281. vmovdqu 0(%rsi),%xmm7
  1282. vpaddq %ymm6,%ymm14,%ymm14
  1283. vpaddq %ymm2,%ymm15,%ymm15
  1284. vinserti128 $1,32(%rsi),%ymm7,%ymm7
  1285. vpmuludq %ymm3,%ymm8,%ymm6
  1286. vpmuludq %ymm4,%ymm8,%ymm2
  1287. vmovdqu 16(%rsi),%xmm8
  1288. vpaddq %ymm6,%ymm11,%ymm11
  1289. vpaddq %ymm2,%ymm12,%ymm12
  1290. vmovdqa 16(%rax),%ymm2
  1291. vpmuludq %ymm1,%ymm9,%ymm6
  1292. vpmuludq %ymm0,%ymm9,%ymm9
  1293. vpaddq %ymm6,%ymm14,%ymm14
  1294. vpaddq %ymm9,%ymm13,%ymm13
  1295. vinserti128 $1,48(%rsi),%ymm8,%ymm8
  1296. leaq 64(%rsi),%rsi
  1297. vpmuludq %ymm1,%ymm2,%ymm6
  1298. vpmuludq %ymm0,%ymm2,%ymm2
  1299. vpsrldq $6,%ymm7,%ymm9
  1300. vpaddq %ymm6,%ymm15,%ymm15
  1301. vpaddq %ymm2,%ymm14,%ymm14
  1302. vpmuludq %ymm3,%ymm10,%ymm6
  1303. vpmuludq %ymm4,%ymm10,%ymm2
  1304. vpsrldq $6,%ymm8,%ymm10
  1305. vpaddq %ymm6,%ymm12,%ymm12
  1306. vpaddq %ymm2,%ymm13,%ymm13
  1307. vpunpckhqdq %ymm8,%ymm7,%ymm6
  1308. vpmuludq %ymm3,%ymm5,%ymm3
  1309. vpmuludq %ymm4,%ymm5,%ymm4
  1310. vpunpcklqdq %ymm8,%ymm7,%ymm7
  1311. vpaddq %ymm3,%ymm13,%ymm2
  1312. vpaddq %ymm4,%ymm14,%ymm3
  1313. vpunpcklqdq %ymm10,%ymm9,%ymm10
  1314. vpmuludq 80(%rax),%ymm0,%ymm4
  1315. vpmuludq %ymm1,%ymm5,%ymm0
  1316. vmovdqa 64(%rcx),%ymm5
  1317. vpaddq %ymm4,%ymm15,%ymm4
  1318. vpaddq %ymm0,%ymm11,%ymm0
  1319. vpsrlq $26,%ymm3,%ymm14
  1320. vpand %ymm5,%ymm3,%ymm3
  1321. vpaddq %ymm14,%ymm4,%ymm4
  1322. vpsrlq $26,%ymm0,%ymm11
  1323. vpand %ymm5,%ymm0,%ymm0
  1324. vpaddq %ymm11,%ymm12,%ymm1
  1325. vpsrlq $26,%ymm4,%ymm15
  1326. vpand %ymm5,%ymm4,%ymm4
  1327. vpsrlq $4,%ymm10,%ymm9
  1328. vpsrlq $26,%ymm1,%ymm12
  1329. vpand %ymm5,%ymm1,%ymm1
  1330. vpaddq %ymm12,%ymm2,%ymm2
  1331. vpaddq %ymm15,%ymm0,%ymm0
  1332. vpsllq $2,%ymm15,%ymm15
  1333. vpaddq %ymm15,%ymm0,%ymm0
  1334. vpand %ymm5,%ymm9,%ymm9
  1335. vpsrlq $26,%ymm7,%ymm8
  1336. vpsrlq $26,%ymm2,%ymm13
  1337. vpand %ymm5,%ymm2,%ymm2
  1338. vpaddq %ymm13,%ymm3,%ymm3
  1339. vpaddq %ymm9,%ymm2,%ymm2
  1340. vpsrlq $30,%ymm10,%ymm10
  1341. vpsrlq $26,%ymm0,%ymm11
  1342. vpand %ymm5,%ymm0,%ymm0
  1343. vpaddq %ymm11,%ymm1,%ymm1
  1344. vpsrlq $40,%ymm6,%ymm6
  1345. vpsrlq $26,%ymm3,%ymm14
  1346. vpand %ymm5,%ymm3,%ymm3
  1347. vpaddq %ymm14,%ymm4,%ymm4
  1348. vpand %ymm5,%ymm7,%ymm7
  1349. vpand %ymm5,%ymm8,%ymm8
  1350. vpand %ymm5,%ymm10,%ymm10
  1351. vpor 32(%rcx),%ymm6,%ymm6
  1352. subq $64,%rdx
  1353. jnz L$oop_avx2
  1354. .byte 0x66,0x90
  1355. L$tail_avx2:
  1356. vpaddq %ymm0,%ymm7,%ymm0
  1357. vmovdqu 4(%rsp),%ymm7
  1358. vpaddq %ymm1,%ymm8,%ymm1
  1359. vmovdqu 36(%rsp),%ymm8
  1360. vpaddq %ymm3,%ymm10,%ymm3
  1361. vmovdqu 100(%rsp),%ymm9
  1362. vpaddq %ymm4,%ymm6,%ymm4
  1363. vmovdqu 52(%rax),%ymm10
  1364. vmovdqu 116(%rax),%ymm5
  1365. vpmuludq %ymm2,%ymm7,%ymm13
  1366. vpmuludq %ymm2,%ymm8,%ymm14
  1367. vpmuludq %ymm2,%ymm9,%ymm15
  1368. vpmuludq %ymm2,%ymm10,%ymm11
  1369. vpmuludq %ymm2,%ymm5,%ymm12
  1370. vpmuludq %ymm0,%ymm8,%ymm6
  1371. vpmuludq %ymm1,%ymm8,%ymm2
  1372. vpaddq %ymm6,%ymm12,%ymm12
  1373. vpaddq %ymm2,%ymm13,%ymm13
  1374. vpmuludq %ymm3,%ymm8,%ymm6
  1375. vpmuludq 68(%rsp),%ymm4,%ymm2
  1376. vpaddq %ymm6,%ymm15,%ymm15
  1377. vpaddq %ymm2,%ymm11,%ymm11
  1378. vpmuludq %ymm0,%ymm7,%ymm6
  1379. vpmuludq %ymm1,%ymm7,%ymm2
  1380. vpaddq %ymm6,%ymm11,%ymm11
  1381. vmovdqu -12(%rax),%ymm8
  1382. vpaddq %ymm2,%ymm12,%ymm12
  1383. vpmuludq %ymm3,%ymm7,%ymm6
  1384. vpmuludq %ymm4,%ymm7,%ymm2
  1385. vpaddq %ymm6,%ymm14,%ymm14
  1386. vpaddq %ymm2,%ymm15,%ymm15
  1387. vpmuludq %ymm3,%ymm8,%ymm6
  1388. vpmuludq %ymm4,%ymm8,%ymm2
  1389. vpaddq %ymm6,%ymm11,%ymm11
  1390. vpaddq %ymm2,%ymm12,%ymm12
  1391. vmovdqu 20(%rax),%ymm2
  1392. vpmuludq %ymm1,%ymm9,%ymm6
  1393. vpmuludq %ymm0,%ymm9,%ymm9
  1394. vpaddq %ymm6,%ymm14,%ymm14
  1395. vpaddq %ymm9,%ymm13,%ymm13
  1396. vpmuludq %ymm1,%ymm2,%ymm6
  1397. vpmuludq %ymm0,%ymm2,%ymm2
  1398. vpaddq %ymm6,%ymm15,%ymm15
  1399. vpaddq %ymm2,%ymm14,%ymm14
  1400. vpmuludq %ymm3,%ymm10,%ymm6
  1401. vpmuludq %ymm4,%ymm10,%ymm2
  1402. vpaddq %ymm6,%ymm12,%ymm12
  1403. vpaddq %ymm2,%ymm13,%ymm13
  1404. vpmuludq %ymm3,%ymm5,%ymm3
  1405. vpmuludq %ymm4,%ymm5,%ymm4
  1406. vpaddq %ymm3,%ymm13,%ymm2
  1407. vpaddq %ymm4,%ymm14,%ymm3
  1408. vpmuludq 84(%rax),%ymm0,%ymm4
  1409. vpmuludq %ymm1,%ymm5,%ymm0
  1410. vmovdqa 64(%rcx),%ymm5
  1411. vpaddq %ymm4,%ymm15,%ymm4
  1412. vpaddq %ymm0,%ymm11,%ymm0
  1413. vpsrldq $8,%ymm12,%ymm8
  1414. vpsrldq $8,%ymm2,%ymm9
  1415. vpsrldq $8,%ymm3,%ymm10
  1416. vpsrldq $8,%ymm4,%ymm6
  1417. vpsrldq $8,%ymm0,%ymm7
  1418. vpaddq %ymm8,%ymm12,%ymm12
  1419. vpaddq %ymm9,%ymm2,%ymm2
  1420. vpaddq %ymm10,%ymm3,%ymm3
  1421. vpaddq %ymm6,%ymm4,%ymm4
  1422. vpaddq %ymm7,%ymm0,%ymm0
  1423. vpermq $0x2,%ymm3,%ymm10
  1424. vpermq $0x2,%ymm4,%ymm6
  1425. vpermq $0x2,%ymm0,%ymm7
  1426. vpermq $0x2,%ymm12,%ymm8
  1427. vpermq $0x2,%ymm2,%ymm9
  1428. vpaddq %ymm10,%ymm3,%ymm3
  1429. vpaddq %ymm6,%ymm4,%ymm4
  1430. vpaddq %ymm7,%ymm0,%ymm0
  1431. vpaddq %ymm8,%ymm12,%ymm12
  1432. vpaddq %ymm9,%ymm2,%ymm2
  1433. vpsrlq $26,%ymm3,%ymm14
  1434. vpand %ymm5,%ymm3,%ymm3
  1435. vpaddq %ymm14,%ymm4,%ymm4
  1436. vpsrlq $26,%ymm0,%ymm11
  1437. vpand %ymm5,%ymm0,%ymm0
  1438. vpaddq %ymm11,%ymm12,%ymm1
  1439. vpsrlq $26,%ymm4,%ymm15
  1440. vpand %ymm5,%ymm4,%ymm4
  1441. vpsrlq $26,%ymm1,%ymm12
  1442. vpand %ymm5,%ymm1,%ymm1
  1443. vpaddq %ymm12,%ymm2,%ymm2
  1444. vpaddq %ymm15,%ymm0,%ymm0
  1445. vpsllq $2,%ymm15,%ymm15
  1446. vpaddq %ymm15,%ymm0,%ymm0
  1447. vpsrlq $26,%ymm2,%ymm13
  1448. vpand %ymm5,%ymm2,%ymm2
  1449. vpaddq %ymm13,%ymm3,%ymm3
  1450. vpsrlq $26,%ymm0,%ymm11
  1451. vpand %ymm5,%ymm0,%ymm0
  1452. vpaddq %ymm11,%ymm1,%ymm1
  1453. vpsrlq $26,%ymm3,%ymm14
  1454. vpand %ymm5,%ymm3,%ymm3
  1455. vpaddq %ymm14,%ymm4,%ymm4
  1456. vmovd %xmm0,-112(%rdi)
  1457. vmovd %xmm1,-108(%rdi)
  1458. vmovd %xmm2,-104(%rdi)
  1459. vmovd %xmm3,-100(%rdi)
  1460. vmovd %xmm4,-96(%rdi)
  1461. leaq 8(%r11),%rsp
  1462. vzeroupper
  1463. .byte 0xf3,0xc3
  1464. .p2align 5
  1465. poly1305_blocks_avx512:
  1466. L$blocks_avx512:
  1467. movl $15,%eax
  1468. kmovw %eax,%k2
  1469. leaq -8(%rsp),%r11
  1470. subq $0x128,%rsp
  1471. leaq L$const(%rip),%rcx
  1472. leaq 48+64(%rdi),%rdi
  1473. vmovdqa 96(%rcx),%ymm9
  1474. vmovdqu -64(%rdi),%xmm11
  1475. andq $-512,%rsp
  1476. vmovdqu -48(%rdi),%xmm12
  1477. movq $0x20,%rax
  1478. vmovdqu -32(%rdi),%xmm7
  1479. vmovdqu -16(%rdi),%xmm13
  1480. vmovdqu 0(%rdi),%xmm8
  1481. vmovdqu 16(%rdi),%xmm14
  1482. vmovdqu 32(%rdi),%xmm10
  1483. vmovdqu 48(%rdi),%xmm15
  1484. vmovdqu 64(%rdi),%xmm6
  1485. vpermd %zmm11,%zmm9,%zmm16
  1486. vpbroadcastq 64(%rcx),%zmm5
  1487. vpermd %zmm12,%zmm9,%zmm17
  1488. vpermd %zmm7,%zmm9,%zmm21
  1489. vpermd %zmm13,%zmm9,%zmm18
  1490. vmovdqa64 %zmm16,0(%rsp){%k2}
  1491. vpsrlq $32,%zmm16,%zmm7
  1492. vpermd %zmm8,%zmm9,%zmm22
  1493. vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2}
  1494. vpsrlq $32,%zmm17,%zmm8
  1495. vpermd %zmm14,%zmm9,%zmm19
  1496. vmovdqa64 %zmm21,64(%rsp){%k2}
  1497. vpermd %zmm10,%zmm9,%zmm23
  1498. vpermd %zmm15,%zmm9,%zmm20
  1499. vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2}
  1500. vpermd %zmm6,%zmm9,%zmm24
  1501. vmovdqa64 %zmm22,128(%rsp){%k2}
  1502. vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2}
  1503. vmovdqa64 %zmm23,192(%rsp){%k2}
  1504. vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2}
  1505. vmovdqa64 %zmm24,256(%rsp){%k2}
  1506. vpmuludq %zmm7,%zmm16,%zmm11
  1507. vpmuludq %zmm7,%zmm17,%zmm12
  1508. vpmuludq %zmm7,%zmm18,%zmm13
  1509. vpmuludq %zmm7,%zmm19,%zmm14
  1510. vpmuludq %zmm7,%zmm20,%zmm15
  1511. vpsrlq $32,%zmm18,%zmm9
  1512. vpmuludq %zmm8,%zmm24,%zmm25
  1513. vpmuludq %zmm8,%zmm16,%zmm26
  1514. vpmuludq %zmm8,%zmm17,%zmm27
  1515. vpmuludq %zmm8,%zmm18,%zmm28
  1516. vpmuludq %zmm8,%zmm19,%zmm29
  1517. vpsrlq $32,%zmm19,%zmm10
  1518. vpaddq %zmm25,%zmm11,%zmm11
  1519. vpaddq %zmm26,%zmm12,%zmm12
  1520. vpaddq %zmm27,%zmm13,%zmm13
  1521. vpaddq %zmm28,%zmm14,%zmm14
  1522. vpaddq %zmm29,%zmm15,%zmm15
  1523. vpmuludq %zmm9,%zmm23,%zmm25
  1524. vpmuludq %zmm9,%zmm24,%zmm26
  1525. vpmuludq %zmm9,%zmm17,%zmm28
  1526. vpmuludq %zmm9,%zmm18,%zmm29
  1527. vpmuludq %zmm9,%zmm16,%zmm27
  1528. vpsrlq $32,%zmm20,%zmm6
  1529. vpaddq %zmm25,%zmm11,%zmm11
  1530. vpaddq %zmm26,%zmm12,%zmm12
  1531. vpaddq %zmm28,%zmm14,%zmm14
  1532. vpaddq %zmm29,%zmm15,%zmm15
  1533. vpaddq %zmm27,%zmm13,%zmm13
  1534. vpmuludq %zmm10,%zmm22,%zmm25
  1535. vpmuludq %zmm10,%zmm16,%zmm28
  1536. vpmuludq %zmm10,%zmm17,%zmm29
  1537. vpmuludq %zmm10,%zmm23,%zmm26
  1538. vpmuludq %zmm10,%zmm24,%zmm27
  1539. vpaddq %zmm25,%zmm11,%zmm11
  1540. vpaddq %zmm28,%zmm14,%zmm14
  1541. vpaddq %zmm29,%zmm15,%zmm15
  1542. vpaddq %zmm26,%zmm12,%zmm12
  1543. vpaddq %zmm27,%zmm13,%zmm13
  1544. vpmuludq %zmm6,%zmm24,%zmm28
  1545. vpmuludq %zmm6,%zmm16,%zmm29
  1546. vpmuludq %zmm6,%zmm21,%zmm25
  1547. vpmuludq %zmm6,%zmm22,%zmm26
  1548. vpmuludq %zmm6,%zmm23,%zmm27
  1549. vpaddq %zmm28,%zmm14,%zmm14
  1550. vpaddq %zmm29,%zmm15,%zmm15
  1551. vpaddq %zmm25,%zmm11,%zmm11
  1552. vpaddq %zmm26,%zmm12,%zmm12
  1553. vpaddq %zmm27,%zmm13,%zmm13
  1554. vmovdqu64 0(%rsi),%zmm10
  1555. vmovdqu64 64(%rsi),%zmm6
  1556. leaq 128(%rsi),%rsi
  1557. vpsrlq $26,%zmm14,%zmm28
  1558. vpandq %zmm5,%zmm14,%zmm14
  1559. vpaddq %zmm28,%zmm15,%zmm15
  1560. vpsrlq $26,%zmm11,%zmm25
  1561. vpandq %zmm5,%zmm11,%zmm11
  1562. vpaddq %zmm25,%zmm12,%zmm12
  1563. vpsrlq $26,%zmm15,%zmm29
  1564. vpandq %zmm5,%zmm15,%zmm15
  1565. vpsrlq $26,%zmm12,%zmm26
  1566. vpandq %zmm5,%zmm12,%zmm12
  1567. vpaddq %zmm26,%zmm13,%zmm13
  1568. vpaddq %zmm29,%zmm11,%zmm11
  1569. vpsllq $2,%zmm29,%zmm29
  1570. vpaddq %zmm29,%zmm11,%zmm11
  1571. vpsrlq $26,%zmm13,%zmm27
  1572. vpandq %zmm5,%zmm13,%zmm13
  1573. vpaddq %zmm27,%zmm14,%zmm14
  1574. vpsrlq $26,%zmm11,%zmm25
  1575. vpandq %zmm5,%zmm11,%zmm11
  1576. vpaddq %zmm25,%zmm12,%zmm12
  1577. vpsrlq $26,%zmm14,%zmm28
  1578. vpandq %zmm5,%zmm14,%zmm14
  1579. vpaddq %zmm28,%zmm15,%zmm15
  1580. vpunpcklqdq %zmm6,%zmm10,%zmm7
  1581. vpunpckhqdq %zmm6,%zmm10,%zmm6
  1582. vmovdqa32 128(%rcx),%zmm25
  1583. movl $0x7777,%eax
  1584. kmovw %eax,%k1
  1585. vpermd %zmm16,%zmm25,%zmm16
  1586. vpermd %zmm17,%zmm25,%zmm17
  1587. vpermd %zmm18,%zmm25,%zmm18
  1588. vpermd %zmm19,%zmm25,%zmm19
  1589. vpermd %zmm20,%zmm25,%zmm20
  1590. vpermd %zmm11,%zmm25,%zmm16{%k1}
  1591. vpermd %zmm12,%zmm25,%zmm17{%k1}
  1592. vpermd %zmm13,%zmm25,%zmm18{%k1}
  1593. vpermd %zmm14,%zmm25,%zmm19{%k1}
  1594. vpermd %zmm15,%zmm25,%zmm20{%k1}
  1595. vpslld $2,%zmm17,%zmm21
  1596. vpslld $2,%zmm18,%zmm22
  1597. vpslld $2,%zmm19,%zmm23
  1598. vpslld $2,%zmm20,%zmm24
  1599. vpaddd %zmm17,%zmm21,%zmm21
  1600. vpaddd %zmm18,%zmm22,%zmm22
  1601. vpaddd %zmm19,%zmm23,%zmm23
  1602. vpaddd %zmm20,%zmm24,%zmm24
  1603. vpbroadcastq 32(%rcx),%zmm30
  1604. vpsrlq $52,%zmm7,%zmm9
  1605. vpsllq $12,%zmm6,%zmm10
  1606. vporq %zmm10,%zmm9,%zmm9
  1607. vpsrlq $26,%zmm7,%zmm8
  1608. vpsrlq $14,%zmm6,%zmm10
  1609. vpsrlq $40,%zmm6,%zmm6
  1610. vpandq %zmm5,%zmm9,%zmm9
  1611. vpandq %zmm5,%zmm7,%zmm7
  1612. vpaddq %zmm2,%zmm9,%zmm2
  1613. subq $192,%rdx
  1614. jbe L$tail_avx512
  1615. jmp L$oop_avx512
  1616. .p2align 5
  1617. L$oop_avx512:
  1618. vpmuludq %zmm2,%zmm17,%zmm14
  1619. vpaddq %zmm0,%zmm7,%zmm0
  1620. vpmuludq %zmm2,%zmm18,%zmm15
  1621. vpandq %zmm5,%zmm8,%zmm8
  1622. vpmuludq %zmm2,%zmm23,%zmm11
  1623. vpandq %zmm5,%zmm10,%zmm10
  1624. vpmuludq %zmm2,%zmm24,%zmm12
  1625. vporq %zmm30,%zmm6,%zmm6
  1626. vpmuludq %zmm2,%zmm16,%zmm13
  1627. vpaddq %zmm1,%zmm8,%zmm1
  1628. vpaddq %zmm3,%zmm10,%zmm3
  1629. vpaddq %zmm4,%zmm6,%zmm4
  1630. vmovdqu64 0(%rsi),%zmm10
  1631. vmovdqu64 64(%rsi),%zmm6
  1632. leaq 128(%rsi),%rsi
  1633. vpmuludq %zmm0,%zmm19,%zmm28
  1634. vpmuludq %zmm0,%zmm20,%zmm29
  1635. vpmuludq %zmm0,%zmm16,%zmm25
  1636. vpmuludq %zmm0,%zmm17,%zmm26
  1637. vpaddq %zmm28,%zmm14,%zmm14
  1638. vpaddq %zmm29,%zmm15,%zmm15
  1639. vpaddq %zmm25,%zmm11,%zmm11
  1640. vpaddq %zmm26,%zmm12,%zmm12
  1641. vpmuludq %zmm1,%zmm18,%zmm28
  1642. vpmuludq %zmm1,%zmm19,%zmm29
  1643. vpmuludq %zmm1,%zmm24,%zmm25
  1644. vpmuludq %zmm0,%zmm18,%zmm27
  1645. vpaddq %zmm28,%zmm14,%zmm14
  1646. vpaddq %zmm29,%zmm15,%zmm15
  1647. vpaddq %zmm25,%zmm11,%zmm11
  1648. vpaddq %zmm27,%zmm13,%zmm13
  1649. vpunpcklqdq %zmm6,%zmm10,%zmm7
  1650. vpunpckhqdq %zmm6,%zmm10,%zmm6
  1651. vpmuludq %zmm3,%zmm16,%zmm28
  1652. vpmuludq %zmm3,%zmm17,%zmm29
  1653. vpmuludq %zmm1,%zmm16,%zmm26
  1654. vpmuludq %zmm1,%zmm17,%zmm27
  1655. vpaddq %zmm28,%zmm14,%zmm14
  1656. vpaddq %zmm29,%zmm15,%zmm15
  1657. vpaddq %zmm26,%zmm12,%zmm12
  1658. vpaddq %zmm27,%zmm13,%zmm13
  1659. vpmuludq %zmm4,%zmm24,%zmm28
  1660. vpmuludq %zmm4,%zmm16,%zmm29
  1661. vpmuludq %zmm3,%zmm22,%zmm25
  1662. vpmuludq %zmm3,%zmm23,%zmm26
  1663. vpaddq %zmm28,%zmm14,%zmm14
  1664. vpmuludq %zmm3,%zmm24,%zmm27
  1665. vpaddq %zmm29,%zmm15,%zmm15
  1666. vpaddq %zmm25,%zmm11,%zmm11
  1667. vpaddq %zmm26,%zmm12,%zmm12
  1668. vpaddq %zmm27,%zmm13,%zmm13
  1669. vpmuludq %zmm4,%zmm21,%zmm25
  1670. vpmuludq %zmm4,%zmm22,%zmm26
  1671. vpmuludq %zmm4,%zmm23,%zmm27
  1672. vpaddq %zmm25,%zmm11,%zmm0
  1673. vpaddq %zmm26,%zmm12,%zmm1
  1674. vpaddq %zmm27,%zmm13,%zmm2
  1675. vpsrlq $52,%zmm7,%zmm9
  1676. vpsllq $12,%zmm6,%zmm10
  1677. vpsrlq $26,%zmm14,%zmm3
  1678. vpandq %zmm5,%zmm14,%zmm14
  1679. vpaddq %zmm3,%zmm15,%zmm4
  1680. vporq %zmm10,%zmm9,%zmm9
  1681. vpsrlq $26,%zmm0,%zmm11
  1682. vpandq %zmm5,%zmm0,%zmm0
  1683. vpaddq %zmm11,%zmm1,%zmm1
  1684. vpandq %zmm5,%zmm9,%zmm9
  1685. vpsrlq $26,%zmm4,%zmm15
  1686. vpandq %zmm5,%zmm4,%zmm4
  1687. vpsrlq $26,%zmm1,%zmm12
  1688. vpandq %zmm5,%zmm1,%zmm1
  1689. vpaddq %zmm12,%zmm2,%zmm2
  1690. vpaddq %zmm15,%zmm0,%zmm0
  1691. vpsllq $2,%zmm15,%zmm15
  1692. vpaddq %zmm15,%zmm0,%zmm0
  1693. vpaddq %zmm9,%zmm2,%zmm2
  1694. vpsrlq $26,%zmm7,%zmm8
  1695. vpsrlq $26,%zmm2,%zmm13
  1696. vpandq %zmm5,%zmm2,%zmm2
  1697. vpaddq %zmm13,%zmm14,%zmm3
  1698. vpsrlq $14,%zmm6,%zmm10
  1699. vpsrlq $26,%zmm0,%zmm11
  1700. vpandq %zmm5,%zmm0,%zmm0
  1701. vpaddq %zmm11,%zmm1,%zmm1
  1702. vpsrlq $40,%zmm6,%zmm6
  1703. vpsrlq $26,%zmm3,%zmm14
  1704. vpandq %zmm5,%zmm3,%zmm3
  1705. vpaddq %zmm14,%zmm4,%zmm4
  1706. vpandq %zmm5,%zmm7,%zmm7
  1707. subq $128,%rdx
  1708. ja L$oop_avx512
  1709. L$tail_avx512:
  1710. vpsrlq $32,%zmm16,%zmm16
  1711. vpsrlq $32,%zmm17,%zmm17
  1712. vpsrlq $32,%zmm18,%zmm18
  1713. vpsrlq $32,%zmm23,%zmm23
  1714. vpsrlq $32,%zmm24,%zmm24
  1715. vpsrlq $32,%zmm19,%zmm19
  1716. vpsrlq $32,%zmm20,%zmm20
  1717. vpsrlq $32,%zmm21,%zmm21
  1718. vpsrlq $32,%zmm22,%zmm22
  1719. leaq (%rsi,%rdx,1),%rsi
  1720. vpaddq %zmm0,%zmm7,%zmm0
  1721. vpmuludq %zmm2,%zmm17,%zmm14
  1722. vpmuludq %zmm2,%zmm18,%zmm15
  1723. vpmuludq %zmm2,%zmm23,%zmm11
  1724. vpandq %zmm5,%zmm8,%zmm8
  1725. vpmuludq %zmm2,%zmm24,%zmm12
  1726. vpandq %zmm5,%zmm10,%zmm10
  1727. vpmuludq %zmm2,%zmm16,%zmm13
  1728. vporq %zmm30,%zmm6,%zmm6
  1729. vpaddq %zmm1,%zmm8,%zmm1
  1730. vpaddq %zmm3,%zmm10,%zmm3
  1731. vpaddq %zmm4,%zmm6,%zmm4
  1732. vmovdqu 0(%rsi),%xmm7
  1733. vpmuludq %zmm0,%zmm19,%zmm28
  1734. vpmuludq %zmm0,%zmm20,%zmm29
  1735. vpmuludq %zmm0,%zmm16,%zmm25
  1736. vpmuludq %zmm0,%zmm17,%zmm26
  1737. vpaddq %zmm28,%zmm14,%zmm14
  1738. vpaddq %zmm29,%zmm15,%zmm15
  1739. vpaddq %zmm25,%zmm11,%zmm11
  1740. vpaddq %zmm26,%zmm12,%zmm12
  1741. vmovdqu 16(%rsi),%xmm8
  1742. vpmuludq %zmm1,%zmm18,%zmm28
  1743. vpmuludq %zmm1,%zmm19,%zmm29
  1744. vpmuludq %zmm1,%zmm24,%zmm25
  1745. vpmuludq %zmm0,%zmm18,%zmm27
  1746. vpaddq %zmm28,%zmm14,%zmm14
  1747. vpaddq %zmm29,%zmm15,%zmm15
  1748. vpaddq %zmm25,%zmm11,%zmm11
  1749. vpaddq %zmm27,%zmm13,%zmm13
  1750. vinserti128 $1,32(%rsi),%ymm7,%ymm7
  1751. vpmuludq %zmm3,%zmm16,%zmm28
  1752. vpmuludq %zmm3,%zmm17,%zmm29
  1753. vpmuludq %zmm1,%zmm16,%zmm26
  1754. vpmuludq %zmm1,%zmm17,%zmm27
  1755. vpaddq %zmm28,%zmm14,%zmm14
  1756. vpaddq %zmm29,%zmm15,%zmm15
  1757. vpaddq %zmm26,%zmm12,%zmm12
  1758. vpaddq %zmm27,%zmm13,%zmm13
  1759. vinserti128 $1,48(%rsi),%ymm8,%ymm8
  1760. vpmuludq %zmm4,%zmm24,%zmm28
  1761. vpmuludq %zmm4,%zmm16,%zmm29
  1762. vpmuludq %zmm3,%zmm22,%zmm25
  1763. vpmuludq %zmm3,%zmm23,%zmm26
  1764. vpmuludq %zmm3,%zmm24,%zmm27
  1765. vpaddq %zmm28,%zmm14,%zmm3
  1766. vpaddq %zmm29,%zmm15,%zmm15
  1767. vpaddq %zmm25,%zmm11,%zmm11
  1768. vpaddq %zmm26,%zmm12,%zmm12
  1769. vpaddq %zmm27,%zmm13,%zmm13
  1770. vpmuludq %zmm4,%zmm21,%zmm25
  1771. vpmuludq %zmm4,%zmm22,%zmm26
  1772. vpmuludq %zmm4,%zmm23,%zmm27
  1773. vpaddq %zmm25,%zmm11,%zmm0
  1774. vpaddq %zmm26,%zmm12,%zmm1
  1775. vpaddq %zmm27,%zmm13,%zmm2
  1776. movl $1,%eax
  1777. vpermq $0xb1,%zmm3,%zmm14
  1778. vpermq $0xb1,%zmm15,%zmm4
  1779. vpermq $0xb1,%zmm0,%zmm11
  1780. vpermq $0xb1,%zmm1,%zmm12
  1781. vpermq $0xb1,%zmm2,%zmm13
  1782. vpaddq %zmm14,%zmm3,%zmm3
  1783. vpaddq %zmm15,%zmm4,%zmm4
  1784. vpaddq %zmm11,%zmm0,%zmm0
  1785. vpaddq %zmm12,%zmm1,%zmm1
  1786. vpaddq %zmm13,%zmm2,%zmm2
  1787. kmovw %eax,%k3
  1788. vpermq $0x2,%zmm3,%zmm14
  1789. vpermq $0x2,%zmm4,%zmm15
  1790. vpermq $0x2,%zmm0,%zmm11
  1791. vpermq $0x2,%zmm1,%zmm12
  1792. vpermq $0x2,%zmm2,%zmm13
  1793. vpaddq %zmm14,%zmm3,%zmm3
  1794. vpaddq %zmm15,%zmm4,%zmm4
  1795. vpaddq %zmm11,%zmm0,%zmm0
  1796. vpaddq %zmm12,%zmm1,%zmm1
  1797. vpaddq %zmm13,%zmm2,%zmm2
  1798. vextracti64x4 $0x1,%zmm3,%ymm14
  1799. vextracti64x4 $0x1,%zmm4,%ymm15
  1800. vextracti64x4 $0x1,%zmm0,%ymm11
  1801. vextracti64x4 $0x1,%zmm1,%ymm12
  1802. vextracti64x4 $0x1,%zmm2,%ymm13
  1803. vpaddq %zmm14,%zmm3,%zmm3{%k3}{z}
  1804. vpaddq %zmm15,%zmm4,%zmm4{%k3}{z}
  1805. vpaddq %zmm11,%zmm0,%zmm0{%k3}{z}
  1806. vpaddq %zmm12,%zmm1,%zmm1{%k3}{z}
  1807. vpaddq %zmm13,%zmm2,%zmm2{%k3}{z}
  1808. vpsrlq $26,%ymm3,%ymm14
  1809. vpand %ymm5,%ymm3,%ymm3
  1810. vpsrldq $6,%ymm7,%ymm9
  1811. vpsrldq $6,%ymm8,%ymm10
  1812. vpunpckhqdq %ymm8,%ymm7,%ymm6
  1813. vpaddq %ymm14,%ymm4,%ymm4
  1814. vpsrlq $26,%ymm0,%ymm11
  1815. vpand %ymm5,%ymm0,%ymm0
  1816. vpunpcklqdq %ymm10,%ymm9,%ymm9
  1817. vpunpcklqdq %ymm8,%ymm7,%ymm7
  1818. vpaddq %ymm11,%ymm1,%ymm1
  1819. vpsrlq $26,%ymm4,%ymm15
  1820. vpand %ymm5,%ymm4,%ymm4
  1821. vpsrlq $26,%ymm1,%ymm12
  1822. vpand %ymm5,%ymm1,%ymm1
  1823. vpsrlq $30,%ymm9,%ymm10
  1824. vpsrlq $4,%ymm9,%ymm9
  1825. vpaddq %ymm12,%ymm2,%ymm2
  1826. vpaddq %ymm15,%ymm0,%ymm0
  1827. vpsllq $2,%ymm15,%ymm15
  1828. vpsrlq $26,%ymm7,%ymm8
  1829. vpsrlq $40,%ymm6,%ymm6
  1830. vpaddq %ymm15,%ymm0,%ymm0
  1831. vpsrlq $26,%ymm2,%ymm13
  1832. vpand %ymm5,%ymm2,%ymm2
  1833. vpand %ymm5,%ymm9,%ymm9
  1834. vpand %ymm5,%ymm7,%ymm7
  1835. vpaddq %ymm13,%ymm3,%ymm3
  1836. vpsrlq $26,%ymm0,%ymm11
  1837. vpand %ymm5,%ymm0,%ymm0
  1838. vpaddq %ymm2,%ymm9,%ymm2
  1839. vpand %ymm5,%ymm8,%ymm8
  1840. vpaddq %ymm11,%ymm1,%ymm1
  1841. vpsrlq $26,%ymm3,%ymm14
  1842. vpand %ymm5,%ymm3,%ymm3
  1843. vpand %ymm5,%ymm10,%ymm10
  1844. vpor 32(%rcx),%ymm6,%ymm6
  1845. vpaddq %ymm14,%ymm4,%ymm4
  1846. leaq 144(%rsp),%rax
  1847. addq $64,%rdx
  1848. jnz L$tail_avx2
  1849. vpsubq %ymm9,%ymm2,%ymm2
  1850. vmovd %xmm0,-112(%rdi)
  1851. vmovd %xmm1,-108(%rdi)
  1852. vmovd %xmm2,-104(%rdi)
  1853. vmovd %xmm3,-100(%rdi)
  1854. vmovd %xmm4,-96(%rdi)
  1855. vzeroall
  1856. leaq 8(%r11),%rsp
  1857. .byte 0xf3,0xc3
  1858. .p2align 5
  1859. poly1305_init_base2_44:
  1860. xorq %rax,%rax
  1861. movq %rax,0(%rdi)
  1862. movq %rax,8(%rdi)
  1863. movq %rax,16(%rdi)
  1864. L$init_base2_44:
  1865. leaq poly1305_blocks_vpmadd52(%rip),%r10
  1866. leaq poly1305_emit_base2_44(%rip),%r11
  1867. movq $0x0ffffffc0fffffff,%rax
  1868. movq $0x0ffffffc0ffffffc,%rcx
  1869. andq 0(%rsi),%rax
  1870. movq $0x00000fffffffffff,%r8
  1871. andq 8(%rsi),%rcx
  1872. movq $0x00000fffffffffff,%r9
  1873. andq %rax,%r8
  1874. shrdq $44,%rcx,%rax
  1875. movq %r8,40(%rdi)
  1876. andq %r9,%rax
  1877. shrq $24,%rcx
  1878. movq %rax,48(%rdi)
  1879. leaq (%rax,%rax,4),%rax
  1880. movq %rcx,56(%rdi)
  1881. shlq $2,%rax
  1882. leaq (%rcx,%rcx,4),%rcx
  1883. shlq $2,%rcx
  1884. movq %rax,24(%rdi)
  1885. movq %rcx,32(%rdi)
  1886. movq $-1,64(%rdi)
  1887. movq %r10,0(%rdx)
  1888. movq %r11,8(%rdx)
  1889. movl $1,%eax
  1890. .byte 0xf3,0xc3
  1891. .p2align 5
  1892. poly1305_blocks_vpmadd52:
  1893. shrq $4,%rdx
  1894. jz L$no_data_vpmadd52
  1895. shlq $40,%rcx
  1896. movq 64(%rdi),%r8
  1897. movq $3,%rax
  1898. movq $1,%r10
  1899. cmpq $4,%rdx
  1900. cmovaeq %r10,%rax
  1901. testq %r8,%r8
  1902. cmovnsq %r10,%rax
  1903. andq %rdx,%rax
  1904. jz L$blocks_vpmadd52_4x
  1905. subq %rax,%rdx
  1906. movl $7,%r10d
  1907. movl $1,%r11d
  1908. kmovw %r10d,%k7
  1909. leaq L$2_44_inp_permd(%rip),%r10
  1910. kmovw %r11d,%k1
  1911. vmovq %rcx,%xmm21
  1912. vmovdqa64 0(%r10),%ymm19
  1913. vmovdqa64 32(%r10),%ymm20
  1914. vpermq $0xcf,%ymm21,%ymm21
  1915. vmovdqa64 64(%r10),%ymm22
  1916. vmovdqu64 0(%rdi),%ymm16{%k7}{z}
  1917. vmovdqu64 40(%rdi),%ymm3{%k7}{z}
  1918. vmovdqu64 32(%rdi),%ymm4{%k7}{z}
  1919. vmovdqu64 24(%rdi),%ymm5{%k7}{z}
  1920. vmovdqa64 96(%r10),%ymm23
  1921. vmovdqa64 128(%r10),%ymm24
  1922. jmp L$oop_vpmadd52
  1923. .p2align 5
  1924. L$oop_vpmadd52:
  1925. vmovdqu32 0(%rsi),%xmm18
  1926. leaq 16(%rsi),%rsi
  1927. vpermd %ymm18,%ymm19,%ymm18
  1928. vpsrlvq %ymm20,%ymm18,%ymm18
  1929. vpandq %ymm22,%ymm18,%ymm18
  1930. vporq %ymm21,%ymm18,%ymm18
  1931. vpaddq %ymm18,%ymm16,%ymm16
  1932. vpermq $0,%ymm16,%ymm0{%k7}{z}
  1933. vpermq $85,%ymm16,%ymm1{%k7}{z}
  1934. vpermq $170,%ymm16,%ymm2{%k7}{z}
  1935. vpxord %ymm16,%ymm16,%ymm16
  1936. vpxord %ymm17,%ymm17,%ymm17
  1937. vpmadd52luq %ymm3,%ymm0,%ymm16
  1938. vpmadd52huq %ymm3,%ymm0,%ymm17
  1939. vpmadd52luq %ymm4,%ymm1,%ymm16
  1940. vpmadd52huq %ymm4,%ymm1,%ymm17
  1941. vpmadd52luq %ymm5,%ymm2,%ymm16
  1942. vpmadd52huq %ymm5,%ymm2,%ymm17
  1943. vpsrlvq %ymm23,%ymm16,%ymm18
  1944. vpsllvq %ymm24,%ymm17,%ymm17
  1945. vpandq %ymm22,%ymm16,%ymm16
  1946. vpaddq %ymm18,%ymm17,%ymm17
  1947. vpermq $147,%ymm17,%ymm17
  1948. vpaddq %ymm17,%ymm16,%ymm16
  1949. vpsrlvq %ymm23,%ymm16,%ymm18
  1950. vpandq %ymm22,%ymm16,%ymm16
  1951. vpermq $147,%ymm18,%ymm18
  1952. vpaddq %ymm18,%ymm16,%ymm16
  1953. vpermq $147,%ymm16,%ymm18{%k1}{z}
  1954. vpaddq %ymm18,%ymm16,%ymm16
  1955. vpsllq $2,%ymm18,%ymm18
  1956. vpaddq %ymm18,%ymm16,%ymm16
  1957. decq %rax
  1958. jnz L$oop_vpmadd52
  1959. vmovdqu64 %ymm16,0(%rdi){%k7}
  1960. testq %rdx,%rdx
  1961. jnz L$blocks_vpmadd52_4x
  1962. L$no_data_vpmadd52:
  1963. .byte 0xf3,0xc3
  1964. .p2align 5
  1965. poly1305_blocks_vpmadd52_4x:
  1966. shrq $4,%rdx
  1967. jz L$no_data_vpmadd52_4x
  1968. shlq $40,%rcx
  1969. movq 64(%rdi),%r8
  1970. L$blocks_vpmadd52_4x:
  1971. vpbroadcastq %rcx,%ymm31
  1972. vmovdqa64 L$x_mask44(%rip),%ymm28
  1973. movl $5,%eax
  1974. vmovdqa64 L$x_mask42(%rip),%ymm29
  1975. kmovw %eax,%k1
  1976. testq %r8,%r8
  1977. js L$init_vpmadd52
  1978. vmovq 0(%rdi),%xmm0
  1979. vmovq 8(%rdi),%xmm1
  1980. vmovq 16(%rdi),%xmm2
  1981. testq $3,%rdx
  1982. jnz L$blocks_vpmadd52_2x_do
  1983. L$blocks_vpmadd52_4x_do:
  1984. vpbroadcastq 64(%rdi),%ymm3
  1985. vpbroadcastq 96(%rdi),%ymm4
  1986. vpbroadcastq 128(%rdi),%ymm5
  1987. vpbroadcastq 160(%rdi),%ymm16
  1988. L$blocks_vpmadd52_4x_key_loaded:
  1989. vpsllq $2,%ymm5,%ymm17
  1990. vpaddq %ymm5,%ymm17,%ymm17
  1991. vpsllq $2,%ymm17,%ymm17
  1992. testq $7,%rdx
  1993. jz L$blocks_vpmadd52_8x
  1994. vmovdqu64 0(%rsi),%ymm26
  1995. vmovdqu64 32(%rsi),%ymm27
  1996. leaq 64(%rsi),%rsi
  1997. vpunpcklqdq %ymm27,%ymm26,%ymm25
  1998. vpunpckhqdq %ymm27,%ymm26,%ymm27
  1999. vpsrlq $24,%ymm27,%ymm26
  2000. vporq %ymm31,%ymm26,%ymm26
  2001. vpaddq %ymm26,%ymm2,%ymm2
  2002. vpandq %ymm28,%ymm25,%ymm24
  2003. vpsrlq $44,%ymm25,%ymm25
  2004. vpsllq $20,%ymm27,%ymm27
  2005. vporq %ymm27,%ymm25,%ymm25
  2006. vpandq %ymm28,%ymm25,%ymm25
  2007. subq $4,%rdx
  2008. jz L$tail_vpmadd52_4x
  2009. jmp L$oop_vpmadd52_4x
  2010. ud2
  2011. .p2align 5
  2012. L$init_vpmadd52:
  2013. vmovq 24(%rdi),%xmm16
  2014. vmovq 56(%rdi),%xmm2
  2015. vmovq 32(%rdi),%xmm17
  2016. vmovq 40(%rdi),%xmm3
  2017. vmovq 48(%rdi),%xmm4
  2018. vmovdqa %ymm3,%ymm0
  2019. vmovdqa %ymm4,%ymm1
  2020. vmovdqa %ymm2,%ymm5
  2021. movl $2,%eax
  2022. L$mul_init_vpmadd52:
  2023. vpxorq %ymm18,%ymm18,%ymm18
  2024. vpmadd52luq %ymm2,%ymm16,%ymm18
  2025. vpxorq %ymm19,%ymm19,%ymm19
  2026. vpmadd52huq %ymm2,%ymm16,%ymm19
  2027. vpxorq %ymm20,%ymm20,%ymm20
  2028. vpmadd52luq %ymm2,%ymm17,%ymm20
  2029. vpxorq %ymm21,%ymm21,%ymm21
  2030. vpmadd52huq %ymm2,%ymm17,%ymm21
  2031. vpxorq %ymm22,%ymm22,%ymm22
  2032. vpmadd52luq %ymm2,%ymm3,%ymm22
  2033. vpxorq %ymm23,%ymm23,%ymm23
  2034. vpmadd52huq %ymm2,%ymm3,%ymm23
  2035. vpmadd52luq %ymm0,%ymm3,%ymm18
  2036. vpmadd52huq %ymm0,%ymm3,%ymm19
  2037. vpmadd52luq %ymm0,%ymm4,%ymm20
  2038. vpmadd52huq %ymm0,%ymm4,%ymm21
  2039. vpmadd52luq %ymm0,%ymm5,%ymm22
  2040. vpmadd52huq %ymm0,%ymm5,%ymm23
  2041. vpmadd52luq %ymm1,%ymm17,%ymm18
  2042. vpmadd52huq %ymm1,%ymm17,%ymm19
  2043. vpmadd52luq %ymm1,%ymm3,%ymm20
  2044. vpmadd52huq %ymm1,%ymm3,%ymm21
  2045. vpmadd52luq %ymm1,%ymm4,%ymm22
  2046. vpmadd52huq %ymm1,%ymm4,%ymm23
  2047. vpsrlq $44,%ymm18,%ymm30
  2048. vpsllq $8,%ymm19,%ymm19
  2049. vpandq %ymm28,%ymm18,%ymm0
  2050. vpaddq %ymm30,%ymm19,%ymm19
  2051. vpaddq %ymm19,%ymm20,%ymm20
  2052. vpsrlq $44,%ymm20,%ymm30
  2053. vpsllq $8,%ymm21,%ymm21
  2054. vpandq %ymm28,%ymm20,%ymm1
  2055. vpaddq %ymm30,%ymm21,%ymm21
  2056. vpaddq %ymm21,%ymm22,%ymm22
  2057. vpsrlq $42,%ymm22,%ymm30
  2058. vpsllq $10,%ymm23,%ymm23
  2059. vpandq %ymm29,%ymm22,%ymm2
  2060. vpaddq %ymm30,%ymm23,%ymm23
  2061. vpaddq %ymm23,%ymm0,%ymm0
  2062. vpsllq $2,%ymm23,%ymm23
  2063. vpaddq %ymm23,%ymm0,%ymm0
  2064. vpsrlq $44,%ymm0,%ymm30
  2065. vpandq %ymm28,%ymm0,%ymm0
  2066. vpaddq %ymm30,%ymm1,%ymm1
  2067. decl %eax
  2068. jz L$done_init_vpmadd52
  2069. vpunpcklqdq %ymm4,%ymm1,%ymm4
  2070. vpbroadcastq %xmm1,%xmm1
  2071. vpunpcklqdq %ymm5,%ymm2,%ymm5
  2072. vpbroadcastq %xmm2,%xmm2
  2073. vpunpcklqdq %ymm3,%ymm0,%ymm3
  2074. vpbroadcastq %xmm0,%xmm0
  2075. vpsllq $2,%ymm4,%ymm16
  2076. vpsllq $2,%ymm5,%ymm17
  2077. vpaddq %ymm4,%ymm16,%ymm16
  2078. vpaddq %ymm5,%ymm17,%ymm17
  2079. vpsllq $2,%ymm16,%ymm16
  2080. vpsllq $2,%ymm17,%ymm17
  2081. jmp L$mul_init_vpmadd52
  2082. ud2
  2083. .p2align 5
  2084. L$done_init_vpmadd52:
  2085. vinserti128 $1,%xmm4,%ymm1,%ymm4
  2086. vinserti128 $1,%xmm5,%ymm2,%ymm5
  2087. vinserti128 $1,%xmm3,%ymm0,%ymm3
  2088. vpermq $216,%ymm4,%ymm4
  2089. vpermq $216,%ymm5,%ymm5
  2090. vpermq $216,%ymm3,%ymm3
  2091. vpsllq $2,%ymm4,%ymm16
  2092. vpaddq %ymm4,%ymm16,%ymm16
  2093. vpsllq $2,%ymm16,%ymm16
  2094. vmovq 0(%rdi),%xmm0
  2095. vmovq 8(%rdi),%xmm1
  2096. vmovq 16(%rdi),%xmm2
  2097. testq $3,%rdx
  2098. jnz L$done_init_vpmadd52_2x
  2099. vmovdqu64 %ymm3,64(%rdi)
  2100. vpbroadcastq %xmm3,%ymm3
  2101. vmovdqu64 %ymm4,96(%rdi)
  2102. vpbroadcastq %xmm4,%ymm4
  2103. vmovdqu64 %ymm5,128(%rdi)
  2104. vpbroadcastq %xmm5,%ymm5
  2105. vmovdqu64 %ymm16,160(%rdi)
  2106. vpbroadcastq %xmm16,%ymm16
  2107. jmp L$blocks_vpmadd52_4x_key_loaded
  2108. ud2
  2109. .p2align 5
  2110. L$done_init_vpmadd52_2x:
  2111. vmovdqu64 %ymm3,64(%rdi)
  2112. vpsrldq $8,%ymm3,%ymm3
  2113. vmovdqu64 %ymm4,96(%rdi)
  2114. vpsrldq $8,%ymm4,%ymm4
  2115. vmovdqu64 %ymm5,128(%rdi)
  2116. vpsrldq $8,%ymm5,%ymm5
  2117. vmovdqu64 %ymm16,160(%rdi)
  2118. vpsrldq $8,%ymm16,%ymm16
  2119. jmp L$blocks_vpmadd52_2x_key_loaded
  2120. ud2
  2121. .p2align 5
  2122. L$blocks_vpmadd52_2x_do:
  2123. vmovdqu64 128+8(%rdi),%ymm5{%k1}{z}
  2124. vmovdqu64 160+8(%rdi),%ymm16{%k1}{z}
  2125. vmovdqu64 64+8(%rdi),%ymm3{%k1}{z}
  2126. vmovdqu64 96+8(%rdi),%ymm4{%k1}{z}
  2127. L$blocks_vpmadd52_2x_key_loaded:
  2128. vmovdqu64 0(%rsi),%ymm26
  2129. vpxorq %ymm27,%ymm27,%ymm27
  2130. leaq 32(%rsi),%rsi
  2131. vpunpcklqdq %ymm27,%ymm26,%ymm25
  2132. vpunpckhqdq %ymm27,%ymm26,%ymm27
  2133. vpsrlq $24,%ymm27,%ymm26
  2134. vporq %ymm31,%ymm26,%ymm26
  2135. vpaddq %ymm26,%ymm2,%ymm2
  2136. vpandq %ymm28,%ymm25,%ymm24
  2137. vpsrlq $44,%ymm25,%ymm25
  2138. vpsllq $20,%ymm27,%ymm27
  2139. vporq %ymm27,%ymm25,%ymm25
  2140. vpandq %ymm28,%ymm25,%ymm25
  2141. jmp L$tail_vpmadd52_2x
  2142. ud2
  2143. .p2align 5
  2144. L$oop_vpmadd52_4x:
  2145. vpaddq %ymm24,%ymm0,%ymm0
  2146. vpaddq %ymm25,%ymm1,%ymm1
  2147. vpxorq %ymm18,%ymm18,%ymm18
  2148. vpmadd52luq %ymm2,%ymm16,%ymm18
  2149. vpxorq %ymm19,%ymm19,%ymm19
  2150. vpmadd52huq %ymm2,%ymm16,%ymm19
  2151. vpxorq %ymm20,%ymm20,%ymm20
  2152. vpmadd52luq %ymm2,%ymm17,%ymm20
  2153. vpxorq %ymm21,%ymm21,%ymm21
  2154. vpmadd52huq %ymm2,%ymm17,%ymm21
  2155. vpxorq %ymm22,%ymm22,%ymm22
  2156. vpmadd52luq %ymm2,%ymm3,%ymm22
  2157. vpxorq %ymm23,%ymm23,%ymm23
  2158. vpmadd52huq %ymm2,%ymm3,%ymm23
  2159. vmovdqu64 0(%rsi),%ymm26
  2160. vmovdqu64 32(%rsi),%ymm27
  2161. leaq 64(%rsi),%rsi
  2162. vpmadd52luq %ymm0,%ymm3,%ymm18
  2163. vpmadd52huq %ymm0,%ymm3,%ymm19
  2164. vpmadd52luq %ymm0,%ymm4,%ymm20
  2165. vpmadd52huq %ymm0,%ymm4,%ymm21
  2166. vpmadd52luq %ymm0,%ymm5,%ymm22
  2167. vpmadd52huq %ymm0,%ymm5,%ymm23
  2168. vpunpcklqdq %ymm27,%ymm26,%ymm25
  2169. vpunpckhqdq %ymm27,%ymm26,%ymm27
  2170. vpmadd52luq %ymm1,%ymm17,%ymm18
  2171. vpmadd52huq %ymm1,%ymm17,%ymm19
  2172. vpmadd52luq %ymm1,%ymm3,%ymm20
  2173. vpmadd52huq %ymm1,%ymm3,%ymm21
  2174. vpmadd52luq %ymm1,%ymm4,%ymm22
  2175. vpmadd52huq %ymm1,%ymm4,%ymm23
  2176. vpsrlq $44,%ymm18,%ymm30
  2177. vpsllq $8,%ymm19,%ymm19
  2178. vpandq %ymm28,%ymm18,%ymm0
  2179. vpaddq %ymm30,%ymm19,%ymm19
  2180. vpsrlq $24,%ymm27,%ymm26
  2181. vporq %ymm31,%ymm26,%ymm26
  2182. vpaddq %ymm19,%ymm20,%ymm20
  2183. vpsrlq $44,%ymm20,%ymm30
  2184. vpsllq $8,%ymm21,%ymm21
  2185. vpandq %ymm28,%ymm20,%ymm1
  2186. vpaddq %ymm30,%ymm21,%ymm21
  2187. vpandq %ymm28,%ymm25,%ymm24
  2188. vpsrlq $44,%ymm25,%ymm25
  2189. vpsllq $20,%ymm27,%ymm27
  2190. vpaddq %ymm21,%ymm22,%ymm22
  2191. vpsrlq $42,%ymm22,%ymm30
  2192. vpsllq $10,%ymm23,%ymm23
  2193. vpandq %ymm29,%ymm22,%ymm2
  2194. vpaddq %ymm30,%ymm23,%ymm23
  2195. vpaddq %ymm26,%ymm2,%ymm2
  2196. vpaddq %ymm23,%ymm0,%ymm0
  2197. vpsllq $2,%ymm23,%ymm23
  2198. vpaddq %ymm23,%ymm0,%ymm0
  2199. vporq %ymm27,%ymm25,%ymm25
  2200. vpandq %ymm28,%ymm25,%ymm25
  2201. vpsrlq $44,%ymm0,%ymm30
  2202. vpandq %ymm28,%ymm0,%ymm0
  2203. vpaddq %ymm30,%ymm1,%ymm1
  2204. subq $4,%rdx
  2205. jnz L$oop_vpmadd52_4x
  2206. L$tail_vpmadd52_4x:
  2207. vmovdqu64 128(%rdi),%ymm5
  2208. vmovdqu64 160(%rdi),%ymm16
  2209. vmovdqu64 64(%rdi),%ymm3
  2210. vmovdqu64 96(%rdi),%ymm4
  2211. L$tail_vpmadd52_2x:
  2212. vpsllq $2,%ymm5,%ymm17
  2213. vpaddq %ymm5,%ymm17,%ymm17
  2214. vpsllq $2,%ymm17,%ymm17
  2215. vpaddq %ymm24,%ymm0,%ymm0
  2216. vpaddq %ymm25,%ymm1,%ymm1
  2217. vpxorq %ymm18,%ymm18,%ymm18
  2218. vpmadd52luq %ymm2,%ymm16,%ymm18
  2219. vpxorq %ymm19,%ymm19,%ymm19
  2220. vpmadd52huq %ymm2,%ymm16,%ymm19
  2221. vpxorq %ymm20,%ymm20,%ymm20
  2222. vpmadd52luq %ymm2,%ymm17,%ymm20
  2223. vpxorq %ymm21,%ymm21,%ymm21
  2224. vpmadd52huq %ymm2,%ymm17,%ymm21
  2225. vpxorq %ymm22,%ymm22,%ymm22
  2226. vpmadd52luq %ymm2,%ymm3,%ymm22
  2227. vpxorq %ymm23,%ymm23,%ymm23
  2228. vpmadd52huq %ymm2,%ymm3,%ymm23
  2229. vpmadd52luq %ymm0,%ymm3,%ymm18
  2230. vpmadd52huq %ymm0,%ymm3,%ymm19
  2231. vpmadd52luq %ymm0,%ymm4,%ymm20
  2232. vpmadd52huq %ymm0,%ymm4,%ymm21
  2233. vpmadd52luq %ymm0,%ymm5,%ymm22
  2234. vpmadd52huq %ymm0,%ymm5,%ymm23
  2235. vpmadd52luq %ymm1,%ymm17,%ymm18
  2236. vpmadd52huq %ymm1,%ymm17,%ymm19
  2237. vpmadd52luq %ymm1,%ymm3,%ymm20
  2238. vpmadd52huq %ymm1,%ymm3,%ymm21
  2239. vpmadd52luq %ymm1,%ymm4,%ymm22
  2240. vpmadd52huq %ymm1,%ymm4,%ymm23
  2241. movl $1,%eax
  2242. kmovw %eax,%k1
  2243. vpsrldq $8,%ymm18,%ymm24
  2244. vpsrldq $8,%ymm19,%ymm0
  2245. vpsrldq $8,%ymm20,%ymm25
  2246. vpsrldq $8,%ymm21,%ymm1
  2247. vpaddq %ymm24,%ymm18,%ymm18
  2248. vpaddq %ymm0,%ymm19,%ymm19
  2249. vpsrldq $8,%ymm22,%ymm26
  2250. vpsrldq $8,%ymm23,%ymm2
  2251. vpaddq %ymm25,%ymm20,%ymm20
  2252. vpaddq %ymm1,%ymm21,%ymm21
  2253. vpermq $0x2,%ymm18,%ymm24
  2254. vpermq $0x2,%ymm19,%ymm0
  2255. vpaddq %ymm26,%ymm22,%ymm22
  2256. vpaddq %ymm2,%ymm23,%ymm23
  2257. vpermq $0x2,%ymm20,%ymm25
  2258. vpermq $0x2,%ymm21,%ymm1
  2259. vpaddq %ymm24,%ymm18,%ymm18{%k1}{z}
  2260. vpaddq %ymm0,%ymm19,%ymm19{%k1}{z}
  2261. vpermq $0x2,%ymm22,%ymm26
  2262. vpermq $0x2,%ymm23,%ymm2
  2263. vpaddq %ymm25,%ymm20,%ymm20{%k1}{z}
  2264. vpaddq %ymm1,%ymm21,%ymm21{%k1}{z}
  2265. vpaddq %ymm26,%ymm22,%ymm22{%k1}{z}
  2266. vpaddq %ymm2,%ymm23,%ymm23{%k1}{z}
  2267. vpsrlq $44,%ymm18,%ymm30
  2268. vpsllq $8,%ymm19,%ymm19
  2269. vpandq %ymm28,%ymm18,%ymm0
  2270. vpaddq %ymm30,%ymm19,%ymm19
  2271. vpaddq %ymm19,%ymm20,%ymm20
  2272. vpsrlq $44,%ymm20,%ymm30
  2273. vpsllq $8,%ymm21,%ymm21
  2274. vpandq %ymm28,%ymm20,%ymm1
  2275. vpaddq %ymm30,%ymm21,%ymm21
  2276. vpaddq %ymm21,%ymm22,%ymm22
  2277. vpsrlq $42,%ymm22,%ymm30
  2278. vpsllq $10,%ymm23,%ymm23
  2279. vpandq %ymm29,%ymm22,%ymm2
  2280. vpaddq %ymm30,%ymm23,%ymm23
  2281. vpaddq %ymm23,%ymm0,%ymm0
  2282. vpsllq $2,%ymm23,%ymm23
  2283. vpaddq %ymm23,%ymm0,%ymm0
  2284. vpsrlq $44,%ymm0,%ymm30
  2285. vpandq %ymm28,%ymm0,%ymm0
  2286. vpaddq %ymm30,%ymm1,%ymm1
  2287. subq $2,%rdx
  2288. ja L$blocks_vpmadd52_4x_do
  2289. vmovq %xmm0,0(%rdi)
  2290. vmovq %xmm1,8(%rdi)
  2291. vmovq %xmm2,16(%rdi)
  2292. vzeroall
  2293. L$no_data_vpmadd52_4x:
  2294. .byte 0xf3,0xc3
  2295. .p2align 5
  2296. poly1305_blocks_vpmadd52_8x:
  2297. shrq $4,%rdx
  2298. jz L$no_data_vpmadd52_8x
  2299. shlq $40,%rcx
  2300. movq 64(%rdi),%r8
  2301. vmovdqa64 L$x_mask44(%rip),%ymm28
  2302. vmovdqa64 L$x_mask42(%rip),%ymm29
  2303. testq %r8,%r8
  2304. js L$init_vpmadd52
  2305. vmovq 0(%rdi),%xmm0
  2306. vmovq 8(%rdi),%xmm1
  2307. vmovq 16(%rdi),%xmm2
  2308. L$blocks_vpmadd52_8x:
  2309. vmovdqu64 128(%rdi),%ymm5
  2310. vmovdqu64 160(%rdi),%ymm16
  2311. vmovdqu64 64(%rdi),%ymm3
  2312. vmovdqu64 96(%rdi),%ymm4
  2313. vpsllq $2,%ymm5,%ymm17
  2314. vpaddq %ymm5,%ymm17,%ymm17
  2315. vpsllq $2,%ymm17,%ymm17
  2316. vpbroadcastq %xmm5,%ymm8
  2317. vpbroadcastq %xmm3,%ymm6
  2318. vpbroadcastq %xmm4,%ymm7
  2319. vpxorq %ymm18,%ymm18,%ymm18
  2320. vpmadd52luq %ymm8,%ymm16,%ymm18
  2321. vpxorq %ymm19,%ymm19,%ymm19
  2322. vpmadd52huq %ymm8,%ymm16,%ymm19
  2323. vpxorq %ymm20,%ymm20,%ymm20
  2324. vpmadd52luq %ymm8,%ymm17,%ymm20
  2325. vpxorq %ymm21,%ymm21,%ymm21
  2326. vpmadd52huq %ymm8,%ymm17,%ymm21
  2327. vpxorq %ymm22,%ymm22,%ymm22
  2328. vpmadd52luq %ymm8,%ymm3,%ymm22
  2329. vpxorq %ymm23,%ymm23,%ymm23
  2330. vpmadd52huq %ymm8,%ymm3,%ymm23
  2331. vpmadd52luq %ymm6,%ymm3,%ymm18
  2332. vpmadd52huq %ymm6,%ymm3,%ymm19
  2333. vpmadd52luq %ymm6,%ymm4,%ymm20
  2334. vpmadd52huq %ymm6,%ymm4,%ymm21
  2335. vpmadd52luq %ymm6,%ymm5,%ymm22
  2336. vpmadd52huq %ymm6,%ymm5,%ymm23
  2337. vpmadd52luq %ymm7,%ymm17,%ymm18
  2338. vpmadd52huq %ymm7,%ymm17,%ymm19
  2339. vpmadd52luq %ymm7,%ymm3,%ymm20
  2340. vpmadd52huq %ymm7,%ymm3,%ymm21
  2341. vpmadd52luq %ymm7,%ymm4,%ymm22
  2342. vpmadd52huq %ymm7,%ymm4,%ymm23
  2343. vpsrlq $44,%ymm18,%ymm30
  2344. vpsllq $8,%ymm19,%ymm19
  2345. vpandq %ymm28,%ymm18,%ymm6
  2346. vpaddq %ymm30,%ymm19,%ymm19
  2347. vpaddq %ymm19,%ymm20,%ymm20
  2348. vpsrlq $44,%ymm20,%ymm30
  2349. vpsllq $8,%ymm21,%ymm21
  2350. vpandq %ymm28,%ymm20,%ymm7
  2351. vpaddq %ymm30,%ymm21,%ymm21
  2352. vpaddq %ymm21,%ymm22,%ymm22
  2353. vpsrlq $42,%ymm22,%ymm30
  2354. vpsllq $10,%ymm23,%ymm23
  2355. vpandq %ymm29,%ymm22,%ymm8
  2356. vpaddq %ymm30,%ymm23,%ymm23
  2357. vpaddq %ymm23,%ymm6,%ymm6
  2358. vpsllq $2,%ymm23,%ymm23
  2359. vpaddq %ymm23,%ymm6,%ymm6
  2360. vpsrlq $44,%ymm6,%ymm30
  2361. vpandq %ymm28,%ymm6,%ymm6
  2362. vpaddq %ymm30,%ymm7,%ymm7
  2363. vpunpcklqdq %ymm5,%ymm8,%ymm26
  2364. vpunpckhqdq %ymm5,%ymm8,%ymm5
  2365. vpunpcklqdq %ymm3,%ymm6,%ymm24
  2366. vpunpckhqdq %ymm3,%ymm6,%ymm3
  2367. vpunpcklqdq %ymm4,%ymm7,%ymm25
  2368. vpunpckhqdq %ymm4,%ymm7,%ymm4
  2369. vshufi64x2 $0x44,%zmm5,%zmm26,%zmm8
  2370. vshufi64x2 $0x44,%zmm3,%zmm24,%zmm6
  2371. vshufi64x2 $0x44,%zmm4,%zmm25,%zmm7
  2372. vmovdqu64 0(%rsi),%zmm26
  2373. vmovdqu64 64(%rsi),%zmm27
  2374. leaq 128(%rsi),%rsi
  2375. vpsllq $2,%zmm8,%zmm10
  2376. vpsllq $2,%zmm7,%zmm9
  2377. vpaddq %zmm8,%zmm10,%zmm10
  2378. vpaddq %zmm7,%zmm9,%zmm9
  2379. vpsllq $2,%zmm10,%zmm10
  2380. vpsllq $2,%zmm9,%zmm9
  2381. vpbroadcastq %rcx,%zmm31
  2382. vpbroadcastq %xmm28,%zmm28
  2383. vpbroadcastq %xmm29,%zmm29
  2384. vpbroadcastq %xmm9,%zmm16
  2385. vpbroadcastq %xmm10,%zmm17
  2386. vpbroadcastq %xmm6,%zmm3
  2387. vpbroadcastq %xmm7,%zmm4
  2388. vpbroadcastq %xmm8,%zmm5
  2389. vpunpcklqdq %zmm27,%zmm26,%zmm25
  2390. vpunpckhqdq %zmm27,%zmm26,%zmm27
  2391. vpsrlq $24,%zmm27,%zmm26
  2392. vporq %zmm31,%zmm26,%zmm26
  2393. vpaddq %zmm26,%zmm2,%zmm2
  2394. vpandq %zmm28,%zmm25,%zmm24
  2395. vpsrlq $44,%zmm25,%zmm25
  2396. vpsllq $20,%zmm27,%zmm27
  2397. vporq %zmm27,%zmm25,%zmm25
  2398. vpandq %zmm28,%zmm25,%zmm25
  2399. subq $8,%rdx
  2400. jz L$tail_vpmadd52_8x
  2401. jmp L$oop_vpmadd52_8x
  2402. .p2align 5
  2403. L$oop_vpmadd52_8x:
  2404. vpaddq %zmm24,%zmm0,%zmm0
  2405. vpaddq %zmm25,%zmm1,%zmm1
  2406. vpxorq %zmm18,%zmm18,%zmm18
  2407. vpmadd52luq %zmm2,%zmm16,%zmm18
  2408. vpxorq %zmm19,%zmm19,%zmm19
  2409. vpmadd52huq %zmm2,%zmm16,%zmm19
  2410. vpxorq %zmm20,%zmm20,%zmm20
  2411. vpmadd52luq %zmm2,%zmm17,%zmm20
  2412. vpxorq %zmm21,%zmm21,%zmm21
  2413. vpmadd52huq %zmm2,%zmm17,%zmm21
  2414. vpxorq %zmm22,%zmm22,%zmm22
  2415. vpmadd52luq %zmm2,%zmm3,%zmm22
  2416. vpxorq %zmm23,%zmm23,%zmm23
  2417. vpmadd52huq %zmm2,%zmm3,%zmm23
  2418. vmovdqu64 0(%rsi),%zmm26
  2419. vmovdqu64 64(%rsi),%zmm27
  2420. leaq 128(%rsi),%rsi
  2421. vpmadd52luq %zmm0,%zmm3,%zmm18
  2422. vpmadd52huq %zmm0,%zmm3,%zmm19
  2423. vpmadd52luq %zmm0,%zmm4,%zmm20
  2424. vpmadd52huq %zmm0,%zmm4,%zmm21
  2425. vpmadd52luq %zmm0,%zmm5,%zmm22
  2426. vpmadd52huq %zmm0,%zmm5,%zmm23
  2427. vpunpcklqdq %zmm27,%zmm26,%zmm25
  2428. vpunpckhqdq %zmm27,%zmm26,%zmm27
  2429. vpmadd52luq %zmm1,%zmm17,%zmm18
  2430. vpmadd52huq %zmm1,%zmm17,%zmm19
  2431. vpmadd52luq %zmm1,%zmm3,%zmm20
  2432. vpmadd52huq %zmm1,%zmm3,%zmm21
  2433. vpmadd52luq %zmm1,%zmm4,%zmm22
  2434. vpmadd52huq %zmm1,%zmm4,%zmm23
  2435. vpsrlq $44,%zmm18,%zmm30
  2436. vpsllq $8,%zmm19,%zmm19
  2437. vpandq %zmm28,%zmm18,%zmm0
  2438. vpaddq %zmm30,%zmm19,%zmm19
  2439. vpsrlq $24,%zmm27,%zmm26
  2440. vporq %zmm31,%zmm26,%zmm26
  2441. vpaddq %zmm19,%zmm20,%zmm20
  2442. vpsrlq $44,%zmm20,%zmm30
  2443. vpsllq $8,%zmm21,%zmm21
  2444. vpandq %zmm28,%zmm20,%zmm1
  2445. vpaddq %zmm30,%zmm21,%zmm21
  2446. vpandq %zmm28,%zmm25,%zmm24
  2447. vpsrlq $44,%zmm25,%zmm25
  2448. vpsllq $20,%zmm27,%zmm27
  2449. vpaddq %zmm21,%zmm22,%zmm22
  2450. vpsrlq $42,%zmm22,%zmm30
  2451. vpsllq $10,%zmm23,%zmm23
  2452. vpandq %zmm29,%zmm22,%zmm2
  2453. vpaddq %zmm30,%zmm23,%zmm23
  2454. vpaddq %zmm26,%zmm2,%zmm2
  2455. vpaddq %zmm23,%zmm0,%zmm0
  2456. vpsllq $2,%zmm23,%zmm23
  2457. vpaddq %zmm23,%zmm0,%zmm0
  2458. vporq %zmm27,%zmm25,%zmm25
  2459. vpandq %zmm28,%zmm25,%zmm25
  2460. vpsrlq $44,%zmm0,%zmm30
  2461. vpandq %zmm28,%zmm0,%zmm0
  2462. vpaddq %zmm30,%zmm1,%zmm1
  2463. subq $8,%rdx
  2464. jnz L$oop_vpmadd52_8x
  2465. L$tail_vpmadd52_8x:
  2466. vpaddq %zmm24,%zmm0,%zmm0
  2467. vpaddq %zmm25,%zmm1,%zmm1
  2468. vpxorq %zmm18,%zmm18,%zmm18
  2469. vpmadd52luq %zmm2,%zmm9,%zmm18
  2470. vpxorq %zmm19,%zmm19,%zmm19
  2471. vpmadd52huq %zmm2,%zmm9,%zmm19
  2472. vpxorq %zmm20,%zmm20,%zmm20
  2473. vpmadd52luq %zmm2,%zmm10,%zmm20
  2474. vpxorq %zmm21,%zmm21,%zmm21
  2475. vpmadd52huq %zmm2,%zmm10,%zmm21
  2476. vpxorq %zmm22,%zmm22,%zmm22
  2477. vpmadd52luq %zmm2,%zmm6,%zmm22
  2478. vpxorq %zmm23,%zmm23,%zmm23
  2479. vpmadd52huq %zmm2,%zmm6,%zmm23
  2480. vpmadd52luq %zmm0,%zmm6,%zmm18
  2481. vpmadd52huq %zmm0,%zmm6,%zmm19
  2482. vpmadd52luq %zmm0,%zmm7,%zmm20
  2483. vpmadd52huq %zmm0,%zmm7,%zmm21
  2484. vpmadd52luq %zmm0,%zmm8,%zmm22
  2485. vpmadd52huq %zmm0,%zmm8,%zmm23
  2486. vpmadd52luq %zmm1,%zmm10,%zmm18
  2487. vpmadd52huq %zmm1,%zmm10,%zmm19
  2488. vpmadd52luq %zmm1,%zmm6,%zmm20
  2489. vpmadd52huq %zmm1,%zmm6,%zmm21
  2490. vpmadd52luq %zmm1,%zmm7,%zmm22
  2491. vpmadd52huq %zmm1,%zmm7,%zmm23
  2492. movl $1,%eax
  2493. kmovw %eax,%k1
  2494. vpsrldq $8,%zmm18,%zmm24
  2495. vpsrldq $8,%zmm19,%zmm0
  2496. vpsrldq $8,%zmm20,%zmm25
  2497. vpsrldq $8,%zmm21,%zmm1
  2498. vpaddq %zmm24,%zmm18,%zmm18
  2499. vpaddq %zmm0,%zmm19,%zmm19
  2500. vpsrldq $8,%zmm22,%zmm26
  2501. vpsrldq $8,%zmm23,%zmm2
  2502. vpaddq %zmm25,%zmm20,%zmm20
  2503. vpaddq %zmm1,%zmm21,%zmm21
  2504. vpermq $0x2,%zmm18,%zmm24
  2505. vpermq $0x2,%zmm19,%zmm0
  2506. vpaddq %zmm26,%zmm22,%zmm22
  2507. vpaddq %zmm2,%zmm23,%zmm23
  2508. vpermq $0x2,%zmm20,%zmm25
  2509. vpermq $0x2,%zmm21,%zmm1
  2510. vpaddq %zmm24,%zmm18,%zmm18
  2511. vpaddq %zmm0,%zmm19,%zmm19
  2512. vpermq $0x2,%zmm22,%zmm26
  2513. vpermq $0x2,%zmm23,%zmm2
  2514. vpaddq %zmm25,%zmm20,%zmm20
  2515. vpaddq %zmm1,%zmm21,%zmm21
  2516. vextracti64x4 $1,%zmm18,%ymm24
  2517. vextracti64x4 $1,%zmm19,%ymm0
  2518. vpaddq %zmm26,%zmm22,%zmm22
  2519. vpaddq %zmm2,%zmm23,%zmm23
  2520. vextracti64x4 $1,%zmm20,%ymm25
  2521. vextracti64x4 $1,%zmm21,%ymm1
  2522. vextracti64x4 $1,%zmm22,%ymm26
  2523. vextracti64x4 $1,%zmm23,%ymm2
  2524. vpaddq %ymm24,%ymm18,%ymm18{%k1}{z}
  2525. vpaddq %ymm0,%ymm19,%ymm19{%k1}{z}
  2526. vpaddq %ymm25,%ymm20,%ymm20{%k1}{z}
  2527. vpaddq %ymm1,%ymm21,%ymm21{%k1}{z}
  2528. vpaddq %ymm26,%ymm22,%ymm22{%k1}{z}
  2529. vpaddq %ymm2,%ymm23,%ymm23{%k1}{z}
  2530. vpsrlq $44,%ymm18,%ymm30
  2531. vpsllq $8,%ymm19,%ymm19
  2532. vpandq %ymm28,%ymm18,%ymm0
  2533. vpaddq %ymm30,%ymm19,%ymm19
  2534. vpaddq %ymm19,%ymm20,%ymm20
  2535. vpsrlq $44,%ymm20,%ymm30
  2536. vpsllq $8,%ymm21,%ymm21
  2537. vpandq %ymm28,%ymm20,%ymm1
  2538. vpaddq %ymm30,%ymm21,%ymm21
  2539. vpaddq %ymm21,%ymm22,%ymm22
  2540. vpsrlq $42,%ymm22,%ymm30
  2541. vpsllq $10,%ymm23,%ymm23
  2542. vpandq %ymm29,%ymm22,%ymm2
  2543. vpaddq %ymm30,%ymm23,%ymm23
  2544. vpaddq %ymm23,%ymm0,%ymm0
  2545. vpsllq $2,%ymm23,%ymm23
  2546. vpaddq %ymm23,%ymm0,%ymm0
  2547. vpsrlq $44,%ymm0,%ymm30
  2548. vpandq %ymm28,%ymm0,%ymm0
  2549. vpaddq %ymm30,%ymm1,%ymm1
  2550. vmovq %xmm0,0(%rdi)
  2551. vmovq %xmm1,8(%rdi)
  2552. vmovq %xmm2,16(%rdi)
  2553. vzeroall
  2554. L$no_data_vpmadd52_8x:
  2555. .byte 0xf3,0xc3
  2556. .p2align 5
  2557. poly1305_emit_base2_44:
  2558. movq 0(%rdi),%r8
  2559. movq 8(%rdi),%r9
  2560. movq 16(%rdi),%r10
  2561. movq %r9,%rax
  2562. shrq $20,%r9
  2563. shlq $44,%rax
  2564. movq %r10,%rcx
  2565. shrq $40,%r10
  2566. shlq $24,%rcx
  2567. addq %rax,%r8
  2568. adcq %rcx,%r9
  2569. adcq $0,%r10
  2570. movq %r8,%rax
  2571. addq $5,%r8
  2572. movq %r9,%rcx
  2573. adcq $0,%r9
  2574. adcq $0,%r10
  2575. shrq $2,%r10
  2576. cmovnzq %r8,%rax
  2577. cmovnzq %r9,%rcx
  2578. addq 0(%rdx),%rax
  2579. adcq 8(%rdx),%rcx
  2580. movq %rax,0(%rsi)
  2581. movq %rcx,8(%rsi)
  2582. .byte 0xf3,0xc3
  2583. .p2align 6
  2584. L$const:
  2585. L$mask24:
  2586. .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
  2587. L$129:
  2588. .long 16777216,0,16777216,0,16777216,0,16777216,0
  2589. L$mask26:
  2590. .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
  2591. L$permd_avx2:
  2592. .long 2,2,2,3,2,0,2,1
  2593. L$permd_avx512:
  2594. .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
  2595. L$2_44_inp_permd:
  2596. .long 0,1,1,2,2,3,7,7
  2597. L$2_44_inp_shift:
  2598. .quad 0,12,24,64
  2599. L$2_44_mask:
  2600. .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
  2601. L$2_44_shift_rgt:
  2602. .quad 44,44,42,64
  2603. L$2_44_shift_lft:
  2604. .quad 8,8,10,64
  2605. .p2align 6
  2606. L$x_mask44:
  2607. .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
  2608. .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
  2609. L$x_mask42:
  2610. .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
  2611. .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
  2612. .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  2613. .p2align 4
  2614. .globl _xor128_encrypt_n_pad
  2615. .p2align 4
  2616. _xor128_encrypt_n_pad:
  2617. subq %rdx,%rsi
  2618. subq %rdx,%rdi
  2619. movq %rcx,%r10
  2620. shrq $4,%rcx
  2621. jz L$tail_enc
  2622. nop
  2623. L$oop_enc_xmm:
  2624. movdqu (%rsi,%rdx,1),%xmm0
  2625. pxor (%rdx),%xmm0
  2626. movdqu %xmm0,(%rdi,%rdx,1)
  2627. movdqa %xmm0,(%rdx)
  2628. leaq 16(%rdx),%rdx
  2629. decq %rcx
  2630. jnz L$oop_enc_xmm
  2631. andq $15,%r10
  2632. jz L$done_enc
  2633. L$tail_enc:
  2634. movq $16,%rcx
  2635. subq %r10,%rcx
  2636. xorl %eax,%eax
  2637. L$oop_enc_byte:
  2638. movb (%rsi,%rdx,1),%al
  2639. xorb (%rdx),%al
  2640. movb %al,(%rdi,%rdx,1)
  2641. movb %al,(%rdx)
  2642. leaq 1(%rdx),%rdx
  2643. decq %r10
  2644. jnz L$oop_enc_byte
  2645. xorl %eax,%eax
  2646. L$oop_enc_pad:
  2647. movb %al,(%rdx)
  2648. leaq 1(%rdx),%rdx
  2649. decq %rcx
  2650. jnz L$oop_enc_pad
  2651. L$done_enc:
  2652. movq %rdx,%rax
  2653. .byte 0xf3,0xc3
  2654. .globl _xor128_decrypt_n_pad
  2655. .p2align 4
  2656. _xor128_decrypt_n_pad:
  2657. subq %rdx,%rsi
  2658. subq %rdx,%rdi
  2659. movq %rcx,%r10
  2660. shrq $4,%rcx
  2661. jz L$tail_dec
  2662. nop
  2663. L$oop_dec_xmm:
  2664. movdqu (%rsi,%rdx,1),%xmm0
  2665. movdqa (%rdx),%xmm1
  2666. pxor %xmm0,%xmm1
  2667. movdqu %xmm1,(%rdi,%rdx,1)
  2668. movdqa %xmm0,(%rdx)
  2669. leaq 16(%rdx),%rdx
  2670. decq %rcx
  2671. jnz L$oop_dec_xmm
  2672. pxor %xmm1,%xmm1
  2673. andq $15,%r10
  2674. jz L$done_dec
  2675. L$tail_dec:
  2676. movq $16,%rcx
  2677. subq %r10,%rcx
  2678. xorl %eax,%eax
  2679. xorq %r11,%r11
  2680. L$oop_dec_byte:
  2681. movb (%rsi,%rdx,1),%r11b
  2682. movb (%rdx),%al
  2683. xorb %r11b,%al
  2684. movb %al,(%rdi,%rdx,1)
  2685. movb %r11b,(%rdx)
  2686. leaq 1(%rdx),%rdx
  2687. decq %r10
  2688. jnz L$oop_dec_byte
  2689. xorl %eax,%eax
  2690. L$oop_dec_pad:
  2691. movb %al,(%rdx)
  2692. leaq 1(%rdx),%rdx
  2693. decq %rcx
  2694. jnz L$oop_dec_pad
  2695. L$done_dec:
  2696. movq %rdx,%rax
  2697. .byte 0xf3,0xc3