ghash-x86_64.s 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835
  1. .text
  2. .globl _gcm_gmult_4bit
  3. .p2align 4
  4. _gcm_gmult_4bit:
  5. pushq %rbx
  6. pushq %rbp
  7. pushq %r12
  8. pushq %r13
  9. pushq %r14
  10. pushq %r15
  11. subq $280,%rsp
  12. L$gmult_prologue:
  13. movzbq 15(%rdi),%r8
  14. leaq L$rem_4bit(%rip),%r11
  15. xorq %rax,%rax
  16. xorq %rbx,%rbx
  17. movb %r8b,%al
  18. movb %r8b,%bl
  19. shlb $4,%al
  20. movq $14,%rcx
  21. movq 8(%rsi,%rax,1),%r8
  22. movq (%rsi,%rax,1),%r9
  23. andb $0xf0,%bl
  24. movq %r8,%rdx
  25. jmp L$oop1
  26. .p2align 4
  27. L$oop1:
  28. shrq $4,%r8
  29. andq $0xf,%rdx
  30. movq %r9,%r10
  31. movb (%rdi,%rcx,1),%al
  32. shrq $4,%r9
  33. xorq 8(%rsi,%rbx,1),%r8
  34. shlq $60,%r10
  35. xorq (%rsi,%rbx,1),%r9
  36. movb %al,%bl
  37. xorq (%r11,%rdx,8),%r9
  38. movq %r8,%rdx
  39. shlb $4,%al
  40. xorq %r10,%r8
  41. decq %rcx
  42. js L$break1
  43. shrq $4,%r8
  44. andq $0xf,%rdx
  45. movq %r9,%r10
  46. shrq $4,%r9
  47. xorq 8(%rsi,%rax,1),%r8
  48. shlq $60,%r10
  49. xorq (%rsi,%rax,1),%r9
  50. andb $0xf0,%bl
  51. xorq (%r11,%rdx,8),%r9
  52. movq %r8,%rdx
  53. xorq %r10,%r8
  54. jmp L$oop1
  55. .p2align 4
  56. L$break1:
  57. shrq $4,%r8
  58. andq $0xf,%rdx
  59. movq %r9,%r10
  60. shrq $4,%r9
  61. xorq 8(%rsi,%rax,1),%r8
  62. shlq $60,%r10
  63. xorq (%rsi,%rax,1),%r9
  64. andb $0xf0,%bl
  65. xorq (%r11,%rdx,8),%r9
  66. movq %r8,%rdx
  67. xorq %r10,%r8
  68. shrq $4,%r8
  69. andq $0xf,%rdx
  70. movq %r9,%r10
  71. shrq $4,%r9
  72. xorq 8(%rsi,%rbx,1),%r8
  73. shlq $60,%r10
  74. xorq (%rsi,%rbx,1),%r9
  75. xorq %r10,%r8
  76. xorq (%r11,%rdx,8),%r9
  77. bswapq %r8
  78. bswapq %r9
  79. movq %r8,8(%rdi)
  80. movq %r9,(%rdi)
  81. leaq 280+48(%rsp),%rsi
  82. movq -8(%rsi),%rbx
  83. leaq (%rsi),%rsp
  84. L$gmult_epilogue:
  85. .byte 0xf3,0xc3
  86. .globl _gcm_ghash_4bit
  87. .p2align 4
  88. _gcm_ghash_4bit:
  89. pushq %rbx
  90. pushq %rbp
  91. pushq %r12
  92. pushq %r13
  93. pushq %r14
  94. pushq %r15
  95. subq $280,%rsp
  96. L$ghash_prologue:
  97. movq %rdx,%r14
  98. movq %rcx,%r15
  99. subq $-128,%rsi
  100. leaq 16+128(%rsp),%rbp
  101. xorl %edx,%edx
  102. movq 0+0-128(%rsi),%r8
  103. movq 0+8-128(%rsi),%rax
  104. movb %al,%dl
  105. shrq $4,%rax
  106. movq %r8,%r10
  107. shrq $4,%r8
  108. movq 16+0-128(%rsi),%r9
  109. shlb $4,%dl
  110. movq 16+8-128(%rsi),%rbx
  111. shlq $60,%r10
  112. movb %dl,0(%rsp)
  113. orq %r10,%rax
  114. movb %bl,%dl
  115. shrq $4,%rbx
  116. movq %r9,%r10
  117. shrq $4,%r9
  118. movq %r8,0(%rbp)
  119. movq 32+0-128(%rsi),%r8
  120. shlb $4,%dl
  121. movq %rax,0-128(%rbp)
  122. movq 32+8-128(%rsi),%rax
  123. shlq $60,%r10
  124. movb %dl,1(%rsp)
  125. orq %r10,%rbx
  126. movb %al,%dl
  127. shrq $4,%rax
  128. movq %r8,%r10
  129. shrq $4,%r8
  130. movq %r9,8(%rbp)
  131. movq 48+0-128(%rsi),%r9
  132. shlb $4,%dl
  133. movq %rbx,8-128(%rbp)
  134. movq 48+8-128(%rsi),%rbx
  135. shlq $60,%r10
  136. movb %dl,2(%rsp)
  137. orq %r10,%rax
  138. movb %bl,%dl
  139. shrq $4,%rbx
  140. movq %r9,%r10
  141. shrq $4,%r9
  142. movq %r8,16(%rbp)
  143. movq 64+0-128(%rsi),%r8
  144. shlb $4,%dl
  145. movq %rax,16-128(%rbp)
  146. movq 64+8-128(%rsi),%rax
  147. shlq $60,%r10
  148. movb %dl,3(%rsp)
  149. orq %r10,%rbx
  150. movb %al,%dl
  151. shrq $4,%rax
  152. movq %r8,%r10
  153. shrq $4,%r8
  154. movq %r9,24(%rbp)
  155. movq 80+0-128(%rsi),%r9
  156. shlb $4,%dl
  157. movq %rbx,24-128(%rbp)
  158. movq 80+8-128(%rsi),%rbx
  159. shlq $60,%r10
  160. movb %dl,4(%rsp)
  161. orq %r10,%rax
  162. movb %bl,%dl
  163. shrq $4,%rbx
  164. movq %r9,%r10
  165. shrq $4,%r9
  166. movq %r8,32(%rbp)
  167. movq 96+0-128(%rsi),%r8
  168. shlb $4,%dl
  169. movq %rax,32-128(%rbp)
  170. movq 96+8-128(%rsi),%rax
  171. shlq $60,%r10
  172. movb %dl,5(%rsp)
  173. orq %r10,%rbx
  174. movb %al,%dl
  175. shrq $4,%rax
  176. movq %r8,%r10
  177. shrq $4,%r8
  178. movq %r9,40(%rbp)
  179. movq 112+0-128(%rsi),%r9
  180. shlb $4,%dl
  181. movq %rbx,40-128(%rbp)
  182. movq 112+8-128(%rsi),%rbx
  183. shlq $60,%r10
  184. movb %dl,6(%rsp)
  185. orq %r10,%rax
  186. movb %bl,%dl
  187. shrq $4,%rbx
  188. movq %r9,%r10
  189. shrq $4,%r9
  190. movq %r8,48(%rbp)
  191. movq 128+0-128(%rsi),%r8
  192. shlb $4,%dl
  193. movq %rax,48-128(%rbp)
  194. movq 128+8-128(%rsi),%rax
  195. shlq $60,%r10
  196. movb %dl,7(%rsp)
  197. orq %r10,%rbx
  198. movb %al,%dl
  199. shrq $4,%rax
  200. movq %r8,%r10
  201. shrq $4,%r8
  202. movq %r9,56(%rbp)
  203. movq 144+0-128(%rsi),%r9
  204. shlb $4,%dl
  205. movq %rbx,56-128(%rbp)
  206. movq 144+8-128(%rsi),%rbx
  207. shlq $60,%r10
  208. movb %dl,8(%rsp)
  209. orq %r10,%rax
  210. movb %bl,%dl
  211. shrq $4,%rbx
  212. movq %r9,%r10
  213. shrq $4,%r9
  214. movq %r8,64(%rbp)
  215. movq 160+0-128(%rsi),%r8
  216. shlb $4,%dl
  217. movq %rax,64-128(%rbp)
  218. movq 160+8-128(%rsi),%rax
  219. shlq $60,%r10
  220. movb %dl,9(%rsp)
  221. orq %r10,%rbx
  222. movb %al,%dl
  223. shrq $4,%rax
  224. movq %r8,%r10
  225. shrq $4,%r8
  226. movq %r9,72(%rbp)
  227. movq 176+0-128(%rsi),%r9
  228. shlb $4,%dl
  229. movq %rbx,72-128(%rbp)
  230. movq 176+8-128(%rsi),%rbx
  231. shlq $60,%r10
  232. movb %dl,10(%rsp)
  233. orq %r10,%rax
  234. movb %bl,%dl
  235. shrq $4,%rbx
  236. movq %r9,%r10
  237. shrq $4,%r9
  238. movq %r8,80(%rbp)
  239. movq 192+0-128(%rsi),%r8
  240. shlb $4,%dl
  241. movq %rax,80-128(%rbp)
  242. movq 192+8-128(%rsi),%rax
  243. shlq $60,%r10
  244. movb %dl,11(%rsp)
  245. orq %r10,%rbx
  246. movb %al,%dl
  247. shrq $4,%rax
  248. movq %r8,%r10
  249. shrq $4,%r8
  250. movq %r9,88(%rbp)
  251. movq 208+0-128(%rsi),%r9
  252. shlb $4,%dl
  253. movq %rbx,88-128(%rbp)
  254. movq 208+8-128(%rsi),%rbx
  255. shlq $60,%r10
  256. movb %dl,12(%rsp)
  257. orq %r10,%rax
  258. movb %bl,%dl
  259. shrq $4,%rbx
  260. movq %r9,%r10
  261. shrq $4,%r9
  262. movq %r8,96(%rbp)
  263. movq 224+0-128(%rsi),%r8
  264. shlb $4,%dl
  265. movq %rax,96-128(%rbp)
  266. movq 224+8-128(%rsi),%rax
  267. shlq $60,%r10
  268. movb %dl,13(%rsp)
  269. orq %r10,%rbx
  270. movb %al,%dl
  271. shrq $4,%rax
  272. movq %r8,%r10
  273. shrq $4,%r8
  274. movq %r9,104(%rbp)
  275. movq 240+0-128(%rsi),%r9
  276. shlb $4,%dl
  277. movq %rbx,104-128(%rbp)
  278. movq 240+8-128(%rsi),%rbx
  279. shlq $60,%r10
  280. movb %dl,14(%rsp)
  281. orq %r10,%rax
  282. movb %bl,%dl
  283. shrq $4,%rbx
  284. movq %r9,%r10
  285. shrq $4,%r9
  286. movq %r8,112(%rbp)
  287. shlb $4,%dl
  288. movq %rax,112-128(%rbp)
  289. shlq $60,%r10
  290. movb %dl,15(%rsp)
  291. orq %r10,%rbx
  292. movq %r9,120(%rbp)
  293. movq %rbx,120-128(%rbp)
  294. addq $-128,%rsi
  295. movq 8(%rdi),%r8
  296. movq 0(%rdi),%r9
  297. addq %r14,%r15
  298. leaq L$rem_8bit(%rip),%r11
  299. jmp L$outer_loop
  300. .p2align 4
  301. L$outer_loop:
  302. xorq (%r14),%r9
  303. movq 8(%r14),%rdx
  304. leaq 16(%r14),%r14
  305. xorq %r8,%rdx
  306. movq %r9,(%rdi)
  307. movq %rdx,8(%rdi)
  308. shrq $32,%rdx
  309. xorq %rax,%rax
  310. roll $8,%edx
  311. movb %dl,%al
  312. movzbl %dl,%ebx
  313. shlb $4,%al
  314. shrl $4,%ebx
  315. roll $8,%edx
  316. movq 8(%rsi,%rax,1),%r8
  317. movq (%rsi,%rax,1),%r9
  318. movb %dl,%al
  319. movzbl %dl,%ecx
  320. shlb $4,%al
  321. movzbq (%rsp,%rbx,1),%r12
  322. shrl $4,%ecx
  323. xorq %r8,%r12
  324. movq %r9,%r10
  325. shrq $8,%r8
  326. movzbq %r12b,%r12
  327. shrq $8,%r9
  328. xorq -128(%rbp,%rbx,8),%r8
  329. shlq $56,%r10
  330. xorq (%rbp,%rbx,8),%r9
  331. roll $8,%edx
  332. xorq 8(%rsi,%rax,1),%r8
  333. xorq (%rsi,%rax,1),%r9
  334. movb %dl,%al
  335. xorq %r10,%r8
  336. movzwq (%r11,%r12,2),%r12
  337. movzbl %dl,%ebx
  338. shlb $4,%al
  339. movzbq (%rsp,%rcx,1),%r13
  340. shrl $4,%ebx
  341. shlq $48,%r12
  342. xorq %r8,%r13
  343. movq %r9,%r10
  344. xorq %r12,%r9
  345. shrq $8,%r8
  346. movzbq %r13b,%r13
  347. shrq $8,%r9
  348. xorq -128(%rbp,%rcx,8),%r8
  349. shlq $56,%r10
  350. xorq (%rbp,%rcx,8),%r9
  351. roll $8,%edx
  352. xorq 8(%rsi,%rax,1),%r8
  353. xorq (%rsi,%rax,1),%r9
  354. movb %dl,%al
  355. xorq %r10,%r8
  356. movzwq (%r11,%r13,2),%r13
  357. movzbl %dl,%ecx
  358. shlb $4,%al
  359. movzbq (%rsp,%rbx,1),%r12
  360. shrl $4,%ecx
  361. shlq $48,%r13
  362. xorq %r8,%r12
  363. movq %r9,%r10
  364. xorq %r13,%r9
  365. shrq $8,%r8
  366. movzbq %r12b,%r12
  367. movl 8(%rdi),%edx
  368. shrq $8,%r9
  369. xorq -128(%rbp,%rbx,8),%r8
  370. shlq $56,%r10
  371. xorq (%rbp,%rbx,8),%r9
  372. roll $8,%edx
  373. xorq 8(%rsi,%rax,1),%r8
  374. xorq (%rsi,%rax,1),%r9
  375. movb %dl,%al
  376. xorq %r10,%r8
  377. movzwq (%r11,%r12,2),%r12
  378. movzbl %dl,%ebx
  379. shlb $4,%al
  380. movzbq (%rsp,%rcx,1),%r13
  381. shrl $4,%ebx
  382. shlq $48,%r12
  383. xorq %r8,%r13
  384. movq %r9,%r10
  385. xorq %r12,%r9
  386. shrq $8,%r8
  387. movzbq %r13b,%r13
  388. shrq $8,%r9
  389. xorq -128(%rbp,%rcx,8),%r8
  390. shlq $56,%r10
  391. xorq (%rbp,%rcx,8),%r9
  392. roll $8,%edx
  393. xorq 8(%rsi,%rax,1),%r8
  394. xorq (%rsi,%rax,1),%r9
  395. movb %dl,%al
  396. xorq %r10,%r8
  397. movzwq (%r11,%r13,2),%r13
  398. movzbl %dl,%ecx
  399. shlb $4,%al
  400. movzbq (%rsp,%rbx,1),%r12
  401. shrl $4,%ecx
  402. shlq $48,%r13
  403. xorq %r8,%r12
  404. movq %r9,%r10
  405. xorq %r13,%r9
  406. shrq $8,%r8
  407. movzbq %r12b,%r12
  408. shrq $8,%r9
  409. xorq -128(%rbp,%rbx,8),%r8
  410. shlq $56,%r10
  411. xorq (%rbp,%rbx,8),%r9
  412. roll $8,%edx
  413. xorq 8(%rsi,%rax,1),%r8
  414. xorq (%rsi,%rax,1),%r9
  415. movb %dl,%al
  416. xorq %r10,%r8
  417. movzwq (%r11,%r12,2),%r12
  418. movzbl %dl,%ebx
  419. shlb $4,%al
  420. movzbq (%rsp,%rcx,1),%r13
  421. shrl $4,%ebx
  422. shlq $48,%r12
  423. xorq %r8,%r13
  424. movq %r9,%r10
  425. xorq %r12,%r9
  426. shrq $8,%r8
  427. movzbq %r13b,%r13
  428. shrq $8,%r9
  429. xorq -128(%rbp,%rcx,8),%r8
  430. shlq $56,%r10
  431. xorq (%rbp,%rcx,8),%r9
  432. roll $8,%edx
  433. xorq 8(%rsi,%rax,1),%r8
  434. xorq (%rsi,%rax,1),%r9
  435. movb %dl,%al
  436. xorq %r10,%r8
  437. movzwq (%r11,%r13,2),%r13
  438. movzbl %dl,%ecx
  439. shlb $4,%al
  440. movzbq (%rsp,%rbx,1),%r12
  441. shrl $4,%ecx
  442. shlq $48,%r13
  443. xorq %r8,%r12
  444. movq %r9,%r10
  445. xorq %r13,%r9
  446. shrq $8,%r8
  447. movzbq %r12b,%r12
  448. movl 4(%rdi),%edx
  449. shrq $8,%r9
  450. xorq -128(%rbp,%rbx,8),%r8
  451. shlq $56,%r10
  452. xorq (%rbp,%rbx,8),%r9
  453. roll $8,%edx
  454. xorq 8(%rsi,%rax,1),%r8
  455. xorq (%rsi,%rax,1),%r9
  456. movb %dl,%al
  457. xorq %r10,%r8
  458. movzwq (%r11,%r12,2),%r12
  459. movzbl %dl,%ebx
  460. shlb $4,%al
  461. movzbq (%rsp,%rcx,1),%r13
  462. shrl $4,%ebx
  463. shlq $48,%r12
  464. xorq %r8,%r13
  465. movq %r9,%r10
  466. xorq %r12,%r9
  467. shrq $8,%r8
  468. movzbq %r13b,%r13
  469. shrq $8,%r9
  470. xorq -128(%rbp,%rcx,8),%r8
  471. shlq $56,%r10
  472. xorq (%rbp,%rcx,8),%r9
  473. roll $8,%edx
  474. xorq 8(%rsi,%rax,1),%r8
  475. xorq (%rsi,%rax,1),%r9
  476. movb %dl,%al
  477. xorq %r10,%r8
  478. movzwq (%r11,%r13,2),%r13
  479. movzbl %dl,%ecx
  480. shlb $4,%al
  481. movzbq (%rsp,%rbx,1),%r12
  482. shrl $4,%ecx
  483. shlq $48,%r13
  484. xorq %r8,%r12
  485. movq %r9,%r10
  486. xorq %r13,%r9
  487. shrq $8,%r8
  488. movzbq %r12b,%r12
  489. shrq $8,%r9
  490. xorq -128(%rbp,%rbx,8),%r8
  491. shlq $56,%r10
  492. xorq (%rbp,%rbx,8),%r9
  493. roll $8,%edx
  494. xorq 8(%rsi,%rax,1),%r8
  495. xorq (%rsi,%rax,1),%r9
  496. movb %dl,%al
  497. xorq %r10,%r8
  498. movzwq (%r11,%r12,2),%r12
  499. movzbl %dl,%ebx
  500. shlb $4,%al
  501. movzbq (%rsp,%rcx,1),%r13
  502. shrl $4,%ebx
  503. shlq $48,%r12
  504. xorq %r8,%r13
  505. movq %r9,%r10
  506. xorq %r12,%r9
  507. shrq $8,%r8
  508. movzbq %r13b,%r13
  509. shrq $8,%r9
  510. xorq -128(%rbp,%rcx,8),%r8
  511. shlq $56,%r10
  512. xorq (%rbp,%rcx,8),%r9
  513. roll $8,%edx
  514. xorq 8(%rsi,%rax,1),%r8
  515. xorq (%rsi,%rax,1),%r9
  516. movb %dl,%al
  517. xorq %r10,%r8
  518. movzwq (%r11,%r13,2),%r13
  519. movzbl %dl,%ecx
  520. shlb $4,%al
  521. movzbq (%rsp,%rbx,1),%r12
  522. shrl $4,%ecx
  523. shlq $48,%r13
  524. xorq %r8,%r12
  525. movq %r9,%r10
  526. xorq %r13,%r9
  527. shrq $8,%r8
  528. movzbq %r12b,%r12
  529. movl 0(%rdi),%edx
  530. shrq $8,%r9
  531. xorq -128(%rbp,%rbx,8),%r8
  532. shlq $56,%r10
  533. xorq (%rbp,%rbx,8),%r9
  534. roll $8,%edx
  535. xorq 8(%rsi,%rax,1),%r8
  536. xorq (%rsi,%rax,1),%r9
  537. movb %dl,%al
  538. xorq %r10,%r8
  539. movzwq (%r11,%r12,2),%r12
  540. movzbl %dl,%ebx
  541. shlb $4,%al
  542. movzbq (%rsp,%rcx,1),%r13
  543. shrl $4,%ebx
  544. shlq $48,%r12
  545. xorq %r8,%r13
  546. movq %r9,%r10
  547. xorq %r12,%r9
  548. shrq $8,%r8
  549. movzbq %r13b,%r13
  550. shrq $8,%r9
  551. xorq -128(%rbp,%rcx,8),%r8
  552. shlq $56,%r10
  553. xorq (%rbp,%rcx,8),%r9
  554. roll $8,%edx
  555. xorq 8(%rsi,%rax,1),%r8
  556. xorq (%rsi,%rax,1),%r9
  557. movb %dl,%al
  558. xorq %r10,%r8
  559. movzwq (%r11,%r13,2),%r13
  560. movzbl %dl,%ecx
  561. shlb $4,%al
  562. movzbq (%rsp,%rbx,1),%r12
  563. shrl $4,%ecx
  564. shlq $48,%r13
  565. xorq %r8,%r12
  566. movq %r9,%r10
  567. xorq %r13,%r9
  568. shrq $8,%r8
  569. movzbq %r12b,%r12
  570. shrq $8,%r9
  571. xorq -128(%rbp,%rbx,8),%r8
  572. shlq $56,%r10
  573. xorq (%rbp,%rbx,8),%r9
  574. roll $8,%edx
  575. xorq 8(%rsi,%rax,1),%r8
  576. xorq (%rsi,%rax,1),%r9
  577. movb %dl,%al
  578. xorq %r10,%r8
  579. movzwq (%r11,%r12,2),%r12
  580. movzbl %dl,%ebx
  581. shlb $4,%al
  582. movzbq (%rsp,%rcx,1),%r13
  583. shrl $4,%ebx
  584. shlq $48,%r12
  585. xorq %r8,%r13
  586. movq %r9,%r10
  587. xorq %r12,%r9
  588. shrq $8,%r8
  589. movzbq %r13b,%r13
  590. shrq $8,%r9
  591. xorq -128(%rbp,%rcx,8),%r8
  592. shlq $56,%r10
  593. xorq (%rbp,%rcx,8),%r9
  594. roll $8,%edx
  595. xorq 8(%rsi,%rax,1),%r8
  596. xorq (%rsi,%rax,1),%r9
  597. movb %dl,%al
  598. xorq %r10,%r8
  599. movzwq (%r11,%r13,2),%r13
  600. movzbl %dl,%ecx
  601. shlb $4,%al
  602. movzbq (%rsp,%rbx,1),%r12
  603. andl $240,%ecx
  604. shlq $48,%r13
  605. xorq %r8,%r12
  606. movq %r9,%r10
  607. xorq %r13,%r9
  608. shrq $8,%r8
  609. movzbq %r12b,%r12
  610. movl -4(%rdi),%edx
  611. shrq $8,%r9
  612. xorq -128(%rbp,%rbx,8),%r8
  613. shlq $56,%r10
  614. xorq (%rbp,%rbx,8),%r9
  615. movzwq (%r11,%r12,2),%r12
  616. xorq 8(%rsi,%rax,1),%r8
  617. xorq (%rsi,%rax,1),%r9
  618. shlq $48,%r12
  619. xorq %r10,%r8
  620. xorq %r12,%r9
  621. movzbq %r8b,%r13
  622. shrq $4,%r8
  623. movq %r9,%r10
  624. shlb $4,%r13b
  625. shrq $4,%r9
  626. xorq 8(%rsi,%rcx,1),%r8
  627. movzwq (%r11,%r13,2),%r13
  628. shlq $60,%r10
  629. xorq (%rsi,%rcx,1),%r9
  630. xorq %r10,%r8
  631. shlq $48,%r13
  632. bswapq %r8
  633. xorq %r13,%r9
  634. bswapq %r9
  635. cmpq %r15,%r14
  636. jb L$outer_loop
  637. movq %r8,8(%rdi)
  638. movq %r9,(%rdi)
  639. leaq 280+48(%rsp),%rsi
  640. movq -48(%rsi),%r15
  641. movq -40(%rsi),%r14
  642. movq -32(%rsi),%r13
  643. movq -24(%rsi),%r12
  644. movq -16(%rsi),%rbp
  645. movq -8(%rsi),%rbx
  646. leaq 0(%rsi),%rsp
  647. L$ghash_epilogue:
  648. .byte 0xf3,0xc3
  649. .globl _gcm_init_clmul
  650. .p2align 4
  651. _gcm_init_clmul:
  652. L$_init_clmul:
  653. movdqu (%rsi),%xmm2
  654. pshufd $78,%xmm2,%xmm2
  655. pshufd $255,%xmm2,%xmm4
  656. movdqa %xmm2,%xmm3
  657. psllq $1,%xmm2
  658. pxor %xmm5,%xmm5
  659. psrlq $63,%xmm3
  660. pcmpgtd %xmm4,%xmm5
  661. pslldq $8,%xmm3
  662. por %xmm3,%xmm2
  663. pand L$0x1c2_polynomial(%rip),%xmm5
  664. pxor %xmm5,%xmm2
  665. pshufd $78,%xmm2,%xmm6
  666. movdqa %xmm2,%xmm0
  667. pxor %xmm2,%xmm6
  668. movdqa %xmm0,%xmm1
  669. pshufd $78,%xmm0,%xmm3
  670. pxor %xmm0,%xmm3
  671. .byte 102,15,58,68,194,0
  672. .byte 102,15,58,68,202,17
  673. .byte 102,15,58,68,222,0
  674. pxor %xmm0,%xmm3
  675. pxor %xmm1,%xmm3
  676. movdqa %xmm3,%xmm4
  677. psrldq $8,%xmm3
  678. pslldq $8,%xmm4
  679. pxor %xmm3,%xmm1
  680. pxor %xmm4,%xmm0
  681. movdqa %xmm0,%xmm4
  682. movdqa %xmm0,%xmm3
  683. psllq $5,%xmm0
  684. pxor %xmm0,%xmm3
  685. psllq $1,%xmm0
  686. pxor %xmm3,%xmm0
  687. psllq $57,%xmm0
  688. movdqa %xmm0,%xmm3
  689. pslldq $8,%xmm0
  690. psrldq $8,%xmm3
  691. pxor %xmm4,%xmm0
  692. pxor %xmm3,%xmm1
  693. movdqa %xmm0,%xmm4
  694. psrlq $1,%xmm0
  695. pxor %xmm4,%xmm1
  696. pxor %xmm0,%xmm4
  697. psrlq $5,%xmm0
  698. pxor %xmm4,%xmm0
  699. psrlq $1,%xmm0
  700. pxor %xmm1,%xmm0
  701. pshufd $78,%xmm2,%xmm3
  702. pshufd $78,%xmm0,%xmm4
  703. pxor %xmm2,%xmm3
  704. movdqu %xmm2,0(%rdi)
  705. pxor %xmm0,%xmm4
  706. movdqu %xmm0,16(%rdi)
  707. .byte 102,15,58,15,227,8
  708. movdqu %xmm4,32(%rdi)
  709. movdqa %xmm0,%xmm1
  710. pshufd $78,%xmm0,%xmm3
  711. pxor %xmm0,%xmm3
  712. .byte 102,15,58,68,194,0
  713. .byte 102,15,58,68,202,17
  714. .byte 102,15,58,68,222,0
  715. pxor %xmm0,%xmm3
  716. pxor %xmm1,%xmm3
  717. movdqa %xmm3,%xmm4
  718. psrldq $8,%xmm3
  719. pslldq $8,%xmm4
  720. pxor %xmm3,%xmm1
  721. pxor %xmm4,%xmm0
  722. movdqa %xmm0,%xmm4
  723. movdqa %xmm0,%xmm3
  724. psllq $5,%xmm0
  725. pxor %xmm0,%xmm3
  726. psllq $1,%xmm0
  727. pxor %xmm3,%xmm0
  728. psllq $57,%xmm0
  729. movdqa %xmm0,%xmm3
  730. pslldq $8,%xmm0
  731. psrldq $8,%xmm3
  732. pxor %xmm4,%xmm0
  733. pxor %xmm3,%xmm1
  734. movdqa %xmm0,%xmm4
  735. psrlq $1,%xmm0
  736. pxor %xmm4,%xmm1
  737. pxor %xmm0,%xmm4
  738. psrlq $5,%xmm0
  739. pxor %xmm4,%xmm0
  740. psrlq $1,%xmm0
  741. pxor %xmm1,%xmm0
  742. movdqa %xmm0,%xmm5
  743. movdqa %xmm0,%xmm1
  744. pshufd $78,%xmm0,%xmm3
  745. pxor %xmm0,%xmm3
  746. .byte 102,15,58,68,194,0
  747. .byte 102,15,58,68,202,17
  748. .byte 102,15,58,68,222,0
  749. pxor %xmm0,%xmm3
  750. pxor %xmm1,%xmm3
  751. movdqa %xmm3,%xmm4
  752. psrldq $8,%xmm3
  753. pslldq $8,%xmm4
  754. pxor %xmm3,%xmm1
  755. pxor %xmm4,%xmm0
  756. movdqa %xmm0,%xmm4
  757. movdqa %xmm0,%xmm3
  758. psllq $5,%xmm0
  759. pxor %xmm0,%xmm3
  760. psllq $1,%xmm0
  761. pxor %xmm3,%xmm0
  762. psllq $57,%xmm0
  763. movdqa %xmm0,%xmm3
  764. pslldq $8,%xmm0
  765. psrldq $8,%xmm3
  766. pxor %xmm4,%xmm0
  767. pxor %xmm3,%xmm1
  768. movdqa %xmm0,%xmm4
  769. psrlq $1,%xmm0
  770. pxor %xmm4,%xmm1
  771. pxor %xmm0,%xmm4
  772. psrlq $5,%xmm0
  773. pxor %xmm4,%xmm0
  774. psrlq $1,%xmm0
  775. pxor %xmm1,%xmm0
  776. pshufd $78,%xmm5,%xmm3
  777. pshufd $78,%xmm0,%xmm4
  778. pxor %xmm5,%xmm3
  779. movdqu %xmm5,48(%rdi)
  780. pxor %xmm0,%xmm4
  781. movdqu %xmm0,64(%rdi)
  782. .byte 102,15,58,15,227,8
  783. movdqu %xmm4,80(%rdi)
  784. .byte 0xf3,0xc3
  785. .globl _gcm_gmult_clmul
  786. .p2align 4
  787. _gcm_gmult_clmul:
  788. L$_gmult_clmul:
  789. movdqu (%rdi),%xmm0
  790. movdqa L$bswap_mask(%rip),%xmm5
  791. movdqu (%rsi),%xmm2
  792. movdqu 32(%rsi),%xmm4
  793. .byte 102,15,56,0,197
  794. movdqa %xmm0,%xmm1
  795. pshufd $78,%xmm0,%xmm3
  796. pxor %xmm0,%xmm3
  797. .byte 102,15,58,68,194,0
  798. .byte 102,15,58,68,202,17
  799. .byte 102,15,58,68,220,0
  800. pxor %xmm0,%xmm3
  801. pxor %xmm1,%xmm3
  802. movdqa %xmm3,%xmm4
  803. psrldq $8,%xmm3
  804. pslldq $8,%xmm4
  805. pxor %xmm3,%xmm1
  806. pxor %xmm4,%xmm0
  807. movdqa %xmm0,%xmm4
  808. movdqa %xmm0,%xmm3
  809. psllq $5,%xmm0
  810. pxor %xmm0,%xmm3
  811. psllq $1,%xmm0
  812. pxor %xmm3,%xmm0
  813. psllq $57,%xmm0
  814. movdqa %xmm0,%xmm3
  815. pslldq $8,%xmm0
  816. psrldq $8,%xmm3
  817. pxor %xmm4,%xmm0
  818. pxor %xmm3,%xmm1
  819. movdqa %xmm0,%xmm4
  820. psrlq $1,%xmm0
  821. pxor %xmm4,%xmm1
  822. pxor %xmm0,%xmm4
  823. psrlq $5,%xmm0
  824. pxor %xmm4,%xmm0
  825. psrlq $1,%xmm0
  826. pxor %xmm1,%xmm0
  827. .byte 102,15,56,0,197
  828. movdqu %xmm0,(%rdi)
  829. .byte 0xf3,0xc3
  830. .globl _gcm_ghash_clmul
  831. .p2align 5
  832. _gcm_ghash_clmul:
  833. L$_ghash_clmul:
  834. movdqa L$bswap_mask(%rip),%xmm10
  835. movdqu (%rdi),%xmm0
  836. movdqu (%rsi),%xmm2
  837. movdqu 32(%rsi),%xmm7
  838. .byte 102,65,15,56,0,194
  839. subq $0x10,%rcx
  840. jz L$odd_tail
  841. movdqu 16(%rsi),%xmm6
  842. movl _OPENSSL_ia32cap_P+4(%rip),%eax
  843. cmpq $0x30,%rcx
  844. jb L$skip4x
  845. andl $71303168,%eax
  846. cmpl $4194304,%eax
  847. je L$skip4x
  848. subq $0x30,%rcx
  849. movq $0xA040608020C0E000,%rax
  850. movdqu 48(%rsi),%xmm14
  851. movdqu 64(%rsi),%xmm15
  852. movdqu 48(%rdx),%xmm3
  853. movdqu 32(%rdx),%xmm11
  854. .byte 102,65,15,56,0,218
  855. .byte 102,69,15,56,0,218
  856. movdqa %xmm3,%xmm5
  857. pshufd $78,%xmm3,%xmm4
  858. pxor %xmm3,%xmm4
  859. .byte 102,15,58,68,218,0
  860. .byte 102,15,58,68,234,17
  861. .byte 102,15,58,68,231,0
  862. movdqa %xmm11,%xmm13
  863. pshufd $78,%xmm11,%xmm12
  864. pxor %xmm11,%xmm12
  865. .byte 102,68,15,58,68,222,0
  866. .byte 102,68,15,58,68,238,17
  867. .byte 102,68,15,58,68,231,16
  868. xorps %xmm11,%xmm3
  869. xorps %xmm13,%xmm5
  870. movups 80(%rsi),%xmm7
  871. xorps %xmm12,%xmm4
  872. movdqu 16(%rdx),%xmm11
  873. movdqu 0(%rdx),%xmm8
  874. .byte 102,69,15,56,0,218
  875. .byte 102,69,15,56,0,194
  876. movdqa %xmm11,%xmm13
  877. pshufd $78,%xmm11,%xmm12
  878. pxor %xmm8,%xmm0
  879. pxor %xmm11,%xmm12
  880. .byte 102,69,15,58,68,222,0
  881. movdqa %xmm0,%xmm1
  882. pshufd $78,%xmm0,%xmm8
  883. pxor %xmm0,%xmm8
  884. .byte 102,69,15,58,68,238,17
  885. .byte 102,68,15,58,68,231,0
  886. xorps %xmm11,%xmm3
  887. xorps %xmm13,%xmm5
  888. leaq 64(%rdx),%rdx
  889. subq $0x40,%rcx
  890. jc L$tail4x
  891. jmp L$mod4_loop
  892. .p2align 5
  893. L$mod4_loop:
  894. .byte 102,65,15,58,68,199,0
  895. xorps %xmm12,%xmm4
  896. movdqu 48(%rdx),%xmm11
  897. .byte 102,69,15,56,0,218
  898. .byte 102,65,15,58,68,207,17
  899. xorps %xmm3,%xmm0
  900. movdqu 32(%rdx),%xmm3
  901. movdqa %xmm11,%xmm13
  902. .byte 102,68,15,58,68,199,16
  903. pshufd $78,%xmm11,%xmm12
  904. xorps %xmm5,%xmm1
  905. pxor %xmm11,%xmm12
  906. .byte 102,65,15,56,0,218
  907. movups 32(%rsi),%xmm7
  908. xorps %xmm4,%xmm8
  909. .byte 102,68,15,58,68,218,0
  910. pshufd $78,%xmm3,%xmm4
  911. pxor %xmm0,%xmm8
  912. movdqa %xmm3,%xmm5
  913. pxor %xmm1,%xmm8
  914. pxor %xmm3,%xmm4
  915. movdqa %xmm8,%xmm9
  916. .byte 102,68,15,58,68,234,17
  917. pslldq $8,%xmm8
  918. psrldq $8,%xmm9
  919. pxor %xmm8,%xmm0
  920. movdqa L$7_mask(%rip),%xmm8
  921. pxor %xmm9,%xmm1
  922. .byte 102,76,15,110,200
  923. pand %xmm0,%xmm8
  924. .byte 102,69,15,56,0,200
  925. pxor %xmm0,%xmm9
  926. .byte 102,68,15,58,68,231,0
  927. psllq $57,%xmm9
  928. movdqa %xmm9,%xmm8
  929. pslldq $8,%xmm9
  930. .byte 102,15,58,68,222,0
  931. psrldq $8,%xmm8
  932. pxor %xmm9,%xmm0
  933. pxor %xmm8,%xmm1
  934. movdqu 0(%rdx),%xmm8
  935. movdqa %xmm0,%xmm9
  936. psrlq $1,%xmm0
  937. .byte 102,15,58,68,238,17
  938. xorps %xmm11,%xmm3
  939. movdqu 16(%rdx),%xmm11
  940. .byte 102,69,15,56,0,218
  941. .byte 102,15,58,68,231,16
  942. xorps %xmm13,%xmm5
  943. movups 80(%rsi),%xmm7
  944. .byte 102,69,15,56,0,194
  945. pxor %xmm9,%xmm1
  946. pxor %xmm0,%xmm9
  947. psrlq $5,%xmm0
  948. movdqa %xmm11,%xmm13
  949. pxor %xmm12,%xmm4
  950. pshufd $78,%xmm11,%xmm12
  951. pxor %xmm9,%xmm0
  952. pxor %xmm8,%xmm1
  953. pxor %xmm11,%xmm12
  954. .byte 102,69,15,58,68,222,0
  955. psrlq $1,%xmm0
  956. pxor %xmm1,%xmm0
  957. movdqa %xmm0,%xmm1
  958. .byte 102,69,15,58,68,238,17
  959. xorps %xmm11,%xmm3
  960. pshufd $78,%xmm0,%xmm8
  961. pxor %xmm0,%xmm8
  962. .byte 102,68,15,58,68,231,0
  963. xorps %xmm13,%xmm5
  964. leaq 64(%rdx),%rdx
  965. subq $0x40,%rcx
  966. jnc L$mod4_loop
  967. L$tail4x:
  968. .byte 102,65,15,58,68,199,0
  969. .byte 102,65,15,58,68,207,17
  970. .byte 102,68,15,58,68,199,16
  971. xorps %xmm12,%xmm4
  972. xorps %xmm3,%xmm0
  973. xorps %xmm5,%xmm1
  974. pxor %xmm0,%xmm1
  975. pxor %xmm4,%xmm8
  976. pxor %xmm1,%xmm8
  977. pxor %xmm0,%xmm1
  978. movdqa %xmm8,%xmm9
  979. psrldq $8,%xmm8
  980. pslldq $8,%xmm9
  981. pxor %xmm8,%xmm1
  982. pxor %xmm9,%xmm0
  983. movdqa %xmm0,%xmm4
  984. movdqa %xmm0,%xmm3
  985. psllq $5,%xmm0
  986. pxor %xmm0,%xmm3
  987. psllq $1,%xmm0
  988. pxor %xmm3,%xmm0
  989. psllq $57,%xmm0
  990. movdqa %xmm0,%xmm3
  991. pslldq $8,%xmm0
  992. psrldq $8,%xmm3
  993. pxor %xmm4,%xmm0
  994. pxor %xmm3,%xmm1
  995. movdqa %xmm0,%xmm4
  996. psrlq $1,%xmm0
  997. pxor %xmm4,%xmm1
  998. pxor %xmm0,%xmm4
  999. psrlq $5,%xmm0
  1000. pxor %xmm4,%xmm0
  1001. psrlq $1,%xmm0
  1002. pxor %xmm1,%xmm0
  1003. addq $0x40,%rcx
  1004. jz L$done
  1005. movdqu 32(%rsi),%xmm7
  1006. subq $0x10,%rcx
  1007. jz L$odd_tail
  1008. L$skip4x:
  1009. movdqu (%rdx),%xmm8
  1010. movdqu 16(%rdx),%xmm3
  1011. .byte 102,69,15,56,0,194
  1012. .byte 102,65,15,56,0,218
  1013. pxor %xmm8,%xmm0
  1014. movdqa %xmm3,%xmm5
  1015. pshufd $78,%xmm3,%xmm4
  1016. pxor %xmm3,%xmm4
  1017. .byte 102,15,58,68,218,0
  1018. .byte 102,15,58,68,234,17
  1019. .byte 102,15,58,68,231,0
  1020. leaq 32(%rdx),%rdx
  1021. nop
  1022. subq $0x20,%rcx
  1023. jbe L$even_tail
  1024. nop
  1025. jmp L$mod_loop
  1026. .p2align 5
  1027. L$mod_loop:
  1028. movdqa %xmm0,%xmm1
  1029. movdqa %xmm4,%xmm8
  1030. pshufd $78,%xmm0,%xmm4
  1031. pxor %xmm0,%xmm4
  1032. .byte 102,15,58,68,198,0
  1033. .byte 102,15,58,68,206,17
  1034. .byte 102,15,58,68,231,16
  1035. pxor %xmm3,%xmm0
  1036. pxor %xmm5,%xmm1
  1037. movdqu (%rdx),%xmm9
  1038. pxor %xmm0,%xmm8
  1039. .byte 102,69,15,56,0,202
  1040. movdqu 16(%rdx),%xmm3
  1041. pxor %xmm1,%xmm8
  1042. pxor %xmm9,%xmm1
  1043. pxor %xmm8,%xmm4
  1044. .byte 102,65,15,56,0,218
  1045. movdqa %xmm4,%xmm8
  1046. psrldq $8,%xmm8
  1047. pslldq $8,%xmm4
  1048. pxor %xmm8,%xmm1
  1049. pxor %xmm4,%xmm0
  1050. movdqa %xmm3,%xmm5
  1051. movdqa %xmm0,%xmm9
  1052. movdqa %xmm0,%xmm8
  1053. psllq $5,%xmm0
  1054. pxor %xmm0,%xmm8
  1055. .byte 102,15,58,68,218,0
  1056. psllq $1,%xmm0
  1057. pxor %xmm8,%xmm0
  1058. psllq $57,%xmm0
  1059. movdqa %xmm0,%xmm8
  1060. pslldq $8,%xmm0
  1061. psrldq $8,%xmm8
  1062. pxor %xmm9,%xmm0
  1063. pshufd $78,%xmm5,%xmm4
  1064. pxor %xmm8,%xmm1
  1065. pxor %xmm5,%xmm4
  1066. movdqa %xmm0,%xmm9
  1067. psrlq $1,%xmm0
  1068. .byte 102,15,58,68,234,17
  1069. pxor %xmm9,%xmm1
  1070. pxor %xmm0,%xmm9
  1071. psrlq $5,%xmm0
  1072. pxor %xmm9,%xmm0
  1073. leaq 32(%rdx),%rdx
  1074. psrlq $1,%xmm0
  1075. .byte 102,15,58,68,231,0
  1076. pxor %xmm1,%xmm0
  1077. subq $0x20,%rcx
  1078. ja L$mod_loop
  1079. L$even_tail:
  1080. movdqa %xmm0,%xmm1
  1081. movdqa %xmm4,%xmm8
  1082. pshufd $78,%xmm0,%xmm4
  1083. pxor %xmm0,%xmm4
  1084. .byte 102,15,58,68,198,0
  1085. .byte 102,15,58,68,206,17
  1086. .byte 102,15,58,68,231,16
  1087. pxor %xmm3,%xmm0
  1088. pxor %xmm5,%xmm1
  1089. pxor %xmm0,%xmm8
  1090. pxor %xmm1,%xmm8
  1091. pxor %xmm8,%xmm4
  1092. movdqa %xmm4,%xmm8
  1093. psrldq $8,%xmm8
  1094. pslldq $8,%xmm4
  1095. pxor %xmm8,%xmm1
  1096. pxor %xmm4,%xmm0
  1097. movdqa %xmm0,%xmm4
  1098. movdqa %xmm0,%xmm3
  1099. psllq $5,%xmm0
  1100. pxor %xmm0,%xmm3
  1101. psllq $1,%xmm0
  1102. pxor %xmm3,%xmm0
  1103. psllq $57,%xmm0
  1104. movdqa %xmm0,%xmm3
  1105. pslldq $8,%xmm0
  1106. psrldq $8,%xmm3
  1107. pxor %xmm4,%xmm0
  1108. pxor %xmm3,%xmm1
  1109. movdqa %xmm0,%xmm4
  1110. psrlq $1,%xmm0
  1111. pxor %xmm4,%xmm1
  1112. pxor %xmm0,%xmm4
  1113. psrlq $5,%xmm0
  1114. pxor %xmm4,%xmm0
  1115. psrlq $1,%xmm0
  1116. pxor %xmm1,%xmm0
  1117. testq %rcx,%rcx
  1118. jnz L$done
  1119. L$odd_tail:
  1120. movdqu (%rdx),%xmm8
  1121. .byte 102,69,15,56,0,194
  1122. pxor %xmm8,%xmm0
  1123. movdqa %xmm0,%xmm1
  1124. pshufd $78,%xmm0,%xmm3
  1125. pxor %xmm0,%xmm3
  1126. .byte 102,15,58,68,194,0
  1127. .byte 102,15,58,68,202,17
  1128. .byte 102,15,58,68,223,0
  1129. pxor %xmm0,%xmm3
  1130. pxor %xmm1,%xmm3
  1131. movdqa %xmm3,%xmm4
  1132. psrldq $8,%xmm3
  1133. pslldq $8,%xmm4
  1134. pxor %xmm3,%xmm1
  1135. pxor %xmm4,%xmm0
  1136. movdqa %xmm0,%xmm4
  1137. movdqa %xmm0,%xmm3
  1138. psllq $5,%xmm0
  1139. pxor %xmm0,%xmm3
  1140. psllq $1,%xmm0
  1141. pxor %xmm3,%xmm0
  1142. psllq $57,%xmm0
  1143. movdqa %xmm0,%xmm3
  1144. pslldq $8,%xmm0
  1145. psrldq $8,%xmm3
  1146. pxor %xmm4,%xmm0
  1147. pxor %xmm3,%xmm1
  1148. movdqa %xmm0,%xmm4
  1149. psrlq $1,%xmm0
  1150. pxor %xmm4,%xmm1
  1151. pxor %xmm0,%xmm4
  1152. psrlq $5,%xmm0
  1153. pxor %xmm4,%xmm0
  1154. psrlq $1,%xmm0
  1155. pxor %xmm1,%xmm0
  1156. L$done:
  1157. .byte 102,65,15,56,0,194
  1158. movdqu %xmm0,(%rdi)
  1159. .byte 0xf3,0xc3
  1160. .globl _gcm_init_avx
  1161. .p2align 5
  1162. _gcm_init_avx:
  1163. vzeroupper
  1164. vmovdqu (%rsi),%xmm2
  1165. vpshufd $78,%xmm2,%xmm2
  1166. vpshufd $255,%xmm2,%xmm4
  1167. vpsrlq $63,%xmm2,%xmm3
  1168. vpsllq $1,%xmm2,%xmm2
  1169. vpxor %xmm5,%xmm5,%xmm5
  1170. vpcmpgtd %xmm4,%xmm5,%xmm5
  1171. vpslldq $8,%xmm3,%xmm3
  1172. vpor %xmm3,%xmm2,%xmm2
  1173. vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5
  1174. vpxor %xmm5,%xmm2,%xmm2
  1175. vpunpckhqdq %xmm2,%xmm2,%xmm6
  1176. vmovdqa %xmm2,%xmm0
  1177. vpxor %xmm2,%xmm6,%xmm6
  1178. movq $4,%r10
  1179. jmp L$init_start_avx
  1180. .p2align 5
  1181. L$init_loop_avx:
  1182. vpalignr $8,%xmm3,%xmm4,%xmm5
  1183. vmovdqu %xmm5,-16(%rdi)
  1184. vpunpckhqdq %xmm0,%xmm0,%xmm3
  1185. vpxor %xmm0,%xmm3,%xmm3
  1186. vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
  1187. vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
  1188. vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
  1189. vpxor %xmm0,%xmm1,%xmm4
  1190. vpxor %xmm4,%xmm3,%xmm3
  1191. vpslldq $8,%xmm3,%xmm4
  1192. vpsrldq $8,%xmm3,%xmm3
  1193. vpxor %xmm4,%xmm0,%xmm0
  1194. vpxor %xmm3,%xmm1,%xmm1
  1195. vpsllq $57,%xmm0,%xmm3
  1196. vpsllq $62,%xmm0,%xmm4
  1197. vpxor %xmm3,%xmm4,%xmm4
  1198. vpsllq $63,%xmm0,%xmm3
  1199. vpxor %xmm3,%xmm4,%xmm4
  1200. vpslldq $8,%xmm4,%xmm3
  1201. vpsrldq $8,%xmm4,%xmm4
  1202. vpxor %xmm3,%xmm0,%xmm0
  1203. vpxor %xmm4,%xmm1,%xmm1
  1204. vpsrlq $1,%xmm0,%xmm4
  1205. vpxor %xmm0,%xmm1,%xmm1
  1206. vpxor %xmm4,%xmm0,%xmm0
  1207. vpsrlq $5,%xmm4,%xmm4
  1208. vpxor %xmm4,%xmm0,%xmm0
  1209. vpsrlq $1,%xmm0,%xmm0
  1210. vpxor %xmm1,%xmm0,%xmm0
  1211. L$init_start_avx:
  1212. vmovdqa %xmm0,%xmm5
  1213. vpunpckhqdq %xmm0,%xmm0,%xmm3
  1214. vpxor %xmm0,%xmm3,%xmm3
  1215. vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
  1216. vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
  1217. vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
  1218. vpxor %xmm0,%xmm1,%xmm4
  1219. vpxor %xmm4,%xmm3,%xmm3
  1220. vpslldq $8,%xmm3,%xmm4
  1221. vpsrldq $8,%xmm3,%xmm3
  1222. vpxor %xmm4,%xmm0,%xmm0
  1223. vpxor %xmm3,%xmm1,%xmm1
  1224. vpsllq $57,%xmm0,%xmm3
  1225. vpsllq $62,%xmm0,%xmm4
  1226. vpxor %xmm3,%xmm4,%xmm4
  1227. vpsllq $63,%xmm0,%xmm3
  1228. vpxor %xmm3,%xmm4,%xmm4
  1229. vpslldq $8,%xmm4,%xmm3
  1230. vpsrldq $8,%xmm4,%xmm4
  1231. vpxor %xmm3,%xmm0,%xmm0
  1232. vpxor %xmm4,%xmm1,%xmm1
  1233. vpsrlq $1,%xmm0,%xmm4
  1234. vpxor %xmm0,%xmm1,%xmm1
  1235. vpxor %xmm4,%xmm0,%xmm0
  1236. vpsrlq $5,%xmm4,%xmm4
  1237. vpxor %xmm4,%xmm0,%xmm0
  1238. vpsrlq $1,%xmm0,%xmm0
  1239. vpxor %xmm1,%xmm0,%xmm0
  1240. vpshufd $78,%xmm5,%xmm3
  1241. vpshufd $78,%xmm0,%xmm4
  1242. vpxor %xmm5,%xmm3,%xmm3
  1243. vmovdqu %xmm5,0(%rdi)
  1244. vpxor %xmm0,%xmm4,%xmm4
  1245. vmovdqu %xmm0,16(%rdi)
  1246. leaq 48(%rdi),%rdi
  1247. subq $1,%r10
  1248. jnz L$init_loop_avx
  1249. vpalignr $8,%xmm4,%xmm3,%xmm5
  1250. vmovdqu %xmm5,-16(%rdi)
  1251. vzeroupper
  1252. .byte 0xf3,0xc3
  1253. .globl _gcm_gmult_avx
  1254. .p2align 5
  1255. _gcm_gmult_avx:
  1256. jmp L$_gmult_clmul
  1257. .globl _gcm_ghash_avx
  1258. .p2align 5
  1259. _gcm_ghash_avx:
  1260. vzeroupper
  1261. vmovdqu (%rdi),%xmm10
  1262. leaq L$0x1c2_polynomial(%rip),%r10
  1263. leaq 64(%rsi),%rsi
  1264. vmovdqu L$bswap_mask(%rip),%xmm13
  1265. vpshufb %xmm13,%xmm10,%xmm10
  1266. cmpq $0x80,%rcx
  1267. jb L$short_avx
  1268. subq $0x80,%rcx
  1269. vmovdqu 112(%rdx),%xmm14
  1270. vmovdqu 0-64(%rsi),%xmm6
  1271. vpshufb %xmm13,%xmm14,%xmm14
  1272. vmovdqu 32-64(%rsi),%xmm7
  1273. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1274. vmovdqu 96(%rdx),%xmm15
  1275. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1276. vpxor %xmm14,%xmm9,%xmm9
  1277. vpshufb %xmm13,%xmm15,%xmm15
  1278. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1279. vmovdqu 16-64(%rsi),%xmm6
  1280. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1281. vmovdqu 80(%rdx),%xmm14
  1282. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1283. vpxor %xmm15,%xmm8,%xmm8
  1284. vpshufb %xmm13,%xmm14,%xmm14
  1285. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1286. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1287. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1288. vmovdqu 48-64(%rsi),%xmm6
  1289. vpxor %xmm14,%xmm9,%xmm9
  1290. vmovdqu 64(%rdx),%xmm15
  1291. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1292. vmovdqu 80-64(%rsi),%xmm7
  1293. vpshufb %xmm13,%xmm15,%xmm15
  1294. vpxor %xmm0,%xmm3,%xmm3
  1295. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1296. vpxor %xmm1,%xmm4,%xmm4
  1297. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1298. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1299. vmovdqu 64-64(%rsi),%xmm6
  1300. vpxor %xmm2,%xmm5,%xmm5
  1301. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1302. vpxor %xmm15,%xmm8,%xmm8
  1303. vmovdqu 48(%rdx),%xmm14
  1304. vpxor %xmm3,%xmm0,%xmm0
  1305. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1306. vpxor %xmm4,%xmm1,%xmm1
  1307. vpshufb %xmm13,%xmm14,%xmm14
  1308. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1309. vmovdqu 96-64(%rsi),%xmm6
  1310. vpxor %xmm5,%xmm2,%xmm2
  1311. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1312. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1313. vmovdqu 128-64(%rsi),%xmm7
  1314. vpxor %xmm14,%xmm9,%xmm9
  1315. vmovdqu 32(%rdx),%xmm15
  1316. vpxor %xmm0,%xmm3,%xmm3
  1317. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1318. vpxor %xmm1,%xmm4,%xmm4
  1319. vpshufb %xmm13,%xmm15,%xmm15
  1320. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1321. vmovdqu 112-64(%rsi),%xmm6
  1322. vpxor %xmm2,%xmm5,%xmm5
  1323. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1324. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1325. vpxor %xmm15,%xmm8,%xmm8
  1326. vmovdqu 16(%rdx),%xmm14
  1327. vpxor %xmm3,%xmm0,%xmm0
  1328. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1329. vpxor %xmm4,%xmm1,%xmm1
  1330. vpshufb %xmm13,%xmm14,%xmm14
  1331. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1332. vmovdqu 144-64(%rsi),%xmm6
  1333. vpxor %xmm5,%xmm2,%xmm2
  1334. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1335. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1336. vmovdqu 176-64(%rsi),%xmm7
  1337. vpxor %xmm14,%xmm9,%xmm9
  1338. vmovdqu (%rdx),%xmm15
  1339. vpxor %xmm0,%xmm3,%xmm3
  1340. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1341. vpxor %xmm1,%xmm4,%xmm4
  1342. vpshufb %xmm13,%xmm15,%xmm15
  1343. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1344. vmovdqu 160-64(%rsi),%xmm6
  1345. vpxor %xmm2,%xmm5,%xmm5
  1346. vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
  1347. leaq 128(%rdx),%rdx
  1348. cmpq $0x80,%rcx
  1349. jb L$tail_avx
  1350. vpxor %xmm10,%xmm15,%xmm15
  1351. subq $0x80,%rcx
  1352. jmp L$oop8x_avx
  1353. .p2align 5
  1354. L$oop8x_avx:
  1355. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1356. vmovdqu 112(%rdx),%xmm14
  1357. vpxor %xmm0,%xmm3,%xmm3
  1358. vpxor %xmm15,%xmm8,%xmm8
  1359. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
  1360. vpshufb %xmm13,%xmm14,%xmm14
  1361. vpxor %xmm1,%xmm4,%xmm4
  1362. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
  1363. vmovdqu 0-64(%rsi),%xmm6
  1364. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1365. vpxor %xmm2,%xmm5,%xmm5
  1366. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
  1367. vmovdqu 32-64(%rsi),%xmm7
  1368. vpxor %xmm14,%xmm9,%xmm9
  1369. vmovdqu 96(%rdx),%xmm15
  1370. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1371. vpxor %xmm3,%xmm10,%xmm10
  1372. vpshufb %xmm13,%xmm15,%xmm15
  1373. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1374. vxorps %xmm4,%xmm11,%xmm11
  1375. vmovdqu 16-64(%rsi),%xmm6
  1376. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1377. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1378. vpxor %xmm5,%xmm12,%xmm12
  1379. vxorps %xmm15,%xmm8,%xmm8
  1380. vmovdqu 80(%rdx),%xmm14
  1381. vpxor %xmm10,%xmm12,%xmm12
  1382. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1383. vpxor %xmm11,%xmm12,%xmm12
  1384. vpslldq $8,%xmm12,%xmm9
  1385. vpxor %xmm0,%xmm3,%xmm3
  1386. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1387. vpsrldq $8,%xmm12,%xmm12
  1388. vpxor %xmm9,%xmm10,%xmm10
  1389. vmovdqu 48-64(%rsi),%xmm6
  1390. vpshufb %xmm13,%xmm14,%xmm14
  1391. vxorps %xmm12,%xmm11,%xmm11
  1392. vpxor %xmm1,%xmm4,%xmm4
  1393. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1394. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1395. vmovdqu 80-64(%rsi),%xmm7
  1396. vpxor %xmm14,%xmm9,%xmm9
  1397. vpxor %xmm2,%xmm5,%xmm5
  1398. vmovdqu 64(%rdx),%xmm15
  1399. vpalignr $8,%xmm10,%xmm10,%xmm12
  1400. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1401. vpshufb %xmm13,%xmm15,%xmm15
  1402. vpxor %xmm3,%xmm0,%xmm0
  1403. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1404. vmovdqu 64-64(%rsi),%xmm6
  1405. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1406. vpxor %xmm4,%xmm1,%xmm1
  1407. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1408. vxorps %xmm15,%xmm8,%xmm8
  1409. vpxor %xmm5,%xmm2,%xmm2
  1410. vmovdqu 48(%rdx),%xmm14
  1411. vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
  1412. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1413. vpshufb %xmm13,%xmm14,%xmm14
  1414. vpxor %xmm0,%xmm3,%xmm3
  1415. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1416. vmovdqu 96-64(%rsi),%xmm6
  1417. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1418. vpxor %xmm1,%xmm4,%xmm4
  1419. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1420. vmovdqu 128-64(%rsi),%xmm7
  1421. vpxor %xmm14,%xmm9,%xmm9
  1422. vpxor %xmm2,%xmm5,%xmm5
  1423. vmovdqu 32(%rdx),%xmm15
  1424. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1425. vpshufb %xmm13,%xmm15,%xmm15
  1426. vpxor %xmm3,%xmm0,%xmm0
  1427. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1428. vmovdqu 112-64(%rsi),%xmm6
  1429. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1430. vpxor %xmm4,%xmm1,%xmm1
  1431. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1432. vpxor %xmm15,%xmm8,%xmm8
  1433. vpxor %xmm5,%xmm2,%xmm2
  1434. vxorps %xmm12,%xmm10,%xmm10
  1435. vmovdqu 16(%rdx),%xmm14
  1436. vpalignr $8,%xmm10,%xmm10,%xmm12
  1437. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1438. vpshufb %xmm13,%xmm14,%xmm14
  1439. vpxor %xmm0,%xmm3,%xmm3
  1440. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1441. vmovdqu 144-64(%rsi),%xmm6
  1442. vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
  1443. vxorps %xmm11,%xmm12,%xmm12
  1444. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1445. vpxor %xmm1,%xmm4,%xmm4
  1446. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1447. vmovdqu 176-64(%rsi),%xmm7
  1448. vpxor %xmm14,%xmm9,%xmm9
  1449. vpxor %xmm2,%xmm5,%xmm5
  1450. vmovdqu (%rdx),%xmm15
  1451. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1452. vpshufb %xmm13,%xmm15,%xmm15
  1453. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1454. vmovdqu 160-64(%rsi),%xmm6
  1455. vpxor %xmm12,%xmm15,%xmm15
  1456. vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
  1457. vpxor %xmm10,%xmm15,%xmm15
  1458. leaq 128(%rdx),%rdx
  1459. subq $0x80,%rcx
  1460. jnc L$oop8x_avx
  1461. addq $0x80,%rcx
  1462. jmp L$tail_no_xor_avx
  1463. .p2align 5
  1464. L$short_avx:
  1465. vmovdqu -16(%rdx,%rcx,1),%xmm14
  1466. leaq (%rdx,%rcx,1),%rdx
  1467. vmovdqu 0-64(%rsi),%xmm6
  1468. vmovdqu 32-64(%rsi),%xmm7
  1469. vpshufb %xmm13,%xmm14,%xmm15
  1470. vmovdqa %xmm0,%xmm3
  1471. vmovdqa %xmm1,%xmm4
  1472. vmovdqa %xmm2,%xmm5
  1473. subq $0x10,%rcx
  1474. jz L$tail_avx
  1475. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1476. vpxor %xmm0,%xmm3,%xmm3
  1477. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1478. vpxor %xmm15,%xmm8,%xmm8
  1479. vmovdqu -32(%rdx),%xmm14
  1480. vpxor %xmm1,%xmm4,%xmm4
  1481. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1482. vmovdqu 16-64(%rsi),%xmm6
  1483. vpshufb %xmm13,%xmm14,%xmm15
  1484. vpxor %xmm2,%xmm5,%xmm5
  1485. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1486. vpsrldq $8,%xmm7,%xmm7
  1487. subq $0x10,%rcx
  1488. jz L$tail_avx
  1489. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1490. vpxor %xmm0,%xmm3,%xmm3
  1491. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1492. vpxor %xmm15,%xmm8,%xmm8
  1493. vmovdqu -48(%rdx),%xmm14
  1494. vpxor %xmm1,%xmm4,%xmm4
  1495. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1496. vmovdqu 48-64(%rsi),%xmm6
  1497. vpshufb %xmm13,%xmm14,%xmm15
  1498. vpxor %xmm2,%xmm5,%xmm5
  1499. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1500. vmovdqu 80-64(%rsi),%xmm7
  1501. subq $0x10,%rcx
  1502. jz L$tail_avx
  1503. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1504. vpxor %xmm0,%xmm3,%xmm3
  1505. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1506. vpxor %xmm15,%xmm8,%xmm8
  1507. vmovdqu -64(%rdx),%xmm14
  1508. vpxor %xmm1,%xmm4,%xmm4
  1509. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1510. vmovdqu 64-64(%rsi),%xmm6
  1511. vpshufb %xmm13,%xmm14,%xmm15
  1512. vpxor %xmm2,%xmm5,%xmm5
  1513. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1514. vpsrldq $8,%xmm7,%xmm7
  1515. subq $0x10,%rcx
  1516. jz L$tail_avx
  1517. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1518. vpxor %xmm0,%xmm3,%xmm3
  1519. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1520. vpxor %xmm15,%xmm8,%xmm8
  1521. vmovdqu -80(%rdx),%xmm14
  1522. vpxor %xmm1,%xmm4,%xmm4
  1523. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1524. vmovdqu 96-64(%rsi),%xmm6
  1525. vpshufb %xmm13,%xmm14,%xmm15
  1526. vpxor %xmm2,%xmm5,%xmm5
  1527. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1528. vmovdqu 128-64(%rsi),%xmm7
  1529. subq $0x10,%rcx
  1530. jz L$tail_avx
  1531. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1532. vpxor %xmm0,%xmm3,%xmm3
  1533. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1534. vpxor %xmm15,%xmm8,%xmm8
  1535. vmovdqu -96(%rdx),%xmm14
  1536. vpxor %xmm1,%xmm4,%xmm4
  1537. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1538. vmovdqu 112-64(%rsi),%xmm6
  1539. vpshufb %xmm13,%xmm14,%xmm15
  1540. vpxor %xmm2,%xmm5,%xmm5
  1541. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1542. vpsrldq $8,%xmm7,%xmm7
  1543. subq $0x10,%rcx
  1544. jz L$tail_avx
  1545. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1546. vpxor %xmm0,%xmm3,%xmm3
  1547. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1548. vpxor %xmm15,%xmm8,%xmm8
  1549. vmovdqu -112(%rdx),%xmm14
  1550. vpxor %xmm1,%xmm4,%xmm4
  1551. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1552. vmovdqu 144-64(%rsi),%xmm6
  1553. vpshufb %xmm13,%xmm14,%xmm15
  1554. vpxor %xmm2,%xmm5,%xmm5
  1555. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1556. vmovq 184-64(%rsi),%xmm7
  1557. subq $0x10,%rcx
  1558. jmp L$tail_avx
  1559. .p2align 5
  1560. L$tail_avx:
  1561. vpxor %xmm10,%xmm15,%xmm15
  1562. L$tail_no_xor_avx:
  1563. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1564. vpxor %xmm0,%xmm3,%xmm3
  1565. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1566. vpxor %xmm15,%xmm8,%xmm8
  1567. vpxor %xmm1,%xmm4,%xmm4
  1568. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1569. vpxor %xmm2,%xmm5,%xmm5
  1570. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1571. vmovdqu (%r10),%xmm12
  1572. vpxor %xmm0,%xmm3,%xmm10
  1573. vpxor %xmm1,%xmm4,%xmm11
  1574. vpxor %xmm2,%xmm5,%xmm5
  1575. vpxor %xmm10,%xmm5,%xmm5
  1576. vpxor %xmm11,%xmm5,%xmm5
  1577. vpslldq $8,%xmm5,%xmm9
  1578. vpsrldq $8,%xmm5,%xmm5
  1579. vpxor %xmm9,%xmm10,%xmm10
  1580. vpxor %xmm5,%xmm11,%xmm11
  1581. vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
  1582. vpalignr $8,%xmm10,%xmm10,%xmm10
  1583. vpxor %xmm9,%xmm10,%xmm10
  1584. vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
  1585. vpalignr $8,%xmm10,%xmm10,%xmm10
  1586. vpxor %xmm11,%xmm10,%xmm10
  1587. vpxor %xmm9,%xmm10,%xmm10
  1588. cmpq $0,%rcx
  1589. jne L$short_avx
  1590. vpshufb %xmm13,%xmm10,%xmm10
  1591. vmovdqu %xmm10,(%rdi)
  1592. vzeroupper
  1593. .byte 0xf3,0xc3
  1594. .p2align 6
  1595. L$bswap_mask:
  1596. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1597. L$0x1c2_polynomial:
  1598. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1599. L$7_mask:
  1600. .long 7,0,7,0
  1601. L$7_mask_poly:
  1602. .long 7,0,450,0
  1603. .p2align 6
  1604. L$rem_4bit:
  1605. .long 0,0,0,471859200,0,943718400,0,610271232
  1606. .long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
  1607. .long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
  1608. .long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
  1609. L$rem_8bit:
  1610. .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
  1611. .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
  1612. .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
  1613. .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
  1614. .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
  1615. .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
  1616. .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
  1617. .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
  1618. .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
  1619. .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
  1620. .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
  1621. .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
  1622. .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
  1623. .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
  1624. .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
  1625. .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
  1626. .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
  1627. .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
  1628. .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
  1629. .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
  1630. .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
  1631. .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
  1632. .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
  1633. .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
  1634. .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
  1635. .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
  1636. .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
  1637. .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
  1638. .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
  1639. .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
  1640. .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
  1641. .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
  1642. .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1643. .p2align 6