ghash-x86_64.s 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847
  1. .text
  2. .globl gcm_gmult_4bit
  3. .type gcm_gmult_4bit,@function
  4. .align 16
  5. gcm_gmult_4bit:
  6. .cfi_startproc
  7. pushq %rbx
  8. .cfi_adjust_cfa_offset 8
  9. .cfi_offset %rbx,-16
  10. pushq %rbp
  11. .cfi_adjust_cfa_offset 8
  12. .cfi_offset %rbp,-24
  13. pushq %r12
  14. .cfi_adjust_cfa_offset 8
  15. .cfi_offset %r12,-32
  16. pushq %r13
  17. .cfi_adjust_cfa_offset 8
  18. .cfi_offset %r13,-40
  19. pushq %r14
  20. .cfi_adjust_cfa_offset 8
  21. .cfi_offset %r14,-48
  22. pushq %r15
  23. .cfi_adjust_cfa_offset 8
  24. .cfi_offset %r15,-56
  25. subq $280,%rsp
  26. .cfi_adjust_cfa_offset 280
  27. .Lgmult_prologue:
  28. movzbq 15(%rdi),%r8
  29. leaq .Lrem_4bit(%rip),%r11
  30. xorq %rax,%rax
  31. xorq %rbx,%rbx
  32. movb %r8b,%al
  33. movb %r8b,%bl
  34. shlb $4,%al
  35. movq $14,%rcx
  36. movq 8(%rsi,%rax,1),%r8
  37. movq (%rsi,%rax,1),%r9
  38. andb $0xf0,%bl
  39. movq %r8,%rdx
  40. jmp .Loop1
  41. .align 16
  42. .Loop1:
  43. shrq $4,%r8
  44. andq $0xf,%rdx
  45. movq %r9,%r10
  46. movb (%rdi,%rcx,1),%al
  47. shrq $4,%r9
  48. xorq 8(%rsi,%rbx,1),%r8
  49. shlq $60,%r10
  50. xorq (%rsi,%rbx,1),%r9
  51. movb %al,%bl
  52. xorq (%r11,%rdx,8),%r9
  53. movq %r8,%rdx
  54. shlb $4,%al
  55. xorq %r10,%r8
  56. decq %rcx
  57. js .Lbreak1
  58. shrq $4,%r8
  59. andq $0xf,%rdx
  60. movq %r9,%r10
  61. shrq $4,%r9
  62. xorq 8(%rsi,%rax,1),%r8
  63. shlq $60,%r10
  64. xorq (%rsi,%rax,1),%r9
  65. andb $0xf0,%bl
  66. xorq (%r11,%rdx,8),%r9
  67. movq %r8,%rdx
  68. xorq %r10,%r8
  69. jmp .Loop1
  70. .align 16
  71. .Lbreak1:
  72. shrq $4,%r8
  73. andq $0xf,%rdx
  74. movq %r9,%r10
  75. shrq $4,%r9
  76. xorq 8(%rsi,%rax,1),%r8
  77. shlq $60,%r10
  78. xorq (%rsi,%rax,1),%r9
  79. andb $0xf0,%bl
  80. xorq (%r11,%rdx,8),%r9
  81. movq %r8,%rdx
  82. xorq %r10,%r8
  83. shrq $4,%r8
  84. andq $0xf,%rdx
  85. movq %r9,%r10
  86. shrq $4,%r9
  87. xorq 8(%rsi,%rbx,1),%r8
  88. shlq $60,%r10
  89. xorq (%rsi,%rbx,1),%r9
  90. xorq %r10,%r8
  91. xorq (%r11,%rdx,8),%r9
  92. bswapq %r8
  93. bswapq %r9
  94. movq %r8,8(%rdi)
  95. movq %r9,(%rdi)
  96. leaq 280+48(%rsp),%rsi
  97. .cfi_def_cfa %rsi,8
  98. movq -8(%rsi),%rbx
  99. .cfi_restore %rbx
  100. leaq (%rsi),%rsp
  101. .cfi_def_cfa_register %rsp
  102. .Lgmult_epilogue:
  103. .byte 0xf3,0xc3
  104. .cfi_endproc
  105. .size gcm_gmult_4bit,.-gcm_gmult_4bit
  106. .globl gcm_ghash_4bit
  107. .type gcm_ghash_4bit,@function
  108. .align 16
  109. gcm_ghash_4bit:
  110. .cfi_startproc
  111. pushq %rbx
  112. .cfi_adjust_cfa_offset 8
  113. .cfi_offset %rbx,-16
  114. pushq %rbp
  115. .cfi_adjust_cfa_offset 8
  116. .cfi_offset %rbp,-24
  117. pushq %r12
  118. .cfi_adjust_cfa_offset 8
  119. .cfi_offset %r12,-32
  120. pushq %r13
  121. .cfi_adjust_cfa_offset 8
  122. .cfi_offset %r13,-40
  123. pushq %r14
  124. .cfi_adjust_cfa_offset 8
  125. .cfi_offset %r14,-48
  126. pushq %r15
  127. .cfi_adjust_cfa_offset 8
  128. .cfi_offset %r15,-56
  129. subq $280,%rsp
  130. .cfi_adjust_cfa_offset 280
  131. .Lghash_prologue:
  132. movq %rdx,%r14
  133. movq %rcx,%r15
  134. subq $-128,%rsi
  135. leaq 16+128(%rsp),%rbp
  136. xorl %edx,%edx
  137. movq 0+0-128(%rsi),%r8
  138. movq 0+8-128(%rsi),%rax
  139. movb %al,%dl
  140. shrq $4,%rax
  141. movq %r8,%r10
  142. shrq $4,%r8
  143. movq 16+0-128(%rsi),%r9
  144. shlb $4,%dl
  145. movq 16+8-128(%rsi),%rbx
  146. shlq $60,%r10
  147. movb %dl,0(%rsp)
  148. orq %r10,%rax
  149. movb %bl,%dl
  150. shrq $4,%rbx
  151. movq %r9,%r10
  152. shrq $4,%r9
  153. movq %r8,0(%rbp)
  154. movq 32+0-128(%rsi),%r8
  155. shlb $4,%dl
  156. movq %rax,0-128(%rbp)
  157. movq 32+8-128(%rsi),%rax
  158. shlq $60,%r10
  159. movb %dl,1(%rsp)
  160. orq %r10,%rbx
  161. movb %al,%dl
  162. shrq $4,%rax
  163. movq %r8,%r10
  164. shrq $4,%r8
  165. movq %r9,8(%rbp)
  166. movq 48+0-128(%rsi),%r9
  167. shlb $4,%dl
  168. movq %rbx,8-128(%rbp)
  169. movq 48+8-128(%rsi),%rbx
  170. shlq $60,%r10
  171. movb %dl,2(%rsp)
  172. orq %r10,%rax
  173. movb %bl,%dl
  174. shrq $4,%rbx
  175. movq %r9,%r10
  176. shrq $4,%r9
  177. movq %r8,16(%rbp)
  178. movq 64+0-128(%rsi),%r8
  179. shlb $4,%dl
  180. movq %rax,16-128(%rbp)
  181. movq 64+8-128(%rsi),%rax
  182. shlq $60,%r10
  183. movb %dl,3(%rsp)
  184. orq %r10,%rbx
  185. movb %al,%dl
  186. shrq $4,%rax
  187. movq %r8,%r10
  188. shrq $4,%r8
  189. movq %r9,24(%rbp)
  190. movq 80+0-128(%rsi),%r9
  191. shlb $4,%dl
  192. movq %rbx,24-128(%rbp)
  193. movq 80+8-128(%rsi),%rbx
  194. shlq $60,%r10
  195. movb %dl,4(%rsp)
  196. orq %r10,%rax
  197. movb %bl,%dl
  198. shrq $4,%rbx
  199. movq %r9,%r10
  200. shrq $4,%r9
  201. movq %r8,32(%rbp)
  202. movq 96+0-128(%rsi),%r8
  203. shlb $4,%dl
  204. movq %rax,32-128(%rbp)
  205. movq 96+8-128(%rsi),%rax
  206. shlq $60,%r10
  207. movb %dl,5(%rsp)
  208. orq %r10,%rbx
  209. movb %al,%dl
  210. shrq $4,%rax
  211. movq %r8,%r10
  212. shrq $4,%r8
  213. movq %r9,40(%rbp)
  214. movq 112+0-128(%rsi),%r9
  215. shlb $4,%dl
  216. movq %rbx,40-128(%rbp)
  217. movq 112+8-128(%rsi),%rbx
  218. shlq $60,%r10
  219. movb %dl,6(%rsp)
  220. orq %r10,%rax
  221. movb %bl,%dl
  222. shrq $4,%rbx
  223. movq %r9,%r10
  224. shrq $4,%r9
  225. movq %r8,48(%rbp)
  226. movq 128+0-128(%rsi),%r8
  227. shlb $4,%dl
  228. movq %rax,48-128(%rbp)
  229. movq 128+8-128(%rsi),%rax
  230. shlq $60,%r10
  231. movb %dl,7(%rsp)
  232. orq %r10,%rbx
  233. movb %al,%dl
  234. shrq $4,%rax
  235. movq %r8,%r10
  236. shrq $4,%r8
  237. movq %r9,56(%rbp)
  238. movq 144+0-128(%rsi),%r9
  239. shlb $4,%dl
  240. movq %rbx,56-128(%rbp)
  241. movq 144+8-128(%rsi),%rbx
  242. shlq $60,%r10
  243. movb %dl,8(%rsp)
  244. orq %r10,%rax
  245. movb %bl,%dl
  246. shrq $4,%rbx
  247. movq %r9,%r10
  248. shrq $4,%r9
  249. movq %r8,64(%rbp)
  250. movq 160+0-128(%rsi),%r8
  251. shlb $4,%dl
  252. movq %rax,64-128(%rbp)
  253. movq 160+8-128(%rsi),%rax
  254. shlq $60,%r10
  255. movb %dl,9(%rsp)
  256. orq %r10,%rbx
  257. movb %al,%dl
  258. shrq $4,%rax
  259. movq %r8,%r10
  260. shrq $4,%r8
  261. movq %r9,72(%rbp)
  262. movq 176+0-128(%rsi),%r9
  263. shlb $4,%dl
  264. movq %rbx,72-128(%rbp)
  265. movq 176+8-128(%rsi),%rbx
  266. shlq $60,%r10
  267. movb %dl,10(%rsp)
  268. orq %r10,%rax
  269. movb %bl,%dl
  270. shrq $4,%rbx
  271. movq %r9,%r10
  272. shrq $4,%r9
  273. movq %r8,80(%rbp)
  274. movq 192+0-128(%rsi),%r8
  275. shlb $4,%dl
  276. movq %rax,80-128(%rbp)
  277. movq 192+8-128(%rsi),%rax
  278. shlq $60,%r10
  279. movb %dl,11(%rsp)
  280. orq %r10,%rbx
  281. movb %al,%dl
  282. shrq $4,%rax
  283. movq %r8,%r10
  284. shrq $4,%r8
  285. movq %r9,88(%rbp)
  286. movq 208+0-128(%rsi),%r9
  287. shlb $4,%dl
  288. movq %rbx,88-128(%rbp)
  289. movq 208+8-128(%rsi),%rbx
  290. shlq $60,%r10
  291. movb %dl,12(%rsp)
  292. orq %r10,%rax
  293. movb %bl,%dl
  294. shrq $4,%rbx
  295. movq %r9,%r10
  296. shrq $4,%r9
  297. movq %r8,96(%rbp)
  298. movq 224+0-128(%rsi),%r8
  299. shlb $4,%dl
  300. movq %rax,96-128(%rbp)
  301. movq 224+8-128(%rsi),%rax
  302. shlq $60,%r10
  303. movb %dl,13(%rsp)
  304. orq %r10,%rbx
  305. movb %al,%dl
  306. shrq $4,%rax
  307. movq %r8,%r10
  308. shrq $4,%r8
  309. movq %r9,104(%rbp)
  310. movq 240+0-128(%rsi),%r9
  311. shlb $4,%dl
  312. movq %rbx,104-128(%rbp)
  313. movq 240+8-128(%rsi),%rbx
  314. shlq $60,%r10
  315. movb %dl,14(%rsp)
  316. orq %r10,%rax
  317. movb %bl,%dl
  318. shrq $4,%rbx
  319. movq %r9,%r10
  320. shrq $4,%r9
  321. movq %r8,112(%rbp)
  322. shlb $4,%dl
  323. movq %rax,112-128(%rbp)
  324. shlq $60,%r10
  325. movb %dl,15(%rsp)
  326. orq %r10,%rbx
  327. movq %r9,120(%rbp)
  328. movq %rbx,120-128(%rbp)
  329. addq $-128,%rsi
  330. movq 8(%rdi),%r8
  331. movq 0(%rdi),%r9
  332. addq %r14,%r15
  333. leaq .Lrem_8bit(%rip),%r11
  334. jmp .Louter_loop
  335. .align 16
  336. .Louter_loop:
  337. xorq (%r14),%r9
  338. movq 8(%r14),%rdx
  339. leaq 16(%r14),%r14
  340. xorq %r8,%rdx
  341. movq %r9,(%rdi)
  342. movq %rdx,8(%rdi)
  343. shrq $32,%rdx
  344. xorq %rax,%rax
  345. roll $8,%edx
  346. movb %dl,%al
  347. movzbl %dl,%ebx
  348. shlb $4,%al
  349. shrl $4,%ebx
  350. roll $8,%edx
  351. movq 8(%rsi,%rax,1),%r8
  352. movq (%rsi,%rax,1),%r9
  353. movb %dl,%al
  354. movzbl %dl,%ecx
  355. shlb $4,%al
  356. movzbq (%rsp,%rbx,1),%r12
  357. shrl $4,%ecx
  358. xorq %r8,%r12
  359. movq %r9,%r10
  360. shrq $8,%r8
  361. movzbq %r12b,%r12
  362. shrq $8,%r9
  363. xorq -128(%rbp,%rbx,8),%r8
  364. shlq $56,%r10
  365. xorq (%rbp,%rbx,8),%r9
  366. roll $8,%edx
  367. xorq 8(%rsi,%rax,1),%r8
  368. xorq (%rsi,%rax,1),%r9
  369. movb %dl,%al
  370. xorq %r10,%r8
  371. movzwq (%r11,%r12,2),%r12
  372. movzbl %dl,%ebx
  373. shlb $4,%al
  374. movzbq (%rsp,%rcx,1),%r13
  375. shrl $4,%ebx
  376. shlq $48,%r12
  377. xorq %r8,%r13
  378. movq %r9,%r10
  379. xorq %r12,%r9
  380. shrq $8,%r8
  381. movzbq %r13b,%r13
  382. shrq $8,%r9
  383. xorq -128(%rbp,%rcx,8),%r8
  384. shlq $56,%r10
  385. xorq (%rbp,%rcx,8),%r9
  386. roll $8,%edx
  387. xorq 8(%rsi,%rax,1),%r8
  388. xorq (%rsi,%rax,1),%r9
  389. movb %dl,%al
  390. xorq %r10,%r8
  391. movzwq (%r11,%r13,2),%r13
  392. movzbl %dl,%ecx
  393. shlb $4,%al
  394. movzbq (%rsp,%rbx,1),%r12
  395. shrl $4,%ecx
  396. shlq $48,%r13
  397. xorq %r8,%r12
  398. movq %r9,%r10
  399. xorq %r13,%r9
  400. shrq $8,%r8
  401. movzbq %r12b,%r12
  402. movl 8(%rdi),%edx
  403. shrq $8,%r9
  404. xorq -128(%rbp,%rbx,8),%r8
  405. shlq $56,%r10
  406. xorq (%rbp,%rbx,8),%r9
  407. roll $8,%edx
  408. xorq 8(%rsi,%rax,1),%r8
  409. xorq (%rsi,%rax,1),%r9
  410. movb %dl,%al
  411. xorq %r10,%r8
  412. movzwq (%r11,%r12,2),%r12
  413. movzbl %dl,%ebx
  414. shlb $4,%al
  415. movzbq (%rsp,%rcx,1),%r13
  416. shrl $4,%ebx
  417. shlq $48,%r12
  418. xorq %r8,%r13
  419. movq %r9,%r10
  420. xorq %r12,%r9
  421. shrq $8,%r8
  422. movzbq %r13b,%r13
  423. shrq $8,%r9
  424. xorq -128(%rbp,%rcx,8),%r8
  425. shlq $56,%r10
  426. xorq (%rbp,%rcx,8),%r9
  427. roll $8,%edx
  428. xorq 8(%rsi,%rax,1),%r8
  429. xorq (%rsi,%rax,1),%r9
  430. movb %dl,%al
  431. xorq %r10,%r8
  432. movzwq (%r11,%r13,2),%r13
  433. movzbl %dl,%ecx
  434. shlb $4,%al
  435. movzbq (%rsp,%rbx,1),%r12
  436. shrl $4,%ecx
  437. shlq $48,%r13
  438. xorq %r8,%r12
  439. movq %r9,%r10
  440. xorq %r13,%r9
  441. shrq $8,%r8
  442. movzbq %r12b,%r12
  443. shrq $8,%r9
  444. xorq -128(%rbp,%rbx,8),%r8
  445. shlq $56,%r10
  446. xorq (%rbp,%rbx,8),%r9
  447. roll $8,%edx
  448. xorq 8(%rsi,%rax,1),%r8
  449. xorq (%rsi,%rax,1),%r9
  450. movb %dl,%al
  451. xorq %r10,%r8
  452. movzwq (%r11,%r12,2),%r12
  453. movzbl %dl,%ebx
  454. shlb $4,%al
  455. movzbq (%rsp,%rcx,1),%r13
  456. shrl $4,%ebx
  457. shlq $48,%r12
  458. xorq %r8,%r13
  459. movq %r9,%r10
  460. xorq %r12,%r9
  461. shrq $8,%r8
  462. movzbq %r13b,%r13
  463. shrq $8,%r9
  464. xorq -128(%rbp,%rcx,8),%r8
  465. shlq $56,%r10
  466. xorq (%rbp,%rcx,8),%r9
  467. roll $8,%edx
  468. xorq 8(%rsi,%rax,1),%r8
  469. xorq (%rsi,%rax,1),%r9
  470. movb %dl,%al
  471. xorq %r10,%r8
  472. movzwq (%r11,%r13,2),%r13
  473. movzbl %dl,%ecx
  474. shlb $4,%al
  475. movzbq (%rsp,%rbx,1),%r12
  476. shrl $4,%ecx
  477. shlq $48,%r13
  478. xorq %r8,%r12
  479. movq %r9,%r10
  480. xorq %r13,%r9
  481. shrq $8,%r8
  482. movzbq %r12b,%r12
  483. movl 4(%rdi),%edx
  484. shrq $8,%r9
  485. xorq -128(%rbp,%rbx,8),%r8
  486. shlq $56,%r10
  487. xorq (%rbp,%rbx,8),%r9
  488. roll $8,%edx
  489. xorq 8(%rsi,%rax,1),%r8
  490. xorq (%rsi,%rax,1),%r9
  491. movb %dl,%al
  492. xorq %r10,%r8
  493. movzwq (%r11,%r12,2),%r12
  494. movzbl %dl,%ebx
  495. shlb $4,%al
  496. movzbq (%rsp,%rcx,1),%r13
  497. shrl $4,%ebx
  498. shlq $48,%r12
  499. xorq %r8,%r13
  500. movq %r9,%r10
  501. xorq %r12,%r9
  502. shrq $8,%r8
  503. movzbq %r13b,%r13
  504. shrq $8,%r9
  505. xorq -128(%rbp,%rcx,8),%r8
  506. shlq $56,%r10
  507. xorq (%rbp,%rcx,8),%r9
  508. roll $8,%edx
  509. xorq 8(%rsi,%rax,1),%r8
  510. xorq (%rsi,%rax,1),%r9
  511. movb %dl,%al
  512. xorq %r10,%r8
  513. movzwq (%r11,%r13,2),%r13
  514. movzbl %dl,%ecx
  515. shlb $4,%al
  516. movzbq (%rsp,%rbx,1),%r12
  517. shrl $4,%ecx
  518. shlq $48,%r13
  519. xorq %r8,%r12
  520. movq %r9,%r10
  521. xorq %r13,%r9
  522. shrq $8,%r8
  523. movzbq %r12b,%r12
  524. shrq $8,%r9
  525. xorq -128(%rbp,%rbx,8),%r8
  526. shlq $56,%r10
  527. xorq (%rbp,%rbx,8),%r9
  528. roll $8,%edx
  529. xorq 8(%rsi,%rax,1),%r8
  530. xorq (%rsi,%rax,1),%r9
  531. movb %dl,%al
  532. xorq %r10,%r8
  533. movzwq (%r11,%r12,2),%r12
  534. movzbl %dl,%ebx
  535. shlb $4,%al
  536. movzbq (%rsp,%rcx,1),%r13
  537. shrl $4,%ebx
  538. shlq $48,%r12
  539. xorq %r8,%r13
  540. movq %r9,%r10
  541. xorq %r12,%r9
  542. shrq $8,%r8
  543. movzbq %r13b,%r13
  544. shrq $8,%r9
  545. xorq -128(%rbp,%rcx,8),%r8
  546. shlq $56,%r10
  547. xorq (%rbp,%rcx,8),%r9
  548. roll $8,%edx
  549. xorq 8(%rsi,%rax,1),%r8
  550. xorq (%rsi,%rax,1),%r9
  551. movb %dl,%al
  552. xorq %r10,%r8
  553. movzwq (%r11,%r13,2),%r13
  554. movzbl %dl,%ecx
  555. shlb $4,%al
  556. movzbq (%rsp,%rbx,1),%r12
  557. shrl $4,%ecx
  558. shlq $48,%r13
  559. xorq %r8,%r12
  560. movq %r9,%r10
  561. xorq %r13,%r9
  562. shrq $8,%r8
  563. movzbq %r12b,%r12
  564. movl 0(%rdi),%edx
  565. shrq $8,%r9
  566. xorq -128(%rbp,%rbx,8),%r8
  567. shlq $56,%r10
  568. xorq (%rbp,%rbx,8),%r9
  569. roll $8,%edx
  570. xorq 8(%rsi,%rax,1),%r8
  571. xorq (%rsi,%rax,1),%r9
  572. movb %dl,%al
  573. xorq %r10,%r8
  574. movzwq (%r11,%r12,2),%r12
  575. movzbl %dl,%ebx
  576. shlb $4,%al
  577. movzbq (%rsp,%rcx,1),%r13
  578. shrl $4,%ebx
  579. shlq $48,%r12
  580. xorq %r8,%r13
  581. movq %r9,%r10
  582. xorq %r12,%r9
  583. shrq $8,%r8
  584. movzbq %r13b,%r13
  585. shrq $8,%r9
  586. xorq -128(%rbp,%rcx,8),%r8
  587. shlq $56,%r10
  588. xorq (%rbp,%rcx,8),%r9
  589. roll $8,%edx
  590. xorq 8(%rsi,%rax,1),%r8
  591. xorq (%rsi,%rax,1),%r9
  592. movb %dl,%al
  593. xorq %r10,%r8
  594. movzwq (%r11,%r13,2),%r13
  595. movzbl %dl,%ecx
  596. shlb $4,%al
  597. movzbq (%rsp,%rbx,1),%r12
  598. shrl $4,%ecx
  599. shlq $48,%r13
  600. xorq %r8,%r12
  601. movq %r9,%r10
  602. xorq %r13,%r9
  603. shrq $8,%r8
  604. movzbq %r12b,%r12
  605. shrq $8,%r9
  606. xorq -128(%rbp,%rbx,8),%r8
  607. shlq $56,%r10
  608. xorq (%rbp,%rbx,8),%r9
  609. roll $8,%edx
  610. xorq 8(%rsi,%rax,1),%r8
  611. xorq (%rsi,%rax,1),%r9
  612. movb %dl,%al
  613. xorq %r10,%r8
  614. movzwq (%r11,%r12,2),%r12
  615. movzbl %dl,%ebx
  616. shlb $4,%al
  617. movzbq (%rsp,%rcx,1),%r13
  618. shrl $4,%ebx
  619. shlq $48,%r12
  620. xorq %r8,%r13
  621. movq %r9,%r10
  622. xorq %r12,%r9
  623. shrq $8,%r8
  624. movzbq %r13b,%r13
  625. shrq $8,%r9
  626. xorq -128(%rbp,%rcx,8),%r8
  627. shlq $56,%r10
  628. xorq (%rbp,%rcx,8),%r9
  629. roll $8,%edx
  630. xorq 8(%rsi,%rax,1),%r8
  631. xorq (%rsi,%rax,1),%r9
  632. movb %dl,%al
  633. xorq %r10,%r8
  634. movzwq (%r11,%r13,2),%r13
  635. movzbl %dl,%ecx
  636. shlb $4,%al
  637. movzbq (%rsp,%rbx,1),%r12
  638. andl $240,%ecx
  639. shlq $48,%r13
  640. xorq %r8,%r12
  641. movq %r9,%r10
  642. xorq %r13,%r9
  643. shrq $8,%r8
  644. movzbq %r12b,%r12
  645. movl -4(%rdi),%edx
  646. shrq $8,%r9
  647. xorq -128(%rbp,%rbx,8),%r8
  648. shlq $56,%r10
  649. xorq (%rbp,%rbx,8),%r9
  650. movzwq (%r11,%r12,2),%r12
  651. xorq 8(%rsi,%rax,1),%r8
  652. xorq (%rsi,%rax,1),%r9
  653. shlq $48,%r12
  654. xorq %r10,%r8
  655. xorq %r12,%r9
  656. movzbq %r8b,%r13
  657. shrq $4,%r8
  658. movq %r9,%r10
  659. shlb $4,%r13b
  660. shrq $4,%r9
  661. xorq 8(%rsi,%rcx,1),%r8
  662. movzwq (%r11,%r13,2),%r13
  663. shlq $60,%r10
  664. xorq (%rsi,%rcx,1),%r9
  665. xorq %r10,%r8
  666. shlq $48,%r13
  667. bswapq %r8
  668. xorq %r13,%r9
  669. bswapq %r9
  670. cmpq %r15,%r14
  671. jb .Louter_loop
  672. movq %r8,8(%rdi)
  673. movq %r9,(%rdi)
  674. leaq 280+48(%rsp),%rsi
  675. .cfi_def_cfa %rsi,8
  676. movq -48(%rsi),%r15
  677. .cfi_restore %r15
  678. movq -40(%rsi),%r14
  679. .cfi_restore %r14
  680. movq -32(%rsi),%r13
  681. .cfi_restore %r13
  682. movq -24(%rsi),%r12
  683. .cfi_restore %r12
  684. movq -16(%rsi),%rbp
  685. .cfi_restore %rbp
  686. movq -8(%rsi),%rbx
  687. .cfi_restore %rbx
  688. leaq 0(%rsi),%rsp
  689. .cfi_def_cfa_register %rsp
  690. .Lghash_epilogue:
  691. .byte 0xf3,0xc3
  692. .cfi_endproc
  693. .size gcm_ghash_4bit,.-gcm_ghash_4bit
  694. .globl gcm_init_clmul
  695. .type gcm_init_clmul,@function
  696. .align 16
  697. gcm_init_clmul:
  698. .cfi_startproc
  699. .L_init_clmul:
  700. movdqu (%rsi),%xmm2
  701. pshufd $78,%xmm2,%xmm2
  702. pshufd $255,%xmm2,%xmm4
  703. movdqa %xmm2,%xmm3
  704. psllq $1,%xmm2
  705. pxor %xmm5,%xmm5
  706. psrlq $63,%xmm3
  707. pcmpgtd %xmm4,%xmm5
  708. pslldq $8,%xmm3
  709. por %xmm3,%xmm2
  710. pand .L0x1c2_polynomial(%rip),%xmm5
  711. pxor %xmm5,%xmm2
  712. pshufd $78,%xmm2,%xmm6
  713. movdqa %xmm2,%xmm0
  714. pxor %xmm2,%xmm6
  715. movdqa %xmm0,%xmm1
  716. pshufd $78,%xmm0,%xmm3
  717. pxor %xmm0,%xmm3
  718. .byte 102,15,58,68,194,0
  719. .byte 102,15,58,68,202,17
  720. .byte 102,15,58,68,222,0
  721. pxor %xmm0,%xmm3
  722. pxor %xmm1,%xmm3
  723. movdqa %xmm3,%xmm4
  724. psrldq $8,%xmm3
  725. pslldq $8,%xmm4
  726. pxor %xmm3,%xmm1
  727. pxor %xmm4,%xmm0
  728. movdqa %xmm0,%xmm4
  729. movdqa %xmm0,%xmm3
  730. psllq $5,%xmm0
  731. pxor %xmm0,%xmm3
  732. psllq $1,%xmm0
  733. pxor %xmm3,%xmm0
  734. psllq $57,%xmm0
  735. movdqa %xmm0,%xmm3
  736. pslldq $8,%xmm0
  737. psrldq $8,%xmm3
  738. pxor %xmm4,%xmm0
  739. pxor %xmm3,%xmm1
  740. movdqa %xmm0,%xmm4
  741. psrlq $1,%xmm0
  742. pxor %xmm4,%xmm1
  743. pxor %xmm0,%xmm4
  744. psrlq $5,%xmm0
  745. pxor %xmm4,%xmm0
  746. psrlq $1,%xmm0
  747. pxor %xmm1,%xmm0
  748. pshufd $78,%xmm2,%xmm3
  749. pshufd $78,%xmm0,%xmm4
  750. pxor %xmm2,%xmm3
  751. movdqu %xmm2,0(%rdi)
  752. pxor %xmm0,%xmm4
  753. movdqu %xmm0,16(%rdi)
  754. .byte 102,15,58,15,227,8
  755. movdqu %xmm4,32(%rdi)
  756. movdqa %xmm0,%xmm1
  757. pshufd $78,%xmm0,%xmm3
  758. pxor %xmm0,%xmm3
  759. .byte 102,15,58,68,194,0
  760. .byte 102,15,58,68,202,17
  761. .byte 102,15,58,68,222,0
  762. pxor %xmm0,%xmm3
  763. pxor %xmm1,%xmm3
  764. movdqa %xmm3,%xmm4
  765. psrldq $8,%xmm3
  766. pslldq $8,%xmm4
  767. pxor %xmm3,%xmm1
  768. pxor %xmm4,%xmm0
  769. movdqa %xmm0,%xmm4
  770. movdqa %xmm0,%xmm3
  771. psllq $5,%xmm0
  772. pxor %xmm0,%xmm3
  773. psllq $1,%xmm0
  774. pxor %xmm3,%xmm0
  775. psllq $57,%xmm0
  776. movdqa %xmm0,%xmm3
  777. pslldq $8,%xmm0
  778. psrldq $8,%xmm3
  779. pxor %xmm4,%xmm0
  780. pxor %xmm3,%xmm1
  781. movdqa %xmm0,%xmm4
  782. psrlq $1,%xmm0
  783. pxor %xmm4,%xmm1
  784. pxor %xmm0,%xmm4
  785. psrlq $5,%xmm0
  786. pxor %xmm4,%xmm0
  787. psrlq $1,%xmm0
  788. pxor %xmm1,%xmm0
  789. movdqa %xmm0,%xmm5
  790. movdqa %xmm0,%xmm1
  791. pshufd $78,%xmm0,%xmm3
  792. pxor %xmm0,%xmm3
  793. .byte 102,15,58,68,194,0
  794. .byte 102,15,58,68,202,17
  795. .byte 102,15,58,68,222,0
  796. pxor %xmm0,%xmm3
  797. pxor %xmm1,%xmm3
  798. movdqa %xmm3,%xmm4
  799. psrldq $8,%xmm3
  800. pslldq $8,%xmm4
  801. pxor %xmm3,%xmm1
  802. pxor %xmm4,%xmm0
  803. movdqa %xmm0,%xmm4
  804. movdqa %xmm0,%xmm3
  805. psllq $5,%xmm0
  806. pxor %xmm0,%xmm3
  807. psllq $1,%xmm0
  808. pxor %xmm3,%xmm0
  809. psllq $57,%xmm0
  810. movdqa %xmm0,%xmm3
  811. pslldq $8,%xmm0
  812. psrldq $8,%xmm3
  813. pxor %xmm4,%xmm0
  814. pxor %xmm3,%xmm1
  815. movdqa %xmm0,%xmm4
  816. psrlq $1,%xmm0
  817. pxor %xmm4,%xmm1
  818. pxor %xmm0,%xmm4
  819. psrlq $5,%xmm0
  820. pxor %xmm4,%xmm0
  821. psrlq $1,%xmm0
  822. pxor %xmm1,%xmm0
  823. pshufd $78,%xmm5,%xmm3
  824. pshufd $78,%xmm0,%xmm4
  825. pxor %xmm5,%xmm3
  826. movdqu %xmm5,48(%rdi)
  827. pxor %xmm0,%xmm4
  828. movdqu %xmm0,64(%rdi)
  829. .byte 102,15,58,15,227,8
  830. movdqu %xmm4,80(%rdi)
  831. .byte 0xf3,0xc3
  832. .cfi_endproc
  833. .size gcm_init_clmul,.-gcm_init_clmul
  834. .globl gcm_gmult_clmul
  835. .type gcm_gmult_clmul,@function
  836. .align 16
  837. gcm_gmult_clmul:
  838. .cfi_startproc
  839. .L_gmult_clmul:
  840. movdqu (%rdi),%xmm0
  841. movdqa .Lbswap_mask(%rip),%xmm5
  842. movdqu (%rsi),%xmm2
  843. movdqu 32(%rsi),%xmm4
  844. .byte 102,15,56,0,197
  845. movdqa %xmm0,%xmm1
  846. pshufd $78,%xmm0,%xmm3
  847. pxor %xmm0,%xmm3
  848. .byte 102,15,58,68,194,0
  849. .byte 102,15,58,68,202,17
  850. .byte 102,15,58,68,220,0
  851. pxor %xmm0,%xmm3
  852. pxor %xmm1,%xmm3
  853. movdqa %xmm3,%xmm4
  854. psrldq $8,%xmm3
  855. pslldq $8,%xmm4
  856. pxor %xmm3,%xmm1
  857. pxor %xmm4,%xmm0
  858. movdqa %xmm0,%xmm4
  859. movdqa %xmm0,%xmm3
  860. psllq $5,%xmm0
  861. pxor %xmm0,%xmm3
  862. psllq $1,%xmm0
  863. pxor %xmm3,%xmm0
  864. psllq $57,%xmm0
  865. movdqa %xmm0,%xmm3
  866. pslldq $8,%xmm0
  867. psrldq $8,%xmm3
  868. pxor %xmm4,%xmm0
  869. pxor %xmm3,%xmm1
  870. movdqa %xmm0,%xmm4
  871. psrlq $1,%xmm0
  872. pxor %xmm4,%xmm1
  873. pxor %xmm0,%xmm4
  874. psrlq $5,%xmm0
  875. pxor %xmm4,%xmm0
  876. psrlq $1,%xmm0
  877. pxor %xmm1,%xmm0
  878. .byte 102,15,56,0,197
  879. movdqu %xmm0,(%rdi)
  880. .byte 0xf3,0xc3
  881. .cfi_endproc
  882. .size gcm_gmult_clmul,.-gcm_gmult_clmul
  883. .globl gcm_ghash_clmul
  884. .type gcm_ghash_clmul,@function
  885. .align 32
  886. gcm_ghash_clmul:
  887. .cfi_startproc
  888. .L_ghash_clmul:
  889. movdqa .Lbswap_mask(%rip),%xmm10
  890. movdqu (%rdi),%xmm0
  891. movdqu (%rsi),%xmm2
  892. movdqu 32(%rsi),%xmm7
  893. .byte 102,65,15,56,0,194
  894. subq $0x10,%rcx
  895. jz .Lodd_tail
  896. movdqu 16(%rsi),%xmm6
  897. movl OPENSSL_ia32cap_P+4(%rip),%eax
  898. cmpq $0x30,%rcx
  899. jb .Lskip4x
  900. andl $71303168,%eax
  901. cmpl $4194304,%eax
  902. je .Lskip4x
  903. subq $0x30,%rcx
  904. movq $0xA040608020C0E000,%rax
  905. movdqu 48(%rsi),%xmm14
  906. movdqu 64(%rsi),%xmm15
  907. movdqu 48(%rdx),%xmm3
  908. movdqu 32(%rdx),%xmm11
  909. .byte 102,65,15,56,0,218
  910. .byte 102,69,15,56,0,218
  911. movdqa %xmm3,%xmm5
  912. pshufd $78,%xmm3,%xmm4
  913. pxor %xmm3,%xmm4
  914. .byte 102,15,58,68,218,0
  915. .byte 102,15,58,68,234,17
  916. .byte 102,15,58,68,231,0
  917. movdqa %xmm11,%xmm13
  918. pshufd $78,%xmm11,%xmm12
  919. pxor %xmm11,%xmm12
  920. .byte 102,68,15,58,68,222,0
  921. .byte 102,68,15,58,68,238,17
  922. .byte 102,68,15,58,68,231,16
  923. xorps %xmm11,%xmm3
  924. xorps %xmm13,%xmm5
  925. movups 80(%rsi),%xmm7
  926. xorps %xmm12,%xmm4
  927. movdqu 16(%rdx),%xmm11
  928. movdqu 0(%rdx),%xmm8
  929. .byte 102,69,15,56,0,218
  930. .byte 102,69,15,56,0,194
  931. movdqa %xmm11,%xmm13
  932. pshufd $78,%xmm11,%xmm12
  933. pxor %xmm8,%xmm0
  934. pxor %xmm11,%xmm12
  935. .byte 102,69,15,58,68,222,0
  936. movdqa %xmm0,%xmm1
  937. pshufd $78,%xmm0,%xmm8
  938. pxor %xmm0,%xmm8
  939. .byte 102,69,15,58,68,238,17
  940. .byte 102,68,15,58,68,231,0
  941. xorps %xmm11,%xmm3
  942. xorps %xmm13,%xmm5
  943. leaq 64(%rdx),%rdx
  944. subq $0x40,%rcx
  945. jc .Ltail4x
  946. jmp .Lmod4_loop
  947. .align 32
  948. .Lmod4_loop:
  949. .byte 102,65,15,58,68,199,0
  950. xorps %xmm12,%xmm4
  951. movdqu 48(%rdx),%xmm11
  952. .byte 102,69,15,56,0,218
  953. .byte 102,65,15,58,68,207,17
  954. xorps %xmm3,%xmm0
  955. movdqu 32(%rdx),%xmm3
  956. movdqa %xmm11,%xmm13
  957. .byte 102,68,15,58,68,199,16
  958. pshufd $78,%xmm11,%xmm12
  959. xorps %xmm5,%xmm1
  960. pxor %xmm11,%xmm12
  961. .byte 102,65,15,56,0,218
  962. movups 32(%rsi),%xmm7
  963. xorps %xmm4,%xmm8
  964. .byte 102,68,15,58,68,218,0
  965. pshufd $78,%xmm3,%xmm4
  966. pxor %xmm0,%xmm8
  967. movdqa %xmm3,%xmm5
  968. pxor %xmm1,%xmm8
  969. pxor %xmm3,%xmm4
  970. movdqa %xmm8,%xmm9
  971. .byte 102,68,15,58,68,234,17
  972. pslldq $8,%xmm8
  973. psrldq $8,%xmm9
  974. pxor %xmm8,%xmm0
  975. movdqa .L7_mask(%rip),%xmm8
  976. pxor %xmm9,%xmm1
  977. .byte 102,76,15,110,200
  978. pand %xmm0,%xmm8
  979. .byte 102,69,15,56,0,200
  980. pxor %xmm0,%xmm9
  981. .byte 102,68,15,58,68,231,0
  982. psllq $57,%xmm9
  983. movdqa %xmm9,%xmm8
  984. pslldq $8,%xmm9
  985. .byte 102,15,58,68,222,0
  986. psrldq $8,%xmm8
  987. pxor %xmm9,%xmm0
  988. pxor %xmm8,%xmm1
  989. movdqu 0(%rdx),%xmm8
  990. movdqa %xmm0,%xmm9
  991. psrlq $1,%xmm0
  992. .byte 102,15,58,68,238,17
  993. xorps %xmm11,%xmm3
  994. movdqu 16(%rdx),%xmm11
  995. .byte 102,69,15,56,0,218
  996. .byte 102,15,58,68,231,16
  997. xorps %xmm13,%xmm5
  998. movups 80(%rsi),%xmm7
  999. .byte 102,69,15,56,0,194
  1000. pxor %xmm9,%xmm1
  1001. pxor %xmm0,%xmm9
  1002. psrlq $5,%xmm0
  1003. movdqa %xmm11,%xmm13
  1004. pxor %xmm12,%xmm4
  1005. pshufd $78,%xmm11,%xmm12
  1006. pxor %xmm9,%xmm0
  1007. pxor %xmm8,%xmm1
  1008. pxor %xmm11,%xmm12
  1009. .byte 102,69,15,58,68,222,0
  1010. psrlq $1,%xmm0
  1011. pxor %xmm1,%xmm0
  1012. movdqa %xmm0,%xmm1
  1013. .byte 102,69,15,58,68,238,17
  1014. xorps %xmm11,%xmm3
  1015. pshufd $78,%xmm0,%xmm8
  1016. pxor %xmm0,%xmm8
  1017. .byte 102,68,15,58,68,231,0
  1018. xorps %xmm13,%xmm5
  1019. leaq 64(%rdx),%rdx
  1020. subq $0x40,%rcx
  1021. jnc .Lmod4_loop
  1022. .Ltail4x:
  1023. .byte 102,65,15,58,68,199,0
  1024. .byte 102,65,15,58,68,207,17
  1025. .byte 102,68,15,58,68,199,16
  1026. xorps %xmm12,%xmm4
  1027. xorps %xmm3,%xmm0
  1028. xorps %xmm5,%xmm1
  1029. pxor %xmm0,%xmm1
  1030. pxor %xmm4,%xmm8
  1031. pxor %xmm1,%xmm8
  1032. pxor %xmm0,%xmm1
  1033. movdqa %xmm8,%xmm9
  1034. psrldq $8,%xmm8
  1035. pslldq $8,%xmm9
  1036. pxor %xmm8,%xmm1
  1037. pxor %xmm9,%xmm0
  1038. movdqa %xmm0,%xmm4
  1039. movdqa %xmm0,%xmm3
  1040. psllq $5,%xmm0
  1041. pxor %xmm0,%xmm3
  1042. psllq $1,%xmm0
  1043. pxor %xmm3,%xmm0
  1044. psllq $57,%xmm0
  1045. movdqa %xmm0,%xmm3
  1046. pslldq $8,%xmm0
  1047. psrldq $8,%xmm3
  1048. pxor %xmm4,%xmm0
  1049. pxor %xmm3,%xmm1
  1050. movdqa %xmm0,%xmm4
  1051. psrlq $1,%xmm0
  1052. pxor %xmm4,%xmm1
  1053. pxor %xmm0,%xmm4
  1054. psrlq $5,%xmm0
  1055. pxor %xmm4,%xmm0
  1056. psrlq $1,%xmm0
  1057. pxor %xmm1,%xmm0
  1058. addq $0x40,%rcx
  1059. jz .Ldone
  1060. movdqu 32(%rsi),%xmm7
  1061. subq $0x10,%rcx
  1062. jz .Lodd_tail
  1063. .Lskip4x:
  1064. movdqu (%rdx),%xmm8
  1065. movdqu 16(%rdx),%xmm3
  1066. .byte 102,69,15,56,0,194
  1067. .byte 102,65,15,56,0,218
  1068. pxor %xmm8,%xmm0
  1069. movdqa %xmm3,%xmm5
  1070. pshufd $78,%xmm3,%xmm4
  1071. pxor %xmm3,%xmm4
  1072. .byte 102,15,58,68,218,0
  1073. .byte 102,15,58,68,234,17
  1074. .byte 102,15,58,68,231,0
  1075. leaq 32(%rdx),%rdx
  1076. nop
  1077. subq $0x20,%rcx
  1078. jbe .Leven_tail
  1079. nop
  1080. jmp .Lmod_loop
  1081. .align 32
  1082. .Lmod_loop:
  1083. movdqa %xmm0,%xmm1
  1084. movdqa %xmm4,%xmm8
  1085. pshufd $78,%xmm0,%xmm4
  1086. pxor %xmm0,%xmm4
  1087. .byte 102,15,58,68,198,0
  1088. .byte 102,15,58,68,206,17
  1089. .byte 102,15,58,68,231,16
  1090. pxor %xmm3,%xmm0
  1091. pxor %xmm5,%xmm1
  1092. movdqu (%rdx),%xmm9
  1093. pxor %xmm0,%xmm8
  1094. .byte 102,69,15,56,0,202
  1095. movdqu 16(%rdx),%xmm3
  1096. pxor %xmm1,%xmm8
  1097. pxor %xmm9,%xmm1
  1098. pxor %xmm8,%xmm4
  1099. .byte 102,65,15,56,0,218
  1100. movdqa %xmm4,%xmm8
  1101. psrldq $8,%xmm8
  1102. pslldq $8,%xmm4
  1103. pxor %xmm8,%xmm1
  1104. pxor %xmm4,%xmm0
  1105. movdqa %xmm3,%xmm5
  1106. movdqa %xmm0,%xmm9
  1107. movdqa %xmm0,%xmm8
  1108. psllq $5,%xmm0
  1109. pxor %xmm0,%xmm8
  1110. .byte 102,15,58,68,218,0
  1111. psllq $1,%xmm0
  1112. pxor %xmm8,%xmm0
  1113. psllq $57,%xmm0
  1114. movdqa %xmm0,%xmm8
  1115. pslldq $8,%xmm0
  1116. psrldq $8,%xmm8
  1117. pxor %xmm9,%xmm0
  1118. pshufd $78,%xmm5,%xmm4
  1119. pxor %xmm8,%xmm1
  1120. pxor %xmm5,%xmm4
  1121. movdqa %xmm0,%xmm9
  1122. psrlq $1,%xmm0
  1123. .byte 102,15,58,68,234,17
  1124. pxor %xmm9,%xmm1
  1125. pxor %xmm0,%xmm9
  1126. psrlq $5,%xmm0
  1127. pxor %xmm9,%xmm0
  1128. leaq 32(%rdx),%rdx
  1129. psrlq $1,%xmm0
  1130. .byte 102,15,58,68,231,0
  1131. pxor %xmm1,%xmm0
  1132. subq $0x20,%rcx
  1133. ja .Lmod_loop
  1134. .Leven_tail:
  1135. movdqa %xmm0,%xmm1
  1136. movdqa %xmm4,%xmm8
  1137. pshufd $78,%xmm0,%xmm4
  1138. pxor %xmm0,%xmm4
  1139. .byte 102,15,58,68,198,0
  1140. .byte 102,15,58,68,206,17
  1141. .byte 102,15,58,68,231,16
  1142. pxor %xmm3,%xmm0
  1143. pxor %xmm5,%xmm1
  1144. pxor %xmm0,%xmm8
  1145. pxor %xmm1,%xmm8
  1146. pxor %xmm8,%xmm4
  1147. movdqa %xmm4,%xmm8
  1148. psrldq $8,%xmm8
  1149. pslldq $8,%xmm4
  1150. pxor %xmm8,%xmm1
  1151. pxor %xmm4,%xmm0
  1152. movdqa %xmm0,%xmm4
  1153. movdqa %xmm0,%xmm3
  1154. psllq $5,%xmm0
  1155. pxor %xmm0,%xmm3
  1156. psllq $1,%xmm0
  1157. pxor %xmm3,%xmm0
  1158. psllq $57,%xmm0
  1159. movdqa %xmm0,%xmm3
  1160. pslldq $8,%xmm0
  1161. psrldq $8,%xmm3
  1162. pxor %xmm4,%xmm0
  1163. pxor %xmm3,%xmm1
  1164. movdqa %xmm0,%xmm4
  1165. psrlq $1,%xmm0
  1166. pxor %xmm4,%xmm1
  1167. pxor %xmm0,%xmm4
  1168. psrlq $5,%xmm0
  1169. pxor %xmm4,%xmm0
  1170. psrlq $1,%xmm0
  1171. pxor %xmm1,%xmm0
  1172. testq %rcx,%rcx
  1173. jnz .Ldone
  1174. .Lodd_tail:
  1175. movdqu (%rdx),%xmm8
  1176. .byte 102,69,15,56,0,194
  1177. pxor %xmm8,%xmm0
  1178. movdqa %xmm0,%xmm1
  1179. pshufd $78,%xmm0,%xmm3
  1180. pxor %xmm0,%xmm3
  1181. .byte 102,15,58,68,194,0
  1182. .byte 102,15,58,68,202,17
  1183. .byte 102,15,58,68,223,0
  1184. pxor %xmm0,%xmm3
  1185. pxor %xmm1,%xmm3
  1186. movdqa %xmm3,%xmm4
  1187. psrldq $8,%xmm3
  1188. pslldq $8,%xmm4
  1189. pxor %xmm3,%xmm1
  1190. pxor %xmm4,%xmm0
  1191. movdqa %xmm0,%xmm4
  1192. movdqa %xmm0,%xmm3
  1193. psllq $5,%xmm0
  1194. pxor %xmm0,%xmm3
  1195. psllq $1,%xmm0
  1196. pxor %xmm3,%xmm0
  1197. psllq $57,%xmm0
  1198. movdqa %xmm0,%xmm3
  1199. pslldq $8,%xmm0
  1200. psrldq $8,%xmm3
  1201. pxor %xmm4,%xmm0
  1202. pxor %xmm3,%xmm1
  1203. movdqa %xmm0,%xmm4
  1204. psrlq $1,%xmm0
  1205. pxor %xmm4,%xmm1
  1206. pxor %xmm0,%xmm4
  1207. psrlq $5,%xmm0
  1208. pxor %xmm4,%xmm0
  1209. psrlq $1,%xmm0
  1210. pxor %xmm1,%xmm0
  1211. .Ldone:
  1212. .byte 102,65,15,56,0,194
  1213. movdqu %xmm0,(%rdi)
  1214. .byte 0xf3,0xc3
  1215. .cfi_endproc
  1216. .size gcm_ghash_clmul,.-gcm_ghash_clmul
  1217. .globl gcm_init_avx
  1218. .type gcm_init_avx,@function
  1219. .align 32
  1220. gcm_init_avx:
  1221. .cfi_startproc
  1222. vzeroupper
  1223. vmovdqu (%rsi),%xmm2
  1224. vpshufd $78,%xmm2,%xmm2
  1225. vpshufd $255,%xmm2,%xmm4
  1226. vpsrlq $63,%xmm2,%xmm3
  1227. vpsllq $1,%xmm2,%xmm2
  1228. vpxor %xmm5,%xmm5,%xmm5
  1229. vpcmpgtd %xmm4,%xmm5,%xmm5
  1230. vpslldq $8,%xmm3,%xmm3
  1231. vpor %xmm3,%xmm2,%xmm2
  1232. vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
  1233. vpxor %xmm5,%xmm2,%xmm2
  1234. vpunpckhqdq %xmm2,%xmm2,%xmm6
  1235. vmovdqa %xmm2,%xmm0
  1236. vpxor %xmm2,%xmm6,%xmm6
  1237. movq $4,%r10
  1238. jmp .Linit_start_avx
  1239. .align 32
  1240. .Linit_loop_avx:
  1241. vpalignr $8,%xmm3,%xmm4,%xmm5
  1242. vmovdqu %xmm5,-16(%rdi)
  1243. vpunpckhqdq %xmm0,%xmm0,%xmm3
  1244. vpxor %xmm0,%xmm3,%xmm3
  1245. vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
  1246. vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
  1247. vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
  1248. vpxor %xmm0,%xmm1,%xmm4
  1249. vpxor %xmm4,%xmm3,%xmm3
  1250. vpslldq $8,%xmm3,%xmm4
  1251. vpsrldq $8,%xmm3,%xmm3
  1252. vpxor %xmm4,%xmm0,%xmm0
  1253. vpxor %xmm3,%xmm1,%xmm1
  1254. vpsllq $57,%xmm0,%xmm3
  1255. vpsllq $62,%xmm0,%xmm4
  1256. vpxor %xmm3,%xmm4,%xmm4
  1257. vpsllq $63,%xmm0,%xmm3
  1258. vpxor %xmm3,%xmm4,%xmm4
  1259. vpslldq $8,%xmm4,%xmm3
  1260. vpsrldq $8,%xmm4,%xmm4
  1261. vpxor %xmm3,%xmm0,%xmm0
  1262. vpxor %xmm4,%xmm1,%xmm1
  1263. vpsrlq $1,%xmm0,%xmm4
  1264. vpxor %xmm0,%xmm1,%xmm1
  1265. vpxor %xmm4,%xmm0,%xmm0
  1266. vpsrlq $5,%xmm4,%xmm4
  1267. vpxor %xmm4,%xmm0,%xmm0
  1268. vpsrlq $1,%xmm0,%xmm0
  1269. vpxor %xmm1,%xmm0,%xmm0
  1270. .Linit_start_avx:
  1271. vmovdqa %xmm0,%xmm5
  1272. vpunpckhqdq %xmm0,%xmm0,%xmm3
  1273. vpxor %xmm0,%xmm3,%xmm3
  1274. vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
  1275. vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
  1276. vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
  1277. vpxor %xmm0,%xmm1,%xmm4
  1278. vpxor %xmm4,%xmm3,%xmm3
  1279. vpslldq $8,%xmm3,%xmm4
  1280. vpsrldq $8,%xmm3,%xmm3
  1281. vpxor %xmm4,%xmm0,%xmm0
  1282. vpxor %xmm3,%xmm1,%xmm1
  1283. vpsllq $57,%xmm0,%xmm3
  1284. vpsllq $62,%xmm0,%xmm4
  1285. vpxor %xmm3,%xmm4,%xmm4
  1286. vpsllq $63,%xmm0,%xmm3
  1287. vpxor %xmm3,%xmm4,%xmm4
  1288. vpslldq $8,%xmm4,%xmm3
  1289. vpsrldq $8,%xmm4,%xmm4
  1290. vpxor %xmm3,%xmm0,%xmm0
  1291. vpxor %xmm4,%xmm1,%xmm1
  1292. vpsrlq $1,%xmm0,%xmm4
  1293. vpxor %xmm0,%xmm1,%xmm1
  1294. vpxor %xmm4,%xmm0,%xmm0
  1295. vpsrlq $5,%xmm4,%xmm4
  1296. vpxor %xmm4,%xmm0,%xmm0
  1297. vpsrlq $1,%xmm0,%xmm0
  1298. vpxor %xmm1,%xmm0,%xmm0
  1299. vpshufd $78,%xmm5,%xmm3
  1300. vpshufd $78,%xmm0,%xmm4
  1301. vpxor %xmm5,%xmm3,%xmm3
  1302. vmovdqu %xmm5,0(%rdi)
  1303. vpxor %xmm0,%xmm4,%xmm4
  1304. vmovdqu %xmm0,16(%rdi)
  1305. leaq 48(%rdi),%rdi
  1306. subq $1,%r10
  1307. jnz .Linit_loop_avx
  1308. vpalignr $8,%xmm4,%xmm3,%xmm5
  1309. vmovdqu %xmm5,-16(%rdi)
  1310. vzeroupper
  1311. .byte 0xf3,0xc3
  1312. .cfi_endproc
  1313. .size gcm_init_avx,.-gcm_init_avx
  1314. .globl gcm_gmult_avx
  1315. .type gcm_gmult_avx,@function
  1316. .align 32
  1317. gcm_gmult_avx:
  1318. .cfi_startproc
  1319. jmp .L_gmult_clmul
  1320. .cfi_endproc
  1321. .size gcm_gmult_avx,.-gcm_gmult_avx
  1322. .globl gcm_ghash_avx
  1323. .type gcm_ghash_avx,@function
  1324. .align 32
  1325. gcm_ghash_avx:
  1326. .cfi_startproc
  1327. vzeroupper
  1328. vmovdqu (%rdi),%xmm10
  1329. leaq .L0x1c2_polynomial(%rip),%r10
  1330. leaq 64(%rsi),%rsi
  1331. vmovdqu .Lbswap_mask(%rip),%xmm13
  1332. vpshufb %xmm13,%xmm10,%xmm10
  1333. cmpq $0x80,%rcx
  1334. jb .Lshort_avx
  1335. subq $0x80,%rcx
  1336. vmovdqu 112(%rdx),%xmm14
  1337. vmovdqu 0-64(%rsi),%xmm6
  1338. vpshufb %xmm13,%xmm14,%xmm14
  1339. vmovdqu 32-64(%rsi),%xmm7
  1340. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1341. vmovdqu 96(%rdx),%xmm15
  1342. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1343. vpxor %xmm14,%xmm9,%xmm9
  1344. vpshufb %xmm13,%xmm15,%xmm15
  1345. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1346. vmovdqu 16-64(%rsi),%xmm6
  1347. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1348. vmovdqu 80(%rdx),%xmm14
  1349. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1350. vpxor %xmm15,%xmm8,%xmm8
  1351. vpshufb %xmm13,%xmm14,%xmm14
  1352. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1353. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1354. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1355. vmovdqu 48-64(%rsi),%xmm6
  1356. vpxor %xmm14,%xmm9,%xmm9
  1357. vmovdqu 64(%rdx),%xmm15
  1358. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1359. vmovdqu 80-64(%rsi),%xmm7
  1360. vpshufb %xmm13,%xmm15,%xmm15
  1361. vpxor %xmm0,%xmm3,%xmm3
  1362. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1363. vpxor %xmm1,%xmm4,%xmm4
  1364. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1365. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1366. vmovdqu 64-64(%rsi),%xmm6
  1367. vpxor %xmm2,%xmm5,%xmm5
  1368. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1369. vpxor %xmm15,%xmm8,%xmm8
  1370. vmovdqu 48(%rdx),%xmm14
  1371. vpxor %xmm3,%xmm0,%xmm0
  1372. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1373. vpxor %xmm4,%xmm1,%xmm1
  1374. vpshufb %xmm13,%xmm14,%xmm14
  1375. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1376. vmovdqu 96-64(%rsi),%xmm6
  1377. vpxor %xmm5,%xmm2,%xmm2
  1378. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1379. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1380. vmovdqu 128-64(%rsi),%xmm7
  1381. vpxor %xmm14,%xmm9,%xmm9
  1382. vmovdqu 32(%rdx),%xmm15
  1383. vpxor %xmm0,%xmm3,%xmm3
  1384. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1385. vpxor %xmm1,%xmm4,%xmm4
  1386. vpshufb %xmm13,%xmm15,%xmm15
  1387. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1388. vmovdqu 112-64(%rsi),%xmm6
  1389. vpxor %xmm2,%xmm5,%xmm5
  1390. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1391. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1392. vpxor %xmm15,%xmm8,%xmm8
  1393. vmovdqu 16(%rdx),%xmm14
  1394. vpxor %xmm3,%xmm0,%xmm0
  1395. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1396. vpxor %xmm4,%xmm1,%xmm1
  1397. vpshufb %xmm13,%xmm14,%xmm14
  1398. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1399. vmovdqu 144-64(%rsi),%xmm6
  1400. vpxor %xmm5,%xmm2,%xmm2
  1401. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1402. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1403. vmovdqu 176-64(%rsi),%xmm7
  1404. vpxor %xmm14,%xmm9,%xmm9
  1405. vmovdqu (%rdx),%xmm15
  1406. vpxor %xmm0,%xmm3,%xmm3
  1407. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1408. vpxor %xmm1,%xmm4,%xmm4
  1409. vpshufb %xmm13,%xmm15,%xmm15
  1410. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1411. vmovdqu 160-64(%rsi),%xmm6
  1412. vpxor %xmm2,%xmm5,%xmm5
  1413. vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
  1414. leaq 128(%rdx),%rdx
  1415. cmpq $0x80,%rcx
  1416. jb .Ltail_avx
  1417. vpxor %xmm10,%xmm15,%xmm15
  1418. subq $0x80,%rcx
  1419. jmp .Loop8x_avx
  1420. .align 32
  1421. .Loop8x_avx:
  1422. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1423. vmovdqu 112(%rdx),%xmm14
  1424. vpxor %xmm0,%xmm3,%xmm3
  1425. vpxor %xmm15,%xmm8,%xmm8
  1426. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
  1427. vpshufb %xmm13,%xmm14,%xmm14
  1428. vpxor %xmm1,%xmm4,%xmm4
  1429. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
  1430. vmovdqu 0-64(%rsi),%xmm6
  1431. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1432. vpxor %xmm2,%xmm5,%xmm5
  1433. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
  1434. vmovdqu 32-64(%rsi),%xmm7
  1435. vpxor %xmm14,%xmm9,%xmm9
  1436. vmovdqu 96(%rdx),%xmm15
  1437. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1438. vpxor %xmm3,%xmm10,%xmm10
  1439. vpshufb %xmm13,%xmm15,%xmm15
  1440. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1441. vxorps %xmm4,%xmm11,%xmm11
  1442. vmovdqu 16-64(%rsi),%xmm6
  1443. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1444. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1445. vpxor %xmm5,%xmm12,%xmm12
  1446. vxorps %xmm15,%xmm8,%xmm8
  1447. vmovdqu 80(%rdx),%xmm14
  1448. vpxor %xmm10,%xmm12,%xmm12
  1449. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1450. vpxor %xmm11,%xmm12,%xmm12
  1451. vpslldq $8,%xmm12,%xmm9
  1452. vpxor %xmm0,%xmm3,%xmm3
  1453. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1454. vpsrldq $8,%xmm12,%xmm12
  1455. vpxor %xmm9,%xmm10,%xmm10
  1456. vmovdqu 48-64(%rsi),%xmm6
  1457. vpshufb %xmm13,%xmm14,%xmm14
  1458. vxorps %xmm12,%xmm11,%xmm11
  1459. vpxor %xmm1,%xmm4,%xmm4
  1460. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1461. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1462. vmovdqu 80-64(%rsi),%xmm7
  1463. vpxor %xmm14,%xmm9,%xmm9
  1464. vpxor %xmm2,%xmm5,%xmm5
  1465. vmovdqu 64(%rdx),%xmm15
  1466. vpalignr $8,%xmm10,%xmm10,%xmm12
  1467. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1468. vpshufb %xmm13,%xmm15,%xmm15
  1469. vpxor %xmm3,%xmm0,%xmm0
  1470. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1471. vmovdqu 64-64(%rsi),%xmm6
  1472. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1473. vpxor %xmm4,%xmm1,%xmm1
  1474. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1475. vxorps %xmm15,%xmm8,%xmm8
  1476. vpxor %xmm5,%xmm2,%xmm2
  1477. vmovdqu 48(%rdx),%xmm14
  1478. vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
  1479. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1480. vpshufb %xmm13,%xmm14,%xmm14
  1481. vpxor %xmm0,%xmm3,%xmm3
  1482. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1483. vmovdqu 96-64(%rsi),%xmm6
  1484. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1485. vpxor %xmm1,%xmm4,%xmm4
  1486. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1487. vmovdqu 128-64(%rsi),%xmm7
  1488. vpxor %xmm14,%xmm9,%xmm9
  1489. vpxor %xmm2,%xmm5,%xmm5
  1490. vmovdqu 32(%rdx),%xmm15
  1491. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1492. vpshufb %xmm13,%xmm15,%xmm15
  1493. vpxor %xmm3,%xmm0,%xmm0
  1494. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1495. vmovdqu 112-64(%rsi),%xmm6
  1496. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1497. vpxor %xmm4,%xmm1,%xmm1
  1498. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  1499. vpxor %xmm15,%xmm8,%xmm8
  1500. vpxor %xmm5,%xmm2,%xmm2
  1501. vxorps %xmm12,%xmm10,%xmm10
  1502. vmovdqu 16(%rdx),%xmm14
  1503. vpalignr $8,%xmm10,%xmm10,%xmm12
  1504. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  1505. vpshufb %xmm13,%xmm14,%xmm14
  1506. vpxor %xmm0,%xmm3,%xmm3
  1507. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  1508. vmovdqu 144-64(%rsi),%xmm6
  1509. vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
  1510. vxorps %xmm11,%xmm12,%xmm12
  1511. vpunpckhqdq %xmm14,%xmm14,%xmm9
  1512. vpxor %xmm1,%xmm4,%xmm4
  1513. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  1514. vmovdqu 176-64(%rsi),%xmm7
  1515. vpxor %xmm14,%xmm9,%xmm9
  1516. vpxor %xmm2,%xmm5,%xmm5
  1517. vmovdqu (%rdx),%xmm15
  1518. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  1519. vpshufb %xmm13,%xmm15,%xmm15
  1520. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  1521. vmovdqu 160-64(%rsi),%xmm6
  1522. vpxor %xmm12,%xmm15,%xmm15
  1523. vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
  1524. vpxor %xmm10,%xmm15,%xmm15
  1525. leaq 128(%rdx),%rdx
  1526. subq $0x80,%rcx
  1527. jnc .Loop8x_avx
  1528. addq $0x80,%rcx
  1529. jmp .Ltail_no_xor_avx
  1530. .align 32
  1531. .Lshort_avx:
  1532. vmovdqu -16(%rdx,%rcx,1),%xmm14
  1533. leaq (%rdx,%rcx,1),%rdx
  1534. vmovdqu 0-64(%rsi),%xmm6
  1535. vmovdqu 32-64(%rsi),%xmm7
  1536. vpshufb %xmm13,%xmm14,%xmm15
  1537. vmovdqa %xmm0,%xmm3
  1538. vmovdqa %xmm1,%xmm4
  1539. vmovdqa %xmm2,%xmm5
  1540. subq $0x10,%rcx
  1541. jz .Ltail_avx
  1542. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1543. vpxor %xmm0,%xmm3,%xmm3
  1544. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1545. vpxor %xmm15,%xmm8,%xmm8
  1546. vmovdqu -32(%rdx),%xmm14
  1547. vpxor %xmm1,%xmm4,%xmm4
  1548. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1549. vmovdqu 16-64(%rsi),%xmm6
  1550. vpshufb %xmm13,%xmm14,%xmm15
  1551. vpxor %xmm2,%xmm5,%xmm5
  1552. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1553. vpsrldq $8,%xmm7,%xmm7
  1554. subq $0x10,%rcx
  1555. jz .Ltail_avx
  1556. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1557. vpxor %xmm0,%xmm3,%xmm3
  1558. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1559. vpxor %xmm15,%xmm8,%xmm8
  1560. vmovdqu -48(%rdx),%xmm14
  1561. vpxor %xmm1,%xmm4,%xmm4
  1562. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1563. vmovdqu 48-64(%rsi),%xmm6
  1564. vpshufb %xmm13,%xmm14,%xmm15
  1565. vpxor %xmm2,%xmm5,%xmm5
  1566. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1567. vmovdqu 80-64(%rsi),%xmm7
  1568. subq $0x10,%rcx
  1569. jz .Ltail_avx
  1570. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1571. vpxor %xmm0,%xmm3,%xmm3
  1572. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1573. vpxor %xmm15,%xmm8,%xmm8
  1574. vmovdqu -64(%rdx),%xmm14
  1575. vpxor %xmm1,%xmm4,%xmm4
  1576. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1577. vmovdqu 64-64(%rsi),%xmm6
  1578. vpshufb %xmm13,%xmm14,%xmm15
  1579. vpxor %xmm2,%xmm5,%xmm5
  1580. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1581. vpsrldq $8,%xmm7,%xmm7
  1582. subq $0x10,%rcx
  1583. jz .Ltail_avx
  1584. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1585. vpxor %xmm0,%xmm3,%xmm3
  1586. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1587. vpxor %xmm15,%xmm8,%xmm8
  1588. vmovdqu -80(%rdx),%xmm14
  1589. vpxor %xmm1,%xmm4,%xmm4
  1590. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1591. vmovdqu 96-64(%rsi),%xmm6
  1592. vpshufb %xmm13,%xmm14,%xmm15
  1593. vpxor %xmm2,%xmm5,%xmm5
  1594. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1595. vmovdqu 128-64(%rsi),%xmm7
  1596. subq $0x10,%rcx
  1597. jz .Ltail_avx
  1598. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1599. vpxor %xmm0,%xmm3,%xmm3
  1600. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1601. vpxor %xmm15,%xmm8,%xmm8
  1602. vmovdqu -96(%rdx),%xmm14
  1603. vpxor %xmm1,%xmm4,%xmm4
  1604. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1605. vmovdqu 112-64(%rsi),%xmm6
  1606. vpshufb %xmm13,%xmm14,%xmm15
  1607. vpxor %xmm2,%xmm5,%xmm5
  1608. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1609. vpsrldq $8,%xmm7,%xmm7
  1610. subq $0x10,%rcx
  1611. jz .Ltail_avx
  1612. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1613. vpxor %xmm0,%xmm3,%xmm3
  1614. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1615. vpxor %xmm15,%xmm8,%xmm8
  1616. vmovdqu -112(%rdx),%xmm14
  1617. vpxor %xmm1,%xmm4,%xmm4
  1618. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1619. vmovdqu 144-64(%rsi),%xmm6
  1620. vpshufb %xmm13,%xmm14,%xmm15
  1621. vpxor %xmm2,%xmm5,%xmm5
  1622. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1623. vmovq 184-64(%rsi),%xmm7
  1624. subq $0x10,%rcx
  1625. jmp .Ltail_avx
  1626. .align 32
  1627. .Ltail_avx:
  1628. vpxor %xmm10,%xmm15,%xmm15
  1629. .Ltail_no_xor_avx:
  1630. vpunpckhqdq %xmm15,%xmm15,%xmm8
  1631. vpxor %xmm0,%xmm3,%xmm3
  1632. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  1633. vpxor %xmm15,%xmm8,%xmm8
  1634. vpxor %xmm1,%xmm4,%xmm4
  1635. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  1636. vpxor %xmm2,%xmm5,%xmm5
  1637. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  1638. vmovdqu (%r10),%xmm12
  1639. vpxor %xmm0,%xmm3,%xmm10
  1640. vpxor %xmm1,%xmm4,%xmm11
  1641. vpxor %xmm2,%xmm5,%xmm5
  1642. vpxor %xmm10,%xmm5,%xmm5
  1643. vpxor %xmm11,%xmm5,%xmm5
  1644. vpslldq $8,%xmm5,%xmm9
  1645. vpsrldq $8,%xmm5,%xmm5
  1646. vpxor %xmm9,%xmm10,%xmm10
  1647. vpxor %xmm5,%xmm11,%xmm11
  1648. vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
  1649. vpalignr $8,%xmm10,%xmm10,%xmm10
  1650. vpxor %xmm9,%xmm10,%xmm10
  1651. vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
  1652. vpalignr $8,%xmm10,%xmm10,%xmm10
  1653. vpxor %xmm11,%xmm10,%xmm10
  1654. vpxor %xmm9,%xmm10,%xmm10
  1655. cmpq $0,%rcx
  1656. jne .Lshort_avx
  1657. vpshufb %xmm13,%xmm10,%xmm10
  1658. vmovdqu %xmm10,(%rdi)
  1659. vzeroupper
  1660. .byte 0xf3,0xc3
  1661. .cfi_endproc
  1662. .size gcm_ghash_avx,.-gcm_ghash_avx
  1663. .align 64
  1664. .Lbswap_mask:
  1665. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1666. .L0x1c2_polynomial:
  1667. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1668. .L7_mask:
  1669. .long 7,0,7,0
  1670. .L7_mask_poly:
  1671. .long 7,0,450,0
  1672. .align 64
  1673. .type .Lrem_4bit,@object
  1674. .Lrem_4bit:
  1675. .long 0,0,0,471859200,0,943718400,0,610271232
  1676. .long 0,1887436800,0,1822425088,0,1220542464,0,1423966208
  1677. .long 0,3774873600,0,4246732800,0,3644850176,0,3311403008
  1678. .long 0,2441084928,0,2376073216,0,2847932416,0,3051356160
  1679. .type .Lrem_8bit,@object
  1680. .Lrem_8bit:
  1681. .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
  1682. .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
  1683. .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
  1684. .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
  1685. .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
  1686. .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
  1687. .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
  1688. .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
  1689. .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
  1690. .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
  1691. .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
  1692. .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
  1693. .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
  1694. .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
  1695. .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
  1696. .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
  1697. .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
  1698. .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
  1699. .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
  1700. .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
  1701. .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
  1702. .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
  1703. .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
  1704. .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
  1705. .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
  1706. .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
  1707. .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
  1708. .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
  1709. .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
  1710. .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
  1711. .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
  1712. .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
  1713. .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1714. .align 64