rsaz-x86_64.s 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015
  1. .text
  2. .globl rsaz_512_sqr
  3. .type rsaz_512_sqr,@function
  4. .align 32
  5. rsaz_512_sqr:
  6. .cfi_startproc
  7. pushq %rbx
  8. .cfi_adjust_cfa_offset 8
  9. .cfi_offset %rbx,-16
  10. pushq %rbp
  11. .cfi_adjust_cfa_offset 8
  12. .cfi_offset %rbp,-24
  13. pushq %r12
  14. .cfi_adjust_cfa_offset 8
  15. .cfi_offset %r12,-32
  16. pushq %r13
  17. .cfi_adjust_cfa_offset 8
  18. .cfi_offset %r13,-40
  19. pushq %r14
  20. .cfi_adjust_cfa_offset 8
  21. .cfi_offset %r14,-48
  22. pushq %r15
  23. .cfi_adjust_cfa_offset 8
  24. .cfi_offset %r15,-56
  25. subq $128+24,%rsp
  26. .cfi_adjust_cfa_offset 128+24
  27. .Lsqr_body:
  28. .byte 102,72,15,110,202
  29. movq (%rsi),%rdx
  30. movq 8(%rsi),%rax
  31. movq %rcx,128(%rsp)
  32. movl $0x80100,%r11d
  33. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  34. cmpl $0x80100,%r11d
  35. je .Loop_sqrx
  36. jmp .Loop_sqr
  37. .align 32
  38. .Loop_sqr:
  39. movl %r8d,128+8(%rsp)
  40. movq %rdx,%rbx
  41. movq %rax,%rbp
  42. mulq %rdx
  43. movq %rax,%r8
  44. movq 16(%rsi),%rax
  45. movq %rdx,%r9
  46. mulq %rbx
  47. addq %rax,%r9
  48. movq 24(%rsi),%rax
  49. movq %rdx,%r10
  50. adcq $0,%r10
  51. mulq %rbx
  52. addq %rax,%r10
  53. movq 32(%rsi),%rax
  54. movq %rdx,%r11
  55. adcq $0,%r11
  56. mulq %rbx
  57. addq %rax,%r11
  58. movq 40(%rsi),%rax
  59. movq %rdx,%r12
  60. adcq $0,%r12
  61. mulq %rbx
  62. addq %rax,%r12
  63. movq 48(%rsi),%rax
  64. movq %rdx,%r13
  65. adcq $0,%r13
  66. mulq %rbx
  67. addq %rax,%r13
  68. movq 56(%rsi),%rax
  69. movq %rdx,%r14
  70. adcq $0,%r14
  71. mulq %rbx
  72. addq %rax,%r14
  73. movq %rbx,%rax
  74. adcq $0,%rdx
  75. xorq %rcx,%rcx
  76. addq %r8,%r8
  77. movq %rdx,%r15
  78. adcq $0,%rcx
  79. mulq %rax
  80. addq %r8,%rdx
  81. adcq $0,%rcx
  82. movq %rax,(%rsp)
  83. movq %rdx,8(%rsp)
  84. movq 16(%rsi),%rax
  85. mulq %rbp
  86. addq %rax,%r10
  87. movq 24(%rsi),%rax
  88. movq %rdx,%rbx
  89. adcq $0,%rbx
  90. mulq %rbp
  91. addq %rax,%r11
  92. movq 32(%rsi),%rax
  93. adcq $0,%rdx
  94. addq %rbx,%r11
  95. movq %rdx,%rbx
  96. adcq $0,%rbx
  97. mulq %rbp
  98. addq %rax,%r12
  99. movq 40(%rsi),%rax
  100. adcq $0,%rdx
  101. addq %rbx,%r12
  102. movq %rdx,%rbx
  103. adcq $0,%rbx
  104. mulq %rbp
  105. addq %rax,%r13
  106. movq 48(%rsi),%rax
  107. adcq $0,%rdx
  108. addq %rbx,%r13
  109. movq %rdx,%rbx
  110. adcq $0,%rbx
  111. mulq %rbp
  112. addq %rax,%r14
  113. movq 56(%rsi),%rax
  114. adcq $0,%rdx
  115. addq %rbx,%r14
  116. movq %rdx,%rbx
  117. adcq $0,%rbx
  118. mulq %rbp
  119. addq %rax,%r15
  120. movq %rbp,%rax
  121. adcq $0,%rdx
  122. addq %rbx,%r15
  123. adcq $0,%rdx
  124. xorq %rbx,%rbx
  125. addq %r9,%r9
  126. movq %rdx,%r8
  127. adcq %r10,%r10
  128. adcq $0,%rbx
  129. mulq %rax
  130. addq %rcx,%rax
  131. movq 16(%rsi),%rbp
  132. addq %rax,%r9
  133. movq 24(%rsi),%rax
  134. adcq %rdx,%r10
  135. adcq $0,%rbx
  136. movq %r9,16(%rsp)
  137. movq %r10,24(%rsp)
  138. mulq %rbp
  139. addq %rax,%r12
  140. movq 32(%rsi),%rax
  141. movq %rdx,%rcx
  142. adcq $0,%rcx
  143. mulq %rbp
  144. addq %rax,%r13
  145. movq 40(%rsi),%rax
  146. adcq $0,%rdx
  147. addq %rcx,%r13
  148. movq %rdx,%rcx
  149. adcq $0,%rcx
  150. mulq %rbp
  151. addq %rax,%r14
  152. movq 48(%rsi),%rax
  153. adcq $0,%rdx
  154. addq %rcx,%r14
  155. movq %rdx,%rcx
  156. adcq $0,%rcx
  157. mulq %rbp
  158. addq %rax,%r15
  159. movq 56(%rsi),%rax
  160. adcq $0,%rdx
  161. addq %rcx,%r15
  162. movq %rdx,%rcx
  163. adcq $0,%rcx
  164. mulq %rbp
  165. addq %rax,%r8
  166. movq %rbp,%rax
  167. adcq $0,%rdx
  168. addq %rcx,%r8
  169. adcq $0,%rdx
  170. xorq %rcx,%rcx
  171. addq %r11,%r11
  172. movq %rdx,%r9
  173. adcq %r12,%r12
  174. adcq $0,%rcx
  175. mulq %rax
  176. addq %rbx,%rax
  177. movq 24(%rsi),%r10
  178. addq %rax,%r11
  179. movq 32(%rsi),%rax
  180. adcq %rdx,%r12
  181. adcq $0,%rcx
  182. movq %r11,32(%rsp)
  183. movq %r12,40(%rsp)
  184. movq %rax,%r11
  185. mulq %r10
  186. addq %rax,%r14
  187. movq 40(%rsi),%rax
  188. movq %rdx,%rbx
  189. adcq $0,%rbx
  190. movq %rax,%r12
  191. mulq %r10
  192. addq %rax,%r15
  193. movq 48(%rsi),%rax
  194. adcq $0,%rdx
  195. addq %rbx,%r15
  196. movq %rdx,%rbx
  197. adcq $0,%rbx
  198. movq %rax,%rbp
  199. mulq %r10
  200. addq %rax,%r8
  201. movq 56(%rsi),%rax
  202. adcq $0,%rdx
  203. addq %rbx,%r8
  204. movq %rdx,%rbx
  205. adcq $0,%rbx
  206. mulq %r10
  207. addq %rax,%r9
  208. movq %r10,%rax
  209. adcq $0,%rdx
  210. addq %rbx,%r9
  211. adcq $0,%rdx
  212. xorq %rbx,%rbx
  213. addq %r13,%r13
  214. movq %rdx,%r10
  215. adcq %r14,%r14
  216. adcq $0,%rbx
  217. mulq %rax
  218. addq %rcx,%rax
  219. addq %rax,%r13
  220. movq %r12,%rax
  221. adcq %rdx,%r14
  222. adcq $0,%rbx
  223. movq %r13,48(%rsp)
  224. movq %r14,56(%rsp)
  225. mulq %r11
  226. addq %rax,%r8
  227. movq %rbp,%rax
  228. movq %rdx,%rcx
  229. adcq $0,%rcx
  230. mulq %r11
  231. addq %rax,%r9
  232. movq 56(%rsi),%rax
  233. adcq $0,%rdx
  234. addq %rcx,%r9
  235. movq %rdx,%rcx
  236. adcq $0,%rcx
  237. movq %rax,%r14
  238. mulq %r11
  239. addq %rax,%r10
  240. movq %r11,%rax
  241. adcq $0,%rdx
  242. addq %rcx,%r10
  243. adcq $0,%rdx
  244. xorq %rcx,%rcx
  245. addq %r15,%r15
  246. movq %rdx,%r11
  247. adcq %r8,%r8
  248. adcq $0,%rcx
  249. mulq %rax
  250. addq %rbx,%rax
  251. addq %rax,%r15
  252. movq %rbp,%rax
  253. adcq %rdx,%r8
  254. adcq $0,%rcx
  255. movq %r15,64(%rsp)
  256. movq %r8,72(%rsp)
  257. mulq %r12
  258. addq %rax,%r10
  259. movq %r14,%rax
  260. movq %rdx,%rbx
  261. adcq $0,%rbx
  262. mulq %r12
  263. addq %rax,%r11
  264. movq %r12,%rax
  265. adcq $0,%rdx
  266. addq %rbx,%r11
  267. adcq $0,%rdx
  268. xorq %rbx,%rbx
  269. addq %r9,%r9
  270. movq %rdx,%r12
  271. adcq %r10,%r10
  272. adcq $0,%rbx
  273. mulq %rax
  274. addq %rcx,%rax
  275. addq %rax,%r9
  276. movq %r14,%rax
  277. adcq %rdx,%r10
  278. adcq $0,%rbx
  279. movq %r9,80(%rsp)
  280. movq %r10,88(%rsp)
  281. mulq %rbp
  282. addq %rax,%r12
  283. movq %rbp,%rax
  284. adcq $0,%rdx
  285. xorq %rcx,%rcx
  286. addq %r11,%r11
  287. movq %rdx,%r13
  288. adcq %r12,%r12
  289. adcq $0,%rcx
  290. mulq %rax
  291. addq %rbx,%rax
  292. addq %rax,%r11
  293. movq %r14,%rax
  294. adcq %rdx,%r12
  295. adcq $0,%rcx
  296. movq %r11,96(%rsp)
  297. movq %r12,104(%rsp)
  298. xorq %rbx,%rbx
  299. addq %r13,%r13
  300. adcq $0,%rbx
  301. mulq %rax
  302. addq %rcx,%rax
  303. addq %r13,%rax
  304. adcq %rbx,%rdx
  305. movq (%rsp),%r8
  306. movq 8(%rsp),%r9
  307. movq 16(%rsp),%r10
  308. movq 24(%rsp),%r11
  309. movq 32(%rsp),%r12
  310. movq 40(%rsp),%r13
  311. movq 48(%rsp),%r14
  312. movq 56(%rsp),%r15
  313. .byte 102,72,15,126,205
  314. movq %rax,112(%rsp)
  315. movq %rdx,120(%rsp)
  316. call __rsaz_512_reduce
  317. addq 64(%rsp),%r8
  318. adcq 72(%rsp),%r9
  319. adcq 80(%rsp),%r10
  320. adcq 88(%rsp),%r11
  321. adcq 96(%rsp),%r12
  322. adcq 104(%rsp),%r13
  323. adcq 112(%rsp),%r14
  324. adcq 120(%rsp),%r15
  325. sbbq %rcx,%rcx
  326. call __rsaz_512_subtract
  327. movq %r8,%rdx
  328. movq %r9,%rax
  329. movl 128+8(%rsp),%r8d
  330. movq %rdi,%rsi
  331. decl %r8d
  332. jnz .Loop_sqr
  333. jmp .Lsqr_tail
  334. .align 32
  335. .Loop_sqrx:
  336. movl %r8d,128+8(%rsp)
  337. .byte 102,72,15,110,199
  338. mulxq %rax,%r8,%r9
  339. movq %rax,%rbx
  340. mulxq 16(%rsi),%rcx,%r10
  341. xorq %rbp,%rbp
  342. mulxq 24(%rsi),%rax,%r11
  343. adcxq %rcx,%r9
  344. .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
  345. adcxq %rax,%r10
  346. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
  347. adcxq %rcx,%r11
  348. mulxq 48(%rsi),%rcx,%r14
  349. adcxq %rax,%r12
  350. adcxq %rcx,%r13
  351. mulxq 56(%rsi),%rax,%r15
  352. adcxq %rax,%r14
  353. adcxq %rbp,%r15
  354. mulxq %rdx,%rax,%rdi
  355. movq %rbx,%rdx
  356. xorq %rcx,%rcx
  357. adoxq %r8,%r8
  358. adcxq %rdi,%r8
  359. adoxq %rbp,%rcx
  360. adcxq %rbp,%rcx
  361. movq %rax,(%rsp)
  362. movq %r8,8(%rsp)
  363. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
  364. adoxq %rax,%r10
  365. adcxq %rbx,%r11
  366. mulxq 24(%rsi),%rdi,%r8
  367. adoxq %rdi,%r11
  368. .byte 0x66
  369. adcxq %r8,%r12
  370. mulxq 32(%rsi),%rax,%rbx
  371. adoxq %rax,%r12
  372. adcxq %rbx,%r13
  373. mulxq 40(%rsi),%rdi,%r8
  374. adoxq %rdi,%r13
  375. adcxq %r8,%r14
  376. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
  377. adoxq %rax,%r14
  378. adcxq %rbx,%r15
  379. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
  380. adoxq %rdi,%r15
  381. adcxq %rbp,%r8
  382. mulxq %rdx,%rax,%rdi
  383. adoxq %rbp,%r8
  384. .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00
  385. xorq %rbx,%rbx
  386. adoxq %r9,%r9
  387. adcxq %rcx,%rax
  388. adoxq %r10,%r10
  389. adcxq %rax,%r9
  390. adoxq %rbp,%rbx
  391. adcxq %rdi,%r10
  392. adcxq %rbp,%rbx
  393. movq %r9,16(%rsp)
  394. .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
  395. mulxq 24(%rsi),%rdi,%r9
  396. adoxq %rdi,%r12
  397. adcxq %r9,%r13
  398. mulxq 32(%rsi),%rax,%rcx
  399. adoxq %rax,%r13
  400. adcxq %rcx,%r14
  401. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
  402. adoxq %rdi,%r14
  403. adcxq %r9,%r15
  404. .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
  405. adoxq %rax,%r15
  406. adcxq %rcx,%r8
  407. mulxq 56(%rsi),%rdi,%r9
  408. adoxq %rdi,%r8
  409. adcxq %rbp,%r9
  410. mulxq %rdx,%rax,%rdi
  411. adoxq %rbp,%r9
  412. movq 24(%rsi),%rdx
  413. xorq %rcx,%rcx
  414. adoxq %r11,%r11
  415. adcxq %rbx,%rax
  416. adoxq %r12,%r12
  417. adcxq %rax,%r11
  418. adoxq %rbp,%rcx
  419. adcxq %rdi,%r12
  420. adcxq %rbp,%rcx
  421. movq %r11,32(%rsp)
  422. movq %r12,40(%rsp)
  423. mulxq 32(%rsi),%rax,%rbx
  424. adoxq %rax,%r14
  425. adcxq %rbx,%r15
  426. mulxq 40(%rsi),%rdi,%r10
  427. adoxq %rdi,%r15
  428. adcxq %r10,%r8
  429. mulxq 48(%rsi),%rax,%rbx
  430. adoxq %rax,%r8
  431. adcxq %rbx,%r9
  432. mulxq 56(%rsi),%rdi,%r10
  433. adoxq %rdi,%r9
  434. adcxq %rbp,%r10
  435. mulxq %rdx,%rax,%rdi
  436. adoxq %rbp,%r10
  437. movq 32(%rsi),%rdx
  438. xorq %rbx,%rbx
  439. adoxq %r13,%r13
  440. adcxq %rcx,%rax
  441. adoxq %r14,%r14
  442. adcxq %rax,%r13
  443. adoxq %rbp,%rbx
  444. adcxq %rdi,%r14
  445. adcxq %rbp,%rbx
  446. movq %r13,48(%rsp)
  447. movq %r14,56(%rsp)
  448. mulxq 40(%rsi),%rdi,%r11
  449. adoxq %rdi,%r8
  450. adcxq %r11,%r9
  451. mulxq 48(%rsi),%rax,%rcx
  452. adoxq %rax,%r9
  453. adcxq %rcx,%r10
  454. mulxq 56(%rsi),%rdi,%r11
  455. adoxq %rdi,%r10
  456. adcxq %rbp,%r11
  457. mulxq %rdx,%rax,%rdi
  458. movq 40(%rsi),%rdx
  459. adoxq %rbp,%r11
  460. xorq %rcx,%rcx
  461. adoxq %r15,%r15
  462. adcxq %rbx,%rax
  463. adoxq %r8,%r8
  464. adcxq %rax,%r15
  465. adoxq %rbp,%rcx
  466. adcxq %rdi,%r8
  467. adcxq %rbp,%rcx
  468. movq %r15,64(%rsp)
  469. movq %r8,72(%rsp)
  470. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
  471. adoxq %rax,%r10
  472. adcxq %rbx,%r11
  473. .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
  474. adoxq %rdi,%r11
  475. adcxq %rbp,%r12
  476. mulxq %rdx,%rax,%rdi
  477. adoxq %rbp,%r12
  478. movq 48(%rsi),%rdx
  479. xorq %rbx,%rbx
  480. adoxq %r9,%r9
  481. adcxq %rcx,%rax
  482. adoxq %r10,%r10
  483. adcxq %rax,%r9
  484. adcxq %rdi,%r10
  485. adoxq %rbp,%rbx
  486. adcxq %rbp,%rbx
  487. movq %r9,80(%rsp)
  488. movq %r10,88(%rsp)
  489. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
  490. adoxq %rax,%r12
  491. adoxq %rbp,%r13
  492. mulxq %rdx,%rax,%rdi
  493. xorq %rcx,%rcx
  494. movq 56(%rsi),%rdx
  495. adoxq %r11,%r11
  496. adcxq %rbx,%rax
  497. adoxq %r12,%r12
  498. adcxq %rax,%r11
  499. adoxq %rbp,%rcx
  500. adcxq %rdi,%r12
  501. adcxq %rbp,%rcx
  502. .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
  503. .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
  504. mulxq %rdx,%rax,%rdx
  505. xorq %rbx,%rbx
  506. adoxq %r13,%r13
  507. adcxq %rcx,%rax
  508. adoxq %rbp,%rbx
  509. adcxq %r13,%rax
  510. adcxq %rdx,%rbx
  511. .byte 102,72,15,126,199
  512. .byte 102,72,15,126,205
  513. movq 128(%rsp),%rdx
  514. movq (%rsp),%r8
  515. movq 8(%rsp),%r9
  516. movq 16(%rsp),%r10
  517. movq 24(%rsp),%r11
  518. movq 32(%rsp),%r12
  519. movq 40(%rsp),%r13
  520. movq 48(%rsp),%r14
  521. movq 56(%rsp),%r15
  522. movq %rax,112(%rsp)
  523. movq %rbx,120(%rsp)
  524. call __rsaz_512_reducex
  525. addq 64(%rsp),%r8
  526. adcq 72(%rsp),%r9
  527. adcq 80(%rsp),%r10
  528. adcq 88(%rsp),%r11
  529. adcq 96(%rsp),%r12
  530. adcq 104(%rsp),%r13
  531. adcq 112(%rsp),%r14
  532. adcq 120(%rsp),%r15
  533. sbbq %rcx,%rcx
  534. call __rsaz_512_subtract
  535. movq %r8,%rdx
  536. movq %r9,%rax
  537. movl 128+8(%rsp),%r8d
  538. movq %rdi,%rsi
  539. decl %r8d
  540. jnz .Loop_sqrx
  541. .Lsqr_tail:
  542. leaq 128+24+48(%rsp),%rax
  543. .cfi_def_cfa %rax,8
  544. movq -48(%rax),%r15
  545. .cfi_restore %r15
  546. movq -40(%rax),%r14
  547. .cfi_restore %r14
  548. movq -32(%rax),%r13
  549. .cfi_restore %r13
  550. movq -24(%rax),%r12
  551. .cfi_restore %r12
  552. movq -16(%rax),%rbp
  553. .cfi_restore %rbp
  554. movq -8(%rax),%rbx
  555. .cfi_restore %rbx
  556. leaq (%rax),%rsp
  557. .cfi_def_cfa_register %rsp
  558. .Lsqr_epilogue:
  559. .byte 0xf3,0xc3
  560. .cfi_endproc
  561. .size rsaz_512_sqr,.-rsaz_512_sqr
  562. .globl rsaz_512_mul
  563. .type rsaz_512_mul,@function
  564. .align 32
  565. rsaz_512_mul:
  566. .cfi_startproc
  567. pushq %rbx
  568. .cfi_adjust_cfa_offset 8
  569. .cfi_offset %rbx,-16
  570. pushq %rbp
  571. .cfi_adjust_cfa_offset 8
  572. .cfi_offset %rbp,-24
  573. pushq %r12
  574. .cfi_adjust_cfa_offset 8
  575. .cfi_offset %r12,-32
  576. pushq %r13
  577. .cfi_adjust_cfa_offset 8
  578. .cfi_offset %r13,-40
  579. pushq %r14
  580. .cfi_adjust_cfa_offset 8
  581. .cfi_offset %r14,-48
  582. pushq %r15
  583. .cfi_adjust_cfa_offset 8
  584. .cfi_offset %r15,-56
  585. subq $128+24,%rsp
  586. .cfi_adjust_cfa_offset 128+24
  587. .Lmul_body:
  588. .byte 102,72,15,110,199
  589. .byte 102,72,15,110,201
  590. movq %r8,128(%rsp)
  591. movl $0x80100,%r11d
  592. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  593. cmpl $0x80100,%r11d
  594. je .Lmulx
  595. movq (%rdx),%rbx
  596. movq %rdx,%rbp
  597. call __rsaz_512_mul
  598. .byte 102,72,15,126,199
  599. .byte 102,72,15,126,205
  600. movq (%rsp),%r8
  601. movq 8(%rsp),%r9
  602. movq 16(%rsp),%r10
  603. movq 24(%rsp),%r11
  604. movq 32(%rsp),%r12
  605. movq 40(%rsp),%r13
  606. movq 48(%rsp),%r14
  607. movq 56(%rsp),%r15
  608. call __rsaz_512_reduce
  609. jmp .Lmul_tail
  610. .align 32
  611. .Lmulx:
  612. movq %rdx,%rbp
  613. movq (%rdx),%rdx
  614. call __rsaz_512_mulx
  615. .byte 102,72,15,126,199
  616. .byte 102,72,15,126,205
  617. movq 128(%rsp),%rdx
  618. movq (%rsp),%r8
  619. movq 8(%rsp),%r9
  620. movq 16(%rsp),%r10
  621. movq 24(%rsp),%r11
  622. movq 32(%rsp),%r12
  623. movq 40(%rsp),%r13
  624. movq 48(%rsp),%r14
  625. movq 56(%rsp),%r15
  626. call __rsaz_512_reducex
  627. .Lmul_tail:
  628. addq 64(%rsp),%r8
  629. adcq 72(%rsp),%r9
  630. adcq 80(%rsp),%r10
  631. adcq 88(%rsp),%r11
  632. adcq 96(%rsp),%r12
  633. adcq 104(%rsp),%r13
  634. adcq 112(%rsp),%r14
  635. adcq 120(%rsp),%r15
  636. sbbq %rcx,%rcx
  637. call __rsaz_512_subtract
  638. leaq 128+24+48(%rsp),%rax
  639. .cfi_def_cfa %rax,8
  640. movq -48(%rax),%r15
  641. .cfi_restore %r15
  642. movq -40(%rax),%r14
  643. .cfi_restore %r14
  644. movq -32(%rax),%r13
  645. .cfi_restore %r13
  646. movq -24(%rax),%r12
  647. .cfi_restore %r12
  648. movq -16(%rax),%rbp
  649. .cfi_restore %rbp
  650. movq -8(%rax),%rbx
  651. .cfi_restore %rbx
  652. leaq (%rax),%rsp
  653. .cfi_def_cfa_register %rsp
  654. .Lmul_epilogue:
  655. .byte 0xf3,0xc3
  656. .cfi_endproc
  657. .size rsaz_512_mul,.-rsaz_512_mul
  658. .globl rsaz_512_mul_gather4
  659. .type rsaz_512_mul_gather4,@function
  660. .align 32
  661. rsaz_512_mul_gather4:
  662. .cfi_startproc
  663. pushq %rbx
  664. .cfi_adjust_cfa_offset 8
  665. .cfi_offset %rbx,-16
  666. pushq %rbp
  667. .cfi_adjust_cfa_offset 8
  668. .cfi_offset %rbp,-24
  669. pushq %r12
  670. .cfi_adjust_cfa_offset 8
  671. .cfi_offset %r12,-32
  672. pushq %r13
  673. .cfi_adjust_cfa_offset 8
  674. .cfi_offset %r13,-40
  675. pushq %r14
  676. .cfi_adjust_cfa_offset 8
  677. .cfi_offset %r14,-48
  678. pushq %r15
  679. .cfi_adjust_cfa_offset 8
  680. .cfi_offset %r15,-56
  681. subq $152,%rsp
  682. .cfi_adjust_cfa_offset 152
  683. .Lmul_gather4_body:
  684. movd %r9d,%xmm8
  685. movdqa .Linc+16(%rip),%xmm1
  686. movdqa .Linc(%rip),%xmm0
  687. pshufd $0,%xmm8,%xmm8
  688. movdqa %xmm1,%xmm7
  689. movdqa %xmm1,%xmm2
  690. paddd %xmm0,%xmm1
  691. pcmpeqd %xmm8,%xmm0
  692. movdqa %xmm7,%xmm3
  693. paddd %xmm1,%xmm2
  694. pcmpeqd %xmm8,%xmm1
  695. movdqa %xmm7,%xmm4
  696. paddd %xmm2,%xmm3
  697. pcmpeqd %xmm8,%xmm2
  698. movdqa %xmm7,%xmm5
  699. paddd %xmm3,%xmm4
  700. pcmpeqd %xmm8,%xmm3
  701. movdqa %xmm7,%xmm6
  702. paddd %xmm4,%xmm5
  703. pcmpeqd %xmm8,%xmm4
  704. paddd %xmm5,%xmm6
  705. pcmpeqd %xmm8,%xmm5
  706. paddd %xmm6,%xmm7
  707. pcmpeqd %xmm8,%xmm6
  708. pcmpeqd %xmm8,%xmm7
  709. movdqa 0(%rdx),%xmm8
  710. movdqa 16(%rdx),%xmm9
  711. movdqa 32(%rdx),%xmm10
  712. movdqa 48(%rdx),%xmm11
  713. pand %xmm0,%xmm8
  714. movdqa 64(%rdx),%xmm12
  715. pand %xmm1,%xmm9
  716. movdqa 80(%rdx),%xmm13
  717. pand %xmm2,%xmm10
  718. movdqa 96(%rdx),%xmm14
  719. pand %xmm3,%xmm11
  720. movdqa 112(%rdx),%xmm15
  721. leaq 128(%rdx),%rbp
  722. pand %xmm4,%xmm12
  723. pand %xmm5,%xmm13
  724. pand %xmm6,%xmm14
  725. pand %xmm7,%xmm15
  726. por %xmm10,%xmm8
  727. por %xmm11,%xmm9
  728. por %xmm12,%xmm8
  729. por %xmm13,%xmm9
  730. por %xmm14,%xmm8
  731. por %xmm15,%xmm9
  732. por %xmm9,%xmm8
  733. pshufd $0x4e,%xmm8,%xmm9
  734. por %xmm9,%xmm8
  735. movl $0x80100,%r11d
  736. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  737. cmpl $0x80100,%r11d
  738. je .Lmulx_gather
  739. .byte 102,76,15,126,195
  740. movq %r8,128(%rsp)
  741. movq %rdi,128+8(%rsp)
  742. movq %rcx,128+16(%rsp)
  743. movq (%rsi),%rax
  744. movq 8(%rsi),%rcx
  745. mulq %rbx
  746. movq %rax,(%rsp)
  747. movq %rcx,%rax
  748. movq %rdx,%r8
  749. mulq %rbx
  750. addq %rax,%r8
  751. movq 16(%rsi),%rax
  752. movq %rdx,%r9
  753. adcq $0,%r9
  754. mulq %rbx
  755. addq %rax,%r9
  756. movq 24(%rsi),%rax
  757. movq %rdx,%r10
  758. adcq $0,%r10
  759. mulq %rbx
  760. addq %rax,%r10
  761. movq 32(%rsi),%rax
  762. movq %rdx,%r11
  763. adcq $0,%r11
  764. mulq %rbx
  765. addq %rax,%r11
  766. movq 40(%rsi),%rax
  767. movq %rdx,%r12
  768. adcq $0,%r12
  769. mulq %rbx
  770. addq %rax,%r12
  771. movq 48(%rsi),%rax
  772. movq %rdx,%r13
  773. adcq $0,%r13
  774. mulq %rbx
  775. addq %rax,%r13
  776. movq 56(%rsi),%rax
  777. movq %rdx,%r14
  778. adcq $0,%r14
  779. mulq %rbx
  780. addq %rax,%r14
  781. movq (%rsi),%rax
  782. movq %rdx,%r15
  783. adcq $0,%r15
  784. leaq 8(%rsp),%rdi
  785. movl $7,%ecx
  786. jmp .Loop_mul_gather
  787. .align 32
  788. .Loop_mul_gather:
  789. movdqa 0(%rbp),%xmm8
  790. movdqa 16(%rbp),%xmm9
  791. movdqa 32(%rbp),%xmm10
  792. movdqa 48(%rbp),%xmm11
  793. pand %xmm0,%xmm8
  794. movdqa 64(%rbp),%xmm12
  795. pand %xmm1,%xmm9
  796. movdqa 80(%rbp),%xmm13
  797. pand %xmm2,%xmm10
  798. movdqa 96(%rbp),%xmm14
  799. pand %xmm3,%xmm11
  800. movdqa 112(%rbp),%xmm15
  801. leaq 128(%rbp),%rbp
  802. pand %xmm4,%xmm12
  803. pand %xmm5,%xmm13
  804. pand %xmm6,%xmm14
  805. pand %xmm7,%xmm15
  806. por %xmm10,%xmm8
  807. por %xmm11,%xmm9
  808. por %xmm12,%xmm8
  809. por %xmm13,%xmm9
  810. por %xmm14,%xmm8
  811. por %xmm15,%xmm9
  812. por %xmm9,%xmm8
  813. pshufd $0x4e,%xmm8,%xmm9
  814. por %xmm9,%xmm8
  815. .byte 102,76,15,126,195
  816. mulq %rbx
  817. addq %rax,%r8
  818. movq 8(%rsi),%rax
  819. movq %r8,(%rdi)
  820. movq %rdx,%r8
  821. adcq $0,%r8
  822. mulq %rbx
  823. addq %rax,%r9
  824. movq 16(%rsi),%rax
  825. adcq $0,%rdx
  826. addq %r9,%r8
  827. movq %rdx,%r9
  828. adcq $0,%r9
  829. mulq %rbx
  830. addq %rax,%r10
  831. movq 24(%rsi),%rax
  832. adcq $0,%rdx
  833. addq %r10,%r9
  834. movq %rdx,%r10
  835. adcq $0,%r10
  836. mulq %rbx
  837. addq %rax,%r11
  838. movq 32(%rsi),%rax
  839. adcq $0,%rdx
  840. addq %r11,%r10
  841. movq %rdx,%r11
  842. adcq $0,%r11
  843. mulq %rbx
  844. addq %rax,%r12
  845. movq 40(%rsi),%rax
  846. adcq $0,%rdx
  847. addq %r12,%r11
  848. movq %rdx,%r12
  849. adcq $0,%r12
  850. mulq %rbx
  851. addq %rax,%r13
  852. movq 48(%rsi),%rax
  853. adcq $0,%rdx
  854. addq %r13,%r12
  855. movq %rdx,%r13
  856. adcq $0,%r13
  857. mulq %rbx
  858. addq %rax,%r14
  859. movq 56(%rsi),%rax
  860. adcq $0,%rdx
  861. addq %r14,%r13
  862. movq %rdx,%r14
  863. adcq $0,%r14
  864. mulq %rbx
  865. addq %rax,%r15
  866. movq (%rsi),%rax
  867. adcq $0,%rdx
  868. addq %r15,%r14
  869. movq %rdx,%r15
  870. adcq $0,%r15
  871. leaq 8(%rdi),%rdi
  872. decl %ecx
  873. jnz .Loop_mul_gather
  874. movq %r8,(%rdi)
  875. movq %r9,8(%rdi)
  876. movq %r10,16(%rdi)
  877. movq %r11,24(%rdi)
  878. movq %r12,32(%rdi)
  879. movq %r13,40(%rdi)
  880. movq %r14,48(%rdi)
  881. movq %r15,56(%rdi)
  882. movq 128+8(%rsp),%rdi
  883. movq 128+16(%rsp),%rbp
  884. movq (%rsp),%r8
  885. movq 8(%rsp),%r9
  886. movq 16(%rsp),%r10
  887. movq 24(%rsp),%r11
  888. movq 32(%rsp),%r12
  889. movq 40(%rsp),%r13
  890. movq 48(%rsp),%r14
  891. movq 56(%rsp),%r15
  892. call __rsaz_512_reduce
  893. jmp .Lmul_gather_tail
  894. .align 32
  895. .Lmulx_gather:
  896. .byte 102,76,15,126,194
  897. movq %r8,128(%rsp)
  898. movq %rdi,128+8(%rsp)
  899. movq %rcx,128+16(%rsp)
  900. mulxq (%rsi),%rbx,%r8
  901. movq %rbx,(%rsp)
  902. xorl %edi,%edi
  903. mulxq 8(%rsi),%rax,%r9
  904. mulxq 16(%rsi),%rbx,%r10
  905. adcxq %rax,%r8
  906. mulxq 24(%rsi),%rax,%r11
  907. adcxq %rbx,%r9
  908. mulxq 32(%rsi),%rbx,%r12
  909. adcxq %rax,%r10
  910. mulxq 40(%rsi),%rax,%r13
  911. adcxq %rbx,%r11
  912. mulxq 48(%rsi),%rbx,%r14
  913. adcxq %rax,%r12
  914. mulxq 56(%rsi),%rax,%r15
  915. adcxq %rbx,%r13
  916. adcxq %rax,%r14
  917. .byte 0x67
  918. movq %r8,%rbx
  919. adcxq %rdi,%r15
  920. movq $-7,%rcx
  921. jmp .Loop_mulx_gather
  922. .align 32
  923. .Loop_mulx_gather:
  924. movdqa 0(%rbp),%xmm8
  925. movdqa 16(%rbp),%xmm9
  926. movdqa 32(%rbp),%xmm10
  927. movdqa 48(%rbp),%xmm11
  928. pand %xmm0,%xmm8
  929. movdqa 64(%rbp),%xmm12
  930. pand %xmm1,%xmm9
  931. movdqa 80(%rbp),%xmm13
  932. pand %xmm2,%xmm10
  933. movdqa 96(%rbp),%xmm14
  934. pand %xmm3,%xmm11
  935. movdqa 112(%rbp),%xmm15
  936. leaq 128(%rbp),%rbp
  937. pand %xmm4,%xmm12
  938. pand %xmm5,%xmm13
  939. pand %xmm6,%xmm14
  940. pand %xmm7,%xmm15
  941. por %xmm10,%xmm8
  942. por %xmm11,%xmm9
  943. por %xmm12,%xmm8
  944. por %xmm13,%xmm9
  945. por %xmm14,%xmm8
  946. por %xmm15,%xmm9
  947. por %xmm9,%xmm8
  948. pshufd $0x4e,%xmm8,%xmm9
  949. por %xmm9,%xmm8
  950. .byte 102,76,15,126,194
  951. .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
  952. adcxq %rax,%rbx
  953. adoxq %r9,%r8
  954. mulxq 8(%rsi),%rax,%r9
  955. adcxq %rax,%r8
  956. adoxq %r10,%r9
  957. mulxq 16(%rsi),%rax,%r10
  958. adcxq %rax,%r9
  959. adoxq %r11,%r10
  960. .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
  961. adcxq %rax,%r10
  962. adoxq %r12,%r11
  963. mulxq 32(%rsi),%rax,%r12
  964. adcxq %rax,%r11
  965. adoxq %r13,%r12
  966. mulxq 40(%rsi),%rax,%r13
  967. adcxq %rax,%r12
  968. adoxq %r14,%r13
  969. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
  970. adcxq %rax,%r13
  971. .byte 0x67
  972. adoxq %r15,%r14
  973. mulxq 56(%rsi),%rax,%r15
  974. movq %rbx,64(%rsp,%rcx,8)
  975. adcxq %rax,%r14
  976. adoxq %rdi,%r15
  977. movq %r8,%rbx
  978. adcxq %rdi,%r15
  979. incq %rcx
  980. jnz .Loop_mulx_gather
  981. movq %r8,64(%rsp)
  982. movq %r9,64+8(%rsp)
  983. movq %r10,64+16(%rsp)
  984. movq %r11,64+24(%rsp)
  985. movq %r12,64+32(%rsp)
  986. movq %r13,64+40(%rsp)
  987. movq %r14,64+48(%rsp)
  988. movq %r15,64+56(%rsp)
  989. movq 128(%rsp),%rdx
  990. movq 128+8(%rsp),%rdi
  991. movq 128+16(%rsp),%rbp
  992. movq (%rsp),%r8
  993. movq 8(%rsp),%r9
  994. movq 16(%rsp),%r10
  995. movq 24(%rsp),%r11
  996. movq 32(%rsp),%r12
  997. movq 40(%rsp),%r13
  998. movq 48(%rsp),%r14
  999. movq 56(%rsp),%r15
  1000. call __rsaz_512_reducex
  1001. .Lmul_gather_tail:
  1002. addq 64(%rsp),%r8
  1003. adcq 72(%rsp),%r9
  1004. adcq 80(%rsp),%r10
  1005. adcq 88(%rsp),%r11
  1006. adcq 96(%rsp),%r12
  1007. adcq 104(%rsp),%r13
  1008. adcq 112(%rsp),%r14
  1009. adcq 120(%rsp),%r15
  1010. sbbq %rcx,%rcx
  1011. call __rsaz_512_subtract
  1012. leaq 128+24+48(%rsp),%rax
  1013. .cfi_def_cfa %rax,8
  1014. movq -48(%rax),%r15
  1015. .cfi_restore %r15
  1016. movq -40(%rax),%r14
  1017. .cfi_restore %r14
  1018. movq -32(%rax),%r13
  1019. .cfi_restore %r13
  1020. movq -24(%rax),%r12
  1021. .cfi_restore %r12
  1022. movq -16(%rax),%rbp
  1023. .cfi_restore %rbp
  1024. movq -8(%rax),%rbx
  1025. .cfi_restore %rbx
  1026. leaq (%rax),%rsp
  1027. .cfi_def_cfa_register %rsp
  1028. .Lmul_gather4_epilogue:
  1029. .byte 0xf3,0xc3
  1030. .cfi_endproc
  1031. .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
  1032. .globl rsaz_512_mul_scatter4
  1033. .type rsaz_512_mul_scatter4,@function
  1034. .align 32
  1035. rsaz_512_mul_scatter4:
  1036. .cfi_startproc
  1037. pushq %rbx
  1038. .cfi_adjust_cfa_offset 8
  1039. .cfi_offset %rbx,-16
  1040. pushq %rbp
  1041. .cfi_adjust_cfa_offset 8
  1042. .cfi_offset %rbp,-24
  1043. pushq %r12
  1044. .cfi_adjust_cfa_offset 8
  1045. .cfi_offset %r12,-32
  1046. pushq %r13
  1047. .cfi_adjust_cfa_offset 8
  1048. .cfi_offset %r13,-40
  1049. pushq %r14
  1050. .cfi_adjust_cfa_offset 8
  1051. .cfi_offset %r14,-48
  1052. pushq %r15
  1053. .cfi_adjust_cfa_offset 8
  1054. .cfi_offset %r15,-56
  1055. movl %r9d,%r9d
  1056. subq $128+24,%rsp
  1057. .cfi_adjust_cfa_offset 128+24
  1058. .Lmul_scatter4_body:
  1059. leaq (%r8,%r9,8),%r8
  1060. .byte 102,72,15,110,199
  1061. .byte 102,72,15,110,202
  1062. .byte 102,73,15,110,208
  1063. movq %rcx,128(%rsp)
  1064. movq %rdi,%rbp
  1065. movl $0x80100,%r11d
  1066. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  1067. cmpl $0x80100,%r11d
  1068. je .Lmulx_scatter
  1069. movq (%rdi),%rbx
  1070. call __rsaz_512_mul
  1071. .byte 102,72,15,126,199
  1072. .byte 102,72,15,126,205
  1073. movq (%rsp),%r8
  1074. movq 8(%rsp),%r9
  1075. movq 16(%rsp),%r10
  1076. movq 24(%rsp),%r11
  1077. movq 32(%rsp),%r12
  1078. movq 40(%rsp),%r13
  1079. movq 48(%rsp),%r14
  1080. movq 56(%rsp),%r15
  1081. call __rsaz_512_reduce
  1082. jmp .Lmul_scatter_tail
  1083. .align 32
  1084. .Lmulx_scatter:
  1085. movq (%rdi),%rdx
  1086. call __rsaz_512_mulx
  1087. .byte 102,72,15,126,199
  1088. .byte 102,72,15,126,205
  1089. movq 128(%rsp),%rdx
  1090. movq (%rsp),%r8
  1091. movq 8(%rsp),%r9
  1092. movq 16(%rsp),%r10
  1093. movq 24(%rsp),%r11
  1094. movq 32(%rsp),%r12
  1095. movq 40(%rsp),%r13
  1096. movq 48(%rsp),%r14
  1097. movq 56(%rsp),%r15
  1098. call __rsaz_512_reducex
  1099. .Lmul_scatter_tail:
  1100. addq 64(%rsp),%r8
  1101. adcq 72(%rsp),%r9
  1102. adcq 80(%rsp),%r10
  1103. adcq 88(%rsp),%r11
  1104. adcq 96(%rsp),%r12
  1105. adcq 104(%rsp),%r13
  1106. adcq 112(%rsp),%r14
  1107. adcq 120(%rsp),%r15
  1108. .byte 102,72,15,126,214
  1109. sbbq %rcx,%rcx
  1110. call __rsaz_512_subtract
  1111. movq %r8,0(%rsi)
  1112. movq %r9,128(%rsi)
  1113. movq %r10,256(%rsi)
  1114. movq %r11,384(%rsi)
  1115. movq %r12,512(%rsi)
  1116. movq %r13,640(%rsi)
  1117. movq %r14,768(%rsi)
  1118. movq %r15,896(%rsi)
  1119. leaq 128+24+48(%rsp),%rax
  1120. .cfi_def_cfa %rax,8
  1121. movq -48(%rax),%r15
  1122. .cfi_restore %r15
  1123. movq -40(%rax),%r14
  1124. .cfi_restore %r14
  1125. movq -32(%rax),%r13
  1126. .cfi_restore %r13
  1127. movq -24(%rax),%r12
  1128. .cfi_restore %r12
  1129. movq -16(%rax),%rbp
  1130. .cfi_restore %rbp
  1131. movq -8(%rax),%rbx
  1132. .cfi_restore %rbx
  1133. leaq (%rax),%rsp
  1134. .cfi_def_cfa_register %rsp
  1135. .Lmul_scatter4_epilogue:
  1136. .byte 0xf3,0xc3
  1137. .cfi_endproc
  1138. .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
  1139. .globl rsaz_512_mul_by_one
  1140. .type rsaz_512_mul_by_one,@function
  1141. .align 32
  1142. rsaz_512_mul_by_one:
  1143. .cfi_startproc
  1144. pushq %rbx
  1145. .cfi_adjust_cfa_offset 8
  1146. .cfi_offset %rbx,-16
  1147. pushq %rbp
  1148. .cfi_adjust_cfa_offset 8
  1149. .cfi_offset %rbp,-24
  1150. pushq %r12
  1151. .cfi_adjust_cfa_offset 8
  1152. .cfi_offset %r12,-32
  1153. pushq %r13
  1154. .cfi_adjust_cfa_offset 8
  1155. .cfi_offset %r13,-40
  1156. pushq %r14
  1157. .cfi_adjust_cfa_offset 8
  1158. .cfi_offset %r14,-48
  1159. pushq %r15
  1160. .cfi_adjust_cfa_offset 8
  1161. .cfi_offset %r15,-56
  1162. subq $128+24,%rsp
  1163. .cfi_adjust_cfa_offset 128+24
  1164. .Lmul_by_one_body:
  1165. movl OPENSSL_ia32cap_P+8(%rip),%eax
  1166. movq %rdx,%rbp
  1167. movq %rcx,128(%rsp)
  1168. movq (%rsi),%r8
  1169. pxor %xmm0,%xmm0
  1170. movq 8(%rsi),%r9
  1171. movq 16(%rsi),%r10
  1172. movq 24(%rsi),%r11
  1173. movq 32(%rsi),%r12
  1174. movq 40(%rsi),%r13
  1175. movq 48(%rsi),%r14
  1176. movq 56(%rsi),%r15
  1177. movdqa %xmm0,(%rsp)
  1178. movdqa %xmm0,16(%rsp)
  1179. movdqa %xmm0,32(%rsp)
  1180. movdqa %xmm0,48(%rsp)
  1181. movdqa %xmm0,64(%rsp)
  1182. movdqa %xmm0,80(%rsp)
  1183. movdqa %xmm0,96(%rsp)
  1184. andl $0x80100,%eax
  1185. cmpl $0x80100,%eax
  1186. je .Lby_one_callx
  1187. call __rsaz_512_reduce
  1188. jmp .Lby_one_tail
  1189. .align 32
  1190. .Lby_one_callx:
  1191. movq 128(%rsp),%rdx
  1192. call __rsaz_512_reducex
  1193. .Lby_one_tail:
  1194. movq %r8,(%rdi)
  1195. movq %r9,8(%rdi)
  1196. movq %r10,16(%rdi)
  1197. movq %r11,24(%rdi)
  1198. movq %r12,32(%rdi)
  1199. movq %r13,40(%rdi)
  1200. movq %r14,48(%rdi)
  1201. movq %r15,56(%rdi)
  1202. leaq 128+24+48(%rsp),%rax
  1203. .cfi_def_cfa %rax,8
  1204. movq -48(%rax),%r15
  1205. .cfi_restore %r15
  1206. movq -40(%rax),%r14
  1207. .cfi_restore %r14
  1208. movq -32(%rax),%r13
  1209. .cfi_restore %r13
  1210. movq -24(%rax),%r12
  1211. .cfi_restore %r12
  1212. movq -16(%rax),%rbp
  1213. .cfi_restore %rbp
  1214. movq -8(%rax),%rbx
  1215. .cfi_restore %rbx
  1216. leaq (%rax),%rsp
  1217. .cfi_def_cfa_register %rsp
  1218. .Lmul_by_one_epilogue:
  1219. .byte 0xf3,0xc3
  1220. .cfi_endproc
  1221. .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
  1222. .type __rsaz_512_reduce,@function
  1223. .align 32
  1224. __rsaz_512_reduce:
  1225. .cfi_startproc
  1226. movq %r8,%rbx
  1227. imulq 128+8(%rsp),%rbx
  1228. movq 0(%rbp),%rax
  1229. movl $8,%ecx
  1230. jmp .Lreduction_loop
  1231. .align 32
  1232. .Lreduction_loop:
  1233. mulq %rbx
  1234. movq 8(%rbp),%rax
  1235. negq %r8
  1236. movq %rdx,%r8
  1237. adcq $0,%r8
  1238. mulq %rbx
  1239. addq %rax,%r9
  1240. movq 16(%rbp),%rax
  1241. adcq $0,%rdx
  1242. addq %r9,%r8
  1243. movq %rdx,%r9
  1244. adcq $0,%r9
  1245. mulq %rbx
  1246. addq %rax,%r10
  1247. movq 24(%rbp),%rax
  1248. adcq $0,%rdx
  1249. addq %r10,%r9
  1250. movq %rdx,%r10
  1251. adcq $0,%r10
  1252. mulq %rbx
  1253. addq %rax,%r11
  1254. movq 32(%rbp),%rax
  1255. adcq $0,%rdx
  1256. addq %r11,%r10
  1257. movq 128+8(%rsp),%rsi
  1258. adcq $0,%rdx
  1259. movq %rdx,%r11
  1260. mulq %rbx
  1261. addq %rax,%r12
  1262. movq 40(%rbp),%rax
  1263. adcq $0,%rdx
  1264. imulq %r8,%rsi
  1265. addq %r12,%r11
  1266. movq %rdx,%r12
  1267. adcq $0,%r12
  1268. mulq %rbx
  1269. addq %rax,%r13
  1270. movq 48(%rbp),%rax
  1271. adcq $0,%rdx
  1272. addq %r13,%r12
  1273. movq %rdx,%r13
  1274. adcq $0,%r13
  1275. mulq %rbx
  1276. addq %rax,%r14
  1277. movq 56(%rbp),%rax
  1278. adcq $0,%rdx
  1279. addq %r14,%r13
  1280. movq %rdx,%r14
  1281. adcq $0,%r14
  1282. mulq %rbx
  1283. movq %rsi,%rbx
  1284. addq %rax,%r15
  1285. movq 0(%rbp),%rax
  1286. adcq $0,%rdx
  1287. addq %r15,%r14
  1288. movq %rdx,%r15
  1289. adcq $0,%r15
  1290. decl %ecx
  1291. jne .Lreduction_loop
  1292. .byte 0xf3,0xc3
  1293. .cfi_endproc
  1294. .size __rsaz_512_reduce,.-__rsaz_512_reduce
  1295. .type __rsaz_512_reducex,@function
  1296. .align 32
  1297. __rsaz_512_reducex:
  1298. .cfi_startproc
  1299. imulq %r8,%rdx
  1300. xorq %rsi,%rsi
  1301. movl $8,%ecx
  1302. jmp .Lreduction_loopx
  1303. .align 32
  1304. .Lreduction_loopx:
  1305. movq %r8,%rbx
  1306. mulxq 0(%rbp),%rax,%r8
  1307. adcxq %rbx,%rax
  1308. adoxq %r9,%r8
  1309. mulxq 8(%rbp),%rax,%r9
  1310. adcxq %rax,%r8
  1311. adoxq %r10,%r9
  1312. mulxq 16(%rbp),%rbx,%r10
  1313. adcxq %rbx,%r9
  1314. adoxq %r11,%r10
  1315. mulxq 24(%rbp),%rbx,%r11
  1316. adcxq %rbx,%r10
  1317. adoxq %r12,%r11
  1318. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
  1319. movq %rdx,%rax
  1320. movq %r8,%rdx
  1321. adcxq %rbx,%r11
  1322. adoxq %r13,%r12
  1323. mulxq 128+8(%rsp),%rbx,%rdx
  1324. movq %rax,%rdx
  1325. mulxq 40(%rbp),%rax,%r13
  1326. adcxq %rax,%r12
  1327. adoxq %r14,%r13
  1328. .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
  1329. adcxq %rax,%r13
  1330. adoxq %r15,%r14
  1331. mulxq 56(%rbp),%rax,%r15
  1332. movq %rbx,%rdx
  1333. adcxq %rax,%r14
  1334. adoxq %rsi,%r15
  1335. adcxq %rsi,%r15
  1336. decl %ecx
  1337. jne .Lreduction_loopx
  1338. .byte 0xf3,0xc3
  1339. .cfi_endproc
  1340. .size __rsaz_512_reducex,.-__rsaz_512_reducex
  1341. .type __rsaz_512_subtract,@function
  1342. .align 32
  1343. __rsaz_512_subtract:
  1344. .cfi_startproc
  1345. movq %r8,(%rdi)
  1346. movq %r9,8(%rdi)
  1347. movq %r10,16(%rdi)
  1348. movq %r11,24(%rdi)
  1349. movq %r12,32(%rdi)
  1350. movq %r13,40(%rdi)
  1351. movq %r14,48(%rdi)
  1352. movq %r15,56(%rdi)
  1353. movq 0(%rbp),%r8
  1354. movq 8(%rbp),%r9
  1355. negq %r8
  1356. notq %r9
  1357. andq %rcx,%r8
  1358. movq 16(%rbp),%r10
  1359. andq %rcx,%r9
  1360. notq %r10
  1361. movq 24(%rbp),%r11
  1362. andq %rcx,%r10
  1363. notq %r11
  1364. movq 32(%rbp),%r12
  1365. andq %rcx,%r11
  1366. notq %r12
  1367. movq 40(%rbp),%r13
  1368. andq %rcx,%r12
  1369. notq %r13
  1370. movq 48(%rbp),%r14
  1371. andq %rcx,%r13
  1372. notq %r14
  1373. movq 56(%rbp),%r15
  1374. andq %rcx,%r14
  1375. notq %r15
  1376. andq %rcx,%r15
  1377. addq (%rdi),%r8
  1378. adcq 8(%rdi),%r9
  1379. adcq 16(%rdi),%r10
  1380. adcq 24(%rdi),%r11
  1381. adcq 32(%rdi),%r12
  1382. adcq 40(%rdi),%r13
  1383. adcq 48(%rdi),%r14
  1384. adcq 56(%rdi),%r15
  1385. movq %r8,(%rdi)
  1386. movq %r9,8(%rdi)
  1387. movq %r10,16(%rdi)
  1388. movq %r11,24(%rdi)
  1389. movq %r12,32(%rdi)
  1390. movq %r13,40(%rdi)
  1391. movq %r14,48(%rdi)
  1392. movq %r15,56(%rdi)
  1393. .byte 0xf3,0xc3
  1394. .cfi_endproc
  1395. .size __rsaz_512_subtract,.-__rsaz_512_subtract
  1396. .type __rsaz_512_mul,@function
  1397. .align 32
  1398. __rsaz_512_mul:
  1399. .cfi_startproc
  1400. leaq 8(%rsp),%rdi
  1401. movq (%rsi),%rax
  1402. mulq %rbx
  1403. movq %rax,(%rdi)
  1404. movq 8(%rsi),%rax
  1405. movq %rdx,%r8
  1406. mulq %rbx
  1407. addq %rax,%r8
  1408. movq 16(%rsi),%rax
  1409. movq %rdx,%r9
  1410. adcq $0,%r9
  1411. mulq %rbx
  1412. addq %rax,%r9
  1413. movq 24(%rsi),%rax
  1414. movq %rdx,%r10
  1415. adcq $0,%r10
  1416. mulq %rbx
  1417. addq %rax,%r10
  1418. movq 32(%rsi),%rax
  1419. movq %rdx,%r11
  1420. adcq $0,%r11
  1421. mulq %rbx
  1422. addq %rax,%r11
  1423. movq 40(%rsi),%rax
  1424. movq %rdx,%r12
  1425. adcq $0,%r12
  1426. mulq %rbx
  1427. addq %rax,%r12
  1428. movq 48(%rsi),%rax
  1429. movq %rdx,%r13
  1430. adcq $0,%r13
  1431. mulq %rbx
  1432. addq %rax,%r13
  1433. movq 56(%rsi),%rax
  1434. movq %rdx,%r14
  1435. adcq $0,%r14
  1436. mulq %rbx
  1437. addq %rax,%r14
  1438. movq (%rsi),%rax
  1439. movq %rdx,%r15
  1440. adcq $0,%r15
  1441. leaq 8(%rbp),%rbp
  1442. leaq 8(%rdi),%rdi
  1443. movl $7,%ecx
  1444. jmp .Loop_mul
  1445. .align 32
  1446. .Loop_mul:
  1447. movq (%rbp),%rbx
  1448. mulq %rbx
  1449. addq %rax,%r8
  1450. movq 8(%rsi),%rax
  1451. movq %r8,(%rdi)
  1452. movq %rdx,%r8
  1453. adcq $0,%r8
  1454. mulq %rbx
  1455. addq %rax,%r9
  1456. movq 16(%rsi),%rax
  1457. adcq $0,%rdx
  1458. addq %r9,%r8
  1459. movq %rdx,%r9
  1460. adcq $0,%r9
  1461. mulq %rbx
  1462. addq %rax,%r10
  1463. movq 24(%rsi),%rax
  1464. adcq $0,%rdx
  1465. addq %r10,%r9
  1466. movq %rdx,%r10
  1467. adcq $0,%r10
  1468. mulq %rbx
  1469. addq %rax,%r11
  1470. movq 32(%rsi),%rax
  1471. adcq $0,%rdx
  1472. addq %r11,%r10
  1473. movq %rdx,%r11
  1474. adcq $0,%r11
  1475. mulq %rbx
  1476. addq %rax,%r12
  1477. movq 40(%rsi),%rax
  1478. adcq $0,%rdx
  1479. addq %r12,%r11
  1480. movq %rdx,%r12
  1481. adcq $0,%r12
  1482. mulq %rbx
  1483. addq %rax,%r13
  1484. movq 48(%rsi),%rax
  1485. adcq $0,%rdx
  1486. addq %r13,%r12
  1487. movq %rdx,%r13
  1488. adcq $0,%r13
  1489. mulq %rbx
  1490. addq %rax,%r14
  1491. movq 56(%rsi),%rax
  1492. adcq $0,%rdx
  1493. addq %r14,%r13
  1494. movq %rdx,%r14
  1495. leaq 8(%rbp),%rbp
  1496. adcq $0,%r14
  1497. mulq %rbx
  1498. addq %rax,%r15
  1499. movq (%rsi),%rax
  1500. adcq $0,%rdx
  1501. addq %r15,%r14
  1502. movq %rdx,%r15
  1503. adcq $0,%r15
  1504. leaq 8(%rdi),%rdi
  1505. decl %ecx
  1506. jnz .Loop_mul
  1507. movq %r8,(%rdi)
  1508. movq %r9,8(%rdi)
  1509. movq %r10,16(%rdi)
  1510. movq %r11,24(%rdi)
  1511. movq %r12,32(%rdi)
  1512. movq %r13,40(%rdi)
  1513. movq %r14,48(%rdi)
  1514. movq %r15,56(%rdi)
  1515. .byte 0xf3,0xc3
  1516. .cfi_endproc
  1517. .size __rsaz_512_mul,.-__rsaz_512_mul
  1518. .type __rsaz_512_mulx,@function
  1519. .align 32
  1520. __rsaz_512_mulx:
  1521. .cfi_startproc
  1522. mulxq (%rsi),%rbx,%r8
  1523. movq $-6,%rcx
  1524. mulxq 8(%rsi),%rax,%r9
  1525. movq %rbx,8(%rsp)
  1526. mulxq 16(%rsi),%rbx,%r10
  1527. adcq %rax,%r8
  1528. mulxq 24(%rsi),%rax,%r11
  1529. adcq %rbx,%r9
  1530. mulxq 32(%rsi),%rbx,%r12
  1531. adcq %rax,%r10
  1532. mulxq 40(%rsi),%rax,%r13
  1533. adcq %rbx,%r11
  1534. mulxq 48(%rsi),%rbx,%r14
  1535. adcq %rax,%r12
  1536. mulxq 56(%rsi),%rax,%r15
  1537. movq 8(%rbp),%rdx
  1538. adcq %rbx,%r13
  1539. adcq %rax,%r14
  1540. adcq $0,%r15
  1541. xorq %rdi,%rdi
  1542. jmp .Loop_mulx
  1543. .align 32
  1544. .Loop_mulx:
  1545. movq %r8,%rbx
  1546. mulxq (%rsi),%rax,%r8
  1547. adcxq %rax,%rbx
  1548. adoxq %r9,%r8
  1549. mulxq 8(%rsi),%rax,%r9
  1550. adcxq %rax,%r8
  1551. adoxq %r10,%r9
  1552. mulxq 16(%rsi),%rax,%r10
  1553. adcxq %rax,%r9
  1554. adoxq %r11,%r10
  1555. mulxq 24(%rsi),%rax,%r11
  1556. adcxq %rax,%r10
  1557. adoxq %r12,%r11
  1558. .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
  1559. adcxq %rax,%r11
  1560. adoxq %r13,%r12
  1561. mulxq 40(%rsi),%rax,%r13
  1562. adcxq %rax,%r12
  1563. adoxq %r14,%r13
  1564. mulxq 48(%rsi),%rax,%r14
  1565. adcxq %rax,%r13
  1566. adoxq %r15,%r14
  1567. mulxq 56(%rsi),%rax,%r15
  1568. movq 64(%rbp,%rcx,8),%rdx
  1569. movq %rbx,8+64-8(%rsp,%rcx,8)
  1570. adcxq %rax,%r14
  1571. adoxq %rdi,%r15
  1572. adcxq %rdi,%r15
  1573. incq %rcx
  1574. jnz .Loop_mulx
  1575. movq %r8,%rbx
  1576. mulxq (%rsi),%rax,%r8
  1577. adcxq %rax,%rbx
  1578. adoxq %r9,%r8
  1579. .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
  1580. adcxq %rax,%r8
  1581. adoxq %r10,%r9
  1582. .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
  1583. adcxq %rax,%r9
  1584. adoxq %r11,%r10
  1585. mulxq 24(%rsi),%rax,%r11
  1586. adcxq %rax,%r10
  1587. adoxq %r12,%r11
  1588. mulxq 32(%rsi),%rax,%r12
  1589. adcxq %rax,%r11
  1590. adoxq %r13,%r12
  1591. mulxq 40(%rsi),%rax,%r13
  1592. adcxq %rax,%r12
  1593. adoxq %r14,%r13
  1594. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
  1595. adcxq %rax,%r13
  1596. adoxq %r15,%r14
  1597. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
  1598. adcxq %rax,%r14
  1599. adoxq %rdi,%r15
  1600. adcxq %rdi,%r15
  1601. movq %rbx,8+64-8(%rsp)
  1602. movq %r8,8+64(%rsp)
  1603. movq %r9,8+64+8(%rsp)
  1604. movq %r10,8+64+16(%rsp)
  1605. movq %r11,8+64+24(%rsp)
  1606. movq %r12,8+64+32(%rsp)
  1607. movq %r13,8+64+40(%rsp)
  1608. movq %r14,8+64+48(%rsp)
  1609. movq %r15,8+64+56(%rsp)
  1610. .byte 0xf3,0xc3
  1611. .cfi_endproc
  1612. .size __rsaz_512_mulx,.-__rsaz_512_mulx
  1613. .globl rsaz_512_scatter4
  1614. .type rsaz_512_scatter4,@function
  1615. .align 16
  1616. rsaz_512_scatter4:
  1617. .cfi_startproc
  1618. leaq (%rdi,%rdx,8),%rdi
  1619. movl $8,%r9d
  1620. jmp .Loop_scatter
  1621. .align 16
  1622. .Loop_scatter:
  1623. movq (%rsi),%rax
  1624. leaq 8(%rsi),%rsi
  1625. movq %rax,(%rdi)
  1626. leaq 128(%rdi),%rdi
  1627. decl %r9d
  1628. jnz .Loop_scatter
  1629. .byte 0xf3,0xc3
  1630. .cfi_endproc
  1631. .size rsaz_512_scatter4,.-rsaz_512_scatter4
  1632. .globl rsaz_512_gather4
  1633. .type rsaz_512_gather4,@function
  1634. .align 16
  1635. rsaz_512_gather4:
  1636. .cfi_startproc
  1637. movd %edx,%xmm8
  1638. movdqa .Linc+16(%rip),%xmm1
  1639. movdqa .Linc(%rip),%xmm0
  1640. pshufd $0,%xmm8,%xmm8
  1641. movdqa %xmm1,%xmm7
  1642. movdqa %xmm1,%xmm2
  1643. paddd %xmm0,%xmm1
  1644. pcmpeqd %xmm8,%xmm0
  1645. movdqa %xmm7,%xmm3
  1646. paddd %xmm1,%xmm2
  1647. pcmpeqd %xmm8,%xmm1
  1648. movdqa %xmm7,%xmm4
  1649. paddd %xmm2,%xmm3
  1650. pcmpeqd %xmm8,%xmm2
  1651. movdqa %xmm7,%xmm5
  1652. paddd %xmm3,%xmm4
  1653. pcmpeqd %xmm8,%xmm3
  1654. movdqa %xmm7,%xmm6
  1655. paddd %xmm4,%xmm5
  1656. pcmpeqd %xmm8,%xmm4
  1657. paddd %xmm5,%xmm6
  1658. pcmpeqd %xmm8,%xmm5
  1659. paddd %xmm6,%xmm7
  1660. pcmpeqd %xmm8,%xmm6
  1661. pcmpeqd %xmm8,%xmm7
  1662. movl $8,%r9d
  1663. jmp .Loop_gather
  1664. .align 16
  1665. .Loop_gather:
  1666. movdqa 0(%rsi),%xmm8
  1667. movdqa 16(%rsi),%xmm9
  1668. movdqa 32(%rsi),%xmm10
  1669. movdqa 48(%rsi),%xmm11
  1670. pand %xmm0,%xmm8
  1671. movdqa 64(%rsi),%xmm12
  1672. pand %xmm1,%xmm9
  1673. movdqa 80(%rsi),%xmm13
  1674. pand %xmm2,%xmm10
  1675. movdqa 96(%rsi),%xmm14
  1676. pand %xmm3,%xmm11
  1677. movdqa 112(%rsi),%xmm15
  1678. leaq 128(%rsi),%rsi
  1679. pand %xmm4,%xmm12
  1680. pand %xmm5,%xmm13
  1681. pand %xmm6,%xmm14
  1682. pand %xmm7,%xmm15
  1683. por %xmm10,%xmm8
  1684. por %xmm11,%xmm9
  1685. por %xmm12,%xmm8
  1686. por %xmm13,%xmm9
  1687. por %xmm14,%xmm8
  1688. por %xmm15,%xmm9
  1689. por %xmm9,%xmm8
  1690. pshufd $0x4e,%xmm8,%xmm9
  1691. por %xmm9,%xmm8
  1692. movq %xmm8,(%rdi)
  1693. leaq 8(%rdi),%rdi
  1694. decl %r9d
  1695. jnz .Loop_gather
  1696. .byte 0xf3,0xc3
  1697. .LSEH_end_rsaz_512_gather4:
  1698. .cfi_endproc
  1699. .size rsaz_512_gather4,.-rsaz_512_gather4
  1700. .align 64
  1701. .Linc:
  1702. .long 0,0, 1,1
  1703. .long 2,2, 2,2