rsaz-x86_64.s 28 KB


  1. .text
  2. .globl _rsaz_512_sqr
  3. .p2align 5
  4. _rsaz_512_sqr:
  5. pushq %rbx
  6. pushq %rbp
  7. pushq %r12
  8. pushq %r13
  9. pushq %r14
  10. pushq %r15
  11. subq $128+24,%rsp
  12. L$sqr_body:
  13. .byte 102,72,15,110,202
  14. movq (%rsi),%rdx
  15. movq 8(%rsi),%rax
  16. movq %rcx,128(%rsp)
  17. movl $0x80100,%r11d
  18. andl _OPENSSL_ia32cap_P+8(%rip),%r11d
  19. cmpl $0x80100,%r11d
  20. je L$oop_sqrx
  21. jmp L$oop_sqr
  22. .p2align 5
  23. L$oop_sqr:
  24. movl %r8d,128+8(%rsp)
  25. movq %rdx,%rbx
  26. movq %rax,%rbp
  27. mulq %rdx
  28. movq %rax,%r8
  29. movq 16(%rsi),%rax
  30. movq %rdx,%r9
  31. mulq %rbx
  32. addq %rax,%r9
  33. movq 24(%rsi),%rax
  34. movq %rdx,%r10
  35. adcq $0,%r10
  36. mulq %rbx
  37. addq %rax,%r10
  38. movq 32(%rsi),%rax
  39. movq %rdx,%r11
  40. adcq $0,%r11
  41. mulq %rbx
  42. addq %rax,%r11
  43. movq 40(%rsi),%rax
  44. movq %rdx,%r12
  45. adcq $0,%r12
  46. mulq %rbx
  47. addq %rax,%r12
  48. movq 48(%rsi),%rax
  49. movq %rdx,%r13
  50. adcq $0,%r13
  51. mulq %rbx
  52. addq %rax,%r13
  53. movq 56(%rsi),%rax
  54. movq %rdx,%r14
  55. adcq $0,%r14
  56. mulq %rbx
  57. addq %rax,%r14
  58. movq %rbx,%rax
  59. adcq $0,%rdx
  60. xorq %rcx,%rcx
  61. addq %r8,%r8
  62. movq %rdx,%r15
  63. adcq $0,%rcx
  64. mulq %rax
  65. addq %r8,%rdx
  66. adcq $0,%rcx
  67. movq %rax,(%rsp)
  68. movq %rdx,8(%rsp)
  69. movq 16(%rsi),%rax
  70. mulq %rbp
  71. addq %rax,%r10
  72. movq 24(%rsi),%rax
  73. movq %rdx,%rbx
  74. adcq $0,%rbx
  75. mulq %rbp
  76. addq %rax,%r11
  77. movq 32(%rsi),%rax
  78. adcq $0,%rdx
  79. addq %rbx,%r11
  80. movq %rdx,%rbx
  81. adcq $0,%rbx
  82. mulq %rbp
  83. addq %rax,%r12
  84. movq 40(%rsi),%rax
  85. adcq $0,%rdx
  86. addq %rbx,%r12
  87. movq %rdx,%rbx
  88. adcq $0,%rbx
  89. mulq %rbp
  90. addq %rax,%r13
  91. movq 48(%rsi),%rax
  92. adcq $0,%rdx
  93. addq %rbx,%r13
  94. movq %rdx,%rbx
  95. adcq $0,%rbx
  96. mulq %rbp
  97. addq %rax,%r14
  98. movq 56(%rsi),%rax
  99. adcq $0,%rdx
  100. addq %rbx,%r14
  101. movq %rdx,%rbx
  102. adcq $0,%rbx
  103. mulq %rbp
  104. addq %rax,%r15
  105. movq %rbp,%rax
  106. adcq $0,%rdx
  107. addq %rbx,%r15
  108. adcq $0,%rdx
  109. xorq %rbx,%rbx
  110. addq %r9,%r9
  111. movq %rdx,%r8
  112. adcq %r10,%r10
  113. adcq $0,%rbx
  114. mulq %rax
  115. addq %rcx,%rax
  116. movq 16(%rsi),%rbp
  117. addq %rax,%r9
  118. movq 24(%rsi),%rax
  119. adcq %rdx,%r10
  120. adcq $0,%rbx
  121. movq %r9,16(%rsp)
  122. movq %r10,24(%rsp)
  123. mulq %rbp
  124. addq %rax,%r12
  125. movq 32(%rsi),%rax
  126. movq %rdx,%rcx
  127. adcq $0,%rcx
  128. mulq %rbp
  129. addq %rax,%r13
  130. movq 40(%rsi),%rax
  131. adcq $0,%rdx
  132. addq %rcx,%r13
  133. movq %rdx,%rcx
  134. adcq $0,%rcx
  135. mulq %rbp
  136. addq %rax,%r14
  137. movq 48(%rsi),%rax
  138. adcq $0,%rdx
  139. addq %rcx,%r14
  140. movq %rdx,%rcx
  141. adcq $0,%rcx
  142. mulq %rbp
  143. addq %rax,%r15
  144. movq 56(%rsi),%rax
  145. adcq $0,%rdx
  146. addq %rcx,%r15
  147. movq %rdx,%rcx
  148. adcq $0,%rcx
  149. mulq %rbp
  150. addq %rax,%r8
  151. movq %rbp,%rax
  152. adcq $0,%rdx
  153. addq %rcx,%r8
  154. adcq $0,%rdx
  155. xorq %rcx,%rcx
  156. addq %r11,%r11
  157. movq %rdx,%r9
  158. adcq %r12,%r12
  159. adcq $0,%rcx
  160. mulq %rax
  161. addq %rbx,%rax
  162. movq 24(%rsi),%r10
  163. addq %rax,%r11
  164. movq 32(%rsi),%rax
  165. adcq %rdx,%r12
  166. adcq $0,%rcx
  167. movq %r11,32(%rsp)
  168. movq %r12,40(%rsp)
  169. movq %rax,%r11
  170. mulq %r10
  171. addq %rax,%r14
  172. movq 40(%rsi),%rax
  173. movq %rdx,%rbx
  174. adcq $0,%rbx
  175. movq %rax,%r12
  176. mulq %r10
  177. addq %rax,%r15
  178. movq 48(%rsi),%rax
  179. adcq $0,%rdx
  180. addq %rbx,%r15
  181. movq %rdx,%rbx
  182. adcq $0,%rbx
  183. movq %rax,%rbp
  184. mulq %r10
  185. addq %rax,%r8
  186. movq 56(%rsi),%rax
  187. adcq $0,%rdx
  188. addq %rbx,%r8
  189. movq %rdx,%rbx
  190. adcq $0,%rbx
  191. mulq %r10
  192. addq %rax,%r9
  193. movq %r10,%rax
  194. adcq $0,%rdx
  195. addq %rbx,%r9
  196. adcq $0,%rdx
  197. xorq %rbx,%rbx
  198. addq %r13,%r13
  199. movq %rdx,%r10
  200. adcq %r14,%r14
  201. adcq $0,%rbx
  202. mulq %rax
  203. addq %rcx,%rax
  204. addq %rax,%r13
  205. movq %r12,%rax
  206. adcq %rdx,%r14
  207. adcq $0,%rbx
  208. movq %r13,48(%rsp)
  209. movq %r14,56(%rsp)
  210. mulq %r11
  211. addq %rax,%r8
  212. movq %rbp,%rax
  213. movq %rdx,%rcx
  214. adcq $0,%rcx
  215. mulq %r11
  216. addq %rax,%r9
  217. movq 56(%rsi),%rax
  218. adcq $0,%rdx
  219. addq %rcx,%r9
  220. movq %rdx,%rcx
  221. adcq $0,%rcx
  222. movq %rax,%r14
  223. mulq %r11
  224. addq %rax,%r10
  225. movq %r11,%rax
  226. adcq $0,%rdx
  227. addq %rcx,%r10
  228. adcq $0,%rdx
  229. xorq %rcx,%rcx
  230. addq %r15,%r15
  231. movq %rdx,%r11
  232. adcq %r8,%r8
  233. adcq $0,%rcx
  234. mulq %rax
  235. addq %rbx,%rax
  236. addq %rax,%r15
  237. movq %rbp,%rax
  238. adcq %rdx,%r8
  239. adcq $0,%rcx
  240. movq %r15,64(%rsp)
  241. movq %r8,72(%rsp)
  242. mulq %r12
  243. addq %rax,%r10
  244. movq %r14,%rax
  245. movq %rdx,%rbx
  246. adcq $0,%rbx
  247. mulq %r12
  248. addq %rax,%r11
  249. movq %r12,%rax
  250. adcq $0,%rdx
  251. addq %rbx,%r11
  252. adcq $0,%rdx
  253. xorq %rbx,%rbx
  254. addq %r9,%r9
  255. movq %rdx,%r12
  256. adcq %r10,%r10
  257. adcq $0,%rbx
  258. mulq %rax
  259. addq %rcx,%rax
  260. addq %rax,%r9
  261. movq %r14,%rax
  262. adcq %rdx,%r10
  263. adcq $0,%rbx
  264. movq %r9,80(%rsp)
  265. movq %r10,88(%rsp)
  266. mulq %rbp
  267. addq %rax,%r12
  268. movq %rbp,%rax
  269. adcq $0,%rdx
  270. xorq %rcx,%rcx
  271. addq %r11,%r11
  272. movq %rdx,%r13
  273. adcq %r12,%r12
  274. adcq $0,%rcx
  275. mulq %rax
  276. addq %rbx,%rax
  277. addq %rax,%r11
  278. movq %r14,%rax
  279. adcq %rdx,%r12
  280. adcq $0,%rcx
  281. movq %r11,96(%rsp)
  282. movq %r12,104(%rsp)
  283. xorq %rbx,%rbx
  284. addq %r13,%r13
  285. adcq $0,%rbx
  286. mulq %rax
  287. addq %rcx,%rax
  288. addq %r13,%rax
  289. adcq %rbx,%rdx
  290. movq (%rsp),%r8
  291. movq 8(%rsp),%r9
  292. movq 16(%rsp),%r10
  293. movq 24(%rsp),%r11
  294. movq 32(%rsp),%r12
  295. movq 40(%rsp),%r13
  296. movq 48(%rsp),%r14
  297. movq 56(%rsp),%r15
  298. .byte 102,72,15,126,205
  299. movq %rax,112(%rsp)
  300. movq %rdx,120(%rsp)
  301. call __rsaz_512_reduce
  302. addq 64(%rsp),%r8
  303. adcq 72(%rsp),%r9
  304. adcq 80(%rsp),%r10
  305. adcq 88(%rsp),%r11
  306. adcq 96(%rsp),%r12
  307. adcq 104(%rsp),%r13
  308. adcq 112(%rsp),%r14
  309. adcq 120(%rsp),%r15
  310. sbbq %rcx,%rcx
  311. call __rsaz_512_subtract
  312. movq %r8,%rdx
  313. movq %r9,%rax
  314. movl 128+8(%rsp),%r8d
  315. movq %rdi,%rsi
  316. decl %r8d
  317. jnz L$oop_sqr
  318. jmp L$sqr_tail
  319. .p2align 5
  320. L$oop_sqrx:
  321. movl %r8d,128+8(%rsp)
  322. .byte 102,72,15,110,199
  323. mulxq %rax,%r8,%r9
  324. movq %rax,%rbx
  325. mulxq 16(%rsi),%rcx,%r10
  326. xorq %rbp,%rbp
  327. mulxq 24(%rsi),%rax,%r11
  328. adcxq %rcx,%r9
  329. .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
  330. adcxq %rax,%r10
  331. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
  332. adcxq %rcx,%r11
  333. mulxq 48(%rsi),%rcx,%r14
  334. adcxq %rax,%r12
  335. adcxq %rcx,%r13
  336. mulxq 56(%rsi),%rax,%r15
  337. adcxq %rax,%r14
  338. adcxq %rbp,%r15
  339. mulxq %rdx,%rax,%rdi
  340. movq %rbx,%rdx
  341. xorq %rcx,%rcx
  342. adoxq %r8,%r8
  343. adcxq %rdi,%r8
  344. adoxq %rbp,%rcx
  345. adcxq %rbp,%rcx
  346. movq %rax,(%rsp)
  347. movq %r8,8(%rsp)
  348. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
  349. adoxq %rax,%r10
  350. adcxq %rbx,%r11
  351. mulxq 24(%rsi),%rdi,%r8
  352. adoxq %rdi,%r11
  353. .byte 0x66
  354. adcxq %r8,%r12
  355. mulxq 32(%rsi),%rax,%rbx
  356. adoxq %rax,%r12
  357. adcxq %rbx,%r13
  358. mulxq 40(%rsi),%rdi,%r8
  359. adoxq %rdi,%r13
  360. adcxq %r8,%r14
  361. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
  362. adoxq %rax,%r14
  363. adcxq %rbx,%r15
  364. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
  365. adoxq %rdi,%r15
  366. adcxq %rbp,%r8
  367. mulxq %rdx,%rax,%rdi
  368. adoxq %rbp,%r8
  369. .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00
  370. xorq %rbx,%rbx
  371. adoxq %r9,%r9
  372. adcxq %rcx,%rax
  373. adoxq %r10,%r10
  374. adcxq %rax,%r9
  375. adoxq %rbp,%rbx
  376. adcxq %rdi,%r10
  377. adcxq %rbp,%rbx
  378. movq %r9,16(%rsp)
  379. .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
  380. mulxq 24(%rsi),%rdi,%r9
  381. adoxq %rdi,%r12
  382. adcxq %r9,%r13
  383. mulxq 32(%rsi),%rax,%rcx
  384. adoxq %rax,%r13
  385. adcxq %rcx,%r14
  386. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
  387. adoxq %rdi,%r14
  388. adcxq %r9,%r15
  389. .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
  390. adoxq %rax,%r15
  391. adcxq %rcx,%r8
  392. mulxq 56(%rsi),%rdi,%r9
  393. adoxq %rdi,%r8
  394. adcxq %rbp,%r9
  395. mulxq %rdx,%rax,%rdi
  396. adoxq %rbp,%r9
  397. movq 24(%rsi),%rdx
  398. xorq %rcx,%rcx
  399. adoxq %r11,%r11
  400. adcxq %rbx,%rax
  401. adoxq %r12,%r12
  402. adcxq %rax,%r11
  403. adoxq %rbp,%rcx
  404. adcxq %rdi,%r12
  405. adcxq %rbp,%rcx
  406. movq %r11,32(%rsp)
  407. movq %r12,40(%rsp)
  408. mulxq 32(%rsi),%rax,%rbx
  409. adoxq %rax,%r14
  410. adcxq %rbx,%r15
  411. mulxq 40(%rsi),%rdi,%r10
  412. adoxq %rdi,%r15
  413. adcxq %r10,%r8
  414. mulxq 48(%rsi),%rax,%rbx
  415. adoxq %rax,%r8
  416. adcxq %rbx,%r9
  417. mulxq 56(%rsi),%rdi,%r10
  418. adoxq %rdi,%r9
  419. adcxq %rbp,%r10
  420. mulxq %rdx,%rax,%rdi
  421. adoxq %rbp,%r10
  422. movq 32(%rsi),%rdx
  423. xorq %rbx,%rbx
  424. adoxq %r13,%r13
  425. adcxq %rcx,%rax
  426. adoxq %r14,%r14
  427. adcxq %rax,%r13
  428. adoxq %rbp,%rbx
  429. adcxq %rdi,%r14
  430. adcxq %rbp,%rbx
  431. movq %r13,48(%rsp)
  432. movq %r14,56(%rsp)
  433. mulxq 40(%rsi),%rdi,%r11
  434. adoxq %rdi,%r8
  435. adcxq %r11,%r9
  436. mulxq 48(%rsi),%rax,%rcx
  437. adoxq %rax,%r9
  438. adcxq %rcx,%r10
  439. mulxq 56(%rsi),%rdi,%r11
  440. adoxq %rdi,%r10
  441. adcxq %rbp,%r11
  442. mulxq %rdx,%rax,%rdi
  443. movq 40(%rsi),%rdx
  444. adoxq %rbp,%r11
  445. xorq %rcx,%rcx
  446. adoxq %r15,%r15
  447. adcxq %rbx,%rax
  448. adoxq %r8,%r8
  449. adcxq %rax,%r15
  450. adoxq %rbp,%rcx
  451. adcxq %rdi,%r8
  452. adcxq %rbp,%rcx
  453. movq %r15,64(%rsp)
  454. movq %r8,72(%rsp)
  455. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
  456. adoxq %rax,%r10
  457. adcxq %rbx,%r11
  458. .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
  459. adoxq %rdi,%r11
  460. adcxq %rbp,%r12
  461. mulxq %rdx,%rax,%rdi
  462. adoxq %rbp,%r12
  463. movq 48(%rsi),%rdx
  464. xorq %rbx,%rbx
  465. adoxq %r9,%r9
  466. adcxq %rcx,%rax
  467. adoxq %r10,%r10
  468. adcxq %rax,%r9
  469. adcxq %rdi,%r10
  470. adoxq %rbp,%rbx
  471. adcxq %rbp,%rbx
  472. movq %r9,80(%rsp)
  473. movq %r10,88(%rsp)
  474. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
  475. adoxq %rax,%r12
  476. adoxq %rbp,%r13
  477. mulxq %rdx,%rax,%rdi
  478. xorq %rcx,%rcx
  479. movq 56(%rsi),%rdx
  480. adoxq %r11,%r11
  481. adcxq %rbx,%rax
  482. adoxq %r12,%r12
  483. adcxq %rax,%r11
  484. adoxq %rbp,%rcx
  485. adcxq %rdi,%r12
  486. adcxq %rbp,%rcx
  487. .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
  488. .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
  489. mulxq %rdx,%rax,%rdx
  490. xorq %rbx,%rbx
  491. adoxq %r13,%r13
  492. adcxq %rcx,%rax
  493. adoxq %rbp,%rbx
  494. adcxq %r13,%rax
  495. adcxq %rdx,%rbx
  496. .byte 102,72,15,126,199
  497. .byte 102,72,15,126,205
  498. movq 128(%rsp),%rdx
  499. movq (%rsp),%r8
  500. movq 8(%rsp),%r9
  501. movq 16(%rsp),%r10
  502. movq 24(%rsp),%r11
  503. movq 32(%rsp),%r12
  504. movq 40(%rsp),%r13
  505. movq 48(%rsp),%r14
  506. movq 56(%rsp),%r15
  507. movq %rax,112(%rsp)
  508. movq %rbx,120(%rsp)
  509. call __rsaz_512_reducex
  510. addq 64(%rsp),%r8
  511. adcq 72(%rsp),%r9
  512. adcq 80(%rsp),%r10
  513. adcq 88(%rsp),%r11
  514. adcq 96(%rsp),%r12
  515. adcq 104(%rsp),%r13
  516. adcq 112(%rsp),%r14
  517. adcq 120(%rsp),%r15
  518. sbbq %rcx,%rcx
  519. call __rsaz_512_subtract
  520. movq %r8,%rdx
  521. movq %r9,%rax
  522. movl 128+8(%rsp),%r8d
  523. movq %rdi,%rsi
  524. decl %r8d
  525. jnz L$oop_sqrx
  526. L$sqr_tail:
  527. leaq 128+24+48(%rsp),%rax
  528. movq -48(%rax),%r15
  529. movq -40(%rax),%r14
  530. movq -32(%rax),%r13
  531. movq -24(%rax),%r12
  532. movq -16(%rax),%rbp
  533. movq -8(%rax),%rbx
  534. leaq (%rax),%rsp
  535. L$sqr_epilogue:
  536. .byte 0xf3,0xc3
  537. .globl _rsaz_512_mul
  538. .p2align 5
  539. _rsaz_512_mul:
  540. pushq %rbx
  541. pushq %rbp
  542. pushq %r12
  543. pushq %r13
  544. pushq %r14
  545. pushq %r15
  546. subq $128+24,%rsp
  547. L$mul_body:
  548. .byte 102,72,15,110,199
  549. .byte 102,72,15,110,201
  550. movq %r8,128(%rsp)
  551. movl $0x80100,%r11d
  552. andl _OPENSSL_ia32cap_P+8(%rip),%r11d
  553. cmpl $0x80100,%r11d
  554. je L$mulx
  555. movq (%rdx),%rbx
  556. movq %rdx,%rbp
  557. call __rsaz_512_mul
  558. .byte 102,72,15,126,199
  559. .byte 102,72,15,126,205
  560. movq (%rsp),%r8
  561. movq 8(%rsp),%r9
  562. movq 16(%rsp),%r10
  563. movq 24(%rsp),%r11
  564. movq 32(%rsp),%r12
  565. movq 40(%rsp),%r13
  566. movq 48(%rsp),%r14
  567. movq 56(%rsp),%r15
  568. call __rsaz_512_reduce
  569. jmp L$mul_tail
  570. .p2align 5
  571. L$mulx:
  572. movq %rdx,%rbp
  573. movq (%rdx),%rdx
  574. call __rsaz_512_mulx
  575. .byte 102,72,15,126,199
  576. .byte 102,72,15,126,205
  577. movq 128(%rsp),%rdx
  578. movq (%rsp),%r8
  579. movq 8(%rsp),%r9
  580. movq 16(%rsp),%r10
  581. movq 24(%rsp),%r11
  582. movq 32(%rsp),%r12
  583. movq 40(%rsp),%r13
  584. movq 48(%rsp),%r14
  585. movq 56(%rsp),%r15
  586. call __rsaz_512_reducex
  587. L$mul_tail:
  588. addq 64(%rsp),%r8
  589. adcq 72(%rsp),%r9
  590. adcq 80(%rsp),%r10
  591. adcq 88(%rsp),%r11
  592. adcq 96(%rsp),%r12
  593. adcq 104(%rsp),%r13
  594. adcq 112(%rsp),%r14
  595. adcq 120(%rsp),%r15
  596. sbbq %rcx,%rcx
  597. call __rsaz_512_subtract
  598. leaq 128+24+48(%rsp),%rax
  599. movq -48(%rax),%r15
  600. movq -40(%rax),%r14
  601. movq -32(%rax),%r13
  602. movq -24(%rax),%r12
  603. movq -16(%rax),%rbp
  604. movq -8(%rax),%rbx
  605. leaq (%rax),%rsp
  606. L$mul_epilogue:
  607. .byte 0xf3,0xc3
  608. .globl _rsaz_512_mul_gather4
  609. .p2align 5
  610. _rsaz_512_mul_gather4:
  611. pushq %rbx
  612. pushq %rbp
  613. pushq %r12
  614. pushq %r13
  615. pushq %r14
  616. pushq %r15
  617. subq $152,%rsp
  618. L$mul_gather4_body:
  619. movd %r9d,%xmm8
  620. movdqa L$inc+16(%rip),%xmm1
  621. movdqa L$inc(%rip),%xmm0
  622. pshufd $0,%xmm8,%xmm8
  623. movdqa %xmm1,%xmm7
  624. movdqa %xmm1,%xmm2
  625. paddd %xmm0,%xmm1
  626. pcmpeqd %xmm8,%xmm0
  627. movdqa %xmm7,%xmm3
  628. paddd %xmm1,%xmm2
  629. pcmpeqd %xmm8,%xmm1
  630. movdqa %xmm7,%xmm4
  631. paddd %xmm2,%xmm3
  632. pcmpeqd %xmm8,%xmm2
  633. movdqa %xmm7,%xmm5
  634. paddd %xmm3,%xmm4
  635. pcmpeqd %xmm8,%xmm3
  636. movdqa %xmm7,%xmm6
  637. paddd %xmm4,%xmm5
  638. pcmpeqd %xmm8,%xmm4
  639. paddd %xmm5,%xmm6
  640. pcmpeqd %xmm8,%xmm5
  641. paddd %xmm6,%xmm7
  642. pcmpeqd %xmm8,%xmm6
  643. pcmpeqd %xmm8,%xmm7
  644. movdqa 0(%rdx),%xmm8
  645. movdqa 16(%rdx),%xmm9
  646. movdqa 32(%rdx),%xmm10
  647. movdqa 48(%rdx),%xmm11
  648. pand %xmm0,%xmm8
  649. movdqa 64(%rdx),%xmm12
  650. pand %xmm1,%xmm9
  651. movdqa 80(%rdx),%xmm13
  652. pand %xmm2,%xmm10
  653. movdqa 96(%rdx),%xmm14
  654. pand %xmm3,%xmm11
  655. movdqa 112(%rdx),%xmm15
  656. leaq 128(%rdx),%rbp
  657. pand %xmm4,%xmm12
  658. pand %xmm5,%xmm13
  659. pand %xmm6,%xmm14
  660. pand %xmm7,%xmm15
  661. por %xmm10,%xmm8
  662. por %xmm11,%xmm9
  663. por %xmm12,%xmm8
  664. por %xmm13,%xmm9
  665. por %xmm14,%xmm8
  666. por %xmm15,%xmm9
  667. por %xmm9,%xmm8
  668. pshufd $0x4e,%xmm8,%xmm9
  669. por %xmm9,%xmm8
  670. movl $0x80100,%r11d
  671. andl _OPENSSL_ia32cap_P+8(%rip),%r11d
  672. cmpl $0x80100,%r11d
  673. je L$mulx_gather
  674. .byte 102,76,15,126,195
  675. movq %r8,128(%rsp)
  676. movq %rdi,128+8(%rsp)
  677. movq %rcx,128+16(%rsp)
  678. movq (%rsi),%rax
  679. movq 8(%rsi),%rcx
  680. mulq %rbx
  681. movq %rax,(%rsp)
  682. movq %rcx,%rax
  683. movq %rdx,%r8
  684. mulq %rbx
  685. addq %rax,%r8
  686. movq 16(%rsi),%rax
  687. movq %rdx,%r9
  688. adcq $0,%r9
  689. mulq %rbx
  690. addq %rax,%r9
  691. movq 24(%rsi),%rax
  692. movq %rdx,%r10
  693. adcq $0,%r10
  694. mulq %rbx
  695. addq %rax,%r10
  696. movq 32(%rsi),%rax
  697. movq %rdx,%r11
  698. adcq $0,%r11
  699. mulq %rbx
  700. addq %rax,%r11
  701. movq 40(%rsi),%rax
  702. movq %rdx,%r12
  703. adcq $0,%r12
  704. mulq %rbx
  705. addq %rax,%r12
  706. movq 48(%rsi),%rax
  707. movq %rdx,%r13
  708. adcq $0,%r13
  709. mulq %rbx
  710. addq %rax,%r13
  711. movq 56(%rsi),%rax
  712. movq %rdx,%r14
  713. adcq $0,%r14
  714. mulq %rbx
  715. addq %rax,%r14
  716. movq (%rsi),%rax
  717. movq %rdx,%r15
  718. adcq $0,%r15
  719. leaq 8(%rsp),%rdi
  720. movl $7,%ecx
  721. jmp L$oop_mul_gather
  722. .p2align 5
  723. L$oop_mul_gather:
  724. movdqa 0(%rbp),%xmm8
  725. movdqa 16(%rbp),%xmm9
  726. movdqa 32(%rbp),%xmm10
  727. movdqa 48(%rbp),%xmm11
  728. pand %xmm0,%xmm8
  729. movdqa 64(%rbp),%xmm12
  730. pand %xmm1,%xmm9
  731. movdqa 80(%rbp),%xmm13
  732. pand %xmm2,%xmm10
  733. movdqa 96(%rbp),%xmm14
  734. pand %xmm3,%xmm11
  735. movdqa 112(%rbp),%xmm15
  736. leaq 128(%rbp),%rbp
  737. pand %xmm4,%xmm12
  738. pand %xmm5,%xmm13
  739. pand %xmm6,%xmm14
  740. pand %xmm7,%xmm15
  741. por %xmm10,%xmm8
  742. por %xmm11,%xmm9
  743. por %xmm12,%xmm8
  744. por %xmm13,%xmm9
  745. por %xmm14,%xmm8
  746. por %xmm15,%xmm9
  747. por %xmm9,%xmm8
  748. pshufd $0x4e,%xmm8,%xmm9
  749. por %xmm9,%xmm8
  750. .byte 102,76,15,126,195
  751. mulq %rbx
  752. addq %rax,%r8
  753. movq 8(%rsi),%rax
  754. movq %r8,(%rdi)
  755. movq %rdx,%r8
  756. adcq $0,%r8
  757. mulq %rbx
  758. addq %rax,%r9
  759. movq 16(%rsi),%rax
  760. adcq $0,%rdx
  761. addq %r9,%r8
  762. movq %rdx,%r9
  763. adcq $0,%r9
  764. mulq %rbx
  765. addq %rax,%r10
  766. movq 24(%rsi),%rax
  767. adcq $0,%rdx
  768. addq %r10,%r9
  769. movq %rdx,%r10
  770. adcq $0,%r10
  771. mulq %rbx
  772. addq %rax,%r11
  773. movq 32(%rsi),%rax
  774. adcq $0,%rdx
  775. addq %r11,%r10
  776. movq %rdx,%r11
  777. adcq $0,%r11
  778. mulq %rbx
  779. addq %rax,%r12
  780. movq 40(%rsi),%rax
  781. adcq $0,%rdx
  782. addq %r12,%r11
  783. movq %rdx,%r12
  784. adcq $0,%r12
  785. mulq %rbx
  786. addq %rax,%r13
  787. movq 48(%rsi),%rax
  788. adcq $0,%rdx
  789. addq %r13,%r12
  790. movq %rdx,%r13
  791. adcq $0,%r13
  792. mulq %rbx
  793. addq %rax,%r14
  794. movq 56(%rsi),%rax
  795. adcq $0,%rdx
  796. addq %r14,%r13
  797. movq %rdx,%r14
  798. adcq $0,%r14
  799. mulq %rbx
  800. addq %rax,%r15
  801. movq (%rsi),%rax
  802. adcq $0,%rdx
  803. addq %r15,%r14
  804. movq %rdx,%r15
  805. adcq $0,%r15
  806. leaq 8(%rdi),%rdi
  807. decl %ecx
  808. jnz L$oop_mul_gather
  809. movq %r8,(%rdi)
  810. movq %r9,8(%rdi)
  811. movq %r10,16(%rdi)
  812. movq %r11,24(%rdi)
  813. movq %r12,32(%rdi)
  814. movq %r13,40(%rdi)
  815. movq %r14,48(%rdi)
  816. movq %r15,56(%rdi)
  817. movq 128+8(%rsp),%rdi
  818. movq 128+16(%rsp),%rbp
  819. movq (%rsp),%r8
  820. movq 8(%rsp),%r9
  821. movq 16(%rsp),%r10
  822. movq 24(%rsp),%r11
  823. movq 32(%rsp),%r12
  824. movq 40(%rsp),%r13
  825. movq 48(%rsp),%r14
  826. movq 56(%rsp),%r15
  827. call __rsaz_512_reduce
  828. jmp L$mul_gather_tail
  829. .p2align 5
  830. L$mulx_gather:
  831. .byte 102,76,15,126,194
  832. movq %r8,128(%rsp)
  833. movq %rdi,128+8(%rsp)
  834. movq %rcx,128+16(%rsp)
  835. mulxq (%rsi),%rbx,%r8
  836. movq %rbx,(%rsp)
  837. xorl %edi,%edi
  838. mulxq 8(%rsi),%rax,%r9
  839. mulxq 16(%rsi),%rbx,%r10
  840. adcxq %rax,%r8
  841. mulxq 24(%rsi),%rax,%r11
  842. adcxq %rbx,%r9
  843. mulxq 32(%rsi),%rbx,%r12
  844. adcxq %rax,%r10
  845. mulxq 40(%rsi),%rax,%r13
  846. adcxq %rbx,%r11
  847. mulxq 48(%rsi),%rbx,%r14
  848. adcxq %rax,%r12
  849. mulxq 56(%rsi),%rax,%r15
  850. adcxq %rbx,%r13
  851. adcxq %rax,%r14
  852. .byte 0x67
  853. movq %r8,%rbx
  854. adcxq %rdi,%r15
  855. movq $-7,%rcx
  856. jmp L$oop_mulx_gather
  857. .p2align 5
  858. L$oop_mulx_gather:
  859. movdqa 0(%rbp),%xmm8
  860. movdqa 16(%rbp),%xmm9
  861. movdqa 32(%rbp),%xmm10
  862. movdqa 48(%rbp),%xmm11
  863. pand %xmm0,%xmm8
  864. movdqa 64(%rbp),%xmm12
  865. pand %xmm1,%xmm9
  866. movdqa 80(%rbp),%xmm13
  867. pand %xmm2,%xmm10
  868. movdqa 96(%rbp),%xmm14
  869. pand %xmm3,%xmm11
  870. movdqa 112(%rbp),%xmm15
  871. leaq 128(%rbp),%rbp
  872. pand %xmm4,%xmm12
  873. pand %xmm5,%xmm13
  874. pand %xmm6,%xmm14
  875. pand %xmm7,%xmm15
  876. por %xmm10,%xmm8
  877. por %xmm11,%xmm9
  878. por %xmm12,%xmm8
  879. por %xmm13,%xmm9
  880. por %xmm14,%xmm8
  881. por %xmm15,%xmm9
  882. por %xmm9,%xmm8
  883. pshufd $0x4e,%xmm8,%xmm9
  884. por %xmm9,%xmm8
  885. .byte 102,76,15,126,194
  886. .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
  887. adcxq %rax,%rbx
  888. adoxq %r9,%r8
  889. mulxq 8(%rsi),%rax,%r9
  890. adcxq %rax,%r8
  891. adoxq %r10,%r9
  892. mulxq 16(%rsi),%rax,%r10
  893. adcxq %rax,%r9
  894. adoxq %r11,%r10
  895. .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
  896. adcxq %rax,%r10
  897. adoxq %r12,%r11
  898. mulxq 32(%rsi),%rax,%r12
  899. adcxq %rax,%r11
  900. adoxq %r13,%r12
  901. mulxq 40(%rsi),%rax,%r13
  902. adcxq %rax,%r12
  903. adoxq %r14,%r13
  904. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
  905. adcxq %rax,%r13
  906. .byte 0x67
  907. adoxq %r15,%r14
  908. mulxq 56(%rsi),%rax,%r15
  909. movq %rbx,64(%rsp,%rcx,8)
  910. adcxq %rax,%r14
  911. adoxq %rdi,%r15
  912. movq %r8,%rbx
  913. adcxq %rdi,%r15
  914. incq %rcx
  915. jnz L$oop_mulx_gather
  916. movq %r8,64(%rsp)
  917. movq %r9,64+8(%rsp)
  918. movq %r10,64+16(%rsp)
  919. movq %r11,64+24(%rsp)
  920. movq %r12,64+32(%rsp)
  921. movq %r13,64+40(%rsp)
  922. movq %r14,64+48(%rsp)
  923. movq %r15,64+56(%rsp)
  924. movq 128(%rsp),%rdx
  925. movq 128+8(%rsp),%rdi
  926. movq 128+16(%rsp),%rbp
  927. movq (%rsp),%r8
  928. movq 8(%rsp),%r9
  929. movq 16(%rsp),%r10
  930. movq 24(%rsp),%r11
  931. movq 32(%rsp),%r12
  932. movq 40(%rsp),%r13
  933. movq 48(%rsp),%r14
  934. movq 56(%rsp),%r15
  935. call __rsaz_512_reducex
  936. L$mul_gather_tail:
  937. addq 64(%rsp),%r8
  938. adcq 72(%rsp),%r9
  939. adcq 80(%rsp),%r10
  940. adcq 88(%rsp),%r11
  941. adcq 96(%rsp),%r12
  942. adcq 104(%rsp),%r13
  943. adcq 112(%rsp),%r14
  944. adcq 120(%rsp),%r15
  945. sbbq %rcx,%rcx
  946. call __rsaz_512_subtract
  947. leaq 128+24+48(%rsp),%rax
  948. movq -48(%rax),%r15
  949. movq -40(%rax),%r14
  950. movq -32(%rax),%r13
  951. movq -24(%rax),%r12
  952. movq -16(%rax),%rbp
  953. movq -8(%rax),%rbx
  954. leaq (%rax),%rsp
  955. L$mul_gather4_epilogue:
  956. .byte 0xf3,0xc3
  957. .globl _rsaz_512_mul_scatter4
  958. .p2align 5
  959. _rsaz_512_mul_scatter4:
  960. pushq %rbx
  961. pushq %rbp
  962. pushq %r12
  963. pushq %r13
  964. pushq %r14
  965. pushq %r15
  966. movl %r9d,%r9d
  967. subq $128+24,%rsp
  968. L$mul_scatter4_body:
  969. leaq (%r8,%r9,8),%r8
  970. .byte 102,72,15,110,199
  971. .byte 102,72,15,110,202
  972. .byte 102,73,15,110,208
  973. movq %rcx,128(%rsp)
  974. movq %rdi,%rbp
  975. movl $0x80100,%r11d
  976. andl _OPENSSL_ia32cap_P+8(%rip),%r11d
  977. cmpl $0x80100,%r11d
  978. je L$mulx_scatter
  979. movq (%rdi),%rbx
  980. call __rsaz_512_mul
  981. .byte 102,72,15,126,199
  982. .byte 102,72,15,126,205
  983. movq (%rsp),%r8
  984. movq 8(%rsp),%r9
  985. movq 16(%rsp),%r10
  986. movq 24(%rsp),%r11
  987. movq 32(%rsp),%r12
  988. movq 40(%rsp),%r13
  989. movq 48(%rsp),%r14
  990. movq 56(%rsp),%r15
  991. call __rsaz_512_reduce
  992. jmp L$mul_scatter_tail
  993. .p2align 5
  994. L$mulx_scatter:
  995. movq (%rdi),%rdx
  996. call __rsaz_512_mulx
  997. .byte 102,72,15,126,199
  998. .byte 102,72,15,126,205
  999. movq 128(%rsp),%rdx
  1000. movq (%rsp),%r8
  1001. movq 8(%rsp),%r9
  1002. movq 16(%rsp),%r10
  1003. movq 24(%rsp),%r11
  1004. movq 32(%rsp),%r12
  1005. movq 40(%rsp),%r13
  1006. movq 48(%rsp),%r14
  1007. movq 56(%rsp),%r15
  1008. call __rsaz_512_reducex
  1009. L$mul_scatter_tail:
  1010. addq 64(%rsp),%r8
  1011. adcq 72(%rsp),%r9
  1012. adcq 80(%rsp),%r10
  1013. adcq 88(%rsp),%r11
  1014. adcq 96(%rsp),%r12
  1015. adcq 104(%rsp),%r13
  1016. adcq 112(%rsp),%r14
  1017. adcq 120(%rsp),%r15
  1018. .byte 102,72,15,126,214
  1019. sbbq %rcx,%rcx
  1020. call __rsaz_512_subtract
  1021. movq %r8,0(%rsi)
  1022. movq %r9,128(%rsi)
  1023. movq %r10,256(%rsi)
  1024. movq %r11,384(%rsi)
  1025. movq %r12,512(%rsi)
  1026. movq %r13,640(%rsi)
  1027. movq %r14,768(%rsi)
  1028. movq %r15,896(%rsi)
  1029. leaq 128+24+48(%rsp),%rax
  1030. movq -48(%rax),%r15
  1031. movq -40(%rax),%r14
  1032. movq -32(%rax),%r13
  1033. movq -24(%rax),%r12
  1034. movq -16(%rax),%rbp
  1035. movq -8(%rax),%rbx
  1036. leaq (%rax),%rsp
  1037. L$mul_scatter4_epilogue:
  1038. .byte 0xf3,0xc3
  1039. .globl _rsaz_512_mul_by_one
  1040. .p2align 5
  1041. _rsaz_512_mul_by_one:
  1042. pushq %rbx
  1043. pushq %rbp
  1044. pushq %r12
  1045. pushq %r13
  1046. pushq %r14
  1047. pushq %r15
  1048. subq $128+24,%rsp
  1049. L$mul_by_one_body:
  1050. movl _OPENSSL_ia32cap_P+8(%rip),%eax
  1051. movq %rdx,%rbp
  1052. movq %rcx,128(%rsp)
  1053. movq (%rsi),%r8
  1054. pxor %xmm0,%xmm0
  1055. movq 8(%rsi),%r9
  1056. movq 16(%rsi),%r10
  1057. movq 24(%rsi),%r11
  1058. movq 32(%rsi),%r12
  1059. movq 40(%rsi),%r13
  1060. movq 48(%rsi),%r14
  1061. movq 56(%rsi),%r15
  1062. movdqa %xmm0,(%rsp)
  1063. movdqa %xmm0,16(%rsp)
  1064. movdqa %xmm0,32(%rsp)
  1065. movdqa %xmm0,48(%rsp)
  1066. movdqa %xmm0,64(%rsp)
  1067. movdqa %xmm0,80(%rsp)
  1068. movdqa %xmm0,96(%rsp)
  1069. andl $0x80100,%eax
  1070. cmpl $0x80100,%eax
  1071. je L$by_one_callx
  1072. call __rsaz_512_reduce
  1073. jmp L$by_one_tail
  1074. .p2align 5
  1075. L$by_one_callx:
  1076. movq 128(%rsp),%rdx
  1077. call __rsaz_512_reducex
  1078. L$by_one_tail:
  1079. movq %r8,(%rdi)
  1080. movq %r9,8(%rdi)
  1081. movq %r10,16(%rdi)
  1082. movq %r11,24(%rdi)
  1083. movq %r12,32(%rdi)
  1084. movq %r13,40(%rdi)
  1085. movq %r14,48(%rdi)
  1086. movq %r15,56(%rdi)
  1087. leaq 128+24+48(%rsp),%rax
  1088. movq -48(%rax),%r15
  1089. movq -40(%rax),%r14
  1090. movq -32(%rax),%r13
  1091. movq -24(%rax),%r12
  1092. movq -16(%rax),%rbp
  1093. movq -8(%rax),%rbx
  1094. leaq (%rax),%rsp
  1095. L$mul_by_one_epilogue:
  1096. .byte 0xf3,0xc3
  1097. .p2align 5
  1098. __rsaz_512_reduce:
  1099. movq %r8,%rbx
  1100. imulq 128+8(%rsp),%rbx
  1101. movq 0(%rbp),%rax
  1102. movl $8,%ecx
  1103. jmp L$reduction_loop
  1104. .p2align 5
  1105. L$reduction_loop:
  1106. mulq %rbx
  1107. movq 8(%rbp),%rax
  1108. negq %r8
  1109. movq %rdx,%r8
  1110. adcq $0,%r8
  1111. mulq %rbx
  1112. addq %rax,%r9
  1113. movq 16(%rbp),%rax
  1114. adcq $0,%rdx
  1115. addq %r9,%r8
  1116. movq %rdx,%r9
  1117. adcq $0,%r9
  1118. mulq %rbx
  1119. addq %rax,%r10
  1120. movq 24(%rbp),%rax
  1121. adcq $0,%rdx
  1122. addq %r10,%r9
  1123. movq %rdx,%r10
  1124. adcq $0,%r10
  1125. mulq %rbx
  1126. addq %rax,%r11
  1127. movq 32(%rbp),%rax
  1128. adcq $0,%rdx
  1129. addq %r11,%r10
  1130. movq 128+8(%rsp),%rsi
  1131. adcq $0,%rdx
  1132. movq %rdx,%r11
  1133. mulq %rbx
  1134. addq %rax,%r12
  1135. movq 40(%rbp),%rax
  1136. adcq $0,%rdx
  1137. imulq %r8,%rsi
  1138. addq %r12,%r11
  1139. movq %rdx,%r12
  1140. adcq $0,%r12
  1141. mulq %rbx
  1142. addq %rax,%r13
  1143. movq 48(%rbp),%rax
  1144. adcq $0,%rdx
  1145. addq %r13,%r12
  1146. movq %rdx,%r13
  1147. adcq $0,%r13
  1148. mulq %rbx
  1149. addq %rax,%r14
  1150. movq 56(%rbp),%rax
  1151. adcq $0,%rdx
  1152. addq %r14,%r13
  1153. movq %rdx,%r14
  1154. adcq $0,%r14
  1155. mulq %rbx
  1156. movq %rsi,%rbx
  1157. addq %rax,%r15
  1158. movq 0(%rbp),%rax
  1159. adcq $0,%rdx
  1160. addq %r15,%r14
  1161. movq %rdx,%r15
  1162. adcq $0,%r15
  1163. decl %ecx
  1164. jne L$reduction_loop
  1165. .byte 0xf3,0xc3
  1166. .p2align 5
  1167. __rsaz_512_reducex:
  1168. imulq %r8,%rdx
  1169. xorq %rsi,%rsi
  1170. movl $8,%ecx
  1171. jmp L$reduction_loopx
  1172. .p2align 5
  1173. L$reduction_loopx:
  1174. movq %r8,%rbx
  1175. mulxq 0(%rbp),%rax,%r8
  1176. adcxq %rbx,%rax
  1177. adoxq %r9,%r8
  1178. mulxq 8(%rbp),%rax,%r9
  1179. adcxq %rax,%r8
  1180. adoxq %r10,%r9
  1181. mulxq 16(%rbp),%rbx,%r10
  1182. adcxq %rbx,%r9
  1183. adoxq %r11,%r10
  1184. mulxq 24(%rbp),%rbx,%r11
  1185. adcxq %rbx,%r10
  1186. adoxq %r12,%r11
  1187. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
  1188. movq %rdx,%rax
  1189. movq %r8,%rdx
  1190. adcxq %rbx,%r11
  1191. adoxq %r13,%r12
  1192. mulxq 128+8(%rsp),%rbx,%rdx
  1193. movq %rax,%rdx
  1194. mulxq 40(%rbp),%rax,%r13
  1195. adcxq %rax,%r12
  1196. adoxq %r14,%r13
  1197. .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
  1198. adcxq %rax,%r13
  1199. adoxq %r15,%r14
  1200. mulxq 56(%rbp),%rax,%r15
  1201. movq %rbx,%rdx
  1202. adcxq %rax,%r14
  1203. adoxq %rsi,%r15
  1204. adcxq %rsi,%r15
  1205. decl %ecx
  1206. jne L$reduction_loopx
  1207. .byte 0xf3,0xc3
  1208. .p2align 5
  1209. __rsaz_512_subtract:
  1210. movq %r8,(%rdi)
  1211. movq %r9,8(%rdi)
  1212. movq %r10,16(%rdi)
  1213. movq %r11,24(%rdi)
  1214. movq %r12,32(%rdi)
  1215. movq %r13,40(%rdi)
  1216. movq %r14,48(%rdi)
  1217. movq %r15,56(%rdi)
  1218. movq 0(%rbp),%r8
  1219. movq 8(%rbp),%r9
  1220. negq %r8
  1221. notq %r9
  1222. andq %rcx,%r8
  1223. movq 16(%rbp),%r10
  1224. andq %rcx,%r9
  1225. notq %r10
  1226. movq 24(%rbp),%r11
  1227. andq %rcx,%r10
  1228. notq %r11
  1229. movq 32(%rbp),%r12
  1230. andq %rcx,%r11
  1231. notq %r12
  1232. movq 40(%rbp),%r13
  1233. andq %rcx,%r12
  1234. notq %r13
  1235. movq 48(%rbp),%r14
  1236. andq %rcx,%r13
  1237. notq %r14
  1238. movq 56(%rbp),%r15
  1239. andq %rcx,%r14
  1240. notq %r15
  1241. andq %rcx,%r15
  1242. addq (%rdi),%r8
  1243. adcq 8(%rdi),%r9
  1244. adcq 16(%rdi),%r10
  1245. adcq 24(%rdi),%r11
  1246. adcq 32(%rdi),%r12
  1247. adcq 40(%rdi),%r13
  1248. adcq 48(%rdi),%r14
  1249. adcq 56(%rdi),%r15
  1250. movq %r8,(%rdi)
  1251. movq %r9,8(%rdi)
  1252. movq %r10,16(%rdi)
  1253. movq %r11,24(%rdi)
  1254. movq %r12,32(%rdi)
  1255. movq %r13,40(%rdi)
  1256. movq %r14,48(%rdi)
  1257. movq %r15,56(%rdi)
  1258. .byte 0xf3,0xc3
  1259. .p2align 5
  1260. __rsaz_512_mul:
  1261. leaq 8(%rsp),%rdi
  1262. movq (%rsi),%rax
  1263. mulq %rbx
  1264. movq %rax,(%rdi)
  1265. movq 8(%rsi),%rax
  1266. movq %rdx,%r8
  1267. mulq %rbx
  1268. addq %rax,%r8
  1269. movq 16(%rsi),%rax
  1270. movq %rdx,%r9
  1271. adcq $0,%r9
  1272. mulq %rbx
  1273. addq %rax,%r9
  1274. movq 24(%rsi),%rax
  1275. movq %rdx,%r10
  1276. adcq $0,%r10
  1277. mulq %rbx
  1278. addq %rax,%r10
  1279. movq 32(%rsi),%rax
  1280. movq %rdx,%r11
  1281. adcq $0,%r11
  1282. mulq %rbx
  1283. addq %rax,%r11
  1284. movq 40(%rsi),%rax
  1285. movq %rdx,%r12
  1286. adcq $0,%r12
  1287. mulq %rbx
  1288. addq %rax,%r12
  1289. movq 48(%rsi),%rax
  1290. movq %rdx,%r13
  1291. adcq $0,%r13
  1292. mulq %rbx
  1293. addq %rax,%r13
  1294. movq 56(%rsi),%rax
  1295. movq %rdx,%r14
  1296. adcq $0,%r14
  1297. mulq %rbx
  1298. addq %rax,%r14
  1299. movq (%rsi),%rax
  1300. movq %rdx,%r15
  1301. adcq $0,%r15
  1302. leaq 8(%rbp),%rbp
  1303. leaq 8(%rdi),%rdi
  1304. movl $7,%ecx
  1305. jmp L$oop_mul
  1306. .p2align 5
  1307. L$oop_mul:
  1308. movq (%rbp),%rbx
  1309. mulq %rbx
  1310. addq %rax,%r8
  1311. movq 8(%rsi),%rax
  1312. movq %r8,(%rdi)
  1313. movq %rdx,%r8
  1314. adcq $0,%r8
  1315. mulq %rbx
  1316. addq %rax,%r9
  1317. movq 16(%rsi),%rax
  1318. adcq $0,%rdx
  1319. addq %r9,%r8
  1320. movq %rdx,%r9
  1321. adcq $0,%r9
  1322. mulq %rbx
  1323. addq %rax,%r10
  1324. movq 24(%rsi),%rax
  1325. adcq $0,%rdx
  1326. addq %r10,%r9
  1327. movq %rdx,%r10
  1328. adcq $0,%r10
  1329. mulq %rbx
  1330. addq %rax,%r11
  1331. movq 32(%rsi),%rax
  1332. adcq $0,%rdx
  1333. addq %r11,%r10
  1334. movq %rdx,%r11
  1335. adcq $0,%r11
  1336. mulq %rbx
  1337. addq %rax,%r12
  1338. movq 40(%rsi),%rax
  1339. adcq $0,%rdx
  1340. addq %r12,%r11
  1341. movq %rdx,%r12
  1342. adcq $0,%r12
  1343. mulq %rbx
  1344. addq %rax,%r13
  1345. movq 48(%rsi),%rax
  1346. adcq $0,%rdx
  1347. addq %r13,%r12
  1348. movq %rdx,%r13
  1349. adcq $0,%r13
  1350. mulq %rbx
  1351. addq %rax,%r14
  1352. movq 56(%rsi),%rax
  1353. adcq $0,%rdx
  1354. addq %r14,%r13
  1355. movq %rdx,%r14
  1356. leaq 8(%rbp),%rbp
  1357. adcq $0,%r14
  1358. mulq %rbx
  1359. addq %rax,%r15
  1360. movq (%rsi),%rax
  1361. adcq $0,%rdx
  1362. addq %r15,%r14
  1363. movq %rdx,%r15
  1364. adcq $0,%r15
  1365. leaq 8(%rdi),%rdi
  1366. decl %ecx
  1367. jnz L$oop_mul
  1368. movq %r8,(%rdi)
  1369. movq %r9,8(%rdi)
  1370. movq %r10,16(%rdi)
  1371. movq %r11,24(%rdi)
  1372. movq %r12,32(%rdi)
  1373. movq %r13,40(%rdi)
  1374. movq %r14,48(%rdi)
  1375. movq %r15,56(%rdi)
  1376. .byte 0xf3,0xc3
  1377. .p2align 5
  1378. __rsaz_512_mulx:
  1379. mulxq (%rsi),%rbx,%r8
  1380. movq $-6,%rcx
  1381. mulxq 8(%rsi),%rax,%r9
  1382. movq %rbx,8(%rsp)
  1383. mulxq 16(%rsi),%rbx,%r10
  1384. adcq %rax,%r8
  1385. mulxq 24(%rsi),%rax,%r11
  1386. adcq %rbx,%r9
  1387. mulxq 32(%rsi),%rbx,%r12
  1388. adcq %rax,%r10
  1389. mulxq 40(%rsi),%rax,%r13
  1390. adcq %rbx,%r11
  1391. mulxq 48(%rsi),%rbx,%r14
  1392. adcq %rax,%r12
  1393. mulxq 56(%rsi),%rax,%r15
  1394. movq 8(%rbp),%rdx
  1395. adcq %rbx,%r13
  1396. adcq %rax,%r14
  1397. adcq $0,%r15
  1398. xorq %rdi,%rdi
  1399. jmp L$oop_mulx
  1400. .p2align 5
  1401. L$oop_mulx:
  1402. movq %r8,%rbx
  1403. mulxq (%rsi),%rax,%r8
  1404. adcxq %rax,%rbx
  1405. adoxq %r9,%r8
  1406. mulxq 8(%rsi),%rax,%r9
  1407. adcxq %rax,%r8
  1408. adoxq %r10,%r9
  1409. mulxq 16(%rsi),%rax,%r10
  1410. adcxq %rax,%r9
  1411. adoxq %r11,%r10
  1412. mulxq 24(%rsi),%rax,%r11
  1413. adcxq %rax,%r10
  1414. adoxq %r12,%r11
  1415. .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
  1416. adcxq %rax,%r11
  1417. adoxq %r13,%r12
  1418. mulxq 40(%rsi),%rax,%r13
  1419. adcxq %rax,%r12
  1420. adoxq %r14,%r13
  1421. mulxq 48(%rsi),%rax,%r14
  1422. adcxq %rax,%r13
  1423. adoxq %r15,%r14
  1424. mulxq 56(%rsi),%rax,%r15
  1425. movq 64(%rbp,%rcx,8),%rdx
  1426. movq %rbx,8+64-8(%rsp,%rcx,8)
  1427. adcxq %rax,%r14
  1428. adoxq %rdi,%r15
  1429. adcxq %rdi,%r15
  1430. incq %rcx
  1431. jnz L$oop_mulx
  1432. movq %r8,%rbx
  1433. mulxq (%rsi),%rax,%r8
  1434. adcxq %rax,%rbx
  1435. adoxq %r9,%r8
  1436. .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
  1437. adcxq %rax,%r8
  1438. adoxq %r10,%r9
  1439. .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
  1440. adcxq %rax,%r9
  1441. adoxq %r11,%r10
  1442. mulxq 24(%rsi),%rax,%r11
  1443. adcxq %rax,%r10
  1444. adoxq %r12,%r11
  1445. mulxq 32(%rsi),%rax,%r12
  1446. adcxq %rax,%r11
  1447. adoxq %r13,%r12
  1448. mulxq 40(%rsi),%rax,%r13
  1449. adcxq %rax,%r12
  1450. adoxq %r14,%r13
  1451. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
  1452. adcxq %rax,%r13
  1453. adoxq %r15,%r14
  1454. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
  1455. adcxq %rax,%r14
  1456. adoxq %rdi,%r15
  1457. adcxq %rdi,%r15
  1458. movq %rbx,8+64-8(%rsp)
  1459. movq %r8,8+64(%rsp)
  1460. movq %r9,8+64+8(%rsp)
  1461. movq %r10,8+64+16(%rsp)
  1462. movq %r11,8+64+24(%rsp)
  1463. movq %r12,8+64+32(%rsp)
  1464. movq %r13,8+64+40(%rsp)
  1465. movq %r14,8+64+48(%rsp)
  1466. movq %r15,8+64+56(%rsp)
  1467. .byte 0xf3,0xc3
  1468. .globl _rsaz_512_scatter4
  1469. .p2align 4
  1470. _rsaz_512_scatter4:
  1471. leaq (%rdi,%rdx,8),%rdi
  1472. movl $8,%r9d
  1473. jmp L$oop_scatter
  1474. .p2align 4
  1475. L$oop_scatter:
  1476. movq (%rsi),%rax
  1477. leaq 8(%rsi),%rsi
  1478. movq %rax,(%rdi)
  1479. leaq 128(%rdi),%rdi
  1480. decl %r9d
  1481. jnz L$oop_scatter
  1482. .byte 0xf3,0xc3
  1483. .globl _rsaz_512_gather4
  1484. .p2align 4
  1485. _rsaz_512_gather4:
  1486. movd %edx,%xmm8
  1487. movdqa L$inc+16(%rip),%xmm1
  1488. movdqa L$inc(%rip),%xmm0
  1489. pshufd $0,%xmm8,%xmm8
  1490. movdqa %xmm1,%xmm7
  1491. movdqa %xmm1,%xmm2
  1492. paddd %xmm0,%xmm1
  1493. pcmpeqd %xmm8,%xmm0
  1494. movdqa %xmm7,%xmm3
  1495. paddd %xmm1,%xmm2
  1496. pcmpeqd %xmm8,%xmm1
  1497. movdqa %xmm7,%xmm4
  1498. paddd %xmm2,%xmm3
  1499. pcmpeqd %xmm8,%xmm2
  1500. movdqa %xmm7,%xmm5
  1501. paddd %xmm3,%xmm4
  1502. pcmpeqd %xmm8,%xmm3
  1503. movdqa %xmm7,%xmm6
  1504. paddd %xmm4,%xmm5
  1505. pcmpeqd %xmm8,%xmm4
  1506. paddd %xmm5,%xmm6
  1507. pcmpeqd %xmm8,%xmm5
  1508. paddd %xmm6,%xmm7
  1509. pcmpeqd %xmm8,%xmm6
  1510. pcmpeqd %xmm8,%xmm7
  1511. movl $8,%r9d
  1512. jmp L$oop_gather
  1513. .p2align 4
  1514. L$oop_gather:
  1515. movdqa 0(%rsi),%xmm8
  1516. movdqa 16(%rsi),%xmm9
  1517. movdqa 32(%rsi),%xmm10
  1518. movdqa 48(%rsi),%xmm11
  1519. pand %xmm0,%xmm8
  1520. movdqa 64(%rsi),%xmm12
  1521. pand %xmm1,%xmm9
  1522. movdqa 80(%rsi),%xmm13
  1523. pand %xmm2,%xmm10
  1524. movdqa 96(%rsi),%xmm14
  1525. pand %xmm3,%xmm11
  1526. movdqa 112(%rsi),%xmm15
  1527. leaq 128(%rsi),%rsi
  1528. pand %xmm4,%xmm12
  1529. pand %xmm5,%xmm13
  1530. pand %xmm6,%xmm14
  1531. pand %xmm7,%xmm15
  1532. por %xmm10,%xmm8
  1533. por %xmm11,%xmm9
  1534. por %xmm12,%xmm8
  1535. por %xmm13,%xmm9
  1536. por %xmm14,%xmm8
  1537. por %xmm15,%xmm9
  1538. por %xmm9,%xmm8
  1539. pshufd $0x4e,%xmm8,%xmm9
  1540. por %xmm9,%xmm8
  1541. movq %xmm8,(%rdi)
  1542. leaq 8(%rdi),%rdi
  1543. decl %r9d
  1544. jnz L$oop_gather
  1545. .byte 0xf3,0xc3
  1546. L$SEH_end_rsaz_512_gather4:
  1547. .p2align 6
  1548. L$inc:
  1549. .long 0,0, 1,1
  1550. .long 2,2, 2,2