x86_64-mont.s 20 KB


  1. .text
  2. .globl bn_mul_mont
  3. .type bn_mul_mont,@function
  4. .align 16
  5. bn_mul_mont:
  6. .cfi_startproc
  7. movl %r9d,%r9d
  8. movq %rsp,%rax
  9. .cfi_def_cfa_register %rax
  10. testl $3,%r9d
  11. jnz .Lmul_enter
  12. cmpl $8,%r9d
  13. jb .Lmul_enter
  14. movl OPENSSL_ia32cap_P+8(%rip),%r11d
  15. cmpq %rsi,%rdx
  16. jne .Lmul4x_enter
  17. testl $7,%r9d
  18. jz .Lsqr8x_enter
  19. jmp .Lmul4x_enter
  20. .align 16
  21. .Lmul_enter:
  22. pushq %rbx
  23. .cfi_offset %rbx,-16
  24. pushq %rbp
  25. .cfi_offset %rbp,-24
  26. pushq %r12
  27. .cfi_offset %r12,-32
  28. pushq %r13
  29. .cfi_offset %r13,-40
  30. pushq %r14
  31. .cfi_offset %r14,-48
  32. pushq %r15
  33. .cfi_offset %r15,-56
  34. negq %r9
  35. movq %rsp,%r11
  36. leaq -16(%rsp,%r9,8),%r10
  37. negq %r9
  38. andq $-1024,%r10
  39. subq %r10,%r11
  40. andq $-4096,%r11
  41. leaq (%r10,%r11,1),%rsp
  42. movq (%rsp),%r11
  43. cmpq %r10,%rsp
  44. ja .Lmul_page_walk
  45. jmp .Lmul_page_walk_done
  46. .align 16
  47. .Lmul_page_walk:
  48. leaq -4096(%rsp),%rsp
  49. movq (%rsp),%r11
  50. cmpq %r10,%rsp
  51. ja .Lmul_page_walk
  52. .Lmul_page_walk_done:
  53. movq %rax,8(%rsp,%r9,8)
  54. .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
  55. .Lmul_body:
  56. movq %rdx,%r12
  57. movq (%r8),%r8
  58. movq (%r12),%rbx
  59. movq (%rsi),%rax
  60. xorq %r14,%r14
  61. xorq %r15,%r15
  62. movq %r8,%rbp
  63. mulq %rbx
  64. movq %rax,%r10
  65. movq (%rcx),%rax
  66. imulq %r10,%rbp
  67. movq %rdx,%r11
  68. mulq %rbp
  69. addq %rax,%r10
  70. movq 8(%rsi),%rax
  71. adcq $0,%rdx
  72. movq %rdx,%r13
  73. leaq 1(%r15),%r15
  74. jmp .L1st_enter
  75. .align 16
  76. .L1st:
  77. addq %rax,%r13
  78. movq (%rsi,%r15,8),%rax
  79. adcq $0,%rdx
  80. addq %r11,%r13
  81. movq %r10,%r11
  82. adcq $0,%rdx
  83. movq %r13,-16(%rsp,%r15,8)
  84. movq %rdx,%r13
  85. .L1st_enter:
  86. mulq %rbx
  87. addq %rax,%r11
  88. movq (%rcx,%r15,8),%rax
  89. adcq $0,%rdx
  90. leaq 1(%r15),%r15
  91. movq %rdx,%r10
  92. mulq %rbp
  93. cmpq %r9,%r15
  94. jne .L1st
  95. addq %rax,%r13
  96. movq (%rsi),%rax
  97. adcq $0,%rdx
  98. addq %r11,%r13
  99. adcq $0,%rdx
  100. movq %r13,-16(%rsp,%r15,8)
  101. movq %rdx,%r13
  102. movq %r10,%r11
  103. xorq %rdx,%rdx
  104. addq %r11,%r13
  105. adcq $0,%rdx
  106. movq %r13,-8(%rsp,%r9,8)
  107. movq %rdx,(%rsp,%r9,8)
  108. leaq 1(%r14),%r14
  109. jmp .Louter
  110. .align 16
  111. .Louter:
  112. movq (%r12,%r14,8),%rbx
  113. xorq %r15,%r15
  114. movq %r8,%rbp
  115. movq (%rsp),%r10
  116. mulq %rbx
  117. addq %rax,%r10
  118. movq (%rcx),%rax
  119. adcq $0,%rdx
  120. imulq %r10,%rbp
  121. movq %rdx,%r11
  122. mulq %rbp
  123. addq %rax,%r10
  124. movq 8(%rsi),%rax
  125. adcq $0,%rdx
  126. movq 8(%rsp),%r10
  127. movq %rdx,%r13
  128. leaq 1(%r15),%r15
  129. jmp .Linner_enter
  130. .align 16
  131. .Linner:
  132. addq %rax,%r13
  133. movq (%rsi,%r15,8),%rax
  134. adcq $0,%rdx
  135. addq %r10,%r13
  136. movq (%rsp,%r15,8),%r10
  137. adcq $0,%rdx
  138. movq %r13,-16(%rsp,%r15,8)
  139. movq %rdx,%r13
  140. .Linner_enter:
  141. mulq %rbx
  142. addq %rax,%r11
  143. movq (%rcx,%r15,8),%rax
  144. adcq $0,%rdx
  145. addq %r11,%r10
  146. movq %rdx,%r11
  147. adcq $0,%r11
  148. leaq 1(%r15),%r15
  149. mulq %rbp
  150. cmpq %r9,%r15
  151. jne .Linner
  152. addq %rax,%r13
  153. movq (%rsi),%rax
  154. adcq $0,%rdx
  155. addq %r10,%r13
  156. movq (%rsp,%r15,8),%r10
  157. adcq $0,%rdx
  158. movq %r13,-16(%rsp,%r15,8)
  159. movq %rdx,%r13
  160. xorq %rdx,%rdx
  161. addq %r11,%r13
  162. adcq $0,%rdx
  163. addq %r10,%r13
  164. adcq $0,%rdx
  165. movq %r13,-8(%rsp,%r9,8)
  166. movq %rdx,(%rsp,%r9,8)
  167. leaq 1(%r14),%r14
  168. cmpq %r9,%r14
  169. jb .Louter
  170. xorq %r14,%r14
  171. movq (%rsp),%rax
  172. movq %r9,%r15
  173. .align 16
  174. .Lsub: sbbq (%rcx,%r14,8),%rax
  175. movq %rax,(%rdi,%r14,8)
  176. movq 8(%rsp,%r14,8),%rax
  177. leaq 1(%r14),%r14
  178. decq %r15
  179. jnz .Lsub
  180. sbbq $0,%rax
  181. movq $-1,%rbx
  182. xorq %rax,%rbx
  183. xorq %r14,%r14
  184. movq %r9,%r15
  185. .Lcopy:
  186. movq (%rdi,%r14,8),%rcx
  187. movq (%rsp,%r14,8),%rdx
  188. andq %rbx,%rcx
  189. andq %rax,%rdx
  190. movq %r9,(%rsp,%r14,8)
  191. orq %rcx,%rdx
  192. movq %rdx,(%rdi,%r14,8)
  193. leaq 1(%r14),%r14
  194. subq $1,%r15
  195. jnz .Lcopy
  196. movq 8(%rsp,%r9,8),%rsi
  197. .cfi_def_cfa %rsi,8
  198. movq $1,%rax
  199. movq -48(%rsi),%r15
  200. .cfi_restore %r15
  201. movq -40(%rsi),%r14
  202. .cfi_restore %r14
  203. movq -32(%rsi),%r13
  204. .cfi_restore %r13
  205. movq -24(%rsi),%r12
  206. .cfi_restore %r12
  207. movq -16(%rsi),%rbp
  208. .cfi_restore %rbp
  209. movq -8(%rsi),%rbx
  210. .cfi_restore %rbx
  211. leaq (%rsi),%rsp
  212. .cfi_def_cfa_register %rsp
  213. .Lmul_epilogue:
  214. .byte 0xf3,0xc3
  215. .cfi_endproc
  216. .size bn_mul_mont,.-bn_mul_mont
  217. .type bn_mul4x_mont,@function
  218. .align 16
  219. bn_mul4x_mont:
  220. .cfi_startproc
  221. movl %r9d,%r9d
  222. movq %rsp,%rax
  223. .cfi_def_cfa_register %rax
  224. .Lmul4x_enter:
  225. andl $0x80100,%r11d
  226. cmpl $0x80100,%r11d
  227. je .Lmulx4x_enter
  228. pushq %rbx
  229. .cfi_offset %rbx,-16
  230. pushq %rbp
  231. .cfi_offset %rbp,-24
  232. pushq %r12
  233. .cfi_offset %r12,-32
  234. pushq %r13
  235. .cfi_offset %r13,-40
  236. pushq %r14
  237. .cfi_offset %r14,-48
  238. pushq %r15
  239. .cfi_offset %r15,-56
  240. negq %r9
  241. movq %rsp,%r11
  242. leaq -32(%rsp,%r9,8),%r10
  243. negq %r9
  244. andq $-1024,%r10
  245. subq %r10,%r11
  246. andq $-4096,%r11
  247. leaq (%r10,%r11,1),%rsp
  248. movq (%rsp),%r11
  249. cmpq %r10,%rsp
  250. ja .Lmul4x_page_walk
  251. jmp .Lmul4x_page_walk_done
  252. .Lmul4x_page_walk:
  253. leaq -4096(%rsp),%rsp
  254. movq (%rsp),%r11
  255. cmpq %r10,%rsp
  256. ja .Lmul4x_page_walk
  257. .Lmul4x_page_walk_done:
  258. movq %rax,8(%rsp,%r9,8)
  259. .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
  260. .Lmul4x_body:
  261. movq %rdi,16(%rsp,%r9,8)
  262. movq %rdx,%r12
  263. movq (%r8),%r8
  264. movq (%r12),%rbx
  265. movq (%rsi),%rax
  266. xorq %r14,%r14
  267. xorq %r15,%r15
  268. movq %r8,%rbp
  269. mulq %rbx
  270. movq %rax,%r10
  271. movq (%rcx),%rax
  272. imulq %r10,%rbp
  273. movq %rdx,%r11
  274. mulq %rbp
  275. addq %rax,%r10
  276. movq 8(%rsi),%rax
  277. adcq $0,%rdx
  278. movq %rdx,%rdi
  279. mulq %rbx
  280. addq %rax,%r11
  281. movq 8(%rcx),%rax
  282. adcq $0,%rdx
  283. movq %rdx,%r10
  284. mulq %rbp
  285. addq %rax,%rdi
  286. movq 16(%rsi),%rax
  287. adcq $0,%rdx
  288. addq %r11,%rdi
  289. leaq 4(%r15),%r15
  290. adcq $0,%rdx
  291. movq %rdi,(%rsp)
  292. movq %rdx,%r13
  293. jmp .L1st4x
  294. .align 16
  295. .L1st4x:
  296. mulq %rbx
  297. addq %rax,%r10
  298. movq -16(%rcx,%r15,8),%rax
  299. adcq $0,%rdx
  300. movq %rdx,%r11
  301. mulq %rbp
  302. addq %rax,%r13
  303. movq -8(%rsi,%r15,8),%rax
  304. adcq $0,%rdx
  305. addq %r10,%r13
  306. adcq $0,%rdx
  307. movq %r13,-24(%rsp,%r15,8)
  308. movq %rdx,%rdi
  309. mulq %rbx
  310. addq %rax,%r11
  311. movq -8(%rcx,%r15,8),%rax
  312. adcq $0,%rdx
  313. movq %rdx,%r10
  314. mulq %rbp
  315. addq %rax,%rdi
  316. movq (%rsi,%r15,8),%rax
  317. adcq $0,%rdx
  318. addq %r11,%rdi
  319. adcq $0,%rdx
  320. movq %rdi,-16(%rsp,%r15,8)
  321. movq %rdx,%r13
  322. mulq %rbx
  323. addq %rax,%r10
  324. movq (%rcx,%r15,8),%rax
  325. adcq $0,%rdx
  326. movq %rdx,%r11
  327. mulq %rbp
  328. addq %rax,%r13
  329. movq 8(%rsi,%r15,8),%rax
  330. adcq $0,%rdx
  331. addq %r10,%r13
  332. adcq $0,%rdx
  333. movq %r13,-8(%rsp,%r15,8)
  334. movq %rdx,%rdi
  335. mulq %rbx
  336. addq %rax,%r11
  337. movq 8(%rcx,%r15,8),%rax
  338. adcq $0,%rdx
  339. leaq 4(%r15),%r15
  340. movq %rdx,%r10
  341. mulq %rbp
  342. addq %rax,%rdi
  343. movq -16(%rsi,%r15,8),%rax
  344. adcq $0,%rdx
  345. addq %r11,%rdi
  346. adcq $0,%rdx
  347. movq %rdi,-32(%rsp,%r15,8)
  348. movq %rdx,%r13
  349. cmpq %r9,%r15
  350. jb .L1st4x
  351. mulq %rbx
  352. addq %rax,%r10
  353. movq -16(%rcx,%r15,8),%rax
  354. adcq $0,%rdx
  355. movq %rdx,%r11
  356. mulq %rbp
  357. addq %rax,%r13
  358. movq -8(%rsi,%r15,8),%rax
  359. adcq $0,%rdx
  360. addq %r10,%r13
  361. adcq $0,%rdx
  362. movq %r13,-24(%rsp,%r15,8)
  363. movq %rdx,%rdi
  364. mulq %rbx
  365. addq %rax,%r11
  366. movq -8(%rcx,%r15,8),%rax
  367. adcq $0,%rdx
  368. movq %rdx,%r10
  369. mulq %rbp
  370. addq %rax,%rdi
  371. movq (%rsi),%rax
  372. adcq $0,%rdx
  373. addq %r11,%rdi
  374. adcq $0,%rdx
  375. movq %rdi,-16(%rsp,%r15,8)
  376. movq %rdx,%r13
  377. xorq %rdi,%rdi
  378. addq %r10,%r13
  379. adcq $0,%rdi
  380. movq %r13,-8(%rsp,%r15,8)
  381. movq %rdi,(%rsp,%r15,8)
  382. leaq 1(%r14),%r14
  383. .align 4
  384. .Louter4x:
  385. movq (%r12,%r14,8),%rbx
  386. xorq %r15,%r15
  387. movq (%rsp),%r10
  388. movq %r8,%rbp
  389. mulq %rbx
  390. addq %rax,%r10
  391. movq (%rcx),%rax
  392. adcq $0,%rdx
  393. imulq %r10,%rbp
  394. movq %rdx,%r11
  395. mulq %rbp
  396. addq %rax,%r10
  397. movq 8(%rsi),%rax
  398. adcq $0,%rdx
  399. movq %rdx,%rdi
  400. mulq %rbx
  401. addq %rax,%r11
  402. movq 8(%rcx),%rax
  403. adcq $0,%rdx
  404. addq 8(%rsp),%r11
  405. adcq $0,%rdx
  406. movq %rdx,%r10
  407. mulq %rbp
  408. addq %rax,%rdi
  409. movq 16(%rsi),%rax
  410. adcq $0,%rdx
  411. addq %r11,%rdi
  412. leaq 4(%r15),%r15
  413. adcq $0,%rdx
  414. movq %rdi,(%rsp)
  415. movq %rdx,%r13
  416. jmp .Linner4x
  417. .align 16
  418. .Linner4x:
  419. mulq %rbx
  420. addq %rax,%r10
  421. movq -16(%rcx,%r15,8),%rax
  422. adcq $0,%rdx
  423. addq -16(%rsp,%r15,8),%r10
  424. adcq $0,%rdx
  425. movq %rdx,%r11
  426. mulq %rbp
  427. addq %rax,%r13
  428. movq -8(%rsi,%r15,8),%rax
  429. adcq $0,%rdx
  430. addq %r10,%r13
  431. adcq $0,%rdx
  432. movq %r13,-24(%rsp,%r15,8)
  433. movq %rdx,%rdi
  434. mulq %rbx
  435. addq %rax,%r11
  436. movq -8(%rcx,%r15,8),%rax
  437. adcq $0,%rdx
  438. addq -8(%rsp,%r15,8),%r11
  439. adcq $0,%rdx
  440. movq %rdx,%r10
  441. mulq %rbp
  442. addq %rax,%rdi
  443. movq (%rsi,%r15,8),%rax
  444. adcq $0,%rdx
  445. addq %r11,%rdi
  446. adcq $0,%rdx
  447. movq %rdi,-16(%rsp,%r15,8)
  448. movq %rdx,%r13
  449. mulq %rbx
  450. addq %rax,%r10
  451. movq (%rcx,%r15,8),%rax
  452. adcq $0,%rdx
  453. addq (%rsp,%r15,8),%r10
  454. adcq $0,%rdx
  455. movq %rdx,%r11
  456. mulq %rbp
  457. addq %rax,%r13
  458. movq 8(%rsi,%r15,8),%rax
  459. adcq $0,%rdx
  460. addq %r10,%r13
  461. adcq $0,%rdx
  462. movq %r13,-8(%rsp,%r15,8)
  463. movq %rdx,%rdi
  464. mulq %rbx
  465. addq %rax,%r11
  466. movq 8(%rcx,%r15,8),%rax
  467. adcq $0,%rdx
  468. addq 8(%rsp,%r15,8),%r11
  469. adcq $0,%rdx
  470. leaq 4(%r15),%r15
  471. movq %rdx,%r10
  472. mulq %rbp
  473. addq %rax,%rdi
  474. movq -16(%rsi,%r15,8),%rax
  475. adcq $0,%rdx
  476. addq %r11,%rdi
  477. adcq $0,%rdx
  478. movq %rdi,-32(%rsp,%r15,8)
  479. movq %rdx,%r13
  480. cmpq %r9,%r15
  481. jb .Linner4x
  482. mulq %rbx
  483. addq %rax,%r10
  484. movq -16(%rcx,%r15,8),%rax
  485. adcq $0,%rdx
  486. addq -16(%rsp,%r15,8),%r10
  487. adcq $0,%rdx
  488. movq %rdx,%r11
  489. mulq %rbp
  490. addq %rax,%r13
  491. movq -8(%rsi,%r15,8),%rax
  492. adcq $0,%rdx
  493. addq %r10,%r13
  494. adcq $0,%rdx
  495. movq %r13,-24(%rsp,%r15,8)
  496. movq %rdx,%rdi
  497. mulq %rbx
  498. addq %rax,%r11
  499. movq -8(%rcx,%r15,8),%rax
  500. adcq $0,%rdx
  501. addq -8(%rsp,%r15,8),%r11
  502. adcq $0,%rdx
  503. leaq 1(%r14),%r14
  504. movq %rdx,%r10
  505. mulq %rbp
  506. addq %rax,%rdi
  507. movq (%rsi),%rax
  508. adcq $0,%rdx
  509. addq %r11,%rdi
  510. adcq $0,%rdx
  511. movq %rdi,-16(%rsp,%r15,8)
  512. movq %rdx,%r13
  513. xorq %rdi,%rdi
  514. addq %r10,%r13
  515. adcq $0,%rdi
  516. addq (%rsp,%r9,8),%r13
  517. adcq $0,%rdi
  518. movq %r13,-8(%rsp,%r15,8)
  519. movq %rdi,(%rsp,%r15,8)
  520. cmpq %r9,%r14
  521. jb .Louter4x
  522. movq 16(%rsp,%r9,8),%rdi
  523. leaq -4(%r9),%r15
  524. movq 0(%rsp),%rax
  525. movq 8(%rsp),%rdx
  526. shrq $2,%r15
  527. leaq (%rsp),%rsi
  528. xorq %r14,%r14
  529. subq 0(%rcx),%rax
  530. movq 16(%rsi),%rbx
  531. movq 24(%rsi),%rbp
  532. sbbq 8(%rcx),%rdx
  533. .Lsub4x:
  534. movq %rax,0(%rdi,%r14,8)
  535. movq %rdx,8(%rdi,%r14,8)
  536. sbbq 16(%rcx,%r14,8),%rbx
  537. movq 32(%rsi,%r14,8),%rax
  538. movq 40(%rsi,%r14,8),%rdx
  539. sbbq 24(%rcx,%r14,8),%rbp
  540. movq %rbx,16(%rdi,%r14,8)
  541. movq %rbp,24(%rdi,%r14,8)
  542. sbbq 32(%rcx,%r14,8),%rax
  543. movq 48(%rsi,%r14,8),%rbx
  544. movq 56(%rsi,%r14,8),%rbp
  545. sbbq 40(%rcx,%r14,8),%rdx
  546. leaq 4(%r14),%r14
  547. decq %r15
  548. jnz .Lsub4x
  549. movq %rax,0(%rdi,%r14,8)
  550. movq 32(%rsi,%r14,8),%rax
  551. sbbq 16(%rcx,%r14,8),%rbx
  552. movq %rdx,8(%rdi,%r14,8)
  553. sbbq 24(%rcx,%r14,8),%rbp
  554. movq %rbx,16(%rdi,%r14,8)
  555. sbbq $0,%rax
  556. movq %rbp,24(%rdi,%r14,8)
  557. pxor %xmm0,%xmm0
  558. .byte 102,72,15,110,224
  559. pcmpeqd %xmm5,%xmm5
  560. pshufd $0,%xmm4,%xmm4
  561. movq %r9,%r15
  562. pxor %xmm4,%xmm5
  563. shrq $2,%r15
  564. xorl %eax,%eax
  565. jmp .Lcopy4x
  566. .align 16
  567. .Lcopy4x:
  568. movdqa (%rsp,%rax,1),%xmm1
  569. movdqu (%rdi,%rax,1),%xmm2
  570. pand %xmm4,%xmm1
  571. pand %xmm5,%xmm2
  572. movdqa 16(%rsp,%rax,1),%xmm3
  573. movdqa %xmm0,(%rsp,%rax,1)
  574. por %xmm2,%xmm1
  575. movdqu 16(%rdi,%rax,1),%xmm2
  576. movdqu %xmm1,(%rdi,%rax,1)
  577. pand %xmm4,%xmm3
  578. pand %xmm5,%xmm2
  579. movdqa %xmm0,16(%rsp,%rax,1)
  580. por %xmm2,%xmm3
  581. movdqu %xmm3,16(%rdi,%rax,1)
  582. leaq 32(%rax),%rax
  583. decq %r15
  584. jnz .Lcopy4x
  585. movq 8(%rsp,%r9,8),%rsi
  586. .cfi_def_cfa %rsi, 8
  587. movq $1,%rax
  588. movq -48(%rsi),%r15
  589. .cfi_restore %r15
  590. movq -40(%rsi),%r14
  591. .cfi_restore %r14
  592. movq -32(%rsi),%r13
  593. .cfi_restore %r13
  594. movq -24(%rsi),%r12
  595. .cfi_restore %r12
  596. movq -16(%rsi),%rbp
  597. .cfi_restore %rbp
  598. movq -8(%rsi),%rbx
  599. .cfi_restore %rbx
  600. leaq (%rsi),%rsp
  601. .cfi_def_cfa_register %rsp
  602. .Lmul4x_epilogue:
  603. .byte 0xf3,0xc3
  604. .cfi_endproc
  605. .size bn_mul4x_mont,.-bn_mul4x_mont
  606. .type bn_sqr8x_mont,@function
  607. .align 32
  608. bn_sqr8x_mont:
  609. .cfi_startproc
  610. movq %rsp,%rax
  611. .cfi_def_cfa_register %rax
  612. .Lsqr8x_enter:
  613. pushq %rbx
  614. .cfi_offset %rbx,-16
  615. pushq %rbp
  616. .cfi_offset %rbp,-24
  617. pushq %r12
  618. .cfi_offset %r12,-32
  619. pushq %r13
  620. .cfi_offset %r13,-40
  621. pushq %r14
  622. .cfi_offset %r14,-48
  623. pushq %r15
  624. .cfi_offset %r15,-56
  625. .Lsqr8x_prologue:
  626. movl %r9d,%r10d
  627. shll $3,%r9d
  628. shlq $3+2,%r10
  629. negq %r9
  630. leaq -64(%rsp,%r9,2),%r11
  631. movq %rsp,%rbp
  632. movq (%r8),%r8
  633. subq %rsi,%r11
  634. andq $4095,%r11
  635. cmpq %r11,%r10
  636. jb .Lsqr8x_sp_alt
  637. subq %r11,%rbp
  638. leaq -64(%rbp,%r9,2),%rbp
  639. jmp .Lsqr8x_sp_done
  640. .align 32
  641. .Lsqr8x_sp_alt:
  642. leaq 4096-64(,%r9,2),%r10
  643. leaq -64(%rbp,%r9,2),%rbp
  644. subq %r10,%r11
  645. movq $0,%r10
  646. cmovcq %r10,%r11
  647. subq %r11,%rbp
  648. .Lsqr8x_sp_done:
  649. andq $-64,%rbp
  650. movq %rsp,%r11
  651. subq %rbp,%r11
  652. andq $-4096,%r11
  653. leaq (%r11,%rbp,1),%rsp
  654. movq (%rsp),%r10
  655. cmpq %rbp,%rsp
  656. ja .Lsqr8x_page_walk
  657. jmp .Lsqr8x_page_walk_done
  658. .align 16
  659. .Lsqr8x_page_walk:
  660. leaq -4096(%rsp),%rsp
  661. movq (%rsp),%r10
  662. cmpq %rbp,%rsp
  663. ja .Lsqr8x_page_walk
  664. .Lsqr8x_page_walk_done:
  665. movq %r9,%r10
  666. negq %r9
  667. movq %r8,32(%rsp)
  668. movq %rax,40(%rsp)
  669. .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
  670. .Lsqr8x_body:
  671. .byte 102,72,15,110,209
  672. pxor %xmm0,%xmm0
  673. .byte 102,72,15,110,207
  674. .byte 102,73,15,110,218
  675. movl OPENSSL_ia32cap_P+8(%rip),%eax
  676. andl $0x80100,%eax
  677. cmpl $0x80100,%eax
  678. jne .Lsqr8x_nox
  679. call bn_sqrx8x_internal
  680. leaq (%r8,%rcx,1),%rbx
  681. movq %rcx,%r9
  682. movq %rcx,%rdx
  683. .byte 102,72,15,126,207
  684. sarq $3+2,%rcx
  685. jmp .Lsqr8x_sub
  686. .align 32
  687. .Lsqr8x_nox:
  688. call bn_sqr8x_internal
  689. leaq (%rdi,%r9,1),%rbx
  690. movq %r9,%rcx
  691. movq %r9,%rdx
  692. .byte 102,72,15,126,207
  693. sarq $3+2,%rcx
  694. jmp .Lsqr8x_sub
  695. .align 32
  696. .Lsqr8x_sub:
  697. movq 0(%rbx),%r12
  698. movq 8(%rbx),%r13
  699. movq 16(%rbx),%r14
  700. movq 24(%rbx),%r15
  701. leaq 32(%rbx),%rbx
  702. sbbq 0(%rbp),%r12
  703. sbbq 8(%rbp),%r13
  704. sbbq 16(%rbp),%r14
  705. sbbq 24(%rbp),%r15
  706. leaq 32(%rbp),%rbp
  707. movq %r12,0(%rdi)
  708. movq %r13,8(%rdi)
  709. movq %r14,16(%rdi)
  710. movq %r15,24(%rdi)
  711. leaq 32(%rdi),%rdi
  712. incq %rcx
  713. jnz .Lsqr8x_sub
  714. sbbq $0,%rax
  715. leaq (%rbx,%r9,1),%rbx
  716. leaq (%rdi,%r9,1),%rdi
  717. .byte 102,72,15,110,200
  718. pxor %xmm0,%xmm0
  719. pshufd $0,%xmm1,%xmm1
  720. movq 40(%rsp),%rsi
  721. .cfi_def_cfa %rsi,8
  722. jmp .Lsqr8x_cond_copy
  723. .align 32
  724. .Lsqr8x_cond_copy:
  725. movdqa 0(%rbx),%xmm2
  726. movdqa 16(%rbx),%xmm3
  727. leaq 32(%rbx),%rbx
  728. movdqu 0(%rdi),%xmm4
  729. movdqu 16(%rdi),%xmm5
  730. leaq 32(%rdi),%rdi
  731. movdqa %xmm0,-32(%rbx)
  732. movdqa %xmm0,-16(%rbx)
  733. movdqa %xmm0,-32(%rbx,%rdx,1)
  734. movdqa %xmm0,-16(%rbx,%rdx,1)
  735. pcmpeqd %xmm1,%xmm0
  736. pand %xmm1,%xmm2
  737. pand %xmm1,%xmm3
  738. pand %xmm0,%xmm4
  739. pand %xmm0,%xmm5
  740. pxor %xmm0,%xmm0
  741. por %xmm2,%xmm4
  742. por %xmm3,%xmm5
  743. movdqu %xmm4,-32(%rdi)
  744. movdqu %xmm5,-16(%rdi)
  745. addq $32,%r9
  746. jnz .Lsqr8x_cond_copy
  747. movq $1,%rax
  748. movq -48(%rsi),%r15
  749. .cfi_restore %r15
  750. movq -40(%rsi),%r14
  751. .cfi_restore %r14
  752. movq -32(%rsi),%r13
  753. .cfi_restore %r13
  754. movq -24(%rsi),%r12
  755. .cfi_restore %r12
  756. movq -16(%rsi),%rbp
  757. .cfi_restore %rbp
  758. movq -8(%rsi),%rbx
  759. .cfi_restore %rbx
  760. leaq (%rsi),%rsp
  761. .cfi_def_cfa_register %rsp
  762. .Lsqr8x_epilogue:
  763. .byte 0xf3,0xc3
  764. .cfi_endproc
  765. .size bn_sqr8x_mont,.-bn_sqr8x_mont
  766. .type bn_mulx4x_mont,@function
  767. .align 32
  768. bn_mulx4x_mont:
  769. .cfi_startproc
  770. movq %rsp,%rax
  771. .cfi_def_cfa_register %rax
  772. .Lmulx4x_enter:
  773. pushq %rbx
  774. .cfi_offset %rbx,-16
  775. pushq %rbp
  776. .cfi_offset %rbp,-24
  777. pushq %r12
  778. .cfi_offset %r12,-32
  779. pushq %r13
  780. .cfi_offset %r13,-40
  781. pushq %r14
  782. .cfi_offset %r14,-48
  783. pushq %r15
  784. .cfi_offset %r15,-56
  785. .Lmulx4x_prologue:
  786. shll $3,%r9d
  787. xorq %r10,%r10
  788. subq %r9,%r10
  789. movq (%r8),%r8
  790. leaq -72(%rsp,%r10,1),%rbp
  791. andq $-128,%rbp
  792. movq %rsp,%r11
  793. subq %rbp,%r11
  794. andq $-4096,%r11
  795. leaq (%r11,%rbp,1),%rsp
  796. movq (%rsp),%r10
  797. cmpq %rbp,%rsp
  798. ja .Lmulx4x_page_walk
  799. jmp .Lmulx4x_page_walk_done
  800. .align 16
  801. .Lmulx4x_page_walk:
  802. leaq -4096(%rsp),%rsp
  803. movq (%rsp),%r10
  804. cmpq %rbp,%rsp
  805. ja .Lmulx4x_page_walk
  806. .Lmulx4x_page_walk_done:
  807. leaq (%rdx,%r9,1),%r10
  808. movq %r9,0(%rsp)
  809. shrq $5,%r9
  810. movq %r10,16(%rsp)
  811. subq $1,%r9
  812. movq %r8,24(%rsp)
  813. movq %rdi,32(%rsp)
  814. movq %rax,40(%rsp)
  815. .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
  816. movq %r9,48(%rsp)
  817. jmp .Lmulx4x_body
  818. .align 32
  819. .Lmulx4x_body:
  820. leaq 8(%rdx),%rdi
  821. movq (%rdx),%rdx
  822. leaq 64+32(%rsp),%rbx
  823. movq %rdx,%r9
  824. mulxq 0(%rsi),%r8,%rax
  825. mulxq 8(%rsi),%r11,%r14
  826. addq %rax,%r11
  827. movq %rdi,8(%rsp)
  828. mulxq 16(%rsi),%r12,%r13
  829. adcq %r14,%r12
  830. adcq $0,%r13
  831. movq %r8,%rdi
  832. imulq 24(%rsp),%r8
  833. xorq %rbp,%rbp
  834. mulxq 24(%rsi),%rax,%r14
  835. movq %r8,%rdx
  836. leaq 32(%rsi),%rsi
  837. adcxq %rax,%r13
  838. adcxq %rbp,%r14
  839. mulxq 0(%rcx),%rax,%r10
  840. adcxq %rax,%rdi
  841. adoxq %r11,%r10
  842. mulxq 8(%rcx),%rax,%r11
  843. adcxq %rax,%r10
  844. adoxq %r12,%r11
  845. .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
  846. movq 48(%rsp),%rdi
  847. movq %r10,-32(%rbx)
  848. adcxq %rax,%r11
  849. adoxq %r13,%r12
  850. mulxq 24(%rcx),%rax,%r15
  851. movq %r9,%rdx
  852. movq %r11,-24(%rbx)
  853. adcxq %rax,%r12
  854. adoxq %rbp,%r15
  855. leaq 32(%rcx),%rcx
  856. movq %r12,-16(%rbx)
  857. jmp .Lmulx4x_1st
  858. .align 32
  859. .Lmulx4x_1st:
  860. adcxq %rbp,%r15
  861. mulxq 0(%rsi),%r10,%rax
  862. adcxq %r14,%r10
  863. mulxq 8(%rsi),%r11,%r14
  864. adcxq %rax,%r11
  865. mulxq 16(%rsi),%r12,%rax
  866. adcxq %r14,%r12
  867. mulxq 24(%rsi),%r13,%r14
  868. .byte 0x67,0x67
  869. movq %r8,%rdx
  870. adcxq %rax,%r13
  871. adcxq %rbp,%r14
  872. leaq 32(%rsi),%rsi
  873. leaq 32(%rbx),%rbx
  874. adoxq %r15,%r10
  875. mulxq 0(%rcx),%rax,%r15
  876. adcxq %rax,%r10
  877. adoxq %r15,%r11
  878. mulxq 8(%rcx),%rax,%r15
  879. adcxq %rax,%r11
  880. adoxq %r15,%r12
  881. mulxq 16(%rcx),%rax,%r15
  882. movq %r10,-40(%rbx)
  883. adcxq %rax,%r12
  884. movq %r11,-32(%rbx)
  885. adoxq %r15,%r13
  886. mulxq 24(%rcx),%rax,%r15
  887. movq %r9,%rdx
  888. movq %r12,-24(%rbx)
  889. adcxq %rax,%r13
  890. adoxq %rbp,%r15
  891. leaq 32(%rcx),%rcx
  892. movq %r13,-16(%rbx)
  893. decq %rdi
  894. jnz .Lmulx4x_1st
  895. movq 0(%rsp),%rax
  896. movq 8(%rsp),%rdi
  897. adcq %rbp,%r15
  898. addq %r15,%r14
  899. sbbq %r15,%r15
  900. movq %r14,-8(%rbx)
  901. jmp .Lmulx4x_outer
  902. .align 32
  903. .Lmulx4x_outer:
  904. movq (%rdi),%rdx
  905. leaq 8(%rdi),%rdi
  906. subq %rax,%rsi
  907. movq %r15,(%rbx)
  908. leaq 64+32(%rsp),%rbx
  909. subq %rax,%rcx
  910. mulxq 0(%rsi),%r8,%r11
  911. xorl %ebp,%ebp
  912. movq %rdx,%r9
  913. mulxq 8(%rsi),%r14,%r12
  914. adoxq -32(%rbx),%r8
  915. adcxq %r14,%r11
  916. mulxq 16(%rsi),%r15,%r13
  917. adoxq -24(%rbx),%r11
  918. adcxq %r15,%r12
  919. adoxq -16(%rbx),%r12
  920. adcxq %rbp,%r13
  921. adoxq %rbp,%r13
  922. movq %rdi,8(%rsp)
  923. movq %r8,%r15
  924. imulq 24(%rsp),%r8
  925. xorl %ebp,%ebp
  926. mulxq 24(%rsi),%rax,%r14
  927. movq %r8,%rdx
  928. adcxq %rax,%r13
  929. adoxq -8(%rbx),%r13
  930. adcxq %rbp,%r14
  931. leaq 32(%rsi),%rsi
  932. adoxq %rbp,%r14
  933. mulxq 0(%rcx),%rax,%r10
  934. adcxq %rax,%r15
  935. adoxq %r11,%r10
  936. mulxq 8(%rcx),%rax,%r11
  937. adcxq %rax,%r10
  938. adoxq %r12,%r11
  939. mulxq 16(%rcx),%rax,%r12
  940. movq %r10,-32(%rbx)
  941. adcxq %rax,%r11
  942. adoxq %r13,%r12
  943. mulxq 24(%rcx),%rax,%r15
  944. movq %r9,%rdx
  945. movq %r11,-24(%rbx)
  946. leaq 32(%rcx),%rcx
  947. adcxq %rax,%r12
  948. adoxq %rbp,%r15
  949. movq 48(%rsp),%rdi
  950. movq %r12,-16(%rbx)
  951. jmp .Lmulx4x_inner
  952. .align 32
  953. .Lmulx4x_inner:
  954. mulxq 0(%rsi),%r10,%rax
  955. adcxq %rbp,%r15
  956. adoxq %r14,%r10
  957. mulxq 8(%rsi),%r11,%r14
  958. adcxq 0(%rbx),%r10
  959. adoxq %rax,%r11
  960. mulxq 16(%rsi),%r12,%rax
  961. adcxq 8(%rbx),%r11
  962. adoxq %r14,%r12
  963. mulxq 24(%rsi),%r13,%r14
  964. movq %r8,%rdx
  965. adcxq 16(%rbx),%r12
  966. adoxq %rax,%r13
  967. adcxq 24(%rbx),%r13
  968. adoxq %rbp,%r14
  969. leaq 32(%rsi),%rsi
  970. leaq 32(%rbx),%rbx
  971. adcxq %rbp,%r14
  972. adoxq %r15,%r10
  973. mulxq 0(%rcx),%rax,%r15
  974. adcxq %rax,%r10
  975. adoxq %r15,%r11
  976. mulxq 8(%rcx),%rax,%r15
  977. adcxq %rax,%r11
  978. adoxq %r15,%r12
  979. mulxq 16(%rcx),%rax,%r15
  980. movq %r10,-40(%rbx)
  981. adcxq %rax,%r12
  982. adoxq %r15,%r13
  983. mulxq 24(%rcx),%rax,%r15
  984. movq %r9,%rdx
  985. movq %r11,-32(%rbx)
  986. movq %r12,-24(%rbx)
  987. adcxq %rax,%r13
  988. adoxq %rbp,%r15
  989. leaq 32(%rcx),%rcx
  990. movq %r13,-16(%rbx)
  991. decq %rdi
  992. jnz .Lmulx4x_inner
  993. movq 0(%rsp),%rax
  994. movq 8(%rsp),%rdi
  995. adcq %rbp,%r15
  996. subq 0(%rbx),%rbp
  997. adcq %r15,%r14
  998. sbbq %r15,%r15
  999. movq %r14,-8(%rbx)
  1000. cmpq 16(%rsp),%rdi
  1001. jne .Lmulx4x_outer
  1002. leaq 64(%rsp),%rbx
  1003. subq %rax,%rcx
  1004. negq %r15
  1005. movq %rax,%rdx
  1006. shrq $3+2,%rax
  1007. movq 32(%rsp),%rdi
  1008. jmp .Lmulx4x_sub
  1009. .align 32
  1010. .Lmulx4x_sub:
  1011. movq 0(%rbx),%r11
  1012. movq 8(%rbx),%r12
  1013. movq 16(%rbx),%r13
  1014. movq 24(%rbx),%r14
  1015. leaq 32(%rbx),%rbx
  1016. sbbq 0(%rcx),%r11
  1017. sbbq 8(%rcx),%r12
  1018. sbbq 16(%rcx),%r13
  1019. sbbq 24(%rcx),%r14
  1020. leaq 32(%rcx),%rcx
  1021. movq %r11,0(%rdi)
  1022. movq %r12,8(%rdi)
  1023. movq %r13,16(%rdi)
  1024. movq %r14,24(%rdi)
  1025. leaq 32(%rdi),%rdi
  1026. decq %rax
  1027. jnz .Lmulx4x_sub
  1028. sbbq $0,%r15
  1029. leaq 64(%rsp),%rbx
  1030. subq %rdx,%rdi
  1031. .byte 102,73,15,110,207
  1032. pxor %xmm0,%xmm0
  1033. pshufd $0,%xmm1,%xmm1
  1034. movq 40(%rsp),%rsi
  1035. .cfi_def_cfa %rsi,8
  1036. jmp .Lmulx4x_cond_copy
  1037. .align 32
  1038. .Lmulx4x_cond_copy:
  1039. movdqa 0(%rbx),%xmm2
  1040. movdqa 16(%rbx),%xmm3
  1041. leaq 32(%rbx),%rbx
  1042. movdqu 0(%rdi),%xmm4
  1043. movdqu 16(%rdi),%xmm5
  1044. leaq 32(%rdi),%rdi
  1045. movdqa %xmm0,-32(%rbx)
  1046. movdqa %xmm0,-16(%rbx)
  1047. pcmpeqd %xmm1,%xmm0
  1048. pand %xmm1,%xmm2
  1049. pand %xmm1,%xmm3
  1050. pand %xmm0,%xmm4
  1051. pand %xmm0,%xmm5
  1052. pxor %xmm0,%xmm0
  1053. por %xmm2,%xmm4
  1054. por %xmm3,%xmm5
  1055. movdqu %xmm4,-32(%rdi)
  1056. movdqu %xmm5,-16(%rdi)
  1057. subq $32,%rdx
  1058. jnz .Lmulx4x_cond_copy
  1059. movq %rdx,(%rbx)
  1060. movq $1,%rax
  1061. movq -48(%rsi),%r15
  1062. .cfi_restore %r15
  1063. movq -40(%rsi),%r14
  1064. .cfi_restore %r14
  1065. movq -32(%rsi),%r13
  1066. .cfi_restore %r13
  1067. movq -24(%rsi),%r12
  1068. .cfi_restore %r12
  1069. movq -16(%rsi),%rbp
  1070. .cfi_restore %rbp
  1071. movq -8(%rsi),%rbx
  1072. .cfi_restore %rbx
  1073. leaq (%rsi),%rsp
  1074. .cfi_def_cfa_register %rsp
  1075. .Lmulx4x_epilogue:
  1076. .byte 0xf3,0xc3
  1077. .cfi_endproc
  1078. .size bn_mulx4x_mont,.-bn_mulx4x_mont
  1079. .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1080. .align 16