x86_64-mont5.s 54 KB


  1. .text
  2. .globl _bn_mul_mont_gather5
  3. .p2align 6
  4. _bn_mul_mont_gather5:
  5. movl %r9d,%r9d
  6. movq %rsp,%rax
  7. testl $7,%r9d
  8. jnz L$mul_enter
  9. movl _OPENSSL_ia32cap_P+8(%rip),%r11d
  10. jmp L$mul4x_enter
  11. .p2align 4
  12. L$mul_enter:
  13. movd 8(%rsp),%xmm5
  14. pushq %rbx
  15. pushq %rbp
  16. pushq %r12
  17. pushq %r13
  18. pushq %r14
  19. pushq %r15
  20. negq %r9
  21. movq %rsp,%r11
  22. leaq -280(%rsp,%r9,8),%r10
  23. negq %r9
  24. andq $-1024,%r10
  25. subq %r10,%r11
  26. andq $-4096,%r11
  27. leaq (%r10,%r11,1),%rsp
  28. movq (%rsp),%r11
  29. cmpq %r10,%rsp
  30. ja L$mul_page_walk
  31. jmp L$mul_page_walk_done
  32. L$mul_page_walk:
  33. leaq -4096(%rsp),%rsp
  34. movq (%rsp),%r11
  35. cmpq %r10,%rsp
  36. ja L$mul_page_walk
  37. L$mul_page_walk_done:
  38. leaq L$inc(%rip),%r10
  39. movq %rax,8(%rsp,%r9,8)
  40. L$mul_body:
  41. leaq 128(%rdx),%r12
  42. movdqa 0(%r10),%xmm0
  43. movdqa 16(%r10),%xmm1
  44. leaq 24-112(%rsp,%r9,8),%r10
  45. andq $-16,%r10
  46. pshufd $0,%xmm5,%xmm5
  47. movdqa %xmm1,%xmm4
  48. movdqa %xmm1,%xmm2
  49. paddd %xmm0,%xmm1
  50. pcmpeqd %xmm5,%xmm0
  51. .byte 0x67
  52. movdqa %xmm4,%xmm3
  53. paddd %xmm1,%xmm2
  54. pcmpeqd %xmm5,%xmm1
  55. movdqa %xmm0,112(%r10)
  56. movdqa %xmm4,%xmm0
  57. paddd %xmm2,%xmm3
  58. pcmpeqd %xmm5,%xmm2
  59. movdqa %xmm1,128(%r10)
  60. movdqa %xmm4,%xmm1
  61. paddd %xmm3,%xmm0
  62. pcmpeqd %xmm5,%xmm3
  63. movdqa %xmm2,144(%r10)
  64. movdqa %xmm4,%xmm2
  65. paddd %xmm0,%xmm1
  66. pcmpeqd %xmm5,%xmm0
  67. movdqa %xmm3,160(%r10)
  68. movdqa %xmm4,%xmm3
  69. paddd %xmm1,%xmm2
  70. pcmpeqd %xmm5,%xmm1
  71. movdqa %xmm0,176(%r10)
  72. movdqa %xmm4,%xmm0
  73. paddd %xmm2,%xmm3
  74. pcmpeqd %xmm5,%xmm2
  75. movdqa %xmm1,192(%r10)
  76. movdqa %xmm4,%xmm1
  77. paddd %xmm3,%xmm0
  78. pcmpeqd %xmm5,%xmm3
  79. movdqa %xmm2,208(%r10)
  80. movdqa %xmm4,%xmm2
  81. paddd %xmm0,%xmm1
  82. pcmpeqd %xmm5,%xmm0
  83. movdqa %xmm3,224(%r10)
  84. movdqa %xmm4,%xmm3
  85. paddd %xmm1,%xmm2
  86. pcmpeqd %xmm5,%xmm1
  87. movdqa %xmm0,240(%r10)
  88. movdqa %xmm4,%xmm0
  89. paddd %xmm2,%xmm3
  90. pcmpeqd %xmm5,%xmm2
  91. movdqa %xmm1,256(%r10)
  92. movdqa %xmm4,%xmm1
  93. paddd %xmm3,%xmm0
  94. pcmpeqd %xmm5,%xmm3
  95. movdqa %xmm2,272(%r10)
  96. movdqa %xmm4,%xmm2
  97. paddd %xmm0,%xmm1
  98. pcmpeqd %xmm5,%xmm0
  99. movdqa %xmm3,288(%r10)
  100. movdqa %xmm4,%xmm3
  101. paddd %xmm1,%xmm2
  102. pcmpeqd %xmm5,%xmm1
  103. movdqa %xmm0,304(%r10)
  104. paddd %xmm2,%xmm3
  105. .byte 0x67
  106. pcmpeqd %xmm5,%xmm2
  107. movdqa %xmm1,320(%r10)
  108. pcmpeqd %xmm5,%xmm3
  109. movdqa %xmm2,336(%r10)
  110. pand 64(%r12),%xmm0
  111. pand 80(%r12),%xmm1
  112. pand 96(%r12),%xmm2
  113. movdqa %xmm3,352(%r10)
  114. pand 112(%r12),%xmm3
  115. por %xmm2,%xmm0
  116. por %xmm3,%xmm1
  117. movdqa -128(%r12),%xmm4
  118. movdqa -112(%r12),%xmm5
  119. movdqa -96(%r12),%xmm2
  120. pand 112(%r10),%xmm4
  121. movdqa -80(%r12),%xmm3
  122. pand 128(%r10),%xmm5
  123. por %xmm4,%xmm0
  124. pand 144(%r10),%xmm2
  125. por %xmm5,%xmm1
  126. pand 160(%r10),%xmm3
  127. por %xmm2,%xmm0
  128. por %xmm3,%xmm1
  129. movdqa -64(%r12),%xmm4
  130. movdqa -48(%r12),%xmm5
  131. movdqa -32(%r12),%xmm2
  132. pand 176(%r10),%xmm4
  133. movdqa -16(%r12),%xmm3
  134. pand 192(%r10),%xmm5
  135. por %xmm4,%xmm0
  136. pand 208(%r10),%xmm2
  137. por %xmm5,%xmm1
  138. pand 224(%r10),%xmm3
  139. por %xmm2,%xmm0
  140. por %xmm3,%xmm1
  141. movdqa 0(%r12),%xmm4
  142. movdqa 16(%r12),%xmm5
  143. movdqa 32(%r12),%xmm2
  144. pand 240(%r10),%xmm4
  145. movdqa 48(%r12),%xmm3
  146. pand 256(%r10),%xmm5
  147. por %xmm4,%xmm0
  148. pand 272(%r10),%xmm2
  149. por %xmm5,%xmm1
  150. pand 288(%r10),%xmm3
  151. por %xmm2,%xmm0
  152. por %xmm3,%xmm1
  153. por %xmm1,%xmm0
  154. pshufd $0x4e,%xmm0,%xmm1
  155. por %xmm1,%xmm0
  156. leaq 256(%r12),%r12
  157. .byte 102,72,15,126,195
  158. movq (%r8),%r8
  159. movq (%rsi),%rax
  160. xorq %r14,%r14
  161. xorq %r15,%r15
  162. movq %r8,%rbp
  163. mulq %rbx
  164. movq %rax,%r10
  165. movq (%rcx),%rax
  166. imulq %r10,%rbp
  167. movq %rdx,%r11
  168. mulq %rbp
  169. addq %rax,%r10
  170. movq 8(%rsi),%rax
  171. adcq $0,%rdx
  172. movq %rdx,%r13
  173. leaq 1(%r15),%r15
  174. jmp L$1st_enter
  175. .p2align 4
  176. L$1st:
  177. addq %rax,%r13
  178. movq (%rsi,%r15,8),%rax
  179. adcq $0,%rdx
  180. addq %r11,%r13
  181. movq %r10,%r11
  182. adcq $0,%rdx
  183. movq %r13,-16(%rsp,%r15,8)
  184. movq %rdx,%r13
  185. L$1st_enter:
  186. mulq %rbx
  187. addq %rax,%r11
  188. movq (%rcx,%r15,8),%rax
  189. adcq $0,%rdx
  190. leaq 1(%r15),%r15
  191. movq %rdx,%r10
  192. mulq %rbp
  193. cmpq %r9,%r15
  194. jne L$1st
  195. addq %rax,%r13
  196. adcq $0,%rdx
  197. addq %r11,%r13
  198. adcq $0,%rdx
  199. movq %r13,-16(%rsp,%r9,8)
  200. movq %rdx,%r13
  201. movq %r10,%r11
  202. xorq %rdx,%rdx
  203. addq %r11,%r13
  204. adcq $0,%rdx
  205. movq %r13,-8(%rsp,%r9,8)
  206. movq %rdx,(%rsp,%r9,8)
  207. leaq 1(%r14),%r14
  208. jmp L$outer
  209. .p2align 4
  210. L$outer:
  211. leaq 24+128(%rsp,%r9,8),%rdx
  212. andq $-16,%rdx
  213. pxor %xmm4,%xmm4
  214. pxor %xmm5,%xmm5
  215. movdqa -128(%r12),%xmm0
  216. movdqa -112(%r12),%xmm1
  217. movdqa -96(%r12),%xmm2
  218. movdqa -80(%r12),%xmm3
  219. pand -128(%rdx),%xmm0
  220. pand -112(%rdx),%xmm1
  221. por %xmm0,%xmm4
  222. pand -96(%rdx),%xmm2
  223. por %xmm1,%xmm5
  224. pand -80(%rdx),%xmm3
  225. por %xmm2,%xmm4
  226. por %xmm3,%xmm5
  227. movdqa -64(%r12),%xmm0
  228. movdqa -48(%r12),%xmm1
  229. movdqa -32(%r12),%xmm2
  230. movdqa -16(%r12),%xmm3
  231. pand -64(%rdx),%xmm0
  232. pand -48(%rdx),%xmm1
  233. por %xmm0,%xmm4
  234. pand -32(%rdx),%xmm2
  235. por %xmm1,%xmm5
  236. pand -16(%rdx),%xmm3
  237. por %xmm2,%xmm4
  238. por %xmm3,%xmm5
  239. movdqa 0(%r12),%xmm0
  240. movdqa 16(%r12),%xmm1
  241. movdqa 32(%r12),%xmm2
  242. movdqa 48(%r12),%xmm3
  243. pand 0(%rdx),%xmm0
  244. pand 16(%rdx),%xmm1
  245. por %xmm0,%xmm4
  246. pand 32(%rdx),%xmm2
  247. por %xmm1,%xmm5
  248. pand 48(%rdx),%xmm3
  249. por %xmm2,%xmm4
  250. por %xmm3,%xmm5
  251. movdqa 64(%r12),%xmm0
  252. movdqa 80(%r12),%xmm1
  253. movdqa 96(%r12),%xmm2
  254. movdqa 112(%r12),%xmm3
  255. pand 64(%rdx),%xmm0
  256. pand 80(%rdx),%xmm1
  257. por %xmm0,%xmm4
  258. pand 96(%rdx),%xmm2
  259. por %xmm1,%xmm5
  260. pand 112(%rdx),%xmm3
  261. por %xmm2,%xmm4
  262. por %xmm3,%xmm5
  263. por %xmm5,%xmm4
  264. pshufd $0x4e,%xmm4,%xmm0
  265. por %xmm4,%xmm0
  266. leaq 256(%r12),%r12
  267. movq (%rsi),%rax
  268. .byte 102,72,15,126,195
  269. xorq %r15,%r15
  270. movq %r8,%rbp
  271. movq (%rsp),%r10
  272. mulq %rbx
  273. addq %rax,%r10
  274. movq (%rcx),%rax
  275. adcq $0,%rdx
  276. imulq %r10,%rbp
  277. movq %rdx,%r11
  278. mulq %rbp
  279. addq %rax,%r10
  280. movq 8(%rsi),%rax
  281. adcq $0,%rdx
  282. movq 8(%rsp),%r10
  283. movq %rdx,%r13
  284. leaq 1(%r15),%r15
  285. jmp L$inner_enter
  286. .p2align 4
  287. L$inner:
  288. addq %rax,%r13
  289. movq (%rsi,%r15,8),%rax
  290. adcq $0,%rdx
  291. addq %r10,%r13
  292. movq (%rsp,%r15,8),%r10
  293. adcq $0,%rdx
  294. movq %r13,-16(%rsp,%r15,8)
  295. movq %rdx,%r13
  296. L$inner_enter:
  297. mulq %rbx
  298. addq %rax,%r11
  299. movq (%rcx,%r15,8),%rax
  300. adcq $0,%rdx
  301. addq %r11,%r10
  302. movq %rdx,%r11
  303. adcq $0,%r11
  304. leaq 1(%r15),%r15
  305. mulq %rbp
  306. cmpq %r9,%r15
  307. jne L$inner
  308. addq %rax,%r13
  309. adcq $0,%rdx
  310. addq %r10,%r13
  311. movq (%rsp,%r9,8),%r10
  312. adcq $0,%rdx
  313. movq %r13,-16(%rsp,%r9,8)
  314. movq %rdx,%r13
  315. xorq %rdx,%rdx
  316. addq %r11,%r13
  317. adcq $0,%rdx
  318. addq %r10,%r13
  319. adcq $0,%rdx
  320. movq %r13,-8(%rsp,%r9,8)
  321. movq %rdx,(%rsp,%r9,8)
  322. leaq 1(%r14),%r14
  323. cmpq %r9,%r14
  324. jb L$outer
  325. xorq %r14,%r14
  326. movq (%rsp),%rax
  327. leaq (%rsp),%rsi
  328. movq %r9,%r15
  329. jmp L$sub
  330. .p2align 4
  331. L$sub: sbbq (%rcx,%r14,8),%rax
  332. movq %rax,(%rdi,%r14,8)
  333. movq 8(%rsi,%r14,8),%rax
  334. leaq 1(%r14),%r14
  335. decq %r15
  336. jnz L$sub
  337. sbbq $0,%rax
  338. movq $-1,%rbx
  339. xorq %rax,%rbx
  340. xorq %r14,%r14
  341. movq %r9,%r15
  342. L$copy:
  343. movq (%rdi,%r14,8),%rcx
  344. movq (%rsp,%r14,8),%rdx
  345. andq %rbx,%rcx
  346. andq %rax,%rdx
  347. movq %r14,(%rsp,%r14,8)
  348. orq %rcx,%rdx
  349. movq %rdx,(%rdi,%r14,8)
  350. leaq 1(%r14),%r14
  351. subq $1,%r15
  352. jnz L$copy
  353. movq 8(%rsp,%r9,8),%rsi
  354. movq $1,%rax
  355. movq -48(%rsi),%r15
  356. movq -40(%rsi),%r14
  357. movq -32(%rsi),%r13
  358. movq -24(%rsi),%r12
  359. movq -16(%rsi),%rbp
  360. movq -8(%rsi),%rbx
  361. leaq (%rsi),%rsp
  362. L$mul_epilogue:
  363. .byte 0xf3,0xc3
  364. .p2align 5
  365. bn_mul4x_mont_gather5:
  366. .byte 0x67
  367. movq %rsp,%rax
  368. L$mul4x_enter:
  369. andl $0x80108,%r11d
  370. cmpl $0x80108,%r11d
  371. je L$mulx4x_enter
  372. pushq %rbx
  373. pushq %rbp
  374. pushq %r12
  375. pushq %r13
  376. pushq %r14
  377. pushq %r15
  378. L$mul4x_prologue:
  379. .byte 0x67
  380. shll $3,%r9d
  381. leaq (%r9,%r9,2),%r10
  382. negq %r9
  383. leaq -320(%rsp,%r9,2),%r11
  384. movq %rsp,%rbp
  385. subq %rdi,%r11
  386. andq $4095,%r11
  387. cmpq %r11,%r10
  388. jb L$mul4xsp_alt
  389. subq %r11,%rbp
  390. leaq -320(%rbp,%r9,2),%rbp
  391. jmp L$mul4xsp_done
  392. .p2align 5
  393. L$mul4xsp_alt:
  394. leaq 4096-320(,%r9,2),%r10
  395. leaq -320(%rbp,%r9,2),%rbp
  396. subq %r10,%r11
  397. movq $0,%r10
  398. cmovcq %r10,%r11
  399. subq %r11,%rbp
  400. L$mul4xsp_done:
  401. andq $-64,%rbp
  402. movq %rsp,%r11
  403. subq %rbp,%r11
  404. andq $-4096,%r11
  405. leaq (%r11,%rbp,1),%rsp
  406. movq (%rsp),%r10
  407. cmpq %rbp,%rsp
  408. ja L$mul4x_page_walk
  409. jmp L$mul4x_page_walk_done
  410. L$mul4x_page_walk:
  411. leaq -4096(%rsp),%rsp
  412. movq (%rsp),%r10
  413. cmpq %rbp,%rsp
  414. ja L$mul4x_page_walk
  415. L$mul4x_page_walk_done:
  416. negq %r9
  417. movq %rax,40(%rsp)
  418. L$mul4x_body:
  419. call mul4x_internal
  420. movq 40(%rsp),%rsi
  421. movq $1,%rax
  422. movq -48(%rsi),%r15
  423. movq -40(%rsi),%r14
  424. movq -32(%rsi),%r13
  425. movq -24(%rsi),%r12
  426. movq -16(%rsi),%rbp
  427. movq -8(%rsi),%rbx
  428. leaq (%rsi),%rsp
  429. L$mul4x_epilogue:
  430. .byte 0xf3,0xc3
  431. .p2align 5
  432. mul4x_internal:
  433. shlq $5,%r9
  434. movd 8(%rax),%xmm5
  435. leaq L$inc(%rip),%rax
  436. leaq 128(%rdx,%r9,1),%r13
  437. shrq $5,%r9
  438. movdqa 0(%rax),%xmm0
  439. movdqa 16(%rax),%xmm1
  440. leaq 88-112(%rsp,%r9,1),%r10
  441. leaq 128(%rdx),%r12
  442. pshufd $0,%xmm5,%xmm5
  443. movdqa %xmm1,%xmm4
  444. .byte 0x67,0x67
  445. movdqa %xmm1,%xmm2
  446. paddd %xmm0,%xmm1
  447. pcmpeqd %xmm5,%xmm0
  448. .byte 0x67
  449. movdqa %xmm4,%xmm3
  450. paddd %xmm1,%xmm2
  451. pcmpeqd %xmm5,%xmm1
  452. movdqa %xmm0,112(%r10)
  453. movdqa %xmm4,%xmm0
  454. paddd %xmm2,%xmm3
  455. pcmpeqd %xmm5,%xmm2
  456. movdqa %xmm1,128(%r10)
  457. movdqa %xmm4,%xmm1
  458. paddd %xmm3,%xmm0
  459. pcmpeqd %xmm5,%xmm3
  460. movdqa %xmm2,144(%r10)
  461. movdqa %xmm4,%xmm2
  462. paddd %xmm0,%xmm1
  463. pcmpeqd %xmm5,%xmm0
  464. movdqa %xmm3,160(%r10)
  465. movdqa %xmm4,%xmm3
  466. paddd %xmm1,%xmm2
  467. pcmpeqd %xmm5,%xmm1
  468. movdqa %xmm0,176(%r10)
  469. movdqa %xmm4,%xmm0
  470. paddd %xmm2,%xmm3
  471. pcmpeqd %xmm5,%xmm2
  472. movdqa %xmm1,192(%r10)
  473. movdqa %xmm4,%xmm1
  474. paddd %xmm3,%xmm0
  475. pcmpeqd %xmm5,%xmm3
  476. movdqa %xmm2,208(%r10)
  477. movdqa %xmm4,%xmm2
  478. paddd %xmm0,%xmm1
  479. pcmpeqd %xmm5,%xmm0
  480. movdqa %xmm3,224(%r10)
  481. movdqa %xmm4,%xmm3
  482. paddd %xmm1,%xmm2
  483. pcmpeqd %xmm5,%xmm1
  484. movdqa %xmm0,240(%r10)
  485. movdqa %xmm4,%xmm0
  486. paddd %xmm2,%xmm3
  487. pcmpeqd %xmm5,%xmm2
  488. movdqa %xmm1,256(%r10)
  489. movdqa %xmm4,%xmm1
  490. paddd %xmm3,%xmm0
  491. pcmpeqd %xmm5,%xmm3
  492. movdqa %xmm2,272(%r10)
  493. movdqa %xmm4,%xmm2
  494. paddd %xmm0,%xmm1
  495. pcmpeqd %xmm5,%xmm0
  496. movdqa %xmm3,288(%r10)
  497. movdqa %xmm4,%xmm3
  498. paddd %xmm1,%xmm2
  499. pcmpeqd %xmm5,%xmm1
  500. movdqa %xmm0,304(%r10)
  501. paddd %xmm2,%xmm3
  502. .byte 0x67
  503. pcmpeqd %xmm5,%xmm2
  504. movdqa %xmm1,320(%r10)
  505. pcmpeqd %xmm5,%xmm3
  506. movdqa %xmm2,336(%r10)
  507. pand 64(%r12),%xmm0
  508. pand 80(%r12),%xmm1
  509. pand 96(%r12),%xmm2
  510. movdqa %xmm3,352(%r10)
  511. pand 112(%r12),%xmm3
  512. por %xmm2,%xmm0
  513. por %xmm3,%xmm1
  514. movdqa -128(%r12),%xmm4
  515. movdqa -112(%r12),%xmm5
  516. movdqa -96(%r12),%xmm2
  517. pand 112(%r10),%xmm4
  518. movdqa -80(%r12),%xmm3
  519. pand 128(%r10),%xmm5
  520. por %xmm4,%xmm0
  521. pand 144(%r10),%xmm2
  522. por %xmm5,%xmm1
  523. pand 160(%r10),%xmm3
  524. por %xmm2,%xmm0
  525. por %xmm3,%xmm1
  526. movdqa -64(%r12),%xmm4
  527. movdqa -48(%r12),%xmm5
  528. movdqa -32(%r12),%xmm2
  529. pand 176(%r10),%xmm4
  530. movdqa -16(%r12),%xmm3
  531. pand 192(%r10),%xmm5
  532. por %xmm4,%xmm0
  533. pand 208(%r10),%xmm2
  534. por %xmm5,%xmm1
  535. pand 224(%r10),%xmm3
  536. por %xmm2,%xmm0
  537. por %xmm3,%xmm1
  538. movdqa 0(%r12),%xmm4
  539. movdqa 16(%r12),%xmm5
  540. movdqa 32(%r12),%xmm2
  541. pand 240(%r10),%xmm4
  542. movdqa 48(%r12),%xmm3
  543. pand 256(%r10),%xmm5
  544. por %xmm4,%xmm0
  545. pand 272(%r10),%xmm2
  546. por %xmm5,%xmm1
  547. pand 288(%r10),%xmm3
  548. por %xmm2,%xmm0
  549. por %xmm3,%xmm1
  550. por %xmm1,%xmm0
  551. pshufd $0x4e,%xmm0,%xmm1
  552. por %xmm1,%xmm0
  553. leaq 256(%r12),%r12
  554. .byte 102,72,15,126,195
  555. movq %r13,16+8(%rsp)
  556. movq %rdi,56+8(%rsp)
  557. movq (%r8),%r8
  558. movq (%rsi),%rax
  559. leaq (%rsi,%r9,1),%rsi
  560. negq %r9
  561. movq %r8,%rbp
  562. mulq %rbx
  563. movq %rax,%r10
  564. movq (%rcx),%rax
  565. imulq %r10,%rbp
  566. leaq 64+8(%rsp),%r14
  567. movq %rdx,%r11
  568. mulq %rbp
  569. addq %rax,%r10
  570. movq 8(%rsi,%r9,1),%rax
  571. adcq $0,%rdx
  572. movq %rdx,%rdi
  573. mulq %rbx
  574. addq %rax,%r11
  575. movq 8(%rcx),%rax
  576. adcq $0,%rdx
  577. movq %rdx,%r10
  578. mulq %rbp
  579. addq %rax,%rdi
  580. movq 16(%rsi,%r9,1),%rax
  581. adcq $0,%rdx
  582. addq %r11,%rdi
  583. leaq 32(%r9),%r15
  584. leaq 32(%rcx),%rcx
  585. adcq $0,%rdx
  586. movq %rdi,(%r14)
  587. movq %rdx,%r13
  588. jmp L$1st4x
  589. .p2align 5
  590. L$1st4x:
  591. mulq %rbx
  592. addq %rax,%r10
  593. movq -16(%rcx),%rax
  594. leaq 32(%r14),%r14
  595. adcq $0,%rdx
  596. movq %rdx,%r11
  597. mulq %rbp
  598. addq %rax,%r13
  599. movq -8(%rsi,%r15,1),%rax
  600. adcq $0,%rdx
  601. addq %r10,%r13
  602. adcq $0,%rdx
  603. movq %r13,-24(%r14)
  604. movq %rdx,%rdi
  605. mulq %rbx
  606. addq %rax,%r11
  607. movq -8(%rcx),%rax
  608. adcq $0,%rdx
  609. movq %rdx,%r10
  610. mulq %rbp
  611. addq %rax,%rdi
  612. movq (%rsi,%r15,1),%rax
  613. adcq $0,%rdx
  614. addq %r11,%rdi
  615. adcq $0,%rdx
  616. movq %rdi,-16(%r14)
  617. movq %rdx,%r13
  618. mulq %rbx
  619. addq %rax,%r10
  620. movq 0(%rcx),%rax
  621. adcq $0,%rdx
  622. movq %rdx,%r11
  623. mulq %rbp
  624. addq %rax,%r13
  625. movq 8(%rsi,%r15,1),%rax
  626. adcq $0,%rdx
  627. addq %r10,%r13
  628. adcq $0,%rdx
  629. movq %r13,-8(%r14)
  630. movq %rdx,%rdi
  631. mulq %rbx
  632. addq %rax,%r11
  633. movq 8(%rcx),%rax
  634. adcq $0,%rdx
  635. movq %rdx,%r10
  636. mulq %rbp
  637. addq %rax,%rdi
  638. movq 16(%rsi,%r15,1),%rax
  639. adcq $0,%rdx
  640. addq %r11,%rdi
  641. leaq 32(%rcx),%rcx
  642. adcq $0,%rdx
  643. movq %rdi,(%r14)
  644. movq %rdx,%r13
  645. addq $32,%r15
  646. jnz L$1st4x
  647. mulq %rbx
  648. addq %rax,%r10
  649. movq -16(%rcx),%rax
  650. leaq 32(%r14),%r14
  651. adcq $0,%rdx
  652. movq %rdx,%r11
  653. mulq %rbp
  654. addq %rax,%r13
  655. movq -8(%rsi),%rax
  656. adcq $0,%rdx
  657. addq %r10,%r13
  658. adcq $0,%rdx
  659. movq %r13,-24(%r14)
  660. movq %rdx,%rdi
  661. mulq %rbx
  662. addq %rax,%r11
  663. movq -8(%rcx),%rax
  664. adcq $0,%rdx
  665. movq %rdx,%r10
  666. mulq %rbp
  667. addq %rax,%rdi
  668. movq (%rsi,%r9,1),%rax
  669. adcq $0,%rdx
  670. addq %r11,%rdi
  671. adcq $0,%rdx
  672. movq %rdi,-16(%r14)
  673. movq %rdx,%r13
  674. leaq (%rcx,%r9,1),%rcx
  675. xorq %rdi,%rdi
  676. addq %r10,%r13
  677. adcq $0,%rdi
  678. movq %r13,-8(%r14)
  679. jmp L$outer4x
  680. .p2align 5
  681. L$outer4x:
  682. leaq 16+128(%r14),%rdx
  683. pxor %xmm4,%xmm4
  684. pxor %xmm5,%xmm5
  685. movdqa -128(%r12),%xmm0
  686. movdqa -112(%r12),%xmm1
  687. movdqa -96(%r12),%xmm2
  688. movdqa -80(%r12),%xmm3
  689. pand -128(%rdx),%xmm0
  690. pand -112(%rdx),%xmm1
  691. por %xmm0,%xmm4
  692. pand -96(%rdx),%xmm2
  693. por %xmm1,%xmm5
  694. pand -80(%rdx),%xmm3
  695. por %xmm2,%xmm4
  696. por %xmm3,%xmm5
  697. movdqa -64(%r12),%xmm0
  698. movdqa -48(%r12),%xmm1
  699. movdqa -32(%r12),%xmm2
  700. movdqa -16(%r12),%xmm3
  701. pand -64(%rdx),%xmm0
  702. pand -48(%rdx),%xmm1
  703. por %xmm0,%xmm4
  704. pand -32(%rdx),%xmm2
  705. por %xmm1,%xmm5
  706. pand -16(%rdx),%xmm3
  707. por %xmm2,%xmm4
  708. por %xmm3,%xmm5
  709. movdqa 0(%r12),%xmm0
  710. movdqa 16(%r12),%xmm1
  711. movdqa 32(%r12),%xmm2
  712. movdqa 48(%r12),%xmm3
  713. pand 0(%rdx),%xmm0
  714. pand 16(%rdx),%xmm1
  715. por %xmm0,%xmm4
  716. pand 32(%rdx),%xmm2
  717. por %xmm1,%xmm5
  718. pand 48(%rdx),%xmm3
  719. por %xmm2,%xmm4
  720. por %xmm3,%xmm5
  721. movdqa 64(%r12),%xmm0
  722. movdqa 80(%r12),%xmm1
  723. movdqa 96(%r12),%xmm2
  724. movdqa 112(%r12),%xmm3
  725. pand 64(%rdx),%xmm0
  726. pand 80(%rdx),%xmm1
  727. por %xmm0,%xmm4
  728. pand 96(%rdx),%xmm2
  729. por %xmm1,%xmm5
  730. pand 112(%rdx),%xmm3
  731. por %xmm2,%xmm4
  732. por %xmm3,%xmm5
  733. por %xmm5,%xmm4
  734. pshufd $0x4e,%xmm4,%xmm0
  735. por %xmm4,%xmm0
  736. leaq 256(%r12),%r12
  737. .byte 102,72,15,126,195
  738. movq (%r14,%r9,1),%r10
  739. movq %r8,%rbp
  740. mulq %rbx
  741. addq %rax,%r10
  742. movq (%rcx),%rax
  743. adcq $0,%rdx
  744. imulq %r10,%rbp
  745. movq %rdx,%r11
  746. movq %rdi,(%r14)
  747. leaq (%r14,%r9,1),%r14
  748. mulq %rbp
  749. addq %rax,%r10
  750. movq 8(%rsi,%r9,1),%rax
  751. adcq $0,%rdx
  752. movq %rdx,%rdi
  753. mulq %rbx
  754. addq %rax,%r11
  755. movq 8(%rcx),%rax
  756. adcq $0,%rdx
  757. addq 8(%r14),%r11
  758. adcq $0,%rdx
  759. movq %rdx,%r10
  760. mulq %rbp
  761. addq %rax,%rdi
  762. movq 16(%rsi,%r9,1),%rax
  763. adcq $0,%rdx
  764. addq %r11,%rdi
  765. leaq 32(%r9),%r15
  766. leaq 32(%rcx),%rcx
  767. adcq $0,%rdx
  768. movq %rdx,%r13
  769. jmp L$inner4x
  770. .p2align 5
  771. L$inner4x:
  772. mulq %rbx
  773. addq %rax,%r10
  774. movq -16(%rcx),%rax
  775. adcq $0,%rdx
  776. addq 16(%r14),%r10
  777. leaq 32(%r14),%r14
  778. adcq $0,%rdx
  779. movq %rdx,%r11
  780. mulq %rbp
  781. addq %rax,%r13
  782. movq -8(%rsi,%r15,1),%rax
  783. adcq $0,%rdx
  784. addq %r10,%r13
  785. adcq $0,%rdx
  786. movq %rdi,-32(%r14)
  787. movq %rdx,%rdi
  788. mulq %rbx
  789. addq %rax,%r11
  790. movq -8(%rcx),%rax
  791. adcq $0,%rdx
  792. addq -8(%r14),%r11
  793. adcq $0,%rdx
  794. movq %rdx,%r10
  795. mulq %rbp
  796. addq %rax,%rdi
  797. movq (%rsi,%r15,1),%rax
  798. adcq $0,%rdx
  799. addq %r11,%rdi
  800. adcq $0,%rdx
  801. movq %r13,-24(%r14)
  802. movq %rdx,%r13
  803. mulq %rbx
  804. addq %rax,%r10
  805. movq 0(%rcx),%rax
  806. adcq $0,%rdx
  807. addq (%r14),%r10
  808. adcq $0,%rdx
  809. movq %rdx,%r11
  810. mulq %rbp
  811. addq %rax,%r13
  812. movq 8(%rsi,%r15,1),%rax
  813. adcq $0,%rdx
  814. addq %r10,%r13
  815. adcq $0,%rdx
  816. movq %rdi,-16(%r14)
  817. movq %rdx,%rdi
  818. mulq %rbx
  819. addq %rax,%r11
  820. movq 8(%rcx),%rax
  821. adcq $0,%rdx
  822. addq 8(%r14),%r11
  823. adcq $0,%rdx
  824. movq %rdx,%r10
  825. mulq %rbp
  826. addq %rax,%rdi
  827. movq 16(%rsi,%r15,1),%rax
  828. adcq $0,%rdx
  829. addq %r11,%rdi
  830. leaq 32(%rcx),%rcx
  831. adcq $0,%rdx
  832. movq %r13,-8(%r14)
  833. movq %rdx,%r13
  834. addq $32,%r15
  835. jnz L$inner4x
  836. mulq %rbx
  837. addq %rax,%r10
  838. movq -16(%rcx),%rax
  839. adcq $0,%rdx
  840. addq 16(%r14),%r10
  841. leaq 32(%r14),%r14
  842. adcq $0,%rdx
  843. movq %rdx,%r11
  844. mulq %rbp
  845. addq %rax,%r13
  846. movq -8(%rsi),%rax
  847. adcq $0,%rdx
  848. addq %r10,%r13
  849. adcq $0,%rdx
  850. movq %rdi,-32(%r14)
  851. movq %rdx,%rdi
  852. mulq %rbx
  853. addq %rax,%r11
  854. movq %rbp,%rax
  855. movq -8(%rcx),%rbp
  856. adcq $0,%rdx
  857. addq -8(%r14),%r11
  858. adcq $0,%rdx
  859. movq %rdx,%r10
  860. mulq %rbp
  861. addq %rax,%rdi
  862. movq (%rsi,%r9,1),%rax
  863. adcq $0,%rdx
  864. addq %r11,%rdi
  865. adcq $0,%rdx
  866. movq %r13,-24(%r14)
  867. movq %rdx,%r13
  868. movq %rdi,-16(%r14)
  869. leaq (%rcx,%r9,1),%rcx
  870. xorq %rdi,%rdi
  871. addq %r10,%r13
  872. adcq $0,%rdi
  873. addq (%r14),%r13
  874. adcq $0,%rdi
  875. movq %r13,-8(%r14)
  876. cmpq 16+8(%rsp),%r12
  877. jb L$outer4x
  878. xorq %rax,%rax
  879. subq %r13,%rbp
  880. adcq %r15,%r15
  881. orq %r15,%rdi
  882. subq %rdi,%rax
  883. leaq (%r14,%r9,1),%rbx
  884. movq (%rcx),%r12
  885. leaq (%rcx),%rbp
  886. movq %r9,%rcx
  887. sarq $3+2,%rcx
  888. movq 56+8(%rsp),%rdi
  889. decq %r12
  890. xorq %r10,%r10
  891. movq 8(%rbp),%r13
  892. movq 16(%rbp),%r14
  893. movq 24(%rbp),%r15
  894. jmp L$sqr4x_sub_entry
  895. .globl _bn_power5
  896. .p2align 5
  897. _bn_power5:
  898. movq %rsp,%rax
  899. movl _OPENSSL_ia32cap_P+8(%rip),%r11d
  900. andl $0x80108,%r11d
  901. cmpl $0x80108,%r11d
  902. je L$powerx5_enter
  903. pushq %rbx
  904. pushq %rbp
  905. pushq %r12
  906. pushq %r13
  907. pushq %r14
  908. pushq %r15
  909. L$power5_prologue:
  910. shll $3,%r9d
  911. leal (%r9,%r9,2),%r10d
  912. negq %r9
  913. movq (%r8),%r8
  914. leaq -320(%rsp,%r9,2),%r11
  915. movq %rsp,%rbp
  916. subq %rdi,%r11
  917. andq $4095,%r11
  918. cmpq %r11,%r10
  919. jb L$pwr_sp_alt
  920. subq %r11,%rbp
  921. leaq -320(%rbp,%r9,2),%rbp
  922. jmp L$pwr_sp_done
  923. .p2align 5
  924. L$pwr_sp_alt:
  925. leaq 4096-320(,%r9,2),%r10
  926. leaq -320(%rbp,%r9,2),%rbp
  927. subq %r10,%r11
  928. movq $0,%r10
  929. cmovcq %r10,%r11
  930. subq %r11,%rbp
  931. L$pwr_sp_done:
  932. andq $-64,%rbp
  933. movq %rsp,%r11
  934. subq %rbp,%r11
  935. andq $-4096,%r11
  936. leaq (%r11,%rbp,1),%rsp
  937. movq (%rsp),%r10
  938. cmpq %rbp,%rsp
  939. ja L$pwr_page_walk
  940. jmp L$pwr_page_walk_done
  941. L$pwr_page_walk:
  942. leaq -4096(%rsp),%rsp
  943. movq (%rsp),%r10
  944. cmpq %rbp,%rsp
  945. ja L$pwr_page_walk
  946. L$pwr_page_walk_done:
  947. movq %r9,%r10
  948. negq %r9
  949. movq %r8,32(%rsp)
  950. movq %rax,40(%rsp)
  951. L$power5_body:
  952. .byte 102,72,15,110,207
  953. .byte 102,72,15,110,209
  954. .byte 102,73,15,110,218
  955. .byte 102,72,15,110,226
  956. call __bn_sqr8x_internal
  957. call __bn_post4x_internal
  958. call __bn_sqr8x_internal
  959. call __bn_post4x_internal
  960. call __bn_sqr8x_internal
  961. call __bn_post4x_internal
  962. call __bn_sqr8x_internal
  963. call __bn_post4x_internal
  964. call __bn_sqr8x_internal
  965. call __bn_post4x_internal
  966. .byte 102,72,15,126,209
  967. .byte 102,72,15,126,226
  968. movq %rsi,%rdi
  969. movq 40(%rsp),%rax
  970. leaq 32(%rsp),%r8
  971. call mul4x_internal
  972. movq 40(%rsp),%rsi
  973. movq $1,%rax
  974. movq -48(%rsi),%r15
  975. movq -40(%rsi),%r14
  976. movq -32(%rsi),%r13
  977. movq -24(%rsi),%r12
  978. movq -16(%rsi),%rbp
  979. movq -8(%rsi),%rbx
  980. leaq (%rsi),%rsp
  981. L$power5_epilogue:
  982. .byte 0xf3,0xc3
  983. .globl _bn_sqr8x_internal
  984. .private_extern _bn_sqr8x_internal
  985. .p2align 5
  986. _bn_sqr8x_internal:
  987. __bn_sqr8x_internal:
  988. leaq 32(%r10),%rbp
  989. leaq (%rsi,%r9,1),%rsi
  990. movq %r9,%rcx
  991. movq -32(%rsi,%rbp,1),%r14
  992. leaq 48+8(%rsp,%r9,2),%rdi
  993. movq -24(%rsi,%rbp,1),%rax
  994. leaq -32(%rdi,%rbp,1),%rdi
  995. movq -16(%rsi,%rbp,1),%rbx
  996. movq %rax,%r15
  997. mulq %r14
  998. movq %rax,%r10
  999. movq %rbx,%rax
  1000. movq %rdx,%r11
  1001. movq %r10,-24(%rdi,%rbp,1)
  1002. mulq %r14
  1003. addq %rax,%r11
  1004. movq %rbx,%rax
  1005. adcq $0,%rdx
  1006. movq %r11,-16(%rdi,%rbp,1)
  1007. movq %rdx,%r10
  1008. movq -8(%rsi,%rbp,1),%rbx
  1009. mulq %r15
  1010. movq %rax,%r12
  1011. movq %rbx,%rax
  1012. movq %rdx,%r13
  1013. leaq (%rbp),%rcx
  1014. mulq %r14
  1015. addq %rax,%r10
  1016. movq %rbx,%rax
  1017. movq %rdx,%r11
  1018. adcq $0,%r11
  1019. addq %r12,%r10
  1020. adcq $0,%r11
  1021. movq %r10,-8(%rdi,%rcx,1)
  1022. jmp L$sqr4x_1st
  1023. .p2align 5
  1024. L$sqr4x_1st:
  1025. movq (%rsi,%rcx,1),%rbx
  1026. mulq %r15
  1027. addq %rax,%r13
  1028. movq %rbx,%rax
  1029. movq %rdx,%r12
  1030. adcq $0,%r12
  1031. mulq %r14
  1032. addq %rax,%r11
  1033. movq %rbx,%rax
  1034. movq 8(%rsi,%rcx,1),%rbx
  1035. movq %rdx,%r10
  1036. adcq $0,%r10
  1037. addq %r13,%r11
  1038. adcq $0,%r10
  1039. mulq %r15
  1040. addq %rax,%r12
  1041. movq %rbx,%rax
  1042. movq %r11,(%rdi,%rcx,1)
  1043. movq %rdx,%r13
  1044. adcq $0,%r13
  1045. mulq %r14
  1046. addq %rax,%r10
  1047. movq %rbx,%rax
  1048. movq 16(%rsi,%rcx,1),%rbx
  1049. movq %rdx,%r11
  1050. adcq $0,%r11
  1051. addq %r12,%r10
  1052. adcq $0,%r11
  1053. mulq %r15
  1054. addq %rax,%r13
  1055. movq %rbx,%rax
  1056. movq %r10,8(%rdi,%rcx,1)
  1057. movq %rdx,%r12
  1058. adcq $0,%r12
  1059. mulq %r14
  1060. addq %rax,%r11
  1061. movq %rbx,%rax
  1062. movq 24(%rsi,%rcx,1),%rbx
  1063. movq %rdx,%r10
  1064. adcq $0,%r10
  1065. addq %r13,%r11
  1066. adcq $0,%r10
  1067. mulq %r15
  1068. addq %rax,%r12
  1069. movq %rbx,%rax
  1070. movq %r11,16(%rdi,%rcx,1)
  1071. movq %rdx,%r13
  1072. adcq $0,%r13
  1073. leaq 32(%rcx),%rcx
  1074. mulq %r14
  1075. addq %rax,%r10
  1076. movq %rbx,%rax
  1077. movq %rdx,%r11
  1078. adcq $0,%r11
  1079. addq %r12,%r10
  1080. adcq $0,%r11
  1081. movq %r10,-8(%rdi,%rcx,1)
  1082. cmpq $0,%rcx
  1083. jne L$sqr4x_1st
  1084. mulq %r15
  1085. addq %rax,%r13
  1086. leaq 16(%rbp),%rbp
  1087. adcq $0,%rdx
  1088. addq %r11,%r13
  1089. adcq $0,%rdx
  1090. movq %r13,(%rdi)
  1091. movq %rdx,%r12
  1092. movq %rdx,8(%rdi)
  1093. jmp L$sqr4x_outer
  1094. .p2align 5
  1095. L$sqr4x_outer:
  1096. movq -32(%rsi,%rbp,1),%r14
  1097. leaq 48+8(%rsp,%r9,2),%rdi
  1098. movq -24(%rsi,%rbp,1),%rax
  1099. leaq -32(%rdi,%rbp,1),%rdi
  1100. movq -16(%rsi,%rbp,1),%rbx
  1101. movq %rax,%r15
  1102. mulq %r14
  1103. movq -24(%rdi,%rbp,1),%r10
  1104. addq %rax,%r10
  1105. movq %rbx,%rax
  1106. adcq $0,%rdx
  1107. movq %r10,-24(%rdi,%rbp,1)
  1108. movq %rdx,%r11
  1109. mulq %r14
  1110. addq %rax,%r11
  1111. movq %rbx,%rax
  1112. adcq $0,%rdx
  1113. addq -16(%rdi,%rbp,1),%r11
  1114. movq %rdx,%r10
  1115. adcq $0,%r10
  1116. movq %r11,-16(%rdi,%rbp,1)
  1117. xorq %r12,%r12
  1118. movq -8(%rsi,%rbp,1),%rbx
  1119. mulq %r15
  1120. addq %rax,%r12
  1121. movq %rbx,%rax
  1122. adcq $0,%rdx
  1123. addq -8(%rdi,%rbp,1),%r12
  1124. movq %rdx,%r13
  1125. adcq $0,%r13
  1126. mulq %r14
  1127. addq %rax,%r10
  1128. movq %rbx,%rax
  1129. adcq $0,%rdx
  1130. addq %r12,%r10
  1131. movq %rdx,%r11
  1132. adcq $0,%r11
  1133. movq %r10,-8(%rdi,%rbp,1)
  1134. leaq (%rbp),%rcx
  1135. jmp L$sqr4x_inner
  1136. .p2align 5
  1137. L$sqr4x_inner:
  1138. movq (%rsi,%rcx,1),%rbx
  1139. mulq %r15
  1140. addq %rax,%r13
  1141. movq %rbx,%rax
  1142. movq %rdx,%r12
  1143. adcq $0,%r12
  1144. addq (%rdi,%rcx,1),%r13
  1145. adcq $0,%r12
  1146. .byte 0x67
  1147. mulq %r14
  1148. addq %rax,%r11
  1149. movq %rbx,%rax
  1150. movq 8(%rsi,%rcx,1),%rbx
  1151. movq %rdx,%r10
  1152. adcq $0,%r10
  1153. addq %r13,%r11
  1154. adcq $0,%r10
  1155. mulq %r15
  1156. addq %rax,%r12
  1157. movq %r11,(%rdi,%rcx,1)
  1158. movq %rbx,%rax
  1159. movq %rdx,%r13
  1160. adcq $0,%r13
  1161. addq 8(%rdi,%rcx,1),%r12
  1162. leaq 16(%rcx),%rcx
  1163. adcq $0,%r13
  1164. mulq %r14
  1165. addq %rax,%r10
  1166. movq %rbx,%rax
  1167. adcq $0,%rdx
  1168. addq %r12,%r10
  1169. movq %rdx,%r11
  1170. adcq $0,%r11
  1171. movq %r10,-8(%rdi,%rcx,1)
  1172. cmpq $0,%rcx
  1173. jne L$sqr4x_inner
  1174. .byte 0x67
  1175. mulq %r15
  1176. addq %rax,%r13
  1177. adcq $0,%rdx
  1178. addq %r11,%r13
  1179. adcq $0,%rdx
  1180. movq %r13,(%rdi)
  1181. movq %rdx,%r12
  1182. movq %rdx,8(%rdi)
  1183. addq $16,%rbp
  1184. jnz L$sqr4x_outer
  1185. movq -32(%rsi),%r14
  1186. leaq 48+8(%rsp,%r9,2),%rdi
  1187. movq -24(%rsi),%rax
  1188. leaq -32(%rdi,%rbp,1),%rdi
  1189. movq -16(%rsi),%rbx
  1190. movq %rax,%r15
  1191. mulq %r14
  1192. addq %rax,%r10
  1193. movq %rbx,%rax
  1194. movq %rdx,%r11
  1195. adcq $0,%r11
  1196. mulq %r14
  1197. addq %rax,%r11
  1198. movq %rbx,%rax
  1199. movq %r10,-24(%rdi)
  1200. movq %rdx,%r10
  1201. adcq $0,%r10
  1202. addq %r13,%r11
  1203. movq -8(%rsi),%rbx
  1204. adcq $0,%r10
  1205. mulq %r15
  1206. addq %rax,%r12
  1207. movq %rbx,%rax
  1208. movq %r11,-16(%rdi)
  1209. movq %rdx,%r13
  1210. adcq $0,%r13
  1211. mulq %r14
  1212. addq %rax,%r10
  1213. movq %rbx,%rax
  1214. movq %rdx,%r11
  1215. adcq $0,%r11
  1216. addq %r12,%r10
  1217. adcq $0,%r11
  1218. movq %r10,-8(%rdi)
  1219. mulq %r15
  1220. addq %rax,%r13
  1221. movq -16(%rsi),%rax
  1222. adcq $0,%rdx
  1223. addq %r11,%r13
  1224. adcq $0,%rdx
  1225. movq %r13,(%rdi)
  1226. movq %rdx,%r12
  1227. movq %rdx,8(%rdi)
  1228. mulq %rbx
  1229. addq $16,%rbp
  1230. xorq %r14,%r14
  1231. subq %r9,%rbp
  1232. xorq %r15,%r15
  1233. addq %r12,%rax
  1234. adcq $0,%rdx
  1235. movq %rax,8(%rdi)
  1236. movq %rdx,16(%rdi)
  1237. movq %r15,24(%rdi)
  1238. movq -16(%rsi,%rbp,1),%rax
  1239. leaq 48+8(%rsp),%rdi
  1240. xorq %r10,%r10
  1241. movq 8(%rdi),%r11
  1242. leaq (%r14,%r10,2),%r12
  1243. shrq $63,%r10
  1244. leaq (%rcx,%r11,2),%r13
  1245. shrq $63,%r11
  1246. orq %r10,%r13
  1247. movq 16(%rdi),%r10
  1248. movq %r11,%r14
  1249. mulq %rax
  1250. negq %r15
  1251. movq 24(%rdi),%r11
  1252. adcq %rax,%r12
  1253. movq -8(%rsi,%rbp,1),%rax
  1254. movq %r12,(%rdi)
  1255. adcq %rdx,%r13
  1256. leaq (%r14,%r10,2),%rbx
  1257. movq %r13,8(%rdi)
  1258. sbbq %r15,%r15
  1259. shrq $63,%r10
  1260. leaq (%rcx,%r11,2),%r8
  1261. shrq $63,%r11
  1262. orq %r10,%r8
  1263. movq 32(%rdi),%r10
  1264. movq %r11,%r14
  1265. mulq %rax
  1266. negq %r15
  1267. movq 40(%rdi),%r11
  1268. adcq %rax,%rbx
  1269. movq 0(%rsi,%rbp,1),%rax
  1270. movq %rbx,16(%rdi)
  1271. adcq %rdx,%r8
  1272. leaq 16(%rbp),%rbp
  1273. movq %r8,24(%rdi)
  1274. sbbq %r15,%r15
  1275. leaq 64(%rdi),%rdi
  1276. jmp L$sqr4x_shift_n_add
  1277. .p2align 5
  1278. L$sqr4x_shift_n_add:
  1279. leaq (%r14,%r10,2),%r12
  1280. shrq $63,%r10
  1281. leaq (%rcx,%r11,2),%r13
  1282. shrq $63,%r11
  1283. orq %r10,%r13
  1284. movq -16(%rdi),%r10
  1285. movq %r11,%r14
  1286. mulq %rax
  1287. negq %r15
  1288. movq -8(%rdi),%r11
  1289. adcq %rax,%r12
  1290. movq -8(%rsi,%rbp,1),%rax
  1291. movq %r12,-32(%rdi)
  1292. adcq %rdx,%r13
  1293. leaq (%r14,%r10,2),%rbx
  1294. movq %r13,-24(%rdi)
  1295. sbbq %r15,%r15
  1296. shrq $63,%r10
  1297. leaq (%rcx,%r11,2),%r8
  1298. shrq $63,%r11
  1299. orq %r10,%r8
  1300. movq 0(%rdi),%r10
  1301. movq %r11,%r14
  1302. mulq %rax
  1303. negq %r15
  1304. movq 8(%rdi),%r11
  1305. adcq %rax,%rbx
  1306. movq 0(%rsi,%rbp,1),%rax
  1307. movq %rbx,-16(%rdi)
  1308. adcq %rdx,%r8
  1309. leaq (%r14,%r10,2),%r12
  1310. movq %r8,-8(%rdi)
  1311. sbbq %r15,%r15
  1312. shrq $63,%r10
  1313. leaq (%rcx,%r11,2),%r13
  1314. shrq $63,%r11
  1315. orq %r10,%r13
  1316. movq 16(%rdi),%r10
  1317. movq %r11,%r14
  1318. mulq %rax
  1319. negq %r15
  1320. movq 24(%rdi),%r11
  1321. adcq %rax,%r12
  1322. movq 8(%rsi,%rbp,1),%rax
  1323. movq %r12,0(%rdi)
  1324. adcq %rdx,%r13
  1325. leaq (%r14,%r10,2),%rbx
  1326. movq %r13,8(%rdi)
  1327. sbbq %r15,%r15
  1328. shrq $63,%r10
  1329. leaq (%rcx,%r11,2),%r8
  1330. shrq $63,%r11
  1331. orq %r10,%r8
  1332. movq 32(%rdi),%r10
  1333. movq %r11,%r14
  1334. mulq %rax
  1335. negq %r15
  1336. movq 40(%rdi),%r11
  1337. adcq %rax,%rbx
  1338. movq 16(%rsi,%rbp,1),%rax
  1339. movq %rbx,16(%rdi)
  1340. adcq %rdx,%r8
  1341. movq %r8,24(%rdi)
  1342. sbbq %r15,%r15
  1343. leaq 64(%rdi),%rdi
  1344. addq $32,%rbp
  1345. jnz L$sqr4x_shift_n_add
  1346. leaq (%r14,%r10,2),%r12
  1347. .byte 0x67
  1348. shrq $63,%r10
  1349. leaq (%rcx,%r11,2),%r13
  1350. shrq $63,%r11
  1351. orq %r10,%r13
  1352. movq -16(%rdi),%r10
  1353. movq %r11,%r14
  1354. mulq %rax
  1355. negq %r15
  1356. movq -8(%rdi),%r11
  1357. adcq %rax,%r12
  1358. movq -8(%rsi),%rax
  1359. movq %r12,-32(%rdi)
  1360. adcq %rdx,%r13
  1361. leaq (%r14,%r10,2),%rbx
  1362. movq %r13,-24(%rdi)
  1363. sbbq %r15,%r15
  1364. shrq $63,%r10
  1365. leaq (%rcx,%r11,2),%r8
  1366. shrq $63,%r11
  1367. orq %r10,%r8
  1368. mulq %rax
  1369. negq %r15
  1370. adcq %rax,%rbx
  1371. adcq %rdx,%r8
  1372. movq %rbx,-16(%rdi)
  1373. movq %r8,-8(%rdi)
  1374. .byte 102,72,15,126,213
  1375. __bn_sqr8x_reduction:
  1376. xorq %rax,%rax
  1377. leaq (%r9,%rbp,1),%rcx
  1378. leaq 48+8(%rsp,%r9,2),%rdx
  1379. movq %rcx,0+8(%rsp)
  1380. leaq 48+8(%rsp,%r9,1),%rdi
  1381. movq %rdx,8+8(%rsp)
  1382. negq %r9
  1383. jmp L$8x_reduction_loop
  1384. .p2align 5
  1385. L$8x_reduction_loop:
  1386. leaq (%rdi,%r9,1),%rdi
  1387. .byte 0x66
  1388. movq 0(%rdi),%rbx
  1389. movq 8(%rdi),%r9
  1390. movq 16(%rdi),%r10
  1391. movq 24(%rdi),%r11
  1392. movq 32(%rdi),%r12
  1393. movq 40(%rdi),%r13
  1394. movq 48(%rdi),%r14
  1395. movq 56(%rdi),%r15
  1396. movq %rax,(%rdx)
  1397. leaq 64(%rdi),%rdi
  1398. .byte 0x67
  1399. movq %rbx,%r8
  1400. imulq 32+8(%rsp),%rbx
  1401. movq 0(%rbp),%rax
  1402. movl $8,%ecx
  1403. jmp L$8x_reduce
  1404. .p2align 5
  1405. L$8x_reduce:
  1406. mulq %rbx
  1407. movq 8(%rbp),%rax
  1408. negq %r8
  1409. movq %rdx,%r8
  1410. adcq $0,%r8
  1411. mulq %rbx
  1412. addq %rax,%r9
  1413. movq 16(%rbp),%rax
  1414. adcq $0,%rdx
  1415. addq %r9,%r8
  1416. movq %rbx,48-8+8(%rsp,%rcx,8)
  1417. movq %rdx,%r9
  1418. adcq $0,%r9
  1419. mulq %rbx
  1420. addq %rax,%r10
  1421. movq 24(%rbp),%rax
  1422. adcq $0,%rdx
  1423. addq %r10,%r9
  1424. movq 32+8(%rsp),%rsi
  1425. movq %rdx,%r10
  1426. adcq $0,%r10
  1427. mulq %rbx
  1428. addq %rax,%r11
  1429. movq 32(%rbp),%rax
  1430. adcq $0,%rdx
  1431. imulq %r8,%rsi
  1432. addq %r11,%r10
  1433. movq %rdx,%r11
  1434. adcq $0,%r11
  1435. mulq %rbx
  1436. addq %rax,%r12
  1437. movq 40(%rbp),%rax
  1438. adcq $0,%rdx
  1439. addq %r12,%r11
  1440. movq %rdx,%r12
  1441. adcq $0,%r12
  1442. mulq %rbx
  1443. addq %rax,%r13
  1444. movq 48(%rbp),%rax
  1445. adcq $0,%rdx
  1446. addq %r13,%r12
  1447. movq %rdx,%r13
  1448. adcq $0,%r13
  1449. mulq %rbx
  1450. addq %rax,%r14
  1451. movq 56(%rbp),%rax
  1452. adcq $0,%rdx
  1453. addq %r14,%r13
  1454. movq %rdx,%r14
  1455. adcq $0,%r14
  1456. mulq %rbx
  1457. movq %rsi,%rbx
  1458. addq %rax,%r15
  1459. movq 0(%rbp),%rax
  1460. adcq $0,%rdx
  1461. addq %r15,%r14
  1462. movq %rdx,%r15
  1463. adcq $0,%r15
  1464. decl %ecx
  1465. jnz L$8x_reduce
  1466. leaq 64(%rbp),%rbp
  1467. xorq %rax,%rax
  1468. movq 8+8(%rsp),%rdx
  1469. cmpq 0+8(%rsp),%rbp
  1470. jae L$8x_no_tail
  1471. .byte 0x66
  1472. addq 0(%rdi),%r8
  1473. adcq 8(%rdi),%r9
  1474. adcq 16(%rdi),%r10
  1475. adcq 24(%rdi),%r11
  1476. adcq 32(%rdi),%r12
  1477. adcq 40(%rdi),%r13
  1478. adcq 48(%rdi),%r14
  1479. adcq 56(%rdi),%r15
  1480. sbbq %rsi,%rsi
  1481. movq 48+56+8(%rsp),%rbx
  1482. movl $8,%ecx
  1483. movq 0(%rbp),%rax
  1484. jmp L$8x_tail
  1485. .p2align 5
  1486. L$8x_tail:
  1487. mulq %rbx
  1488. addq %rax,%r8
  1489. movq 8(%rbp),%rax
  1490. movq %r8,(%rdi)
  1491. movq %rdx,%r8
  1492. adcq $0,%r8
  1493. mulq %rbx
  1494. addq %rax,%r9
  1495. movq 16(%rbp),%rax
  1496. adcq $0,%rdx
  1497. addq %r9,%r8
  1498. leaq 8(%rdi),%rdi
  1499. movq %rdx,%r9
  1500. adcq $0,%r9
  1501. mulq %rbx
  1502. addq %rax,%r10
  1503. movq 24(%rbp),%rax
  1504. adcq $0,%rdx
  1505. addq %r10,%r9
  1506. movq %rdx,%r10
  1507. adcq $0,%r10
  1508. mulq %rbx
  1509. addq %rax,%r11
  1510. movq 32(%rbp),%rax
  1511. adcq $0,%rdx
  1512. addq %r11,%r10
  1513. movq %rdx,%r11
  1514. adcq $0,%r11
  1515. mulq %rbx
  1516. addq %rax,%r12
  1517. movq 40(%rbp),%rax
  1518. adcq $0,%rdx
  1519. addq %r12,%r11
  1520. movq %rdx,%r12
  1521. adcq $0,%r12
  1522. mulq %rbx
  1523. addq %rax,%r13
  1524. movq 48(%rbp),%rax
  1525. adcq $0,%rdx
  1526. addq %r13,%r12
  1527. movq %rdx,%r13
  1528. adcq $0,%r13
  1529. mulq %rbx
  1530. addq %rax,%r14
  1531. movq 56(%rbp),%rax
  1532. adcq $0,%rdx
  1533. addq %r14,%r13
  1534. movq %rdx,%r14
  1535. adcq $0,%r14
  1536. mulq %rbx
  1537. movq 48-16+8(%rsp,%rcx,8),%rbx
  1538. addq %rax,%r15
  1539. adcq $0,%rdx
  1540. addq %r15,%r14
  1541. movq 0(%rbp),%rax
  1542. movq %rdx,%r15
  1543. adcq $0,%r15
  1544. decl %ecx
  1545. jnz L$8x_tail
  1546. leaq 64(%rbp),%rbp
  1547. movq 8+8(%rsp),%rdx
  1548. cmpq 0+8(%rsp),%rbp
  1549. jae L$8x_tail_done
  1550. movq 48+56+8(%rsp),%rbx
  1551. negq %rsi
  1552. movq 0(%rbp),%rax
  1553. adcq 0(%rdi),%r8
  1554. adcq 8(%rdi),%r9
  1555. adcq 16(%rdi),%r10
  1556. adcq 24(%rdi),%r11
  1557. adcq 32(%rdi),%r12
  1558. adcq 40(%rdi),%r13
  1559. adcq 48(%rdi),%r14
  1560. adcq 56(%rdi),%r15
  1561. sbbq %rsi,%rsi
  1562. movl $8,%ecx
  1563. jmp L$8x_tail
  1564. .p2align 5
  1565. L$8x_tail_done:
  1566. xorq %rax,%rax
  1567. addq (%rdx),%r8
  1568. adcq $0,%r9
  1569. adcq $0,%r10
  1570. adcq $0,%r11
  1571. adcq $0,%r12
  1572. adcq $0,%r13
  1573. adcq $0,%r14
  1574. adcq $0,%r15
  1575. adcq $0,%rax
  1576. negq %rsi
  1577. L$8x_no_tail:
  1578. adcq 0(%rdi),%r8
  1579. adcq 8(%rdi),%r9
  1580. adcq 16(%rdi),%r10
  1581. adcq 24(%rdi),%r11
  1582. adcq 32(%rdi),%r12
  1583. adcq 40(%rdi),%r13
  1584. adcq 48(%rdi),%r14
  1585. adcq 56(%rdi),%r15
  1586. adcq $0,%rax
  1587. movq -8(%rbp),%rcx
  1588. xorq %rsi,%rsi
  1589. .byte 102,72,15,126,213
  1590. movq %r8,0(%rdi)
  1591. movq %r9,8(%rdi)
  1592. .byte 102,73,15,126,217
  1593. movq %r10,16(%rdi)
  1594. movq %r11,24(%rdi)
  1595. movq %r12,32(%rdi)
  1596. movq %r13,40(%rdi)
  1597. movq %r14,48(%rdi)
  1598. movq %r15,56(%rdi)
  1599. leaq 64(%rdi),%rdi
  1600. cmpq %rdx,%rdi
  1601. jb L$8x_reduction_loop
  1602. .byte 0xf3,0xc3
  1603. .p2align 5
  1604. __bn_post4x_internal:
  1605. movq 0(%rbp),%r12
  1606. leaq (%rdi,%r9,1),%rbx
  1607. movq %r9,%rcx
  1608. .byte 102,72,15,126,207
  1609. negq %rax
  1610. .byte 102,72,15,126,206
  1611. sarq $3+2,%rcx
  1612. decq %r12
  1613. xorq %r10,%r10
  1614. movq 8(%rbp),%r13
  1615. movq 16(%rbp),%r14
  1616. movq 24(%rbp),%r15
  1617. jmp L$sqr4x_sub_entry
  1618. .p2align 4
  1619. L$sqr4x_sub:
  1620. movq 0(%rbp),%r12
  1621. movq 8(%rbp),%r13
  1622. movq 16(%rbp),%r14
  1623. movq 24(%rbp),%r15
  1624. L$sqr4x_sub_entry:
  1625. leaq 32(%rbp),%rbp
  1626. notq %r12
  1627. notq %r13
  1628. notq %r14
  1629. notq %r15
  1630. andq %rax,%r12
  1631. andq %rax,%r13
  1632. andq %rax,%r14
  1633. andq %rax,%r15
  1634. negq %r10
  1635. adcq 0(%rbx),%r12
  1636. adcq 8(%rbx),%r13
  1637. adcq 16(%rbx),%r14
  1638. adcq 24(%rbx),%r15
  1639. movq %r12,0(%rdi)
  1640. leaq 32(%rbx),%rbx
  1641. movq %r13,8(%rdi)
  1642. sbbq %r10,%r10
  1643. movq %r14,16(%rdi)
  1644. movq %r15,24(%rdi)
  1645. leaq 32(%rdi),%rdi
  1646. incq %rcx
  1647. jnz L$sqr4x_sub
  1648. movq %r9,%r10
  1649. negq %r9
  1650. .byte 0xf3,0xc3
  1651. .p2align 5
  1652. bn_mulx4x_mont_gather5:
  1653. movq %rsp,%rax
  1654. L$mulx4x_enter:
  1655. pushq %rbx
  1656. pushq %rbp
  1657. pushq %r12
  1658. pushq %r13
  1659. pushq %r14
  1660. pushq %r15
  1661. L$mulx4x_prologue:
  1662. shll $3,%r9d
  1663. leaq (%r9,%r9,2),%r10
  1664. negq %r9
  1665. movq (%r8),%r8
  1666. leaq -320(%rsp,%r9,2),%r11
  1667. movq %rsp,%rbp
  1668. subq %rdi,%r11
  1669. andq $4095,%r11
  1670. cmpq %r11,%r10
  1671. jb L$mulx4xsp_alt
  1672. subq %r11,%rbp
  1673. leaq -320(%rbp,%r9,2),%rbp
  1674. jmp L$mulx4xsp_done
  1675. L$mulx4xsp_alt:
  1676. leaq 4096-320(,%r9,2),%r10
  1677. leaq -320(%rbp,%r9,2),%rbp
  1678. subq %r10,%r11
  1679. movq $0,%r10
  1680. cmovcq %r10,%r11
  1681. subq %r11,%rbp
  1682. L$mulx4xsp_done:
  1683. andq $-64,%rbp
  1684. movq %rsp,%r11
  1685. subq %rbp,%r11
  1686. andq $-4096,%r11
  1687. leaq (%r11,%rbp,1),%rsp
  1688. movq (%rsp),%r10
  1689. cmpq %rbp,%rsp
  1690. ja L$mulx4x_page_walk
  1691. jmp L$mulx4x_page_walk_done
  1692. L$mulx4x_page_walk:
  1693. leaq -4096(%rsp),%rsp
  1694. movq (%rsp),%r10
  1695. cmpq %rbp,%rsp
  1696. ja L$mulx4x_page_walk
  1697. L$mulx4x_page_walk_done:
  1698. movq %r8,32(%rsp)
  1699. movq %rax,40(%rsp)
  1700. L$mulx4x_body:
  1701. call mulx4x_internal
  1702. movq 40(%rsp),%rsi
  1703. movq $1,%rax
  1704. movq -48(%rsi),%r15
  1705. movq -40(%rsi),%r14
  1706. movq -32(%rsi),%r13
  1707. movq -24(%rsi),%r12
  1708. movq -16(%rsi),%rbp
  1709. movq -8(%rsi),%rbx
  1710. leaq (%rsi),%rsp
  1711. L$mulx4x_epilogue:
  1712. .byte 0xf3,0xc3
  1713. .p2align 5
  1714. mulx4x_internal:
  1715. movq %r9,8(%rsp)
  1716. movq %r9,%r10
  1717. negq %r9
  1718. shlq $5,%r9
  1719. negq %r10
  1720. leaq 128(%rdx,%r9,1),%r13
  1721. shrq $5+5,%r9
  1722. movd 8(%rax),%xmm5
  1723. subq $1,%r9
  1724. leaq L$inc(%rip),%rax
  1725. movq %r13,16+8(%rsp)
  1726. movq %r9,24+8(%rsp)
  1727. movq %rdi,56+8(%rsp)
  1728. movdqa 0(%rax),%xmm0
  1729. movdqa 16(%rax),%xmm1
  1730. leaq 88-112(%rsp,%r10,1),%r10
  1731. leaq 128(%rdx),%rdi
  1732. pshufd $0,%xmm5,%xmm5
  1733. movdqa %xmm1,%xmm4
  1734. .byte 0x67
  1735. movdqa %xmm1,%xmm2
  1736. .byte 0x67
  1737. paddd %xmm0,%xmm1
  1738. pcmpeqd %xmm5,%xmm0
  1739. movdqa %xmm4,%xmm3
  1740. paddd %xmm1,%xmm2
  1741. pcmpeqd %xmm5,%xmm1
  1742. movdqa %xmm0,112(%r10)
  1743. movdqa %xmm4,%xmm0
  1744. paddd %xmm2,%xmm3
  1745. pcmpeqd %xmm5,%xmm2
  1746. movdqa %xmm1,128(%r10)
  1747. movdqa %xmm4,%xmm1
  1748. paddd %xmm3,%xmm0
  1749. pcmpeqd %xmm5,%xmm3
  1750. movdqa %xmm2,144(%r10)
  1751. movdqa %xmm4,%xmm2
  1752. paddd %xmm0,%xmm1
  1753. pcmpeqd %xmm5,%xmm0
  1754. movdqa %xmm3,160(%r10)
  1755. movdqa %xmm4,%xmm3
  1756. paddd %xmm1,%xmm2
  1757. pcmpeqd %xmm5,%xmm1
  1758. movdqa %xmm0,176(%r10)
  1759. movdqa %xmm4,%xmm0
  1760. paddd %xmm2,%xmm3
  1761. pcmpeqd %xmm5,%xmm2
  1762. movdqa %xmm1,192(%r10)
  1763. movdqa %xmm4,%xmm1
  1764. paddd %xmm3,%xmm0
  1765. pcmpeqd %xmm5,%xmm3
  1766. movdqa %xmm2,208(%r10)
  1767. movdqa %xmm4,%xmm2
  1768. paddd %xmm0,%xmm1
  1769. pcmpeqd %xmm5,%xmm0
  1770. movdqa %xmm3,224(%r10)
  1771. movdqa %xmm4,%xmm3
  1772. paddd %xmm1,%xmm2
  1773. pcmpeqd %xmm5,%xmm1
  1774. movdqa %xmm0,240(%r10)
  1775. movdqa %xmm4,%xmm0
  1776. paddd %xmm2,%xmm3
  1777. pcmpeqd %xmm5,%xmm2
  1778. movdqa %xmm1,256(%r10)
  1779. movdqa %xmm4,%xmm1
  1780. paddd %xmm3,%xmm0
  1781. pcmpeqd %xmm5,%xmm3
  1782. movdqa %xmm2,272(%r10)
  1783. movdqa %xmm4,%xmm2
  1784. paddd %xmm0,%xmm1
  1785. pcmpeqd %xmm5,%xmm0
  1786. movdqa %xmm3,288(%r10)
  1787. movdqa %xmm4,%xmm3
  1788. .byte 0x67
  1789. paddd %xmm1,%xmm2
  1790. pcmpeqd %xmm5,%xmm1
  1791. movdqa %xmm0,304(%r10)
  1792. paddd %xmm2,%xmm3
  1793. pcmpeqd %xmm5,%xmm2
  1794. movdqa %xmm1,320(%r10)
  1795. pcmpeqd %xmm5,%xmm3
  1796. movdqa %xmm2,336(%r10)
  1797. pand 64(%rdi),%xmm0
  1798. pand 80(%rdi),%xmm1
  1799. pand 96(%rdi),%xmm2
  1800. movdqa %xmm3,352(%r10)
  1801. pand 112(%rdi),%xmm3
  1802. por %xmm2,%xmm0
  1803. por %xmm3,%xmm1
  1804. movdqa -128(%rdi),%xmm4
  1805. movdqa -112(%rdi),%xmm5
  1806. movdqa -96(%rdi),%xmm2
  1807. pand 112(%r10),%xmm4
  1808. movdqa -80(%rdi),%xmm3
  1809. pand 128(%r10),%xmm5
  1810. por %xmm4,%xmm0
  1811. pand 144(%r10),%xmm2
  1812. por %xmm5,%xmm1
  1813. pand 160(%r10),%xmm3
  1814. por %xmm2,%xmm0
  1815. por %xmm3,%xmm1
  1816. movdqa -64(%rdi),%xmm4
  1817. movdqa -48(%rdi),%xmm5
  1818. movdqa -32(%rdi),%xmm2
  1819. pand 176(%r10),%xmm4
  1820. movdqa -16(%rdi),%xmm3
  1821. pand 192(%r10),%xmm5
  1822. por %xmm4,%xmm0
  1823. pand 208(%r10),%xmm2
  1824. por %xmm5,%xmm1
  1825. pand 224(%r10),%xmm3
  1826. por %xmm2,%xmm0
  1827. por %xmm3,%xmm1
  1828. movdqa 0(%rdi),%xmm4
  1829. movdqa 16(%rdi),%xmm5
  1830. movdqa 32(%rdi),%xmm2
  1831. pand 240(%r10),%xmm4
  1832. movdqa 48(%rdi),%xmm3
  1833. pand 256(%r10),%xmm5
  1834. por %xmm4,%xmm0
  1835. pand 272(%r10),%xmm2
  1836. por %xmm5,%xmm1
  1837. pand 288(%r10),%xmm3
  1838. por %xmm2,%xmm0
  1839. por %xmm3,%xmm1
  1840. pxor %xmm1,%xmm0
  1841. pshufd $0x4e,%xmm0,%xmm1
  1842. por %xmm1,%xmm0
  1843. leaq 256(%rdi),%rdi
  1844. .byte 102,72,15,126,194
  1845. leaq 64+32+8(%rsp),%rbx
  1846. movq %rdx,%r9
  1847. mulxq 0(%rsi),%r8,%rax
  1848. mulxq 8(%rsi),%r11,%r12
  1849. addq %rax,%r11
  1850. mulxq 16(%rsi),%rax,%r13
  1851. adcq %rax,%r12
  1852. adcq $0,%r13
  1853. mulxq 24(%rsi),%rax,%r14
  1854. movq %r8,%r15
  1855. imulq 32+8(%rsp),%r8
  1856. xorq %rbp,%rbp
  1857. movq %r8,%rdx
  1858. movq %rdi,8+8(%rsp)
  1859. leaq 32(%rsi),%rsi
  1860. adcxq %rax,%r13
  1861. adcxq %rbp,%r14
  1862. mulxq 0(%rcx),%rax,%r10
  1863. adcxq %rax,%r15
  1864. adoxq %r11,%r10
  1865. mulxq 8(%rcx),%rax,%r11
  1866. adcxq %rax,%r10
  1867. adoxq %r12,%r11
  1868. mulxq 16(%rcx),%rax,%r12
  1869. movq 24+8(%rsp),%rdi
  1870. movq %r10,-32(%rbx)
  1871. adcxq %rax,%r11
  1872. adoxq %r13,%r12
  1873. mulxq 24(%rcx),%rax,%r15
  1874. movq %r9,%rdx
  1875. movq %r11,-24(%rbx)
  1876. adcxq %rax,%r12
  1877. adoxq %rbp,%r15
  1878. leaq 32(%rcx),%rcx
  1879. movq %r12,-16(%rbx)
  1880. jmp L$mulx4x_1st
  1881. .p2align 5
  1882. L$mulx4x_1st:
  1883. adcxq %rbp,%r15
  1884. mulxq 0(%rsi),%r10,%rax
  1885. adcxq %r14,%r10
  1886. mulxq 8(%rsi),%r11,%r14
  1887. adcxq %rax,%r11
  1888. mulxq 16(%rsi),%r12,%rax
  1889. adcxq %r14,%r12
  1890. mulxq 24(%rsi),%r13,%r14
  1891. .byte 0x67,0x67
  1892. movq %r8,%rdx
  1893. adcxq %rax,%r13
  1894. adcxq %rbp,%r14
  1895. leaq 32(%rsi),%rsi
  1896. leaq 32(%rbx),%rbx
  1897. adoxq %r15,%r10
  1898. mulxq 0(%rcx),%rax,%r15
  1899. adcxq %rax,%r10
  1900. adoxq %r15,%r11
  1901. mulxq 8(%rcx),%rax,%r15
  1902. adcxq %rax,%r11
  1903. adoxq %r15,%r12
  1904. mulxq 16(%rcx),%rax,%r15
  1905. movq %r10,-40(%rbx)
  1906. adcxq %rax,%r12
  1907. movq %r11,-32(%rbx)
  1908. adoxq %r15,%r13
  1909. mulxq 24(%rcx),%rax,%r15
  1910. movq %r9,%rdx
  1911. movq %r12,-24(%rbx)
  1912. adcxq %rax,%r13
  1913. adoxq %rbp,%r15
  1914. leaq 32(%rcx),%rcx
  1915. movq %r13,-16(%rbx)
  1916. decq %rdi
  1917. jnz L$mulx4x_1st
  1918. movq 8(%rsp),%rax
  1919. adcq %rbp,%r15
  1920. leaq (%rsi,%rax,1),%rsi
  1921. addq %r15,%r14
  1922. movq 8+8(%rsp),%rdi
  1923. adcq %rbp,%rbp
  1924. movq %r14,-8(%rbx)
  1925. jmp L$mulx4x_outer
  1926. .p2align 5
  1927. L$mulx4x_outer:
  1928. leaq 16-256(%rbx),%r10
  1929. pxor %xmm4,%xmm4
  1930. .byte 0x67,0x67
  1931. pxor %xmm5,%xmm5
  1932. movdqa -128(%rdi),%xmm0
  1933. movdqa -112(%rdi),%xmm1
  1934. movdqa -96(%rdi),%xmm2
  1935. pand 256(%r10),%xmm0
  1936. movdqa -80(%rdi),%xmm3
  1937. pand 272(%r10),%xmm1
  1938. por %xmm0,%xmm4
  1939. pand 288(%r10),%xmm2
  1940. por %xmm1,%xmm5
  1941. pand 304(%r10),%xmm3
  1942. por %xmm2,%xmm4
  1943. por %xmm3,%xmm5
  1944. movdqa -64(%rdi),%xmm0
  1945. movdqa -48(%rdi),%xmm1
  1946. movdqa -32(%rdi),%xmm2
  1947. pand 320(%r10),%xmm0
  1948. movdqa -16(%rdi),%xmm3
  1949. pand 336(%r10),%xmm1
  1950. por %xmm0,%xmm4
  1951. pand 352(%r10),%xmm2
  1952. por %xmm1,%xmm5
  1953. pand 368(%r10),%xmm3
  1954. por %xmm2,%xmm4
  1955. por %xmm3,%xmm5
  1956. movdqa 0(%rdi),%xmm0
  1957. movdqa 16(%rdi),%xmm1
  1958. movdqa 32(%rdi),%xmm2
  1959. pand 384(%r10),%xmm0
  1960. movdqa 48(%rdi),%xmm3
  1961. pand 400(%r10),%xmm1
  1962. por %xmm0,%xmm4
  1963. pand 416(%r10),%xmm2
  1964. por %xmm1,%xmm5
  1965. pand 432(%r10),%xmm3
  1966. por %xmm2,%xmm4
  1967. por %xmm3,%xmm5
  1968. movdqa 64(%rdi),%xmm0
  1969. movdqa 80(%rdi),%xmm1
  1970. movdqa 96(%rdi),%xmm2
  1971. pand 448(%r10),%xmm0
  1972. movdqa 112(%rdi),%xmm3
  1973. pand 464(%r10),%xmm1
  1974. por %xmm0,%xmm4
  1975. pand 480(%r10),%xmm2
  1976. por %xmm1,%xmm5
  1977. pand 496(%r10),%xmm3
  1978. por %xmm2,%xmm4
  1979. por %xmm3,%xmm5
  1980. por %xmm5,%xmm4
  1981. pshufd $0x4e,%xmm4,%xmm0
  1982. por %xmm4,%xmm0
  1983. leaq 256(%rdi),%rdi
  1984. .byte 102,72,15,126,194
  1985. movq %rbp,(%rbx)
  1986. leaq 32(%rbx,%rax,1),%rbx
  1987. mulxq 0(%rsi),%r8,%r11
  1988. xorq %rbp,%rbp
  1989. movq %rdx,%r9
  1990. mulxq 8(%rsi),%r14,%r12
  1991. adoxq -32(%rbx),%r8
  1992. adcxq %r14,%r11
  1993. mulxq 16(%rsi),%r15,%r13
  1994. adoxq -24(%rbx),%r11
  1995. adcxq %r15,%r12
  1996. mulxq 24(%rsi),%rdx,%r14
  1997. adoxq -16(%rbx),%r12
  1998. adcxq %rdx,%r13
  1999. leaq (%rcx,%rax,1),%rcx
  2000. leaq 32(%rsi),%rsi
  2001. adoxq -8(%rbx),%r13
  2002. adcxq %rbp,%r14
  2003. adoxq %rbp,%r14
  2004. movq %r8,%r15
  2005. imulq 32+8(%rsp),%r8
  2006. movq %r8,%rdx
  2007. xorq %rbp,%rbp
  2008. movq %rdi,8+8(%rsp)
  2009. mulxq 0(%rcx),%rax,%r10
  2010. adcxq %rax,%r15
  2011. adoxq %r11,%r10
  2012. mulxq 8(%rcx),%rax,%r11
  2013. adcxq %rax,%r10
  2014. adoxq %r12,%r11
  2015. mulxq 16(%rcx),%rax,%r12
  2016. adcxq %rax,%r11
  2017. adoxq %r13,%r12
  2018. mulxq 24(%rcx),%rax,%r15
  2019. movq %r9,%rdx
  2020. movq 24+8(%rsp),%rdi
  2021. movq %r10,-32(%rbx)
  2022. adcxq %rax,%r12
  2023. movq %r11,-24(%rbx)
  2024. adoxq %rbp,%r15
  2025. movq %r12,-16(%rbx)
  2026. leaq 32(%rcx),%rcx
  2027. jmp L$mulx4x_inner
  2028. .p2align 5
  2029. L$mulx4x_inner:
  2030. mulxq 0(%rsi),%r10,%rax
  2031. adcxq %rbp,%r15
  2032. adoxq %r14,%r10
  2033. mulxq 8(%rsi),%r11,%r14
  2034. adcxq 0(%rbx),%r10
  2035. adoxq %rax,%r11
  2036. mulxq 16(%rsi),%r12,%rax
  2037. adcxq 8(%rbx),%r11
  2038. adoxq %r14,%r12
  2039. mulxq 24(%rsi),%r13,%r14
  2040. movq %r8,%rdx
  2041. adcxq 16(%rbx),%r12
  2042. adoxq %rax,%r13
  2043. adcxq 24(%rbx),%r13
  2044. adoxq %rbp,%r14
  2045. leaq 32(%rsi),%rsi
  2046. leaq 32(%rbx),%rbx
  2047. adcxq %rbp,%r14
  2048. adoxq %r15,%r10
  2049. mulxq 0(%rcx),%rax,%r15
  2050. adcxq %rax,%r10
  2051. adoxq %r15,%r11
  2052. mulxq 8(%rcx),%rax,%r15
  2053. adcxq %rax,%r11
  2054. adoxq %r15,%r12
  2055. mulxq 16(%rcx),%rax,%r15
  2056. movq %r10,-40(%rbx)
  2057. adcxq %rax,%r12
  2058. adoxq %r15,%r13
  2059. movq %r11,-32(%rbx)
  2060. mulxq 24(%rcx),%rax,%r15
  2061. movq %r9,%rdx
  2062. leaq 32(%rcx),%rcx
  2063. movq %r12,-24(%rbx)
  2064. adcxq %rax,%r13
  2065. adoxq %rbp,%r15
  2066. movq %r13,-16(%rbx)
  2067. decq %rdi
  2068. jnz L$mulx4x_inner
  2069. movq 0+8(%rsp),%rax
  2070. adcq %rbp,%r15
  2071. subq 0(%rbx),%rdi
  2072. movq 8+8(%rsp),%rdi
  2073. movq 16+8(%rsp),%r10
  2074. adcq %r15,%r14
  2075. leaq (%rsi,%rax,1),%rsi
  2076. adcq %rbp,%rbp
  2077. movq %r14,-8(%rbx)
  2078. cmpq %r10,%rdi
  2079. jb L$mulx4x_outer
  2080. movq -8(%rcx),%r10
  2081. movq %rbp,%r8
  2082. movq (%rcx,%rax,1),%r12
  2083. leaq (%rcx,%rax,1),%rbp
  2084. movq %rax,%rcx
  2085. leaq (%rbx,%rax,1),%rdi
  2086. xorl %eax,%eax
  2087. xorq %r15,%r15
  2088. subq %r14,%r10
  2089. adcq %r15,%r15
  2090. orq %r15,%r8
  2091. sarq $3+2,%rcx
  2092. subq %r8,%rax
  2093. movq 56+8(%rsp),%rdx
  2094. decq %r12
  2095. movq 8(%rbp),%r13
  2096. xorq %r8,%r8
  2097. movq 16(%rbp),%r14
  2098. movq 24(%rbp),%r15
  2099. jmp L$sqrx4x_sub_entry
  2100. .p2align 5
  2101. bn_powerx5:
  2102. movq %rsp,%rax
  2103. L$powerx5_enter:
  2104. pushq %rbx
  2105. pushq %rbp
  2106. pushq %r12
  2107. pushq %r13
  2108. pushq %r14
  2109. pushq %r15
  2110. L$powerx5_prologue:
  2111. shll $3,%r9d
  2112. leaq (%r9,%r9,2),%r10
  2113. negq %r9
  2114. movq (%r8),%r8
  2115. leaq -320(%rsp,%r9,2),%r11
  2116. movq %rsp,%rbp
  2117. subq %rdi,%r11
  2118. andq $4095,%r11
  2119. cmpq %r11,%r10
  2120. jb L$pwrx_sp_alt
  2121. subq %r11,%rbp
  2122. leaq -320(%rbp,%r9,2),%rbp
  2123. jmp L$pwrx_sp_done
  2124. .p2align 5
  2125. L$pwrx_sp_alt:
  2126. leaq 4096-320(,%r9,2),%r10
  2127. leaq -320(%rbp,%r9,2),%rbp
  2128. subq %r10,%r11
  2129. movq $0,%r10
  2130. cmovcq %r10,%r11
  2131. subq %r11,%rbp
  2132. L$pwrx_sp_done:
  2133. andq $-64,%rbp
  2134. movq %rsp,%r11
  2135. subq %rbp,%r11
  2136. andq $-4096,%r11
  2137. leaq (%r11,%rbp,1),%rsp
  2138. movq (%rsp),%r10
  2139. cmpq %rbp,%rsp
  2140. ja L$pwrx_page_walk
  2141. jmp L$pwrx_page_walk_done
  2142. L$pwrx_page_walk:
  2143. leaq -4096(%rsp),%rsp
  2144. movq (%rsp),%r10
  2145. cmpq %rbp,%rsp
  2146. ja L$pwrx_page_walk
  2147. L$pwrx_page_walk_done:
  2148. movq %r9,%r10
  2149. negq %r9
  2150. pxor %xmm0,%xmm0
  2151. .byte 102,72,15,110,207
  2152. .byte 102,72,15,110,209
  2153. .byte 102,73,15,110,218
  2154. .byte 102,72,15,110,226
  2155. movq %r8,32(%rsp)
  2156. movq %rax,40(%rsp)
  2157. L$powerx5_body:
  2158. call __bn_sqrx8x_internal
  2159. call __bn_postx4x_internal
  2160. call __bn_sqrx8x_internal
  2161. call __bn_postx4x_internal
  2162. call __bn_sqrx8x_internal
  2163. call __bn_postx4x_internal
  2164. call __bn_sqrx8x_internal
  2165. call __bn_postx4x_internal
  2166. call __bn_sqrx8x_internal
  2167. call __bn_postx4x_internal
  2168. movq %r10,%r9
  2169. movq %rsi,%rdi
  2170. .byte 102,72,15,126,209
  2171. .byte 102,72,15,126,226
  2172. movq 40(%rsp),%rax
  2173. call mulx4x_internal
  2174. movq 40(%rsp),%rsi
  2175. movq $1,%rax
  2176. movq -48(%rsi),%r15
  2177. movq -40(%rsi),%r14
  2178. movq -32(%rsi),%r13
  2179. movq -24(%rsi),%r12
  2180. movq -16(%rsi),%rbp
  2181. movq -8(%rsi),%rbx
  2182. leaq (%rsi),%rsp
  2183. L$powerx5_epilogue:
  2184. .byte 0xf3,0xc3
  2185. .globl _bn_sqrx8x_internal
  2186. .private_extern _bn_sqrx8x_internal
  2187. .p2align 5
  2188. _bn_sqrx8x_internal:
  2189. __bn_sqrx8x_internal:
  2190. leaq 48+8(%rsp),%rdi
  2191. leaq (%rsi,%r9,1),%rbp
  2192. movq %r9,0+8(%rsp)
  2193. movq %rbp,8+8(%rsp)
  2194. jmp L$sqr8x_zero_start
  2195. .p2align 5
  2196. .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
  2197. L$sqrx8x_zero:
  2198. .byte 0x3e
  2199. movdqa %xmm0,0(%rdi)
  2200. movdqa %xmm0,16(%rdi)
  2201. movdqa %xmm0,32(%rdi)
  2202. movdqa %xmm0,48(%rdi)
  2203. L$sqr8x_zero_start:
  2204. movdqa %xmm0,64(%rdi)
  2205. movdqa %xmm0,80(%rdi)
  2206. movdqa %xmm0,96(%rdi)
  2207. movdqa %xmm0,112(%rdi)
  2208. leaq 128(%rdi),%rdi
  2209. subq $64,%r9
  2210. jnz L$sqrx8x_zero
  2211. movq 0(%rsi),%rdx
  2212. xorq %r10,%r10
  2213. xorq %r11,%r11
  2214. xorq %r12,%r12
  2215. xorq %r13,%r13
  2216. xorq %r14,%r14
  2217. xorq %r15,%r15
  2218. leaq 48+8(%rsp),%rdi
  2219. xorq %rbp,%rbp
  2220. jmp L$sqrx8x_outer_loop
  2221. .p2align 5
  2222. L$sqrx8x_outer_loop:
  2223. mulxq 8(%rsi),%r8,%rax
  2224. adcxq %r9,%r8
  2225. adoxq %rax,%r10
  2226. mulxq 16(%rsi),%r9,%rax
  2227. adcxq %r10,%r9
  2228. adoxq %rax,%r11
  2229. .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
  2230. adcxq %r11,%r10
  2231. adoxq %rax,%r12
  2232. .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
  2233. adcxq %r12,%r11
  2234. adoxq %rax,%r13
  2235. mulxq 40(%rsi),%r12,%rax
  2236. adcxq %r13,%r12
  2237. adoxq %rax,%r14
  2238. mulxq 48(%rsi),%r13,%rax
  2239. adcxq %r14,%r13
  2240. adoxq %r15,%rax
  2241. mulxq 56(%rsi),%r14,%r15
  2242. movq 8(%rsi),%rdx
  2243. adcxq %rax,%r14
  2244. adoxq %rbp,%r15
  2245. adcq 64(%rdi),%r15
  2246. movq %r8,8(%rdi)
  2247. movq %r9,16(%rdi)
  2248. sbbq %rcx,%rcx
  2249. xorq %rbp,%rbp
  2250. mulxq 16(%rsi),%r8,%rbx
  2251. mulxq 24(%rsi),%r9,%rax
  2252. adcxq %r10,%r8
  2253. adoxq %rbx,%r9
  2254. mulxq 32(%rsi),%r10,%rbx
  2255. adcxq %r11,%r9
  2256. adoxq %rax,%r10
  2257. .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
  2258. adcxq %r12,%r10
  2259. adoxq %rbx,%r11
  2260. .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
  2261. adcxq %r13,%r11
  2262. adoxq %r14,%r12
  2263. .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
  2264. movq 16(%rsi),%rdx
  2265. adcxq %rax,%r12
  2266. adoxq %rbx,%r13
  2267. adcxq %r15,%r13
  2268. adoxq %rbp,%r14
  2269. adcxq %rbp,%r14
  2270. movq %r8,24(%rdi)
  2271. movq %r9,32(%rdi)
  2272. mulxq 24(%rsi),%r8,%rbx
  2273. mulxq 32(%rsi),%r9,%rax
  2274. adcxq %r10,%r8
  2275. adoxq %rbx,%r9
  2276. mulxq 40(%rsi),%r10,%rbx
  2277. adcxq %r11,%r9
  2278. adoxq %rax,%r10
  2279. .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
  2280. adcxq %r12,%r10
  2281. adoxq %r13,%r11
  2282. .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
  2283. .byte 0x3e
  2284. movq 24(%rsi),%rdx
  2285. adcxq %rbx,%r11
  2286. adoxq %rax,%r12
  2287. adcxq %r14,%r12
  2288. movq %r8,40(%rdi)
  2289. movq %r9,48(%rdi)
  2290. mulxq 32(%rsi),%r8,%rax
  2291. adoxq %rbp,%r13
  2292. adcxq %rbp,%r13
  2293. mulxq 40(%rsi),%r9,%rbx
  2294. adcxq %r10,%r8
  2295. adoxq %rax,%r9
  2296. mulxq 48(%rsi),%r10,%rax
  2297. adcxq %r11,%r9
  2298. adoxq %r12,%r10
  2299. mulxq 56(%rsi),%r11,%r12
  2300. movq 32(%rsi),%rdx
  2301. movq 40(%rsi),%r14
  2302. adcxq %rbx,%r10
  2303. adoxq %rax,%r11
  2304. movq 48(%rsi),%r15
  2305. adcxq %r13,%r11
  2306. adoxq %rbp,%r12
  2307. adcxq %rbp,%r12
  2308. movq %r8,56(%rdi)
  2309. movq %r9,64(%rdi)
  2310. mulxq %r14,%r9,%rax
  2311. movq 56(%rsi),%r8
  2312. adcxq %r10,%r9
  2313. mulxq %r15,%r10,%rbx
  2314. adoxq %rax,%r10
  2315. adcxq %r11,%r10
  2316. mulxq %r8,%r11,%rax
  2317. movq %r14,%rdx
  2318. adoxq %rbx,%r11
  2319. adcxq %r12,%r11
  2320. adcxq %rbp,%rax
  2321. mulxq %r15,%r14,%rbx
  2322. mulxq %r8,%r12,%r13
  2323. movq %r15,%rdx
  2324. leaq 64(%rsi),%rsi
  2325. adcxq %r14,%r11
  2326. adoxq %rbx,%r12
  2327. adcxq %rax,%r12
  2328. adoxq %rbp,%r13
  2329. .byte 0x67,0x67
  2330. mulxq %r8,%r8,%r14
  2331. adcxq %r8,%r13
  2332. adcxq %rbp,%r14
  2333. cmpq 8+8(%rsp),%rsi
  2334. je L$sqrx8x_outer_break
  2335. negq %rcx
  2336. movq $-8,%rcx
  2337. movq %rbp,%r15
  2338. movq 64(%rdi),%r8
  2339. adcxq 72(%rdi),%r9
  2340. adcxq 80(%rdi),%r10
  2341. adcxq 88(%rdi),%r11
  2342. adcq 96(%rdi),%r12
  2343. adcq 104(%rdi),%r13
  2344. adcq 112(%rdi),%r14
  2345. adcq 120(%rdi),%r15
  2346. leaq (%rsi),%rbp
  2347. leaq 128(%rdi),%rdi
  2348. sbbq %rax,%rax
  2349. movq -64(%rsi),%rdx
  2350. movq %rax,16+8(%rsp)
  2351. movq %rdi,24+8(%rsp)
  2352. xorl %eax,%eax
  2353. jmp L$sqrx8x_loop
  2354. .p2align 5
  2355. L$sqrx8x_loop:
  2356. movq %r8,%rbx
  2357. mulxq 0(%rbp),%rax,%r8
  2358. adcxq %rax,%rbx
  2359. adoxq %r9,%r8
  2360. mulxq 8(%rbp),%rax,%r9
  2361. adcxq %rax,%r8
  2362. adoxq %r10,%r9
  2363. mulxq 16(%rbp),%rax,%r10
  2364. adcxq %rax,%r9
  2365. adoxq %r11,%r10
  2366. mulxq 24(%rbp),%rax,%r11
  2367. adcxq %rax,%r10
  2368. adoxq %r12,%r11
  2369. .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
  2370. adcxq %rax,%r11
  2371. adoxq %r13,%r12
  2372. mulxq 40(%rbp),%rax,%r13
  2373. adcxq %rax,%r12
  2374. adoxq %r14,%r13
  2375. mulxq 48(%rbp),%rax,%r14
  2376. movq %rbx,(%rdi,%rcx,8)
  2377. movl $0,%ebx
  2378. adcxq %rax,%r13
  2379. adoxq %r15,%r14
  2380. .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
  2381. movq 8(%rsi,%rcx,8),%rdx
  2382. adcxq %rax,%r14
  2383. adoxq %rbx,%r15
  2384. adcxq %rbx,%r15
  2385. .byte 0x67
  2386. incq %rcx
  2387. jnz L$sqrx8x_loop
  2388. leaq 64(%rbp),%rbp
  2389. movq $-8,%rcx
  2390. cmpq 8+8(%rsp),%rbp
  2391. je L$sqrx8x_break
  2392. subq 16+8(%rsp),%rbx
  2393. .byte 0x66
  2394. movq -64(%rsi),%rdx
  2395. adcxq 0(%rdi),%r8
  2396. adcxq 8(%rdi),%r9
  2397. adcq 16(%rdi),%r10
  2398. adcq 24(%rdi),%r11
  2399. adcq 32(%rdi),%r12
  2400. adcq 40(%rdi),%r13
  2401. adcq 48(%rdi),%r14
  2402. adcq 56(%rdi),%r15
  2403. leaq 64(%rdi),%rdi
  2404. .byte 0x67
  2405. sbbq %rax,%rax
  2406. xorl %ebx,%ebx
  2407. movq %rax,16+8(%rsp)
  2408. jmp L$sqrx8x_loop
  2409. .p2align 5
  2410. L$sqrx8x_break:
  2411. xorq %rbp,%rbp
  2412. subq 16+8(%rsp),%rbx
  2413. adcxq %rbp,%r8
  2414. movq 24+8(%rsp),%rcx
  2415. adcxq %rbp,%r9
  2416. movq 0(%rsi),%rdx
  2417. adcq $0,%r10
  2418. movq %r8,0(%rdi)
  2419. adcq $0,%r11
  2420. adcq $0,%r12
  2421. adcq $0,%r13
  2422. adcq $0,%r14
  2423. adcq $0,%r15
  2424. cmpq %rcx,%rdi
  2425. je L$sqrx8x_outer_loop
  2426. movq %r9,8(%rdi)
  2427. movq 8(%rcx),%r9
  2428. movq %r10,16(%rdi)
  2429. movq 16(%rcx),%r10
  2430. movq %r11,24(%rdi)
  2431. movq 24(%rcx),%r11
  2432. movq %r12,32(%rdi)
  2433. movq 32(%rcx),%r12
  2434. movq %r13,40(%rdi)
  2435. movq 40(%rcx),%r13
  2436. movq %r14,48(%rdi)
  2437. movq 48(%rcx),%r14
  2438. movq %r15,56(%rdi)
  2439. movq 56(%rcx),%r15
  2440. movq %rcx,%rdi
  2441. jmp L$sqrx8x_outer_loop
  2442. .p2align 5
  2443. L$sqrx8x_outer_break:
  2444. movq %r9,72(%rdi)
  2445. .byte 102,72,15,126,217
  2446. movq %r10,80(%rdi)
  2447. movq %r11,88(%rdi)
  2448. movq %r12,96(%rdi)
  2449. movq %r13,104(%rdi)
  2450. movq %r14,112(%rdi)
  2451. leaq 48+8(%rsp),%rdi
  2452. movq (%rsi,%rcx,1),%rdx
  2453. movq 8(%rdi),%r11
  2454. xorq %r10,%r10
  2455. movq 0+8(%rsp),%r9
  2456. adoxq %r11,%r11
  2457. movq 16(%rdi),%r12
  2458. movq 24(%rdi),%r13
  2459. .p2align 5
  2460. L$sqrx4x_shift_n_add:
  2461. mulxq %rdx,%rax,%rbx
  2462. adoxq %r12,%r12
  2463. adcxq %r10,%rax
  2464. .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
  2465. .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
  2466. adoxq %r13,%r13
  2467. adcxq %r11,%rbx
  2468. movq 40(%rdi),%r11
  2469. movq %rax,0(%rdi)
  2470. movq %rbx,8(%rdi)
  2471. mulxq %rdx,%rax,%rbx
  2472. adoxq %r10,%r10
  2473. adcxq %r12,%rax
  2474. movq 16(%rsi,%rcx,1),%rdx
  2475. movq 48(%rdi),%r12
  2476. adoxq %r11,%r11
  2477. adcxq %r13,%rbx
  2478. movq 56(%rdi),%r13
  2479. movq %rax,16(%rdi)
  2480. movq %rbx,24(%rdi)
  2481. mulxq %rdx,%rax,%rbx
  2482. adoxq %r12,%r12
  2483. adcxq %r10,%rax
  2484. movq 24(%rsi,%rcx,1),%rdx
  2485. leaq 32(%rcx),%rcx
  2486. movq 64(%rdi),%r10
  2487. adoxq %r13,%r13
  2488. adcxq %r11,%rbx
  2489. movq 72(%rdi),%r11
  2490. movq %rax,32(%rdi)
  2491. movq %rbx,40(%rdi)
  2492. mulxq %rdx,%rax,%rbx
  2493. adoxq %r10,%r10
  2494. adcxq %r12,%rax
  2495. jrcxz L$sqrx4x_shift_n_add_break
  2496. .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
  2497. adoxq %r11,%r11
  2498. adcxq %r13,%rbx
  2499. movq 80(%rdi),%r12
  2500. movq 88(%rdi),%r13
  2501. movq %rax,48(%rdi)
  2502. movq %rbx,56(%rdi)
  2503. leaq 64(%rdi),%rdi
  2504. nop
  2505. jmp L$sqrx4x_shift_n_add
  2506. .p2align 5
  2507. L$sqrx4x_shift_n_add_break:
  2508. adcxq %r13,%rbx
  2509. movq %rax,48(%rdi)
  2510. movq %rbx,56(%rdi)
  2511. leaq 64(%rdi),%rdi
  2512. .byte 102,72,15,126,213
  2513. __bn_sqrx8x_reduction:
  2514. xorl %eax,%eax
  2515. movq 32+8(%rsp),%rbx
  2516. movq 48+8(%rsp),%rdx
  2517. leaq -64(%rbp,%r9,1),%rcx
  2518. movq %rcx,0+8(%rsp)
  2519. movq %rdi,8+8(%rsp)
  2520. leaq 48+8(%rsp),%rdi
  2521. jmp L$sqrx8x_reduction_loop
  2522. .p2align 5
  2523. L$sqrx8x_reduction_loop:
  2524. movq 8(%rdi),%r9
  2525. movq 16(%rdi),%r10
  2526. movq 24(%rdi),%r11
  2527. movq 32(%rdi),%r12
  2528. movq %rdx,%r8
  2529. imulq %rbx,%rdx
  2530. movq 40(%rdi),%r13
  2531. movq 48(%rdi),%r14
  2532. movq 56(%rdi),%r15
  2533. movq %rax,24+8(%rsp)
  2534. leaq 64(%rdi),%rdi
  2535. xorq %rsi,%rsi
  2536. movq $-8,%rcx
  2537. jmp L$sqrx8x_reduce
  2538. .p2align 5
  2539. L$sqrx8x_reduce:
  2540. movq %r8,%rbx
  2541. mulxq 0(%rbp),%rax,%r8
  2542. adcxq %rbx,%rax
  2543. adoxq %r9,%r8
  2544. mulxq 8(%rbp),%rbx,%r9
  2545. adcxq %rbx,%r8
  2546. adoxq %r10,%r9
  2547. mulxq 16(%rbp),%rbx,%r10
  2548. adcxq %rbx,%r9
  2549. adoxq %r11,%r10
  2550. mulxq 24(%rbp),%rbx,%r11
  2551. adcxq %rbx,%r10
  2552. adoxq %r12,%r11
  2553. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
  2554. movq %rdx,%rax
  2555. movq %r8,%rdx
  2556. adcxq %rbx,%r11
  2557. adoxq %r13,%r12
  2558. mulxq 32+8(%rsp),%rbx,%rdx
  2559. movq %rax,%rdx
  2560. movq %rax,64+48+8(%rsp,%rcx,8)
  2561. mulxq 40(%rbp),%rax,%r13
  2562. adcxq %rax,%r12
  2563. adoxq %r14,%r13
  2564. mulxq 48(%rbp),%rax,%r14
  2565. adcxq %rax,%r13
  2566. adoxq %r15,%r14
  2567. mulxq 56(%rbp),%rax,%r15
  2568. movq %rbx,%rdx
  2569. adcxq %rax,%r14
  2570. adoxq %rsi,%r15
  2571. adcxq %rsi,%r15
  2572. .byte 0x67,0x67,0x67
  2573. incq %rcx
  2574. jnz L$sqrx8x_reduce
  2575. movq %rsi,%rax
  2576. cmpq 0+8(%rsp),%rbp
  2577. jae L$sqrx8x_no_tail
  2578. movq 48+8(%rsp),%rdx
  2579. addq 0(%rdi),%r8
  2580. leaq 64(%rbp),%rbp
  2581. movq $-8,%rcx
  2582. adcxq 8(%rdi),%r9
  2583. adcxq 16(%rdi),%r10
  2584. adcq 24(%rdi),%r11
  2585. adcq 32(%rdi),%r12
  2586. adcq 40(%rdi),%r13
  2587. adcq 48(%rdi),%r14
  2588. adcq 56(%rdi),%r15
  2589. leaq 64(%rdi),%rdi
  2590. sbbq %rax,%rax
  2591. xorq %rsi,%rsi
  2592. movq %rax,16+8(%rsp)
  2593. jmp L$sqrx8x_tail
  2594. .p2align 5
  2595. L$sqrx8x_tail:
  2596. movq %r8,%rbx
  2597. mulxq 0(%rbp),%rax,%r8
  2598. adcxq %rax,%rbx
  2599. adoxq %r9,%r8
  2600. mulxq 8(%rbp),%rax,%r9
  2601. adcxq %rax,%r8
  2602. adoxq %r10,%r9
  2603. mulxq 16(%rbp),%rax,%r10
  2604. adcxq %rax,%r9
  2605. adoxq %r11,%r10
  2606. mulxq 24(%rbp),%rax,%r11
  2607. adcxq %rax,%r10
  2608. adoxq %r12,%r11
  2609. .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
  2610. adcxq %rax,%r11
  2611. adoxq %r13,%r12
  2612. mulxq 40(%rbp),%rax,%r13
  2613. adcxq %rax,%r12
  2614. adoxq %r14,%r13
  2615. mulxq 48(%rbp),%rax,%r14
  2616. adcxq %rax,%r13
  2617. adoxq %r15,%r14
  2618. mulxq 56(%rbp),%rax,%r15
  2619. movq 72+48+8(%rsp,%rcx,8),%rdx
  2620. adcxq %rax,%r14
  2621. adoxq %rsi,%r15
  2622. movq %rbx,(%rdi,%rcx,8)
  2623. movq %r8,%rbx
  2624. adcxq %rsi,%r15
  2625. incq %rcx
  2626. jnz L$sqrx8x_tail
  2627. cmpq 0+8(%rsp),%rbp
  2628. jae L$sqrx8x_tail_done
  2629. subq 16+8(%rsp),%rsi
  2630. movq 48+8(%rsp),%rdx
  2631. leaq 64(%rbp),%rbp
  2632. adcq 0(%rdi),%r8
  2633. adcq 8(%rdi),%r9
  2634. adcq 16(%rdi),%r10
  2635. adcq 24(%rdi),%r11
  2636. adcq 32(%rdi),%r12
  2637. adcq 40(%rdi),%r13
  2638. adcq 48(%rdi),%r14
  2639. adcq 56(%rdi),%r15
  2640. leaq 64(%rdi),%rdi
  2641. sbbq %rax,%rax
  2642. subq $8,%rcx
  2643. xorq %rsi,%rsi
  2644. movq %rax,16+8(%rsp)
  2645. jmp L$sqrx8x_tail
  2646. .p2align 5
  2647. L$sqrx8x_tail_done:
  2648. xorq %rax,%rax
  2649. addq 24+8(%rsp),%r8
  2650. adcq $0,%r9
  2651. adcq $0,%r10
  2652. adcq $0,%r11
  2653. adcq $0,%r12
  2654. adcq $0,%r13
  2655. adcq $0,%r14
  2656. adcq $0,%r15
  2657. adcq $0,%rax
  2658. subq 16+8(%rsp),%rsi
  2659. L$sqrx8x_no_tail:
  2660. adcq 0(%rdi),%r8
  2661. .byte 102,72,15,126,217
  2662. adcq 8(%rdi),%r9
  2663. movq 56(%rbp),%rsi
  2664. .byte 102,72,15,126,213
  2665. adcq 16(%rdi),%r10
  2666. adcq 24(%rdi),%r11
  2667. adcq 32(%rdi),%r12
  2668. adcq 40(%rdi),%r13
  2669. adcq 48(%rdi),%r14
  2670. adcq 56(%rdi),%r15
  2671. adcq $0,%rax
  2672. movq 32+8(%rsp),%rbx
  2673. movq 64(%rdi,%rcx,1),%rdx
  2674. movq %r8,0(%rdi)
  2675. leaq 64(%rdi),%r8
  2676. movq %r9,8(%rdi)
  2677. movq %r10,16(%rdi)
  2678. movq %r11,24(%rdi)
  2679. movq %r12,32(%rdi)
  2680. movq %r13,40(%rdi)
  2681. movq %r14,48(%rdi)
  2682. movq %r15,56(%rdi)
  2683. leaq 64(%rdi,%rcx,1),%rdi
  2684. cmpq 8+8(%rsp),%r8
  2685. jb L$sqrx8x_reduction_loop
  2686. .byte 0xf3,0xc3
  2687. .p2align 5
  2688. __bn_postx4x_internal:
  2689. movq 0(%rbp),%r12
  2690. movq %rcx,%r10
  2691. movq %rcx,%r9
  2692. negq %rax
  2693. sarq $3+2,%rcx
  2694. .byte 102,72,15,126,202
  2695. .byte 102,72,15,126,206
  2696. decq %r12
  2697. movq 8(%rbp),%r13
  2698. xorq %r8,%r8
  2699. movq 16(%rbp),%r14
  2700. movq 24(%rbp),%r15
  2701. jmp L$sqrx4x_sub_entry
  2702. .p2align 4
  2703. L$sqrx4x_sub:
  2704. movq 0(%rbp),%r12
  2705. movq 8(%rbp),%r13
  2706. movq 16(%rbp),%r14
  2707. movq 24(%rbp),%r15
  2708. L$sqrx4x_sub_entry:
  2709. andnq %rax,%r12,%r12
  2710. leaq 32(%rbp),%rbp
  2711. andnq %rax,%r13,%r13
  2712. andnq %rax,%r14,%r14
  2713. andnq %rax,%r15,%r15
  2714. negq %r8
  2715. adcq 0(%rdi),%r12
  2716. adcq 8(%rdi),%r13
  2717. adcq 16(%rdi),%r14
  2718. adcq 24(%rdi),%r15
  2719. movq %r12,0(%rdx)
  2720. leaq 32(%rdi),%rdi
  2721. movq %r13,8(%rdx)
  2722. sbbq %r8,%r8
  2723. movq %r14,16(%rdx)
  2724. movq %r15,24(%rdx)
  2725. leaq 32(%rdx),%rdx
  2726. incq %rcx
  2727. jnz L$sqrx4x_sub
  2728. negq %r9
  2729. .byte 0xf3,0xc3
  2730. .globl _bn_get_bits5
  2731. .p2align 4
  2732. _bn_get_bits5:
  2733. leaq 0(%rdi),%r10
  2734. leaq 1(%rdi),%r11
  2735. movl %esi,%ecx
  2736. shrl $4,%esi
  2737. andl $15,%ecx
  2738. leal -8(%rcx),%eax
  2739. cmpl $11,%ecx
  2740. cmovaq %r11,%r10
  2741. cmoval %eax,%ecx
  2742. movzwl (%r10,%rsi,2),%eax
  2743. shrl %cl,%eax
  2744. andl $31,%eax
  2745. .byte 0xf3,0xc3
  2746. .globl _bn_scatter5
  2747. .p2align 4
  2748. _bn_scatter5:
  2749. cmpl $0,%esi
  2750. jz L$scatter_epilogue
  2751. leaq (%rdx,%rcx,8),%rdx
  2752. L$scatter:
  2753. movq (%rdi),%rax
  2754. leaq 8(%rdi),%rdi
  2755. movq %rax,(%rdx)
  2756. leaq 256(%rdx),%rdx
  2757. subl $1,%esi
  2758. jnz L$scatter
  2759. L$scatter_epilogue:
  2760. .byte 0xf3,0xc3
  2761. .globl _bn_gather5
  2762. .p2align 5
  2763. _bn_gather5:
  2764. L$SEH_begin_bn_gather5:
  2765. .byte 0x4c,0x8d,0x14,0x24
  2766. .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
  2767. leaq L$inc(%rip),%rax
  2768. andq $-16,%rsp
  2769. movd %ecx,%xmm5
  2770. movdqa 0(%rax),%xmm0
  2771. movdqa 16(%rax),%xmm1
  2772. leaq 128(%rdx),%r11
  2773. leaq 128(%rsp),%rax
  2774. pshufd $0,%xmm5,%xmm5
  2775. movdqa %xmm1,%xmm4
  2776. movdqa %xmm1,%xmm2
  2777. paddd %xmm0,%xmm1
  2778. pcmpeqd %xmm5,%xmm0
  2779. movdqa %xmm4,%xmm3
  2780. paddd %xmm1,%xmm2
  2781. pcmpeqd %xmm5,%xmm1
  2782. movdqa %xmm0,-128(%rax)
  2783. movdqa %xmm4,%xmm0
  2784. paddd %xmm2,%xmm3
  2785. pcmpeqd %xmm5,%xmm2
  2786. movdqa %xmm1,-112(%rax)
  2787. movdqa %xmm4,%xmm1
  2788. paddd %xmm3,%xmm0
  2789. pcmpeqd %xmm5,%xmm3
  2790. movdqa %xmm2,-96(%rax)
  2791. movdqa %xmm4,%xmm2
  2792. paddd %xmm0,%xmm1
  2793. pcmpeqd %xmm5,%xmm0
  2794. movdqa %xmm3,-80(%rax)
  2795. movdqa %xmm4,%xmm3
  2796. paddd %xmm1,%xmm2
  2797. pcmpeqd %xmm5,%xmm1
  2798. movdqa %xmm0,-64(%rax)
  2799. movdqa %xmm4,%xmm0
  2800. paddd %xmm2,%xmm3
  2801. pcmpeqd %xmm5,%xmm2
  2802. movdqa %xmm1,-48(%rax)
  2803. movdqa %xmm4,%xmm1
  2804. paddd %xmm3,%xmm0
  2805. pcmpeqd %xmm5,%xmm3
  2806. movdqa %xmm2,-32(%rax)
  2807. movdqa %xmm4,%xmm2
  2808. paddd %xmm0,%xmm1
  2809. pcmpeqd %xmm5,%xmm0
  2810. movdqa %xmm3,-16(%rax)
  2811. movdqa %xmm4,%xmm3
  2812. paddd %xmm1,%xmm2
  2813. pcmpeqd %xmm5,%xmm1
  2814. movdqa %xmm0,0(%rax)
  2815. movdqa %xmm4,%xmm0
  2816. paddd %xmm2,%xmm3
  2817. pcmpeqd %xmm5,%xmm2
  2818. movdqa %xmm1,16(%rax)
  2819. movdqa %xmm4,%xmm1
  2820. paddd %xmm3,%xmm0
  2821. pcmpeqd %xmm5,%xmm3
  2822. movdqa %xmm2,32(%rax)
  2823. movdqa %xmm4,%xmm2
  2824. paddd %xmm0,%xmm1
  2825. pcmpeqd %xmm5,%xmm0
  2826. movdqa %xmm3,48(%rax)
  2827. movdqa %xmm4,%xmm3
  2828. paddd %xmm1,%xmm2
  2829. pcmpeqd %xmm5,%xmm1
  2830. movdqa %xmm0,64(%rax)
  2831. movdqa %xmm4,%xmm0
  2832. paddd %xmm2,%xmm3
  2833. pcmpeqd %xmm5,%xmm2
  2834. movdqa %xmm1,80(%rax)
  2835. movdqa %xmm4,%xmm1
  2836. paddd %xmm3,%xmm0
  2837. pcmpeqd %xmm5,%xmm3
  2838. movdqa %xmm2,96(%rax)
  2839. movdqa %xmm4,%xmm2
  2840. movdqa %xmm3,112(%rax)
  2841. jmp L$gather
  2842. .p2align 5
  2843. L$gather:
  2844. pxor %xmm4,%xmm4
  2845. pxor %xmm5,%xmm5
  2846. movdqa -128(%r11),%xmm0
  2847. movdqa -112(%r11),%xmm1
  2848. movdqa -96(%r11),%xmm2
  2849. pand -128(%rax),%xmm0
  2850. movdqa -80(%r11),%xmm3
  2851. pand -112(%rax),%xmm1
  2852. por %xmm0,%xmm4
  2853. pand -96(%rax),%xmm2
  2854. por %xmm1,%xmm5
  2855. pand -80(%rax),%xmm3
  2856. por %xmm2,%xmm4
  2857. por %xmm3,%xmm5
  2858. movdqa -64(%r11),%xmm0
  2859. movdqa -48(%r11),%xmm1
  2860. movdqa -32(%r11),%xmm2
  2861. pand -64(%rax),%xmm0
  2862. movdqa -16(%r11),%xmm3
  2863. pand -48(%rax),%xmm1
  2864. por %xmm0,%xmm4
  2865. pand -32(%rax),%xmm2
  2866. por %xmm1,%xmm5
  2867. pand -16(%rax),%xmm3
  2868. por %xmm2,%xmm4
  2869. por %xmm3,%xmm5
  2870. movdqa 0(%r11),%xmm0
  2871. movdqa 16(%r11),%xmm1
  2872. movdqa 32(%r11),%xmm2
  2873. pand 0(%rax),%xmm0
  2874. movdqa 48(%r11),%xmm3
  2875. pand 16(%rax),%xmm1
  2876. por %xmm0,%xmm4
  2877. pand 32(%rax),%xmm2
  2878. por %xmm1,%xmm5
  2879. pand 48(%rax),%xmm3
  2880. por %xmm2,%xmm4
  2881. por %xmm3,%xmm5
  2882. movdqa 64(%r11),%xmm0
  2883. movdqa 80(%r11),%xmm1
  2884. movdqa 96(%r11),%xmm2
  2885. pand 64(%rax),%xmm0
  2886. movdqa 112(%r11),%xmm3
  2887. pand 80(%rax),%xmm1
  2888. por %xmm0,%xmm4
  2889. pand 96(%rax),%xmm2
  2890. por %xmm1,%xmm5
  2891. pand 112(%rax),%xmm3
  2892. por %xmm2,%xmm4
  2893. por %xmm3,%xmm5
  2894. por %xmm5,%xmm4
  2895. leaq 256(%r11),%r11
  2896. pshufd $0x4e,%xmm4,%xmm0
  2897. por %xmm4,%xmm0
  2898. movq %xmm0,(%rdi)
  2899. leaq 8(%rdi),%rdi
  2900. subl $1,%esi
  2901. jnz L$gather
  2902. leaq (%r10),%rsp
  2903. .byte 0xf3,0xc3
  2904. L$SEH_end_bn_gather5:
  2905. .p2align 6
  2906. L$inc:
  2907. .long 0,0, 1,1
  2908. .long 2,2, 2,2
  2909. .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0