x86-mont.s 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. .text
  2. .globl bn_mul_mont
  3. .type bn_mul_mont,@function
  4. .align 16
  5. bn_mul_mont:
  6. .L_bn_mul_mont_begin:
  7. pushl %ebp
  8. pushl %ebx
  9. pushl %esi
  10. pushl %edi
  11. xorl %eax,%eax
  12. movl 40(%esp),%edi
  13. cmpl $4,%edi
  14. jl .L000just_leave
  15. leal 20(%esp),%esi
  16. leal 24(%esp),%edx
  17. addl $2,%edi
  18. negl %edi
  19. leal -32(%esp,%edi,4),%ebp
  20. negl %edi
  21. movl %ebp,%eax
  22. subl %edx,%eax
  23. andl $2047,%eax
  24. subl %eax,%ebp
  25. xorl %ebp,%edx
  26. andl $2048,%edx
  27. xorl $2048,%edx
  28. subl %edx,%ebp
  29. andl $-64,%ebp
  30. movl %esp,%eax
  31. subl %ebp,%eax
  32. andl $-4096,%eax
  33. movl %esp,%edx
  34. leal (%ebp,%eax,1),%esp
  35. movl (%esp),%eax
  36. cmpl %ebp,%esp
  37. ja .L001page_walk
  38. jmp .L002page_walk_done
  39. .align 16
  40. .L001page_walk:
  41. leal -4096(%esp),%esp
  42. movl (%esp),%eax
  43. cmpl %ebp,%esp
  44. ja .L001page_walk
  45. .L002page_walk_done:
  46. movl (%esi),%eax
  47. movl 4(%esi),%ebx
  48. movl 8(%esi),%ecx
  49. movl 12(%esi),%ebp
  50. movl 16(%esi),%esi
  51. movl (%esi),%esi
  52. movl %eax,4(%esp)
  53. movl %ebx,8(%esp)
  54. movl %ecx,12(%esp)
  55. movl %ebp,16(%esp)
  56. movl %esi,20(%esp)
  57. leal -3(%edi),%ebx
  58. movl %edx,24(%esp)
  59. call .L003PIC_me_up
  60. .L003PIC_me_up:
  61. popl %eax
  62. leal OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
  63. btl $26,(%eax)
  64. jnc .L004non_sse2
  65. movl $-1,%eax
  66. movd %eax,%mm7
  67. movl 8(%esp),%esi
  68. movl 12(%esp),%edi
  69. movl 16(%esp),%ebp
  70. xorl %edx,%edx
  71. xorl %ecx,%ecx
  72. movd (%edi),%mm4
  73. movd (%esi),%mm5
  74. movd (%ebp),%mm3
  75. pmuludq %mm4,%mm5
  76. movq %mm5,%mm2
  77. movq %mm5,%mm0
  78. pand %mm7,%mm0
  79. pmuludq 20(%esp),%mm5
  80. pmuludq %mm5,%mm3
  81. paddq %mm0,%mm3
  82. movd 4(%ebp),%mm1
  83. movd 4(%esi),%mm0
  84. psrlq $32,%mm2
  85. psrlq $32,%mm3
  86. incl %ecx
  87. .align 16
  88. .L0051st:
  89. pmuludq %mm4,%mm0
  90. pmuludq %mm5,%mm1
  91. paddq %mm0,%mm2
  92. paddq %mm1,%mm3
  93. movq %mm2,%mm0
  94. pand %mm7,%mm0
  95. movd 4(%ebp,%ecx,4),%mm1
  96. paddq %mm0,%mm3
  97. movd 4(%esi,%ecx,4),%mm0
  98. psrlq $32,%mm2
  99. movd %mm3,28(%esp,%ecx,4)
  100. psrlq $32,%mm3
  101. leal 1(%ecx),%ecx
  102. cmpl %ebx,%ecx
  103. jl .L0051st
  104. pmuludq %mm4,%mm0
  105. pmuludq %mm5,%mm1
  106. paddq %mm0,%mm2
  107. paddq %mm1,%mm3
  108. movq %mm2,%mm0
  109. pand %mm7,%mm0
  110. paddq %mm0,%mm3
  111. movd %mm3,28(%esp,%ecx,4)
  112. psrlq $32,%mm2
  113. psrlq $32,%mm3
  114. paddq %mm2,%mm3
  115. movq %mm3,32(%esp,%ebx,4)
  116. incl %edx
  117. .L006outer:
  118. xorl %ecx,%ecx
  119. movd (%edi,%edx,4),%mm4
  120. movd (%esi),%mm5
  121. movd 32(%esp),%mm6
  122. movd (%ebp),%mm3
  123. pmuludq %mm4,%mm5
  124. paddq %mm6,%mm5
  125. movq %mm5,%mm0
  126. movq %mm5,%mm2
  127. pand %mm7,%mm0
  128. pmuludq 20(%esp),%mm5
  129. pmuludq %mm5,%mm3
  130. paddq %mm0,%mm3
  131. movd 36(%esp),%mm6
  132. movd 4(%ebp),%mm1
  133. movd 4(%esi),%mm0
  134. psrlq $32,%mm2
  135. psrlq $32,%mm3
  136. paddq %mm6,%mm2
  137. incl %ecx
  138. decl %ebx
  139. .L007inner:
  140. pmuludq %mm4,%mm0
  141. pmuludq %mm5,%mm1
  142. paddq %mm0,%mm2
  143. paddq %mm1,%mm3
  144. movq %mm2,%mm0
  145. movd 36(%esp,%ecx,4),%mm6
  146. pand %mm7,%mm0
  147. movd 4(%ebp,%ecx,4),%mm1
  148. paddq %mm0,%mm3
  149. movd 4(%esi,%ecx,4),%mm0
  150. psrlq $32,%mm2
  151. movd %mm3,28(%esp,%ecx,4)
  152. psrlq $32,%mm3
  153. paddq %mm6,%mm2
  154. decl %ebx
  155. leal 1(%ecx),%ecx
  156. jnz .L007inner
  157. movl %ecx,%ebx
  158. pmuludq %mm4,%mm0
  159. pmuludq %mm5,%mm1
  160. paddq %mm0,%mm2
  161. paddq %mm1,%mm3
  162. movq %mm2,%mm0
  163. pand %mm7,%mm0
  164. paddq %mm0,%mm3
  165. movd %mm3,28(%esp,%ecx,4)
  166. psrlq $32,%mm2
  167. psrlq $32,%mm3
  168. movd 36(%esp,%ebx,4),%mm6
  169. paddq %mm2,%mm3
  170. paddq %mm6,%mm3
  171. movq %mm3,32(%esp,%ebx,4)
  172. leal 1(%edx),%edx
  173. cmpl %ebx,%edx
  174. jle .L006outer
  175. emms
  176. jmp .L008common_tail
  177. .align 16
  178. .L004non_sse2:
  179. movl 8(%esp),%esi
  180. leal 1(%ebx),%ebp
  181. movl 12(%esp),%edi
  182. xorl %ecx,%ecx
  183. movl %esi,%edx
  184. andl $1,%ebp
  185. subl %edi,%edx
  186. leal 4(%edi,%ebx,4),%eax
  187. orl %edx,%ebp
  188. movl (%edi),%edi
  189. jz .L009bn_sqr_mont
  190. movl %eax,28(%esp)
  191. movl (%esi),%eax
  192. xorl %edx,%edx
  193. .align 16
  194. .L010mull:
  195. movl %edx,%ebp
  196. mull %edi
  197. addl %eax,%ebp
  198. leal 1(%ecx),%ecx
  199. adcl $0,%edx
  200. movl (%esi,%ecx,4),%eax
  201. cmpl %ebx,%ecx
  202. movl %ebp,28(%esp,%ecx,4)
  203. jl .L010mull
  204. movl %edx,%ebp
  205. mull %edi
  206. movl 20(%esp),%edi
  207. addl %ebp,%eax
  208. movl 16(%esp),%esi
  209. adcl $0,%edx
  210. imull 32(%esp),%edi
  211. movl %eax,32(%esp,%ebx,4)
  212. xorl %ecx,%ecx
  213. movl %edx,36(%esp,%ebx,4)
  214. movl %ecx,40(%esp,%ebx,4)
  215. movl (%esi),%eax
  216. mull %edi
  217. addl 32(%esp),%eax
  218. movl 4(%esi),%eax
  219. adcl $0,%edx
  220. incl %ecx
  221. jmp .L0112ndmadd
  222. .align 16
  223. .L0121stmadd:
  224. movl %edx,%ebp
  225. mull %edi
  226. addl 32(%esp,%ecx,4),%ebp
  227. leal 1(%ecx),%ecx
  228. adcl $0,%edx
  229. addl %eax,%ebp
  230. movl (%esi,%ecx,4),%eax
  231. adcl $0,%edx
  232. cmpl %ebx,%ecx
  233. movl %ebp,28(%esp,%ecx,4)
  234. jl .L0121stmadd
  235. movl %edx,%ebp
  236. mull %edi
  237. addl 32(%esp,%ebx,4),%eax
  238. movl 20(%esp),%edi
  239. adcl $0,%edx
  240. movl 16(%esp),%esi
  241. addl %eax,%ebp
  242. adcl $0,%edx
  243. imull 32(%esp),%edi
  244. xorl %ecx,%ecx
  245. addl 36(%esp,%ebx,4),%edx
  246. movl %ebp,32(%esp,%ebx,4)
  247. adcl $0,%ecx
  248. movl (%esi),%eax
  249. movl %edx,36(%esp,%ebx,4)
  250. movl %ecx,40(%esp,%ebx,4)
  251. mull %edi
  252. addl 32(%esp),%eax
  253. movl 4(%esi),%eax
  254. adcl $0,%edx
  255. movl $1,%ecx
  256. .align 16
  257. .L0112ndmadd:
  258. movl %edx,%ebp
  259. mull %edi
  260. addl 32(%esp,%ecx,4),%ebp
  261. leal 1(%ecx),%ecx
  262. adcl $0,%edx
  263. addl %eax,%ebp
  264. movl (%esi,%ecx,4),%eax
  265. adcl $0,%edx
  266. cmpl %ebx,%ecx
  267. movl %ebp,24(%esp,%ecx,4)
  268. jl .L0112ndmadd
  269. movl %edx,%ebp
  270. mull %edi
  271. addl 32(%esp,%ebx,4),%ebp
  272. adcl $0,%edx
  273. addl %eax,%ebp
  274. adcl $0,%edx
  275. movl %ebp,28(%esp,%ebx,4)
  276. xorl %eax,%eax
  277. movl 12(%esp),%ecx
  278. addl 36(%esp,%ebx,4),%edx
  279. adcl 40(%esp,%ebx,4),%eax
  280. leal 4(%ecx),%ecx
  281. movl %edx,32(%esp,%ebx,4)
  282. cmpl 28(%esp),%ecx
  283. movl %eax,36(%esp,%ebx,4)
  284. je .L008common_tail
  285. movl (%ecx),%edi
  286. movl 8(%esp),%esi
  287. movl %ecx,12(%esp)
  288. xorl %ecx,%ecx
  289. xorl %edx,%edx
  290. movl (%esi),%eax
  291. jmp .L0121stmadd
  292. .align 16
  293. .L009bn_sqr_mont:
  294. movl %ebx,(%esp)
  295. movl %ecx,12(%esp)
  296. movl %edi,%eax
  297. mull %edi
  298. movl %eax,32(%esp)
  299. movl %edx,%ebx
  300. shrl $1,%edx
  301. andl $1,%ebx
  302. incl %ecx
  303. .align 16
  304. .L013sqr:
  305. movl (%esi,%ecx,4),%eax
  306. movl %edx,%ebp
  307. mull %edi
  308. addl %ebp,%eax
  309. leal 1(%ecx),%ecx
  310. adcl $0,%edx
  311. leal (%ebx,%eax,2),%ebp
  312. shrl $31,%eax
  313. cmpl (%esp),%ecx
  314. movl %eax,%ebx
  315. movl %ebp,28(%esp,%ecx,4)
  316. jl .L013sqr
  317. movl (%esi,%ecx,4),%eax
  318. movl %edx,%ebp
  319. mull %edi
  320. addl %ebp,%eax
  321. movl 20(%esp),%edi
  322. adcl $0,%edx
  323. movl 16(%esp),%esi
  324. leal (%ebx,%eax,2),%ebp
  325. imull 32(%esp),%edi
  326. shrl $31,%eax
  327. movl %ebp,32(%esp,%ecx,4)
  328. leal (%eax,%edx,2),%ebp
  329. movl (%esi),%eax
  330. shrl $31,%edx
  331. movl %ebp,36(%esp,%ecx,4)
  332. movl %edx,40(%esp,%ecx,4)
  333. mull %edi
  334. addl 32(%esp),%eax
  335. movl %ecx,%ebx
  336. adcl $0,%edx
  337. movl 4(%esi),%eax
  338. movl $1,%ecx
  339. .align 16
  340. .L0143rdmadd:
  341. movl %edx,%ebp
  342. mull %edi
  343. addl 32(%esp,%ecx,4),%ebp
  344. adcl $0,%edx
  345. addl %eax,%ebp
  346. movl 4(%esi,%ecx,4),%eax
  347. adcl $0,%edx
  348. movl %ebp,28(%esp,%ecx,4)
  349. movl %edx,%ebp
  350. mull %edi
  351. addl 36(%esp,%ecx,4),%ebp
  352. leal 2(%ecx),%ecx
  353. adcl $0,%edx
  354. addl %eax,%ebp
  355. movl (%esi,%ecx,4),%eax
  356. adcl $0,%edx
  357. cmpl %ebx,%ecx
  358. movl %ebp,24(%esp,%ecx,4)
  359. jl .L0143rdmadd
  360. movl %edx,%ebp
  361. mull %edi
  362. addl 32(%esp,%ebx,4),%ebp
  363. adcl $0,%edx
  364. addl %eax,%ebp
  365. adcl $0,%edx
  366. movl %ebp,28(%esp,%ebx,4)
  367. movl 12(%esp),%ecx
  368. xorl %eax,%eax
  369. movl 8(%esp),%esi
  370. addl 36(%esp,%ebx,4),%edx
  371. adcl 40(%esp,%ebx,4),%eax
  372. movl %edx,32(%esp,%ebx,4)
  373. cmpl %ebx,%ecx
  374. movl %eax,36(%esp,%ebx,4)
  375. je .L008common_tail
  376. movl 4(%esi,%ecx,4),%edi
  377. leal 1(%ecx),%ecx
  378. movl %edi,%eax
  379. movl %ecx,12(%esp)
  380. mull %edi
  381. addl 32(%esp,%ecx,4),%eax
  382. adcl $0,%edx
  383. movl %eax,32(%esp,%ecx,4)
  384. xorl %ebp,%ebp
  385. cmpl %ebx,%ecx
  386. leal 1(%ecx),%ecx
  387. je .L015sqrlast
  388. movl %edx,%ebx
  389. shrl $1,%edx
  390. andl $1,%ebx
  391. .align 16
  392. .L016sqradd:
  393. movl (%esi,%ecx,4),%eax
  394. movl %edx,%ebp
  395. mull %edi
  396. addl %ebp,%eax
  397. leal (%eax,%eax,1),%ebp
  398. adcl $0,%edx
  399. shrl $31,%eax
  400. addl 32(%esp,%ecx,4),%ebp
  401. leal 1(%ecx),%ecx
  402. adcl $0,%eax
  403. addl %ebx,%ebp
  404. adcl $0,%eax
  405. cmpl (%esp),%ecx
  406. movl %ebp,28(%esp,%ecx,4)
  407. movl %eax,%ebx
  408. jle .L016sqradd
  409. movl %edx,%ebp
  410. addl %edx,%edx
  411. shrl $31,%ebp
  412. addl %ebx,%edx
  413. adcl $0,%ebp
  414. .L015sqrlast:
  415. movl 20(%esp),%edi
  416. movl 16(%esp),%esi
  417. imull 32(%esp),%edi
  418. addl 32(%esp,%ecx,4),%edx
  419. movl (%esi),%eax
  420. adcl $0,%ebp
  421. movl %edx,32(%esp,%ecx,4)
  422. movl %ebp,36(%esp,%ecx,4)
  423. mull %edi
  424. addl 32(%esp),%eax
  425. leal -1(%ecx),%ebx
  426. adcl $0,%edx
  427. movl $1,%ecx
  428. movl 4(%esi),%eax
  429. jmp .L0143rdmadd
  430. .align 16
  431. .L008common_tail:
  432. movl 16(%esp),%ebp
  433. movl 4(%esp),%edi
  434. leal 32(%esp),%esi
  435. movl (%esi),%eax
  436. movl %ebx,%ecx
  437. xorl %edx,%edx
  438. .align 16
  439. .L017sub:
  440. sbbl (%ebp,%edx,4),%eax
  441. movl %eax,(%edi,%edx,4)
  442. decl %ecx
  443. movl 4(%esi,%edx,4),%eax
  444. leal 1(%edx),%edx
  445. jge .L017sub
  446. sbbl $0,%eax
  447. movl $-1,%edx
  448. xorl %eax,%edx
  449. jmp .L018copy
  450. .align 16
  451. .L018copy:
  452. movl 32(%esp,%ebx,4),%esi
  453. movl (%edi,%ebx,4),%ebp
  454. movl %ecx,32(%esp,%ebx,4)
  455. andl %eax,%esi
  456. andl %edx,%ebp
  457. orl %esi,%ebp
  458. movl %ebp,(%edi,%ebx,4)
  459. decl %ebx
  460. jge .L018copy
  461. movl 24(%esp),%esp
  462. movl $1,%eax
  463. .L000just_leave:
  464. popl %edi
  465. popl %esi
  466. popl %ebx
  467. popl %ebp
  468. ret
  469. .size bn_mul_mont,.-.L_bn_mul_mont_begin
  470. .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
  471. .byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
  472. .byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
  473. .byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
  474. .byte 111,114,103,62,0
  475. .comm OPENSSL_ia32cap_P,16,4