x86-mont.masm 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. IF @Version LT 800
  2. ECHO MASM version 8.00 or later is strongly recommended.
  3. ENDIF
  4. .686
  5. .XMM
  6. IF @Version LT 800
  7. XMMWORD STRUCT 16
  8. DQ 2 dup (?)
  9. XMMWORD ENDS
  10. ENDIF
  11. .MODEL FLAT
  12. OPTION DOTNAME
  13. IF @Version LT 800
  14. .text$ SEGMENT PAGE 'CODE'
  15. ELSE
  16. .text$ SEGMENT ALIGN(64) 'CODE'
  17. ENDIF
  18. ;EXTERN _OPENSSL_ia32cap_P:NEAR
  19. ALIGN 16
  20. _bn_mul_mont PROC PUBLIC
  21. $L_bn_mul_mont_begin::
  22. push ebp
  23. push ebx
  24. push esi
  25. push edi
  26. xor eax,eax
  27. mov edi,DWORD PTR 40[esp]
  28. cmp edi,4
  29. jl $L000just_leave
  30. lea esi,DWORD PTR 20[esp]
  31. lea edx,DWORD PTR 24[esp]
  32. add edi,2
  33. neg edi
  34. lea ebp,DWORD PTR [edi*4+esp-32]
  35. neg edi
  36. mov eax,ebp
  37. sub eax,edx
  38. and eax,2047
  39. sub ebp,eax
  40. xor edx,ebp
  41. and edx,2048
  42. xor edx,2048
  43. sub ebp,edx
  44. and ebp,-64
  45. mov eax,esp
  46. sub eax,ebp
  47. and eax,-4096
  48. mov edx,esp
  49. lea esp,DWORD PTR [eax*1+ebp]
  50. mov eax,DWORD PTR [esp]
  51. cmp esp,ebp
  52. ja $L001page_walk
  53. jmp $L002page_walk_done
  54. ALIGN 16
  55. $L001page_walk:
  56. lea esp,DWORD PTR [esp-4096]
  57. mov eax,DWORD PTR [esp]
  58. cmp esp,ebp
  59. ja $L001page_walk
  60. $L002page_walk_done:
  61. mov eax,DWORD PTR [esi]
  62. mov ebx,DWORD PTR 4[esi]
  63. mov ecx,DWORD PTR 8[esi]
  64. mov ebp,DWORD PTR 12[esi]
  65. mov esi,DWORD PTR 16[esi]
  66. mov esi,DWORD PTR [esi]
  67. mov DWORD PTR 4[esp],eax
  68. mov DWORD PTR 8[esp],ebx
  69. mov DWORD PTR 12[esp],ecx
  70. mov DWORD PTR 16[esp],ebp
  71. mov DWORD PTR 20[esp],esi
  72. lea ebx,DWORD PTR [edi-3]
  73. mov DWORD PTR 24[esp],edx
  74. lea eax,DWORD PTR _OPENSSL_ia32cap_P
  75. bt DWORD PTR [eax],26
  76. jnc $L003non_sse2
  77. mov eax,-1
  78. movd mm7,eax
  79. mov esi,DWORD PTR 8[esp]
  80. mov edi,DWORD PTR 12[esp]
  81. mov ebp,DWORD PTR 16[esp]
  82. xor edx,edx
  83. xor ecx,ecx
  84. movd mm4,DWORD PTR [edi]
  85. movd mm5,DWORD PTR [esi]
  86. movd mm3,DWORD PTR [ebp]
  87. pmuludq mm5,mm4
  88. movq mm2,mm5
  89. movq mm0,mm5
  90. pand mm0,mm7
  91. pmuludq mm5,QWORD PTR 20[esp]
  92. pmuludq mm3,mm5
  93. paddq mm3,mm0
  94. movd mm1,DWORD PTR 4[ebp]
  95. movd mm0,DWORD PTR 4[esi]
  96. psrlq mm2,32
  97. psrlq mm3,32
  98. inc ecx
  99. ALIGN 16
  100. $L0041st:
  101. pmuludq mm0,mm4
  102. pmuludq mm1,mm5
  103. paddq mm2,mm0
  104. paddq mm3,mm1
  105. movq mm0,mm2
  106. pand mm0,mm7
  107. movd mm1,DWORD PTR 4[ecx*4+ebp]
  108. paddq mm3,mm0
  109. movd mm0,DWORD PTR 4[ecx*4+esi]
  110. psrlq mm2,32
  111. movd DWORD PTR 28[ecx*4+esp],mm3
  112. psrlq mm3,32
  113. lea ecx,DWORD PTR 1[ecx]
  114. cmp ecx,ebx
  115. jl $L0041st
  116. pmuludq mm0,mm4
  117. pmuludq mm1,mm5
  118. paddq mm2,mm0
  119. paddq mm3,mm1
  120. movq mm0,mm2
  121. pand mm0,mm7
  122. paddq mm3,mm0
  123. movd DWORD PTR 28[ecx*4+esp],mm3
  124. psrlq mm2,32
  125. psrlq mm3,32
  126. paddq mm3,mm2
  127. movq QWORD PTR 32[ebx*4+esp],mm3
  128. inc edx
  129. $L005outer:
  130. xor ecx,ecx
  131. movd mm4,DWORD PTR [edx*4+edi]
  132. movd mm5,DWORD PTR [esi]
  133. movd mm6,DWORD PTR 32[esp]
  134. movd mm3,DWORD PTR [ebp]
  135. pmuludq mm5,mm4
  136. paddq mm5,mm6
  137. movq mm0,mm5
  138. movq mm2,mm5
  139. pand mm0,mm7
  140. pmuludq mm5,QWORD PTR 20[esp]
  141. pmuludq mm3,mm5
  142. paddq mm3,mm0
  143. movd mm6,DWORD PTR 36[esp]
  144. movd mm1,DWORD PTR 4[ebp]
  145. movd mm0,DWORD PTR 4[esi]
  146. psrlq mm2,32
  147. psrlq mm3,32
  148. paddq mm2,mm6
  149. inc ecx
  150. dec ebx
  151. $L006inner:
  152. pmuludq mm0,mm4
  153. pmuludq mm1,mm5
  154. paddq mm2,mm0
  155. paddq mm3,mm1
  156. movq mm0,mm2
  157. movd mm6,DWORD PTR 36[ecx*4+esp]
  158. pand mm0,mm7
  159. movd mm1,DWORD PTR 4[ecx*4+ebp]
  160. paddq mm3,mm0
  161. movd mm0,DWORD PTR 4[ecx*4+esi]
  162. psrlq mm2,32
  163. movd DWORD PTR 28[ecx*4+esp],mm3
  164. psrlq mm3,32
  165. paddq mm2,mm6
  166. dec ebx
  167. lea ecx,DWORD PTR 1[ecx]
  168. jnz $L006inner
  169. mov ebx,ecx
  170. pmuludq mm0,mm4
  171. pmuludq mm1,mm5
  172. paddq mm2,mm0
  173. paddq mm3,mm1
  174. movq mm0,mm2
  175. pand mm0,mm7
  176. paddq mm3,mm0
  177. movd DWORD PTR 28[ecx*4+esp],mm3
  178. psrlq mm2,32
  179. psrlq mm3,32
  180. movd mm6,DWORD PTR 36[ebx*4+esp]
  181. paddq mm3,mm2
  182. paddq mm3,mm6
  183. movq QWORD PTR 32[ebx*4+esp],mm3
  184. lea edx,DWORD PTR 1[edx]
  185. cmp edx,ebx
  186. jle $L005outer
  187. emms
  188. jmp $L007common_tail
  189. ALIGN 16
  190. $L003non_sse2:
  191. mov esi,DWORD PTR 8[esp]
  192. lea ebp,DWORD PTR 1[ebx]
  193. mov edi,DWORD PTR 12[esp]
  194. xor ecx,ecx
  195. mov edx,esi
  196. and ebp,1
  197. sub edx,edi
  198. lea eax,DWORD PTR 4[ebx*4+edi]
  199. or ebp,edx
  200. mov edi,DWORD PTR [edi]
  201. jz $L008bn_sqr_mont
  202. mov DWORD PTR 28[esp],eax
  203. mov eax,DWORD PTR [esi]
  204. xor edx,edx
  205. ALIGN 16
  206. $L009mull:
  207. mov ebp,edx
  208. mul edi
  209. add ebp,eax
  210. lea ecx,DWORD PTR 1[ecx]
  211. adc edx,0
  212. mov eax,DWORD PTR [ecx*4+esi]
  213. cmp ecx,ebx
  214. mov DWORD PTR 28[ecx*4+esp],ebp
  215. jl $L009mull
  216. mov ebp,edx
  217. mul edi
  218. mov edi,DWORD PTR 20[esp]
  219. add eax,ebp
  220. mov esi,DWORD PTR 16[esp]
  221. adc edx,0
  222. imul edi,DWORD PTR 32[esp]
  223. mov DWORD PTR 32[ebx*4+esp],eax
  224. xor ecx,ecx
  225. mov DWORD PTR 36[ebx*4+esp],edx
  226. mov DWORD PTR 40[ebx*4+esp],ecx
  227. mov eax,DWORD PTR [esi]
  228. mul edi
  229. add eax,DWORD PTR 32[esp]
  230. mov eax,DWORD PTR 4[esi]
  231. adc edx,0
  232. inc ecx
  233. jmp $L0102ndmadd
  234. ALIGN 16
  235. $L0111stmadd:
  236. mov ebp,edx
  237. mul edi
  238. add ebp,DWORD PTR 32[ecx*4+esp]
  239. lea ecx,DWORD PTR 1[ecx]
  240. adc edx,0
  241. add ebp,eax
  242. mov eax,DWORD PTR [ecx*4+esi]
  243. adc edx,0
  244. cmp ecx,ebx
  245. mov DWORD PTR 28[ecx*4+esp],ebp
  246. jl $L0111stmadd
  247. mov ebp,edx
  248. mul edi
  249. add eax,DWORD PTR 32[ebx*4+esp]
  250. mov edi,DWORD PTR 20[esp]
  251. adc edx,0
  252. mov esi,DWORD PTR 16[esp]
  253. add ebp,eax
  254. adc edx,0
  255. imul edi,DWORD PTR 32[esp]
  256. xor ecx,ecx
  257. add edx,DWORD PTR 36[ebx*4+esp]
  258. mov DWORD PTR 32[ebx*4+esp],ebp
  259. adc ecx,0
  260. mov eax,DWORD PTR [esi]
  261. mov DWORD PTR 36[ebx*4+esp],edx
  262. mov DWORD PTR 40[ebx*4+esp],ecx
  263. mul edi
  264. add eax,DWORD PTR 32[esp]
  265. mov eax,DWORD PTR 4[esi]
  266. adc edx,0
  267. mov ecx,1
  268. ALIGN 16
  269. $L0102ndmadd:
  270. mov ebp,edx
  271. mul edi
  272. add ebp,DWORD PTR 32[ecx*4+esp]
  273. lea ecx,DWORD PTR 1[ecx]
  274. adc edx,0
  275. add ebp,eax
  276. mov eax,DWORD PTR [ecx*4+esi]
  277. adc edx,0
  278. cmp ecx,ebx
  279. mov DWORD PTR 24[ecx*4+esp],ebp
  280. jl $L0102ndmadd
  281. mov ebp,edx
  282. mul edi
  283. add ebp,DWORD PTR 32[ebx*4+esp]
  284. adc edx,0
  285. add ebp,eax
  286. adc edx,0
  287. mov DWORD PTR 28[ebx*4+esp],ebp
  288. xor eax,eax
  289. mov ecx,DWORD PTR 12[esp]
  290. add edx,DWORD PTR 36[ebx*4+esp]
  291. adc eax,DWORD PTR 40[ebx*4+esp]
  292. lea ecx,DWORD PTR 4[ecx]
  293. mov DWORD PTR 32[ebx*4+esp],edx
  294. cmp ecx,DWORD PTR 28[esp]
  295. mov DWORD PTR 36[ebx*4+esp],eax
  296. je $L007common_tail
  297. mov edi,DWORD PTR [ecx]
  298. mov esi,DWORD PTR 8[esp]
  299. mov DWORD PTR 12[esp],ecx
  300. xor ecx,ecx
  301. xor edx,edx
  302. mov eax,DWORD PTR [esi]
  303. jmp $L0111stmadd
  304. ALIGN 16
  305. $L008bn_sqr_mont:
  306. mov DWORD PTR [esp],ebx
  307. mov DWORD PTR 12[esp],ecx
  308. mov eax,edi
  309. mul edi
  310. mov DWORD PTR 32[esp],eax
  311. mov ebx,edx
  312. shr edx,1
  313. and ebx,1
  314. inc ecx
  315. ALIGN 16
  316. $L012sqr:
  317. mov eax,DWORD PTR [ecx*4+esi]
  318. mov ebp,edx
  319. mul edi
  320. add eax,ebp
  321. lea ecx,DWORD PTR 1[ecx]
  322. adc edx,0
  323. lea ebp,DWORD PTR [eax*2+ebx]
  324. shr eax,31
  325. cmp ecx,DWORD PTR [esp]
  326. mov ebx,eax
  327. mov DWORD PTR 28[ecx*4+esp],ebp
  328. jl $L012sqr
  329. mov eax,DWORD PTR [ecx*4+esi]
  330. mov ebp,edx
  331. mul edi
  332. add eax,ebp
  333. mov edi,DWORD PTR 20[esp]
  334. adc edx,0
  335. mov esi,DWORD PTR 16[esp]
  336. lea ebp,DWORD PTR [eax*2+ebx]
  337. imul edi,DWORD PTR 32[esp]
  338. shr eax,31
  339. mov DWORD PTR 32[ecx*4+esp],ebp
  340. lea ebp,DWORD PTR [edx*2+eax]
  341. mov eax,DWORD PTR [esi]
  342. shr edx,31
  343. mov DWORD PTR 36[ecx*4+esp],ebp
  344. mov DWORD PTR 40[ecx*4+esp],edx
  345. mul edi
  346. add eax,DWORD PTR 32[esp]
  347. mov ebx,ecx
  348. adc edx,0
  349. mov eax,DWORD PTR 4[esi]
  350. mov ecx,1
  351. ALIGN 16
  352. $L0133rdmadd:
  353. mov ebp,edx
  354. mul edi
  355. add ebp,DWORD PTR 32[ecx*4+esp]
  356. adc edx,0
  357. add ebp,eax
  358. mov eax,DWORD PTR 4[ecx*4+esi]
  359. adc edx,0
  360. mov DWORD PTR 28[ecx*4+esp],ebp
  361. mov ebp,edx
  362. mul edi
  363. add ebp,DWORD PTR 36[ecx*4+esp]
  364. lea ecx,DWORD PTR 2[ecx]
  365. adc edx,0
  366. add ebp,eax
  367. mov eax,DWORD PTR [ecx*4+esi]
  368. adc edx,0
  369. cmp ecx,ebx
  370. mov DWORD PTR 24[ecx*4+esp],ebp
  371. jl $L0133rdmadd
  372. mov ebp,edx
  373. mul edi
  374. add ebp,DWORD PTR 32[ebx*4+esp]
  375. adc edx,0
  376. add ebp,eax
  377. adc edx,0
  378. mov DWORD PTR 28[ebx*4+esp],ebp
  379. mov ecx,DWORD PTR 12[esp]
  380. xor eax,eax
  381. mov esi,DWORD PTR 8[esp]
  382. add edx,DWORD PTR 36[ebx*4+esp]
  383. adc eax,DWORD PTR 40[ebx*4+esp]
  384. mov DWORD PTR 32[ebx*4+esp],edx
  385. cmp ecx,ebx
  386. mov DWORD PTR 36[ebx*4+esp],eax
  387. je $L007common_tail
  388. mov edi,DWORD PTR 4[ecx*4+esi]
  389. lea ecx,DWORD PTR 1[ecx]
  390. mov eax,edi
  391. mov DWORD PTR 12[esp],ecx
  392. mul edi
  393. add eax,DWORD PTR 32[ecx*4+esp]
  394. adc edx,0
  395. mov DWORD PTR 32[ecx*4+esp],eax
  396. xor ebp,ebp
  397. cmp ecx,ebx
  398. lea ecx,DWORD PTR 1[ecx]
  399. je $L014sqrlast
  400. mov ebx,edx
  401. shr edx,1
  402. and ebx,1
  403. ALIGN 16
  404. $L015sqradd:
  405. mov eax,DWORD PTR [ecx*4+esi]
  406. mov ebp,edx
  407. mul edi
  408. add eax,ebp
  409. lea ebp,DWORD PTR [eax*1+eax]
  410. adc edx,0
  411. shr eax,31
  412. add ebp,DWORD PTR 32[ecx*4+esp]
  413. lea ecx,DWORD PTR 1[ecx]
  414. adc eax,0
  415. add ebp,ebx
  416. adc eax,0
  417. cmp ecx,DWORD PTR [esp]
  418. mov DWORD PTR 28[ecx*4+esp],ebp
  419. mov ebx,eax
  420. jle $L015sqradd
  421. mov ebp,edx
  422. add edx,edx
  423. shr ebp,31
  424. add edx,ebx
  425. adc ebp,0
  426. $L014sqrlast:
  427. mov edi,DWORD PTR 20[esp]
  428. mov esi,DWORD PTR 16[esp]
  429. imul edi,DWORD PTR 32[esp]
  430. add edx,DWORD PTR 32[ecx*4+esp]
  431. mov eax,DWORD PTR [esi]
  432. adc ebp,0
  433. mov DWORD PTR 32[ecx*4+esp],edx
  434. mov DWORD PTR 36[ecx*4+esp],ebp
  435. mul edi
  436. add eax,DWORD PTR 32[esp]
  437. lea ebx,DWORD PTR [ecx-1]
  438. adc edx,0
  439. mov ecx,1
  440. mov eax,DWORD PTR 4[esi]
  441. jmp $L0133rdmadd
  442. ALIGN 16
  443. $L007common_tail:
  444. mov ebp,DWORD PTR 16[esp]
  445. mov edi,DWORD PTR 4[esp]
  446. lea esi,DWORD PTR 32[esp]
  447. mov eax,DWORD PTR [esi]
  448. mov ecx,ebx
  449. xor edx,edx
  450. ALIGN 16
  451. $L016sub:
  452. sbb eax,DWORD PTR [edx*4+ebp]
  453. mov DWORD PTR [edx*4+edi],eax
  454. dec ecx
  455. mov eax,DWORD PTR 4[edx*4+esi]
  456. lea edx,DWORD PTR 1[edx]
  457. jge $L016sub
  458. sbb eax,0
  459. mov edx,-1
  460. xor edx,eax
  461. jmp $L017copy
  462. ALIGN 16
  463. $L017copy:
  464. mov esi,DWORD PTR 32[ebx*4+esp]
  465. mov ebp,DWORD PTR [ebx*4+edi]
  466. mov DWORD PTR 32[ebx*4+esp],ecx
  467. and esi,eax
  468. and ebp,edx
  469. or ebp,esi
  470. mov DWORD PTR [ebx*4+edi],ebp
  471. dec ebx
  472. jge $L017copy
  473. mov esp,DWORD PTR 24[esp]
  474. mov eax,1
  475. $L000just_leave:
  476. pop edi
  477. pop esi
  478. pop ebx
  479. pop ebp
  480. ret
  481. _bn_mul_mont ENDP
  482. DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
  483. DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
  484. DB 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
  485. DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
  486. DB 111,114,103,62,0
  487. .text$ ENDS
  488. .bss SEGMENT 'BSS'
  489. COMM _OPENSSL_ia32cap_P:DWORD:4
  490. .bss ENDS
  491. END