x86_64-gf2m.s 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. .text
  2. .type _mul_1x1,@function
  3. .align 16
  4. _mul_1x1:
  5. .cfi_startproc
  6. subq $128+8,%rsp
  7. .cfi_adjust_cfa_offset 128+8
  8. movq $-1,%r9
  9. leaq (%rax,%rax,1),%rsi
  10. shrq $3,%r9
  11. leaq (,%rax,4),%rdi
  12. andq %rax,%r9
  13. leaq (,%rax,8),%r12
  14. sarq $63,%rax
  15. leaq (%r9,%r9,1),%r10
  16. sarq $63,%rsi
  17. leaq (,%r9,4),%r11
  18. andq %rbp,%rax
  19. sarq $63,%rdi
  20. movq %rax,%rdx
  21. shlq $63,%rax
  22. andq %rbp,%rsi
  23. shrq $1,%rdx
  24. movq %rsi,%rcx
  25. shlq $62,%rsi
  26. andq %rbp,%rdi
  27. shrq $2,%rcx
  28. xorq %rsi,%rax
  29. movq %rdi,%rbx
  30. shlq $61,%rdi
  31. xorq %rcx,%rdx
  32. shrq $3,%rbx
  33. xorq %rdi,%rax
  34. xorq %rbx,%rdx
  35. movq %r9,%r13
  36. movq $0,0(%rsp)
  37. xorq %r10,%r13
  38. movq %r9,8(%rsp)
  39. movq %r11,%r14
  40. movq %r10,16(%rsp)
  41. xorq %r12,%r14
  42. movq %r13,24(%rsp)
  43. xorq %r11,%r9
  44. movq %r11,32(%rsp)
  45. xorq %r11,%r10
  46. movq %r9,40(%rsp)
  47. xorq %r11,%r13
  48. movq %r10,48(%rsp)
  49. xorq %r14,%r9
  50. movq %r13,56(%rsp)
  51. xorq %r14,%r10
  52. movq %r12,64(%rsp)
  53. xorq %r14,%r13
  54. movq %r9,72(%rsp)
  55. xorq %r11,%r9
  56. movq %r10,80(%rsp)
  57. xorq %r11,%r10
  58. movq %r13,88(%rsp)
  59. xorq %r11,%r13
  60. movq %r14,96(%rsp)
  61. movq %r8,%rsi
  62. movq %r9,104(%rsp)
  63. andq %rbp,%rsi
  64. movq %r10,112(%rsp)
  65. shrq $4,%rbp
  66. movq %r13,120(%rsp)
  67. movq %r8,%rdi
  68. andq %rbp,%rdi
  69. shrq $4,%rbp
  70. movq (%rsp,%rsi,8),%xmm0
  71. movq %r8,%rsi
  72. andq %rbp,%rsi
  73. shrq $4,%rbp
  74. movq (%rsp,%rdi,8),%rcx
  75. movq %r8,%rdi
  76. movq %rcx,%rbx
  77. shlq $4,%rcx
  78. andq %rbp,%rdi
  79. movq (%rsp,%rsi,8),%xmm1
  80. shrq $60,%rbx
  81. xorq %rcx,%rax
  82. pslldq $1,%xmm1
  83. movq %r8,%rsi
  84. shrq $4,%rbp
  85. xorq %rbx,%rdx
  86. andq %rbp,%rsi
  87. shrq $4,%rbp
  88. pxor %xmm1,%xmm0
  89. movq (%rsp,%rdi,8),%rcx
  90. movq %r8,%rdi
  91. movq %rcx,%rbx
  92. shlq $12,%rcx
  93. andq %rbp,%rdi
  94. movq (%rsp,%rsi,8),%xmm1
  95. shrq $52,%rbx
  96. xorq %rcx,%rax
  97. pslldq $2,%xmm1
  98. movq %r8,%rsi
  99. shrq $4,%rbp
  100. xorq %rbx,%rdx
  101. andq %rbp,%rsi
  102. shrq $4,%rbp
  103. pxor %xmm1,%xmm0
  104. movq (%rsp,%rdi,8),%rcx
  105. movq %r8,%rdi
  106. movq %rcx,%rbx
  107. shlq $20,%rcx
  108. andq %rbp,%rdi
  109. movq (%rsp,%rsi,8),%xmm1
  110. shrq $44,%rbx
  111. xorq %rcx,%rax
  112. pslldq $3,%xmm1
  113. movq %r8,%rsi
  114. shrq $4,%rbp
  115. xorq %rbx,%rdx
  116. andq %rbp,%rsi
  117. shrq $4,%rbp
  118. pxor %xmm1,%xmm0
  119. movq (%rsp,%rdi,8),%rcx
  120. movq %r8,%rdi
  121. movq %rcx,%rbx
  122. shlq $28,%rcx
  123. andq %rbp,%rdi
  124. movq (%rsp,%rsi,8),%xmm1
  125. shrq $36,%rbx
  126. xorq %rcx,%rax
  127. pslldq $4,%xmm1
  128. movq %r8,%rsi
  129. shrq $4,%rbp
  130. xorq %rbx,%rdx
  131. andq %rbp,%rsi
  132. shrq $4,%rbp
  133. pxor %xmm1,%xmm0
  134. movq (%rsp,%rdi,8),%rcx
  135. movq %r8,%rdi
  136. movq %rcx,%rbx
  137. shlq $36,%rcx
  138. andq %rbp,%rdi
  139. movq (%rsp,%rsi,8),%xmm1
  140. shrq $28,%rbx
  141. xorq %rcx,%rax
  142. pslldq $5,%xmm1
  143. movq %r8,%rsi
  144. shrq $4,%rbp
  145. xorq %rbx,%rdx
  146. andq %rbp,%rsi
  147. shrq $4,%rbp
  148. pxor %xmm1,%xmm0
  149. movq (%rsp,%rdi,8),%rcx
  150. movq %r8,%rdi
  151. movq %rcx,%rbx
  152. shlq $44,%rcx
  153. andq %rbp,%rdi
  154. movq (%rsp,%rsi,8),%xmm1
  155. shrq $20,%rbx
  156. xorq %rcx,%rax
  157. pslldq $6,%xmm1
  158. movq %r8,%rsi
  159. shrq $4,%rbp
  160. xorq %rbx,%rdx
  161. andq %rbp,%rsi
  162. shrq $4,%rbp
  163. pxor %xmm1,%xmm0
  164. movq (%rsp,%rdi,8),%rcx
  165. movq %r8,%rdi
  166. movq %rcx,%rbx
  167. shlq $52,%rcx
  168. andq %rbp,%rdi
  169. movq (%rsp,%rsi,8),%xmm1
  170. shrq $12,%rbx
  171. xorq %rcx,%rax
  172. pslldq $7,%xmm1
  173. movq %r8,%rsi
  174. shrq $4,%rbp
  175. xorq %rbx,%rdx
  176. andq %rbp,%rsi
  177. shrq $4,%rbp
  178. pxor %xmm1,%xmm0
  179. movq (%rsp,%rdi,8),%rcx
  180. movq %rcx,%rbx
  181. shlq $60,%rcx
  182. .byte 102,72,15,126,198
  183. shrq $4,%rbx
  184. xorq %rcx,%rax
  185. psrldq $8,%xmm0
  186. xorq %rbx,%rdx
  187. .byte 102,72,15,126,199
  188. xorq %rsi,%rax
  189. xorq %rdi,%rdx
  190. addq $128+8,%rsp
  191. .cfi_adjust_cfa_offset -128-8
  192. .byte 0xf3,0xc3
  193. .Lend_mul_1x1:
  194. .cfi_endproc
  195. .size _mul_1x1,.-_mul_1x1
  196. .globl bn_GF2m_mul_2x2
  197. .type bn_GF2m_mul_2x2,@function
  198. .align 16
  199. bn_GF2m_mul_2x2:
  200. .cfi_startproc
  201. movq %rsp,%rax
  202. movq OPENSSL_ia32cap_P(%rip),%r10
  203. btq $33,%r10
  204. jnc .Lvanilla_mul_2x2
  205. .byte 102,72,15,110,198
  206. .byte 102,72,15,110,201
  207. .byte 102,72,15,110,210
  208. .byte 102,73,15,110,216
  209. movdqa %xmm0,%xmm4
  210. movdqa %xmm1,%xmm5
  211. .byte 102,15,58,68,193,0
  212. pxor %xmm2,%xmm4
  213. pxor %xmm3,%xmm5
  214. .byte 102,15,58,68,211,0
  215. .byte 102,15,58,68,229,0
  216. xorps %xmm0,%xmm4
  217. xorps %xmm2,%xmm4
  218. movdqa %xmm4,%xmm5
  219. pslldq $8,%xmm4
  220. psrldq $8,%xmm5
  221. pxor %xmm4,%xmm2
  222. pxor %xmm5,%xmm0
  223. movdqu %xmm2,0(%rdi)
  224. movdqu %xmm0,16(%rdi)
  225. .byte 0xf3,0xc3
  226. .align 16
  227. .Lvanilla_mul_2x2:
  228. leaq -136(%rsp),%rsp
  229. .cfi_adjust_cfa_offset 8*17
  230. movq %r14,80(%rsp)
  231. .cfi_rel_offset %r14,8*10
  232. movq %r13,88(%rsp)
  233. .cfi_rel_offset %r13,8*11
  234. movq %r12,96(%rsp)
  235. .cfi_rel_offset %r12,8*12
  236. movq %rbp,104(%rsp)
  237. .cfi_rel_offset %rbp,8*13
  238. movq %rbx,112(%rsp)
  239. .cfi_rel_offset %rbx,8*14
  240. .Lbody_mul_2x2:
  241. movq %rdi,32(%rsp)
  242. movq %rsi,40(%rsp)
  243. movq %rdx,48(%rsp)
  244. movq %rcx,56(%rsp)
  245. movq %r8,64(%rsp)
  246. movq $0xf,%r8
  247. movq %rsi,%rax
  248. movq %rcx,%rbp
  249. call _mul_1x1
  250. movq %rax,16(%rsp)
  251. movq %rdx,24(%rsp)
  252. movq 48(%rsp),%rax
  253. movq 64(%rsp),%rbp
  254. call _mul_1x1
  255. movq %rax,0(%rsp)
  256. movq %rdx,8(%rsp)
  257. movq 40(%rsp),%rax
  258. movq 56(%rsp),%rbp
  259. xorq 48(%rsp),%rax
  260. xorq 64(%rsp),%rbp
  261. call _mul_1x1
  262. movq 0(%rsp),%rbx
  263. movq 8(%rsp),%rcx
  264. movq 16(%rsp),%rdi
  265. movq 24(%rsp),%rsi
  266. movq 32(%rsp),%rbp
  267. xorq %rdx,%rax
  268. xorq %rcx,%rdx
  269. xorq %rbx,%rax
  270. movq %rbx,0(%rbp)
  271. xorq %rdi,%rdx
  272. movq %rsi,24(%rbp)
  273. xorq %rsi,%rax
  274. xorq %rsi,%rdx
  275. xorq %rdx,%rax
  276. movq %rdx,16(%rbp)
  277. movq %rax,8(%rbp)
  278. movq 80(%rsp),%r14
  279. .cfi_restore %r14
  280. movq 88(%rsp),%r13
  281. .cfi_restore %r13
  282. movq 96(%rsp),%r12
  283. .cfi_restore %r12
  284. movq 104(%rsp),%rbp
  285. .cfi_restore %rbp
  286. movq 112(%rsp),%rbx
  287. .cfi_restore %rbx
  288. leaq 136(%rsp),%rsp
  289. .cfi_adjust_cfa_offset -8*17
  290. .Lepilogue_mul_2x2:
  291. .byte 0xf3,0xc3
  292. .Lend_mul_2x2:
  293. .cfi_endproc
  294. .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
  295. .byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  296. .align 16