x86_64-gf2m.s 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. .text
  2. .p2align 4
  3. _mul_1x1:
  4. subq $128+8,%rsp
  5. movq $-1,%r9
  6. leaq (%rax,%rax,1),%rsi
  7. shrq $3,%r9
  8. leaq (,%rax,4),%rdi
  9. andq %rax,%r9
  10. leaq (,%rax,8),%r12
  11. sarq $63,%rax
  12. leaq (%r9,%r9,1),%r10
  13. sarq $63,%rsi
  14. leaq (,%r9,4),%r11
  15. andq %rbp,%rax
  16. sarq $63,%rdi
  17. movq %rax,%rdx
  18. shlq $63,%rax
  19. andq %rbp,%rsi
  20. shrq $1,%rdx
  21. movq %rsi,%rcx
  22. shlq $62,%rsi
  23. andq %rbp,%rdi
  24. shrq $2,%rcx
  25. xorq %rsi,%rax
  26. movq %rdi,%rbx
  27. shlq $61,%rdi
  28. xorq %rcx,%rdx
  29. shrq $3,%rbx
  30. xorq %rdi,%rax
  31. xorq %rbx,%rdx
  32. movq %r9,%r13
  33. movq $0,0(%rsp)
  34. xorq %r10,%r13
  35. movq %r9,8(%rsp)
  36. movq %r11,%r14
  37. movq %r10,16(%rsp)
  38. xorq %r12,%r14
  39. movq %r13,24(%rsp)
  40. xorq %r11,%r9
  41. movq %r11,32(%rsp)
  42. xorq %r11,%r10
  43. movq %r9,40(%rsp)
  44. xorq %r11,%r13
  45. movq %r10,48(%rsp)
  46. xorq %r14,%r9
  47. movq %r13,56(%rsp)
  48. xorq %r14,%r10
  49. movq %r12,64(%rsp)
  50. xorq %r14,%r13
  51. movq %r9,72(%rsp)
  52. xorq %r11,%r9
  53. movq %r10,80(%rsp)
  54. xorq %r11,%r10
  55. movq %r13,88(%rsp)
  56. xorq %r11,%r13
  57. movq %r14,96(%rsp)
  58. movq %r8,%rsi
  59. movq %r9,104(%rsp)
  60. andq %rbp,%rsi
  61. movq %r10,112(%rsp)
  62. shrq $4,%rbp
  63. movq %r13,120(%rsp)
  64. movq %r8,%rdi
  65. andq %rbp,%rdi
  66. shrq $4,%rbp
  67. movq (%rsp,%rsi,8),%xmm0
  68. movq %r8,%rsi
  69. andq %rbp,%rsi
  70. shrq $4,%rbp
  71. movq (%rsp,%rdi,8),%rcx
  72. movq %r8,%rdi
  73. movq %rcx,%rbx
  74. shlq $4,%rcx
  75. andq %rbp,%rdi
  76. movq (%rsp,%rsi,8),%xmm1
  77. shrq $60,%rbx
  78. xorq %rcx,%rax
  79. pslldq $1,%xmm1
  80. movq %r8,%rsi
  81. shrq $4,%rbp
  82. xorq %rbx,%rdx
  83. andq %rbp,%rsi
  84. shrq $4,%rbp
  85. pxor %xmm1,%xmm0
  86. movq (%rsp,%rdi,8),%rcx
  87. movq %r8,%rdi
  88. movq %rcx,%rbx
  89. shlq $12,%rcx
  90. andq %rbp,%rdi
  91. movq (%rsp,%rsi,8),%xmm1
  92. shrq $52,%rbx
  93. xorq %rcx,%rax
  94. pslldq $2,%xmm1
  95. movq %r8,%rsi
  96. shrq $4,%rbp
  97. xorq %rbx,%rdx
  98. andq %rbp,%rsi
  99. shrq $4,%rbp
  100. pxor %xmm1,%xmm0
  101. movq (%rsp,%rdi,8),%rcx
  102. movq %r8,%rdi
  103. movq %rcx,%rbx
  104. shlq $20,%rcx
  105. andq %rbp,%rdi
  106. movq (%rsp,%rsi,8),%xmm1
  107. shrq $44,%rbx
  108. xorq %rcx,%rax
  109. pslldq $3,%xmm1
  110. movq %r8,%rsi
  111. shrq $4,%rbp
  112. xorq %rbx,%rdx
  113. andq %rbp,%rsi
  114. shrq $4,%rbp
  115. pxor %xmm1,%xmm0
  116. movq (%rsp,%rdi,8),%rcx
  117. movq %r8,%rdi
  118. movq %rcx,%rbx
  119. shlq $28,%rcx
  120. andq %rbp,%rdi
  121. movq (%rsp,%rsi,8),%xmm1
  122. shrq $36,%rbx
  123. xorq %rcx,%rax
  124. pslldq $4,%xmm1
  125. movq %r8,%rsi
  126. shrq $4,%rbp
  127. xorq %rbx,%rdx
  128. andq %rbp,%rsi
  129. shrq $4,%rbp
  130. pxor %xmm1,%xmm0
  131. movq (%rsp,%rdi,8),%rcx
  132. movq %r8,%rdi
  133. movq %rcx,%rbx
  134. shlq $36,%rcx
  135. andq %rbp,%rdi
  136. movq (%rsp,%rsi,8),%xmm1
  137. shrq $28,%rbx
  138. xorq %rcx,%rax
  139. pslldq $5,%xmm1
  140. movq %r8,%rsi
  141. shrq $4,%rbp
  142. xorq %rbx,%rdx
  143. andq %rbp,%rsi
  144. shrq $4,%rbp
  145. pxor %xmm1,%xmm0
  146. movq (%rsp,%rdi,8),%rcx
  147. movq %r8,%rdi
  148. movq %rcx,%rbx
  149. shlq $44,%rcx
  150. andq %rbp,%rdi
  151. movq (%rsp,%rsi,8),%xmm1
  152. shrq $20,%rbx
  153. xorq %rcx,%rax
  154. pslldq $6,%xmm1
  155. movq %r8,%rsi
  156. shrq $4,%rbp
  157. xorq %rbx,%rdx
  158. andq %rbp,%rsi
  159. shrq $4,%rbp
  160. pxor %xmm1,%xmm0
  161. movq (%rsp,%rdi,8),%rcx
  162. movq %r8,%rdi
  163. movq %rcx,%rbx
  164. shlq $52,%rcx
  165. andq %rbp,%rdi
  166. movq (%rsp,%rsi,8),%xmm1
  167. shrq $12,%rbx
  168. xorq %rcx,%rax
  169. pslldq $7,%xmm1
  170. movq %r8,%rsi
  171. shrq $4,%rbp
  172. xorq %rbx,%rdx
  173. andq %rbp,%rsi
  174. shrq $4,%rbp
  175. pxor %xmm1,%xmm0
  176. movq (%rsp,%rdi,8),%rcx
  177. movq %rcx,%rbx
  178. shlq $60,%rcx
  179. .byte 102,72,15,126,198
  180. shrq $4,%rbx
  181. xorq %rcx,%rax
  182. psrldq $8,%xmm0
  183. xorq %rbx,%rdx
  184. .byte 102,72,15,126,199
  185. xorq %rsi,%rax
  186. xorq %rdi,%rdx
  187. addq $128+8,%rsp
  188. .byte 0xf3,0xc3
  189. L$end_mul_1x1:
  190. .globl _bn_GF2m_mul_2x2
  191. .p2align 4
  192. _bn_GF2m_mul_2x2:
  193. movq %rsp,%rax
  194. movq _OPENSSL_ia32cap_P(%rip),%r10
  195. btq $33,%r10
  196. jnc L$vanilla_mul_2x2
  197. .byte 102,72,15,110,198
  198. .byte 102,72,15,110,201
  199. .byte 102,72,15,110,210
  200. .byte 102,73,15,110,216
  201. movdqa %xmm0,%xmm4
  202. movdqa %xmm1,%xmm5
  203. .byte 102,15,58,68,193,0
  204. pxor %xmm2,%xmm4
  205. pxor %xmm3,%xmm5
  206. .byte 102,15,58,68,211,0
  207. .byte 102,15,58,68,229,0
  208. xorps %xmm0,%xmm4
  209. xorps %xmm2,%xmm4
  210. movdqa %xmm4,%xmm5
  211. pslldq $8,%xmm4
  212. psrldq $8,%xmm5
  213. pxor %xmm4,%xmm2
  214. pxor %xmm5,%xmm0
  215. movdqu %xmm2,0(%rdi)
  216. movdqu %xmm0,16(%rdi)
  217. .byte 0xf3,0xc3
  218. .p2align 4
  219. L$vanilla_mul_2x2:
  220. leaq -136(%rsp),%rsp
  221. movq %r14,80(%rsp)
  222. movq %r13,88(%rsp)
  223. movq %r12,96(%rsp)
  224. movq %rbp,104(%rsp)
  225. movq %rbx,112(%rsp)
  226. L$body_mul_2x2:
  227. movq %rdi,32(%rsp)
  228. movq %rsi,40(%rsp)
  229. movq %rdx,48(%rsp)
  230. movq %rcx,56(%rsp)
  231. movq %r8,64(%rsp)
  232. movq $0xf,%r8
  233. movq %rsi,%rax
  234. movq %rcx,%rbp
  235. call _mul_1x1
  236. movq %rax,16(%rsp)
  237. movq %rdx,24(%rsp)
  238. movq 48(%rsp),%rax
  239. movq 64(%rsp),%rbp
  240. call _mul_1x1
  241. movq %rax,0(%rsp)
  242. movq %rdx,8(%rsp)
  243. movq 40(%rsp),%rax
  244. movq 56(%rsp),%rbp
  245. xorq 48(%rsp),%rax
  246. xorq 64(%rsp),%rbp
  247. call _mul_1x1
  248. movq 0(%rsp),%rbx
  249. movq 8(%rsp),%rcx
  250. movq 16(%rsp),%rdi
  251. movq 24(%rsp),%rsi
  252. movq 32(%rsp),%rbp
  253. xorq %rdx,%rax
  254. xorq %rcx,%rdx
  255. xorq %rbx,%rax
  256. movq %rbx,0(%rbp)
  257. xorq %rdi,%rdx
  258. movq %rsi,24(%rbp)
  259. xorq %rsi,%rax
  260. xorq %rsi,%rdx
  261. xorq %rdx,%rax
  262. movq %rdx,16(%rbp)
  263. movq %rax,8(%rbp)
  264. movq 80(%rsp),%r14
  265. movq 88(%rsp),%r13
  266. movq 96(%rsp),%r12
  267. movq 104(%rsp),%rbp
  268. movq 112(%rsp),%rbx
  269. leaq 136(%rsp),%rsp
  270. L$epilogue_mul_2x2:
  271. .byte 0xf3,0xc3
  272. L$end_mul_2x2:
  273. .byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  274. .p2align 4