rematrix.asm 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. ;******************************************************************************
  2. ;* Copyright (c) 2012 Michael Niedermayer
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86inc.asm"
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. align 32
  24. dw1: times 8 dd 1
  25. w1 : times 16 dw 1
  26. SECTION .text
  27. %macro MIX2_FLT 1
  28. cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len
  29. %ifidn %1, a
  30. test in1q, mmsize-1
  31. jne mix_2_1_float_u_int %+ SUFFIX
  32. test in2q, mmsize-1
  33. jne mix_2_1_float_u_int %+ SUFFIX
  34. test outq, mmsize-1
  35. jne mix_2_1_float_u_int %+ SUFFIX
  36. %else
  37. mix_2_1_float_u_int %+ SUFFIX
  38. %endif
  39. VBROADCASTSS m4, [coeffpq + 4*index1q]
  40. VBROADCASTSS m5, [coeffpq + 4*index2q]
  41. shl lend , 2
  42. add in1q , lenq
  43. add in2q , lenq
  44. add outq , lenq
  45. neg lenq
  46. .next:
  47. %ifidn %1, a
  48. mulps m0, m4, [in1q + lenq ]
  49. mulps m1, m5, [in2q + lenq ]
  50. mulps m2, m4, [in1q + lenq + mmsize]
  51. mulps m3, m5, [in2q + lenq + mmsize]
  52. %else
  53. movu m0, [in1q + lenq ]
  54. movu m1, [in2q + lenq ]
  55. movu m2, [in1q + lenq + mmsize]
  56. movu m3, [in2q + lenq + mmsize]
  57. mulps m0, m0, m4
  58. mulps m1, m1, m5
  59. mulps m2, m2, m4
  60. mulps m3, m3, m5
  61. %endif
  62. addps m0, m0, m1
  63. addps m2, m2, m3
  64. mov%1 [outq + lenq ], m0
  65. mov%1 [outq + lenq + mmsize], m2
  66. add lenq, mmsize*2
  67. jl .next
  68. REP_RET
  69. %endmacro
  70. %macro MIX1_FLT 1
  71. cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len
  72. %ifidn %1, a
  73. test inq, mmsize-1
  74. jne mix_1_1_float_u_int %+ SUFFIX
  75. test outq, mmsize-1
  76. jne mix_1_1_float_u_int %+ SUFFIX
  77. %else
  78. mix_1_1_float_u_int %+ SUFFIX
  79. %endif
  80. VBROADCASTSS m2, [coeffpq + 4*indexq]
  81. shl lenq , 2
  82. add inq , lenq
  83. add outq , lenq
  84. neg lenq
  85. .next:
  86. %ifidn %1, a
  87. mulps m0, m2, [inq + lenq ]
  88. mulps m1, m2, [inq + lenq + mmsize]
  89. %else
  90. movu m0, [inq + lenq ]
  91. movu m1, [inq + lenq + mmsize]
  92. mulps m0, m0, m2
  93. mulps m1, m1, m2
  94. %endif
  95. mov%1 [outq + lenq ], m0
  96. mov%1 [outq + lenq + mmsize], m1
  97. add lenq, mmsize*2
  98. jl .next
  99. REP_RET
  100. %endmacro
  101. %macro MIX1_INT16 1
  102. cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len
  103. %ifidn %1, a
  104. test inq, mmsize-1
  105. jne mix_1_1_int16_u_int %+ SUFFIX
  106. test outq, mmsize-1
  107. jne mix_1_1_int16_u_int %+ SUFFIX
  108. %else
  109. mix_1_1_int16_u_int %+ SUFFIX
  110. %endif
  111. movd m4, [coeffpq + 4*indexq]
  112. SPLATW m5, m4
  113. psllq m4, 32
  114. psrlq m4, 48
  115. mova m0, [w1]
  116. psllw m0, m4
  117. psrlw m0, 1
  118. punpcklwd m5, m0
  119. add lenq , lenq
  120. add inq , lenq
  121. add outq , lenq
  122. neg lenq
  123. .next:
  124. mov%1 m0, [inq + lenq ]
  125. mov%1 m2, [inq + lenq + mmsize]
  126. mova m1, m0
  127. mova m3, m2
  128. punpcklwd m0, [w1]
  129. punpckhwd m1, [w1]
  130. punpcklwd m2, [w1]
  131. punpckhwd m3, [w1]
  132. pmaddwd m0, m5
  133. pmaddwd m1, m5
  134. pmaddwd m2, m5
  135. pmaddwd m3, m5
  136. psrad m0, m4
  137. psrad m1, m4
  138. psrad m2, m4
  139. psrad m3, m4
  140. packssdw m0, m1
  141. packssdw m2, m3
  142. mov%1 [outq + lenq ], m0
  143. mov%1 [outq + lenq + mmsize], m2
  144. add lenq, mmsize*2
  145. jl .next
  146. %if mmsize == 8
  147. emms
  148. RET
  149. %else
  150. REP_RET
  151. %endif
  152. %endmacro
  153. %macro MIX2_INT16 1
  154. cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len
  155. %ifidn %1, a
  156. test in1q, mmsize-1
  157. jne mix_2_1_int16_u_int %+ SUFFIX
  158. test in2q, mmsize-1
  159. jne mix_2_1_int16_u_int %+ SUFFIX
  160. test outq, mmsize-1
  161. jne mix_2_1_int16_u_int %+ SUFFIX
  162. %else
  163. mix_2_1_int16_u_int %+ SUFFIX
  164. %endif
  165. movd m4, [coeffpq + 4*index1q]
  166. movd m6, [coeffpq + 4*index2q]
  167. SPLATW m5, m4
  168. SPLATW m6, m6
  169. psllq m4, 32
  170. psrlq m4, 48
  171. mova m7, [dw1]
  172. pslld m7, m4
  173. psrld m7, 1
  174. punpcklwd m5, m6
  175. add lend , lend
  176. add in1q , lenq
  177. add in2q , lenq
  178. add outq , lenq
  179. neg lenq
  180. .next:
  181. mov%1 m0, [in1q + lenq ]
  182. mov%1 m2, [in2q + lenq ]
  183. mova m1, m0
  184. punpcklwd m0, m2
  185. punpckhwd m1, m2
  186. mov%1 m2, [in1q + lenq + mmsize]
  187. mov%1 m6, [in2q + lenq + mmsize]
  188. mova m3, m2
  189. punpcklwd m2, m6
  190. punpckhwd m3, m6
  191. pmaddwd m0, m5
  192. pmaddwd m1, m5
  193. pmaddwd m2, m5
  194. pmaddwd m3, m5
  195. paddd m0, m7
  196. paddd m1, m7
  197. paddd m2, m7
  198. paddd m3, m7
  199. psrad m0, m4
  200. psrad m1, m4
  201. psrad m2, m4
  202. psrad m3, m4
  203. packssdw m0, m1
  204. packssdw m2, m3
  205. mov%1 [outq + lenq ], m0
  206. mov%1 [outq + lenq + mmsize], m2
  207. add lenq, mmsize*2
  208. jl .next
  209. %if mmsize == 8
  210. emms
  211. RET
  212. %else
  213. REP_RET
  214. %endif
  215. %endmacro
  216. INIT_MMX mmx
  217. MIX1_INT16 u
  218. MIX1_INT16 a
  219. MIX2_INT16 u
  220. MIX2_INT16 a
  221. INIT_XMM sse
  222. MIX2_FLT u
  223. MIX2_FLT a
  224. MIX1_FLT u
  225. MIX1_FLT a
  226. INIT_XMM sse2
  227. MIX1_INT16 u
  228. MIX1_INT16 a
  229. MIX2_INT16 u
  230. MIX2_INT16 a
  231. %if HAVE_AVX_EXTERNAL
  232. INIT_YMM avx
  233. MIX2_FLT u
  234. MIX2_FLT a
  235. MIX1_FLT u
  236. MIX1_FLT a
  237. %endif