rematrix.asm 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. ;******************************************************************************
  2. ;* Copyright (c) 2012 Michael Niedermayer
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION_RODATA 32
  22. dw1: times 8 dd 1
  23. w1 : times 16 dw 1
  24. SECTION .text
  25. %macro MIX2_FLT 1
  26. cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len
  27. %ifidn %1, a
  28. test in1q, mmsize-1
  29. jne mix_2_1_float_u_int %+ SUFFIX
  30. test in2q, mmsize-1
  31. jne mix_2_1_float_u_int %+ SUFFIX
  32. test outq, mmsize-1
  33. jne mix_2_1_float_u_int %+ SUFFIX
  34. %else
  35. mix_2_1_float_u_int %+ SUFFIX:
  36. %endif
  37. VBROADCASTSS m4, [coeffpq + 4*index1q]
  38. VBROADCASTSS m5, [coeffpq + 4*index2q]
  39. shl lend , 2
  40. add in1q , lenq
  41. add in2q , lenq
  42. add outq , lenq
  43. neg lenq
  44. .next:
  45. %ifidn %1, a
  46. mulps m0, m4, [in1q + lenq ]
  47. mulps m1, m5, [in2q + lenq ]
  48. mulps m2, m4, [in1q + lenq + mmsize]
  49. mulps m3, m5, [in2q + lenq + mmsize]
  50. %else
  51. movu m0, [in1q + lenq ]
  52. movu m1, [in2q + lenq ]
  53. movu m2, [in1q + lenq + mmsize]
  54. movu m3, [in2q + lenq + mmsize]
  55. mulps m0, m0, m4
  56. mulps m1, m1, m5
  57. mulps m2, m2, m4
  58. mulps m3, m3, m5
  59. %endif
  60. addps m0, m0, m1
  61. addps m2, m2, m3
  62. mov%1 [outq + lenq ], m0
  63. mov%1 [outq + lenq + mmsize], m2
  64. add lenq, mmsize*2
  65. jl .next
  66. RET
  67. %endmacro
  68. %macro MIX1_FLT 1
  69. cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len
  70. %ifidn %1, a
  71. test inq, mmsize-1
  72. jne mix_1_1_float_u_int %+ SUFFIX
  73. test outq, mmsize-1
  74. jne mix_1_1_float_u_int %+ SUFFIX
  75. %else
  76. mix_1_1_float_u_int %+ SUFFIX:
  77. %endif
  78. VBROADCASTSS m2, [coeffpq + 4*indexq]
  79. shl lenq , 2
  80. add inq , lenq
  81. add outq , lenq
  82. neg lenq
  83. .next:
  84. %ifidn %1, a
  85. mulps m0, m2, [inq + lenq ]
  86. mulps m1, m2, [inq + lenq + mmsize]
  87. %else
  88. movu m0, [inq + lenq ]
  89. movu m1, [inq + lenq + mmsize]
  90. mulps m0, m0, m2
  91. mulps m1, m1, m2
  92. %endif
  93. mov%1 [outq + lenq ], m0
  94. mov%1 [outq + lenq + mmsize], m1
  95. add lenq, mmsize*2
  96. jl .next
  97. RET
  98. %endmacro
  99. %macro MIX1_INT16 1
  100. cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len
  101. %ifidn %1, a
  102. test inq, mmsize-1
  103. jne mix_1_1_int16_u_int %+ SUFFIX
  104. test outq, mmsize-1
  105. jne mix_1_1_int16_u_int %+ SUFFIX
  106. %else
  107. mix_1_1_int16_u_int %+ SUFFIX:
  108. %endif
  109. movd m4, [coeffpq + 4*indexq]
  110. SPLATW m5, m4
  111. psllq m4, 32
  112. psrlq m4, 48
  113. mova m0, [w1]
  114. psllw m0, m4
  115. psrlw m0, 1
  116. punpcklwd m5, m0
  117. add lenq , lenq
  118. add inq , lenq
  119. add outq , lenq
  120. neg lenq
  121. .next:
  122. mov%1 m0, [inq + lenq ]
  123. mov%1 m2, [inq + lenq + mmsize]
  124. mova m1, m0
  125. mova m3, m2
  126. punpcklwd m0, [w1]
  127. punpckhwd m1, [w1]
  128. punpcklwd m2, [w1]
  129. punpckhwd m3, [w1]
  130. pmaddwd m0, m5
  131. pmaddwd m1, m5
  132. pmaddwd m2, m5
  133. pmaddwd m3, m5
  134. psrad m0, m4
  135. psrad m1, m4
  136. psrad m2, m4
  137. psrad m3, m4
  138. packssdw m0, m1
  139. packssdw m2, m3
  140. mov%1 [outq + lenq ], m0
  141. mov%1 [outq + lenq + mmsize], m2
  142. add lenq, mmsize*2
  143. jl .next
  144. %if mmsize == 8
  145. emms
  146. RET
  147. %else
  148. RET
  149. %endif
  150. %endmacro
  151. %macro MIX2_INT16 1
  152. cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len
  153. %ifidn %1, a
  154. test in1q, mmsize-1
  155. jne mix_2_1_int16_u_int %+ SUFFIX
  156. test in2q, mmsize-1
  157. jne mix_2_1_int16_u_int %+ SUFFIX
  158. test outq, mmsize-1
  159. jne mix_2_1_int16_u_int %+ SUFFIX
  160. %else
  161. mix_2_1_int16_u_int %+ SUFFIX:
  162. %endif
  163. movd m4, [coeffpq + 4*index1q]
  164. movd m6, [coeffpq + 4*index2q]
  165. SPLATW m5, m4
  166. SPLATW m6, m6
  167. psllq m4, 32
  168. psrlq m4, 48
  169. mova m7, [dw1]
  170. pslld m7, m4
  171. psrld m7, 1
  172. punpcklwd m5, m6
  173. add lend , lend
  174. add in1q , lenq
  175. add in2q , lenq
  176. add outq , lenq
  177. neg lenq
  178. .next:
  179. mov%1 m0, [in1q + lenq ]
  180. mov%1 m2, [in2q + lenq ]
  181. mova m1, m0
  182. punpcklwd m0, m2
  183. punpckhwd m1, m2
  184. mov%1 m2, [in1q + lenq + mmsize]
  185. mov%1 m6, [in2q + lenq + mmsize]
  186. mova m3, m2
  187. punpcklwd m2, m6
  188. punpckhwd m3, m6
  189. pmaddwd m0, m5
  190. pmaddwd m1, m5
  191. pmaddwd m2, m5
  192. pmaddwd m3, m5
  193. paddd m0, m7
  194. paddd m1, m7
  195. paddd m2, m7
  196. paddd m3, m7
  197. psrad m0, m4
  198. psrad m1, m4
  199. psrad m2, m4
  200. psrad m3, m4
  201. packssdw m0, m1
  202. packssdw m2, m3
  203. mov%1 [outq + lenq ], m0
  204. mov%1 [outq + lenq + mmsize], m2
  205. add lenq, mmsize*2
  206. jl .next
  207. %if mmsize == 8
  208. emms
  209. RET
  210. %else
  211. RET
  212. %endif
  213. %endmacro
  214. INIT_XMM sse
  215. MIX2_FLT u
  216. MIX2_FLT a
  217. MIX1_FLT u
  218. MIX1_FLT a
  219. INIT_XMM sse2
  220. MIX1_INT16 u
  221. MIX1_INT16 a
  222. MIX2_INT16 u
  223. MIX2_INT16 a
  224. %if HAVE_AVX_EXTERNAL
  225. INIT_YMM avx
  226. MIX2_FLT u
  227. MIX2_FLT a
  228. MIX1_FLT u
  229. MIX1_FLT a
  230. %endif