gradfun.c 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. /*
  2. * Copyright (C) 2009 Loren Merritt <lorenm@u.washignton.edu>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/attributes.h"
  21. #include "libavutil/cpu.h"
  22. #include "libavutil/mem.h"
  23. #include "libavutil/x86/asm.h"
  24. #include "libavfilter/gradfun.h"
  25. #if HAVE_INLINE_ASM
  26. DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
  27. DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
  28. #if HAVE_MMXEXT_INLINE
  29. static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src, const uint16_t *dc,
  30. int width, int thresh,
  31. const uint16_t *dithers)
  32. {
  33. intptr_t x;
  34. if (width & 3) {
  35. x = width & ~3;
  36. ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
  37. width = x;
  38. }
  39. x = -width;
  40. __asm__ volatile(
  41. "movd %4, %%mm5 \n"
  42. "pxor %%mm7, %%mm7 \n"
  43. "pshufw $0, %%mm5, %%mm5 \n"
  44. "movq %6, %%mm6 \n"
  45. "movq (%5), %%mm3 \n"
  46. "movq 8(%5), %%mm4 \n"
  47. "1: \n"
  48. "movd (%2,%0), %%mm0 \n"
  49. "movd (%3,%0), %%mm1 \n"
  50. "punpcklbw %%mm7, %%mm0 \n"
  51. "punpcklwd %%mm1, %%mm1 \n"
  52. "psllw $7, %%mm0 \n"
  53. "pxor %%mm2, %%mm2 \n"
  54. "psubw %%mm0, %%mm1 \n" // delta = dc - pix
  55. "psubw %%mm1, %%mm2 \n"
  56. "pmaxsw %%mm1, %%mm2 \n"
  57. "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
  58. "psubw %%mm6, %%mm2 \n"
  59. "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m)
  60. "pmullw %%mm2, %%mm2 \n"
  61. "paddw %%mm3, %%mm0 \n" // pix += dither
  62. "psllw $2, %%mm1 \n" // m = m*m*delta >> 14
  63. "pmulhw %%mm2, %%mm1 \n"
  64. "paddw %%mm1, %%mm0 \n" // pix += m
  65. "psraw $7, %%mm0 \n"
  66. "packuswb %%mm0, %%mm0 \n"
  67. "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
  68. "add $4, %0 \n"
  69. "jnl 2f \n"
  70. "movd (%2,%0), %%mm0 \n"
  71. "movd (%3,%0), %%mm1 \n"
  72. "punpcklbw %%mm7, %%mm0 \n"
  73. "punpcklwd %%mm1, %%mm1 \n"
  74. "psllw $7, %%mm0 \n"
  75. "pxor %%mm2, %%mm2 \n"
  76. "psubw %%mm0, %%mm1 \n" // delta = dc - pix
  77. "psubw %%mm1, %%mm2 \n"
  78. "pmaxsw %%mm1, %%mm2 \n"
  79. "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
  80. "psubw %%mm6, %%mm2 \n"
  81. "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m)
  82. "pmullw %%mm2, %%mm2 \n"
  83. "paddw %%mm4, %%mm0 \n" // pix += dither
  84. "psllw $2, %%mm1 \n" // m = m*m*delta >> 14
  85. "pmulhw %%mm2, %%mm1 \n"
  86. "paddw %%mm1, %%mm0 \n" // pix += m
  87. "psraw $7, %%mm0 \n"
  88. "packuswb %%mm0, %%mm0 \n"
  89. "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
  90. "add $4, %0 \n"
  91. "jl 1b \n"
  92. "2: \n"
  93. "emms \n"
  94. :"+r"(x)
  95. :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
  96. "rm"(thresh), "r"(dithers), "m"(*pw_7f)
  97. :"memory"
  98. );
  99. }
  100. #endif
  101. #if HAVE_SSSE3_INLINE
  102. static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers)
  103. {
  104. intptr_t x;
  105. if (width & 7) {
  106. // could be 10% faster if I somehow eliminated this
  107. x = width & ~7;
  108. ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
  109. width = x;
  110. }
  111. x = -width;
  112. __asm__ volatile(
  113. "movd %4, %%xmm5 \n"
  114. "pxor %%xmm7, %%xmm7 \n"
  115. "pshuflw $0,%%xmm5, %%xmm5 \n"
  116. "movdqa %6, %%xmm6 \n"
  117. "punpcklqdq %%xmm5, %%xmm5 \n"
  118. "movdqa %5, %%xmm4 \n"
  119. "1: \n"
  120. "movq (%2,%0), %%xmm0 \n"
  121. "movq (%3,%0), %%xmm1 \n"
  122. "punpcklbw %%xmm7, %%xmm0 \n"
  123. "punpcklwd %%xmm1, %%xmm1 \n"
  124. "psllw $7, %%xmm0 \n"
  125. "psubw %%xmm0, %%xmm1 \n" // delta = dc - pix
  126. "pabsw %%xmm1, %%xmm2 \n"
  127. "pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
  128. "psubw %%xmm6, %%xmm2 \n"
  129. "pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
  130. "pmullw %%xmm2, %%xmm2 \n"
  131. "psllw $2, %%xmm1 \n"
  132. "paddw %%xmm4, %%xmm0 \n" // pix += dither
  133. "pmulhw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
  134. "paddw %%xmm1, %%xmm0 \n" // pix += m
  135. "psraw $7, %%xmm0 \n"
  136. "packuswb %%xmm0, %%xmm0 \n"
  137. "movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
  138. "add $8, %0 \n"
  139. "jl 1b \n"
  140. :"+&r"(x)
  141. :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
  142. "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
  143. :"memory"
  144. );
  145. }
  146. #endif /* HAVE_SSSE3_INLINE */
  147. #if HAVE_SSE2_INLINE
  148. static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1, const uint8_t *src, int src_linesize, int width)
  149. {
  150. #define BLURV(load)\
  151. intptr_t x = -2*width;\
  152. __asm__ volatile(\
  153. "movdqa %6, %%xmm7 \n"\
  154. "1: \n"\
  155. load" (%4,%0), %%xmm0 \n"\
  156. load" (%5,%0), %%xmm1 \n"\
  157. "movdqa %%xmm0, %%xmm2 \n"\
  158. "movdqa %%xmm1, %%xmm3 \n"\
  159. "psrlw $8, %%xmm0 \n"\
  160. "psrlw $8, %%xmm1 \n"\
  161. "pand %%xmm7, %%xmm2 \n"\
  162. "pand %%xmm7, %%xmm3 \n"\
  163. "paddw %%xmm1, %%xmm0 \n"\
  164. "paddw %%xmm3, %%xmm2 \n"\
  165. "paddw %%xmm2, %%xmm0 \n"\
  166. "paddw (%2,%0), %%xmm0 \n"\
  167. "movdqa (%1,%0), %%xmm1 \n"\
  168. "movdqa %%xmm0, (%1,%0) \n"\
  169. "psubw %%xmm1, %%xmm0 \n"\
  170. "movdqa %%xmm0, (%3,%0) \n"\
  171. "add $16, %0 \n"\
  172. "jl 1b \n"\
  173. :"+&r"(x)\
  174. :"r"(buf+width),\
  175. "r"(buf1+width),\
  176. "r"(dc+width),\
  177. "r"(src+width*2),\
  178. "r"(src+width*2+src_linesize),\
  179. "m"(*pw_ff)\
  180. :"memory"\
  181. );
  182. if (((intptr_t) src | src_linesize) & 15) {
  183. BLURV("movdqu");
  184. } else {
  185. BLURV("movdqa");
  186. }
  187. }
  188. #endif /* HAVE_SSE2_INLINE */
  189. #endif /* HAVE_INLINE_ASM */
  190. av_cold void ff_gradfun_init_x86(GradFunContext *gf)
  191. {
  192. int cpu_flags = av_get_cpu_flags();
  193. #if HAVE_MMXEXT_INLINE
  194. if (cpu_flags & AV_CPU_FLAG_MMXEXT)
  195. gf->filter_line = gradfun_filter_line_mmxext;
  196. #endif
  197. #if HAVE_SSSE3_INLINE
  198. if (cpu_flags & AV_CPU_FLAG_SSSE3)
  199. gf->filter_line = gradfun_filter_line_ssse3;
  200. #endif
  201. #if HAVE_SSE2_INLINE
  202. if (cpu_flags & AV_CPU_FLAG_SSE2)
  203. gf->blur_line = gradfun_blur_line_sse2;
  204. #endif
  205. }