vf_gradfun.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. /*
  2. * Copyright (C) 2009 Loren Merritt <lorenm@u.washignton.edu>
  3. *
  4. * This file is part of MPlayer.
  5. *
  6. * MPlayer is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * MPlayer is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License along
  17. * with MPlayer; if not, write to the Free Software Foundation, Inc.,
  18. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19. */
  20. /*
  21. * Debanding algorithm (from gradfun2db by prunedtree):
  22. * Boxblur.
  23. * Foreach pixel, if it's within threshold of the blurred value, make it closer.
  24. * So now we have a smoothed and higher bitdepth version of all the shallow
  25. * gradients, while leaving detailed areas untouched.
  26. * Dither it back to 8bit.
  27. */
  28. #include <stdio.h>
  29. #include <stdlib.h>
  30. #include <string.h>
  31. #include <inttypes.h>
  32. #include "config.h"
  33. #include "cpudetect.h"
  34. #include "img_format.h"
  35. #include "mp_image.h"
  36. #include "vf.h"
  37. #include "libvo/fastmemcpy.h"
  38. #include "libavutil/avutil.h"
  39. #include "libavutil/x86_cpu.h"
  40. struct vf_priv_s {
  41. int thresh;
  42. int radius;
  43. uint16_t *buf;
  44. void (*filter_line)(uint8_t *dst, uint8_t *src, uint16_t *dc,
  45. int width, int thresh, const uint16_t *dithers);
  46. void (*blur_line)(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
  47. uint8_t *src, int sstride, int width);
  48. };
  49. static const uint16_t __attribute__((aligned(16))) pw_7f[8] = {127,127,127,127,127,127,127,127};
  50. static const uint16_t __attribute__((aligned(16))) pw_ff[8] = {255,255,255,255,255,255,255,255};
  51. static const uint16_t __attribute__((aligned(16))) dither[8][8] = {
  52. { 0, 96, 24,120, 6,102, 30,126 },
  53. { 64, 32, 88, 56, 70, 38, 94, 62 },
  54. { 16,112, 8,104, 22,118, 14,110 },
  55. { 80, 48, 72, 40, 86, 54, 78, 46 },
  56. { 4,100, 28,124, 2, 98, 26,122 },
  57. { 68, 36, 92, 60, 66, 34, 90, 58 },
  58. { 20,116, 12,108, 18,114, 10,106 },
  59. { 84, 52, 76, 44, 82, 50, 74, 42 },
  60. };
  61. static void filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc,
  62. int width, int thresh, const uint16_t *dithers)
  63. {
  64. int x;
  65. for (x=0; x<width; x++, dc+=x&1) {
  66. int pix = src[x]<<7;
  67. int delta = dc[0] - pix;
  68. int m = abs(delta) * thresh >> 16;
  69. m = FFMAX(0, 127-m);
  70. m = m*m*delta >> 14;
  71. pix += m + dithers[x&7];
  72. dst[x] = av_clip_uint8(pix>>7);
  73. }
  74. }
  75. static void blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
  76. uint8_t *src, int sstride, int width)
  77. {
  78. int x, v, old;
  79. for (x=0; x<width; x++) {
  80. v = buf1[x] + src[2*x] + src[2*x+1] + src[2*x+sstride] + src[2*x+1+sstride];
  81. old = buf[x];
  82. buf[x] = v;
  83. dc[x] = v - old;
  84. }
  85. }
  86. #if HAVE_MMX2
  87. static void filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc,
  88. int width, int thresh, const uint16_t *dithers)
  89. {
  90. intptr_t x;
  91. if (width&3) {
  92. x = width&~3;
  93. filter_line_c(dst+x, src+x, dc+x/2, width-x, thresh, dithers);
  94. width = x;
  95. }
  96. x = -width;
  97. __asm__ volatile(
  98. "movd %4, %%mm5 \n"
  99. "pxor %%mm7, %%mm7 \n"
  100. "pshufw $0, %%mm5, %%mm5 \n"
  101. "movq %6, %%mm6 \n"
  102. "movq %5, %%mm4 \n"
  103. "1: \n"
  104. "movd (%2,%0), %%mm0 \n"
  105. "movd (%3,%0), %%mm1 \n"
  106. "punpcklbw %%mm7, %%mm0 \n"
  107. "punpcklwd %%mm1, %%mm1 \n"
  108. "psllw $7, %%mm0 \n"
  109. "pxor %%mm2, %%mm2 \n"
  110. "psubw %%mm0, %%mm1 \n" // delta = dc - pix
  111. "psubw %%mm1, %%mm2 \n"
  112. "pmaxsw %%mm1, %%mm2 \n"
  113. "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
  114. "psubw %%mm6, %%mm2 \n"
  115. "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m)
  116. "pmullw %%mm2, %%mm2 \n"
  117. "paddw %%mm4, %%mm0 \n" // pix += dither
  118. "pmulhw %%mm2, %%mm1 \n"
  119. "psllw $2, %%mm1 \n" // m = m*m*delta >> 14
  120. "paddw %%mm1, %%mm0 \n" // pix += m
  121. "psraw $7, %%mm0 \n"
  122. "packuswb %%mm0, %%mm0 \n"
  123. "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
  124. "add $4, %0 \n"
  125. "jl 1b \n"
  126. "emms \n"
  127. :"+r"(x)
  128. :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
  129. "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
  130. :"memory"
  131. );
  132. }
  133. #endif
  134. #if HAVE_SSSE3
  135. static void filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc,
  136. int width, int thresh, const uint16_t *dithers)
  137. {
  138. intptr_t x;
  139. if (width&7) {
  140. // could be 10% faster if I somehow eliminated this
  141. x = width&~7;
  142. filter_line_c(dst+x, src+x, dc+x/2, width-x, thresh, dithers);
  143. width = x;
  144. }
  145. x = -width;
  146. __asm__ volatile(
  147. "movd %4, %%xmm5 \n"
  148. "pxor %%xmm7, %%xmm7 \n"
  149. "pshuflw $0,%%xmm5, %%xmm5 \n"
  150. "movdqa %6, %%xmm6 \n"
  151. "punpcklqdq %%xmm5, %%xmm5 \n"
  152. "movdqa %5, %%xmm4 \n"
  153. "1: \n"
  154. "movq (%2,%0), %%xmm0 \n"
  155. "movq (%3,%0), %%xmm1 \n"
  156. "punpcklbw %%xmm7, %%xmm0 \n"
  157. "punpcklwd %%xmm1, %%xmm1 \n"
  158. "psllw $7, %%xmm0 \n"
  159. "psubw %%xmm0, %%xmm1 \n" // delta = dc - pix
  160. "pabsw %%xmm1, %%xmm2 \n"
  161. "pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
  162. "psubw %%xmm6, %%xmm2 \n"
  163. "pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
  164. "pmullw %%xmm2, %%xmm2 \n"
  165. "psllw $1, %%xmm2 \n"
  166. "paddw %%xmm4, %%xmm0 \n" // pix += dither
  167. "pmulhrsw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
  168. "paddw %%xmm1, %%xmm0 \n" // pix += m
  169. "psraw $7, %%xmm0 \n"
  170. "packuswb %%xmm0, %%xmm0 \n"
  171. "movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
  172. "add $8, %0 \n"
  173. "jl 1b \n"
  174. :"+&r"(x)
  175. :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
  176. "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
  177. :"memory"
  178. );
  179. }
  180. #endif // HAVE_SSSE3
  181. #if HAVE_SSE && HAVE_6REGS
  182. #define BLURV(load)\
  183. intptr_t x = -2*width;\
  184. __asm__ volatile(\
  185. "movdqa %6, %%xmm7 \n"\
  186. "1: \n"\
  187. load" (%4,%0), %%xmm0 \n"\
  188. load" (%5,%0), %%xmm1 \n"\
  189. "movdqa %%xmm0, %%xmm2 \n"\
  190. "movdqa %%xmm1, %%xmm3 \n"\
  191. "psrlw $8, %%xmm0 \n"\
  192. "psrlw $8, %%xmm1 \n"\
  193. "pand %%xmm7, %%xmm2 \n"\
  194. "pand %%xmm7, %%xmm3 \n"\
  195. "paddw %%xmm1, %%xmm0 \n"\
  196. "paddw %%xmm3, %%xmm2 \n"\
  197. "paddw %%xmm2, %%xmm0 \n"\
  198. "paddw (%2,%0), %%xmm0 \n"\
  199. "movdqa (%1,%0), %%xmm1 \n"\
  200. "movdqa %%xmm0, (%1,%0) \n"\
  201. "psubw %%xmm1, %%xmm0 \n"\
  202. "movdqa %%xmm0, (%3,%0) \n"\
  203. "add $16, %0 \n"\
  204. "jl 1b \n"\
  205. :"+&r"(x)\
  206. :"r"(buf+width),\
  207. "r"(buf1+width),\
  208. "r"(dc+width),\
  209. "r"(src+width*2),\
  210. "r"(src+width*2+sstride),\
  211. "m"(*pw_ff)\
  212. :"memory"\
  213. );
  214. static void blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
  215. uint8_t *src, int sstride, int width)
  216. {
  217. if (((intptr_t)src|sstride)&15) {
  218. BLURV("movdqu");
  219. } else {
  220. BLURV("movdqa");
  221. }
  222. }
  223. #endif // HAVE_6REGS && HAVE_SSE
  224. static void filter(struct vf_priv_s *ctx, uint8_t *dst, uint8_t *src,
  225. int width, int height, int dstride, int sstride, int r)
  226. {
  227. int bstride = ((width+15)&~15)/2;
  228. int y;
  229. uint32_t dc_factor = (1<<21)/(r*r);
  230. uint16_t *dc = ctx->buf+16;
  231. uint16_t *buf = ctx->buf+bstride+32;
  232. int thresh = ctx->thresh;
  233. memset(dc, 0, (bstride+16)*sizeof(*buf));
  234. for (y=0; y<r; y++)
  235. ctx->blur_line(dc, buf+y*bstride, buf+(y-1)*bstride, src+2*y*sstride, sstride, width/2);
  236. for (;;) {
  237. if (y < height-r) {
  238. int mod = ((y+r)/2)%r;
  239. uint16_t *buf0 = buf+mod*bstride;
  240. uint16_t *buf1 = buf+(mod?mod-1:r-1)*bstride;
  241. int x, v;
  242. ctx->blur_line(dc, buf0, buf1, src+(y+r)*sstride, sstride, width/2);
  243. for (x=v=0; x<r; x++)
  244. v += dc[x];
  245. for (; x<width/2; x++) {
  246. v += dc[x] - dc[x-r];
  247. dc[x-r] = v * dc_factor >> 16;
  248. }
  249. for (; x<(width+r+1)/2; x++)
  250. dc[x-r] = v * dc_factor >> 16;
  251. for (x=-r/2; x<0; x++)
  252. dc[x] = dc[0];
  253. }
  254. if (y == r) {
  255. for (y=0; y<r; y++)
  256. ctx->filter_line(dst+y*dstride, src+y*sstride, dc-r/2, width, thresh, dither[y&7]);
  257. }
  258. ctx->filter_line(dst+y*dstride, src+y*sstride, dc-r/2, width, thresh, dither[y&7]);
  259. if (++y >= height) break;
  260. ctx->filter_line(dst+y*dstride, src+y*sstride, dc-r/2, width, thresh, dither[y&7]);
  261. if (++y >= height) break;
  262. }
  263. }
  264. static void get_image(struct vf_instance *vf, mp_image_t *mpi)
  265. {
  266. if (mpi->flags&MP_IMGFLAG_PRESERVE) return; // don't change
  267. // ok, we can do pp in-place:
  268. vf->dmpi = vf_get_image(vf->next, mpi->imgfmt,
  269. mpi->type, mpi->flags, mpi->width, mpi->height);
  270. mpi->planes[0] = vf->dmpi->planes[0];
  271. mpi->stride[0] = vf->dmpi->stride[0];
  272. mpi->width = vf->dmpi->width;
  273. if (mpi->flags&MP_IMGFLAG_PLANAR){
  274. mpi->planes[1] = vf->dmpi->planes[1];
  275. mpi->planes[2] = vf->dmpi->planes[2];
  276. mpi->stride[1] = vf->dmpi->stride[1];
  277. mpi->stride[2] = vf->dmpi->stride[2];
  278. }
  279. mpi->flags |= MP_IMGFLAG_DIRECT;
  280. }
  281. static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
  282. {
  283. mp_image_t *dmpi = vf->dmpi;
  284. int p;
  285. if (!(mpi->flags&MP_IMGFLAG_DIRECT)) {
  286. // no DR, so get a new image. hope we'll get DR buffer:
  287. dmpi = vf_get_image(vf->next,mpi->imgfmt, MP_IMGTYPE_TEMP,
  288. MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
  289. mpi->w, mpi->h);
  290. }
  291. vf_clone_mpi_attributes(dmpi, mpi);
  292. for (p=0; p<mpi->num_planes; p++) {
  293. int w = mpi->w;
  294. int h = mpi->h;
  295. int r = vf->priv->radius;
  296. if (p) {
  297. w >>= mpi->chroma_x_shift;
  298. h >>= mpi->chroma_y_shift;
  299. r = ((r>>mpi->chroma_x_shift) + (r>>mpi->chroma_y_shift)) / 2;
  300. r = av_clip((r+1)&~1,4,32);
  301. }
  302. if (FFMIN(w,h) > 2*r)
  303. filter(vf->priv, dmpi->planes[p], mpi->planes[p], w, h,
  304. dmpi->stride[p], mpi->stride[p], r);
  305. else if (dmpi->planes[p] != mpi->planes[p])
  306. memcpy_pic(dmpi->planes[p], mpi->planes[p], w, h,
  307. dmpi->stride[p], mpi->stride[p]);
  308. }
  309. return vf_next_put_image(vf, dmpi, pts);
  310. }
  311. static int query_format(struct vf_instance *vf, unsigned int fmt)
  312. {
  313. switch (fmt){
  314. case IMGFMT_YVU9:
  315. case IMGFMT_IF09:
  316. case IMGFMT_YV12:
  317. case IMGFMT_I420:
  318. case IMGFMT_IYUV:
  319. case IMGFMT_CLPL:
  320. case IMGFMT_Y800:
  321. case IMGFMT_Y8:
  322. case IMGFMT_NV12:
  323. case IMGFMT_NV21:
  324. case IMGFMT_444P:
  325. case IMGFMT_422P:
  326. case IMGFMT_411P:
  327. case IMGFMT_HM12:
  328. return vf_next_query_format(vf,fmt);
  329. }
  330. return 0;
  331. }
  332. static int config(struct vf_instance *vf,
  333. int width, int height, int d_width, int d_height,
  334. unsigned int flags, unsigned int outfmt)
  335. {
  336. free(vf->priv->buf);
  337. vf->priv->buf = av_mallocz((((width+15)&~15)*(vf->priv->radius+1)/2+32)*sizeof(uint16_t));
  338. return vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
  339. }
  340. static void uninit(struct vf_instance *vf)
  341. {
  342. if (!vf->priv) return;
  343. av_free(vf->priv->buf);
  344. free(vf->priv);
  345. vf->priv = NULL;
  346. }
  347. static int vf_open(vf_instance_t *vf, char *args)
  348. {
  349. float thresh = 1.2;
  350. int radius = 16;
  351. vf->get_image=get_image;
  352. vf->put_image=put_image;
  353. vf->query_format=query_format;
  354. vf->config=config;
  355. vf->uninit=uninit;
  356. vf->priv=malloc(sizeof(struct vf_priv_s));
  357. memset(vf->priv, 0, sizeof(struct vf_priv_s));
  358. if (args) sscanf(args, "%f:%d", &thresh, &radius);
  359. vf->priv->thresh = (1<<15)/av_clipf(thresh,0.51,255);
  360. vf->priv->radius = av_clip((radius+1)&~1,4,32);
  361. vf->priv->blur_line = blur_line_c;
  362. vf->priv->filter_line = filter_line_c;
  363. #if HAVE_SSE && HAVE_6REGS
  364. if (gCpuCaps.hasSSE2)
  365. vf->priv->blur_line = blur_line_sse2;
  366. #endif
  367. #if HAVE_MMX2
  368. if (gCpuCaps.hasMMX2)
  369. vf->priv->filter_line = filter_line_mmx2;
  370. #endif
  371. #if HAVE_SSSE3
  372. if (gCpuCaps.hasSSSE3)
  373. vf->priv->filter_line = filter_line_ssse3;
  374. #endif
  375. return 1;
  376. }
  377. const vf_info_t vf_info_gradfun = {
  378. "gradient deband",
  379. "gradfun",
  380. "Loren Merritt",
  381. "",
  382. vf_open,
  383. NULL
  384. };