hevc_deblock.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. /*
  2. * This file is part of FFmpeg.
  3. *
  4. * FFmpeg is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * FFmpeg is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License along
  15. * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  16. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  17. */
  18. #include <string.h>
  19. #include "libavutil/intreadwrite.h"
  20. #include "libavutil/macros.h"
  21. #include "libavutil/mem_internal.h"
  22. #include "libavcodec/hevc/dsp.h"
  23. #include "checkasm.h"
  24. static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
  25. #define SIZEOF_PIXEL ((bit_depth + 7) / 8)
  26. #define BUF_STRIDE (16 * 2)
  27. #define BUF_LINES (16)
  28. // large buffer sizes based on high bit depth
  29. #define BUF_OFFSET (2 * BUF_STRIDE * BUF_LINES)
  30. #define BUF_SIZE (2 * BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2)
  31. #define randomize_buffers(buf0, buf1, size) \
  32. do { \
  33. uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
  34. int k; \
  35. for (k = 0; k < size; k += 4) { \
  36. uint32_t r = rnd() & mask; \
  37. AV_WN32A(buf0 + k, r); \
  38. AV_WN32A(buf1 + k, r); \
  39. } \
  40. } while (0)
  41. static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth, int c)
  42. {
  43. // see tctable[] in hevc_filter.c, we check full range
  44. int32_t tc[2] = { rnd() % 25, rnd() % 25 };
  45. // no_p, no_q can only be { 0,0 } for the simpler assembly (non *_c
  46. // variant) functions, see deblocking_filter_CTB() in hevc_filter.c
  47. uint8_t no_p[2] = { rnd() & c, rnd() & c };
  48. uint8_t no_q[2] = { rnd() & c, rnd() & c };
  49. LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
  50. LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
  51. declare_func(void, uint8_t *pix, ptrdiff_t stride,
  52. const int32_t *tc, const uint8_t *no_p, const uint8_t *no_q);
  53. if (check_func(c ? h->hevc_h_loop_filter_chroma_c : h->hevc_h_loop_filter_chroma,
  54. "hevc_h_loop_filter_chroma%d%s", bit_depth, c ? "_full" : ""))
  55. {
  56. randomize_buffers(buf0, buf1, BUF_SIZE);
  57. call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
  58. call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
  59. if (memcmp(buf0, buf1, BUF_SIZE))
  60. fail();
  61. bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
  62. }
  63. if (check_func(c ? h->hevc_v_loop_filter_chroma_c : h->hevc_v_loop_filter_chroma,
  64. "hevc_v_loop_filter_chroma%d%s", bit_depth, c ? "_full" : ""))
  65. {
  66. randomize_buffers(buf0, buf1, BUF_SIZE);
  67. call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
  68. call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
  69. if (memcmp(buf0, buf1, BUF_SIZE))
  70. fail();
  71. bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q);
  72. }
  73. }
  74. #define P3 buf[-4 * xstride]
  75. #define P2 buf[-3 * xstride]
  76. #define P1 buf[-2 * xstride]
  77. #define P0 buf[-1 * xstride]
  78. #define Q0 buf[0 * xstride]
  79. #define Q1 buf[1 * xstride]
  80. #define Q2 buf[2 * xstride]
  81. #define Q3 buf[3 * xstride]
  82. #define TC25(x) ((tc[x] * 5 + 1) >> 1)
  83. #define MASK(x) (uint16_t)(x & ((1 << (bit_depth)) - 1))
  84. #define GET(x) ((SIZEOF_PIXEL == 1) ? *(uint8_t*)(&x) : *(uint16_t*)(&x))
  85. #define SET(x, y) do { \
  86. uint16_t z = MASK(y); \
  87. if (SIZEOF_PIXEL == 1) \
  88. *(uint8_t*)(&x) = z; \
  89. else \
  90. *(uint16_t*)(&x) = z; \
  91. } while (0)
  92. #define RANDCLIP(x, diff) av_clip(GET(x) - (diff), 0, \
  93. (1 << (bit_depth)) - 1) + rnd() % FFMAX(2 * (diff), 1)
  94. // NOTE: this function doesn't work 'correctly' in that it won't always choose
  95. // strong/strong or weak/weak, in most cases it tends to but will sometimes mix
  96. // weak/strong or even skip sometimes. This is more useful to test correctness
  97. // for these functions, though it does make benching them difficult. The easiest
  98. // way to bench these functions is to check an overall decode since there are too
  99. // many paths and ways to trigger the deblock: we would have to bench all
  100. // permutations of weak/strong/skip/nd_q/nd_p/no_q/no_p and it quickly becomes
  101. // too much.
  102. static void randomize_luma_buffers(int type, int *beta, int32_t tc[2],
  103. uint8_t *buf, ptrdiff_t xstride, ptrdiff_t ystride, int bit_depth)
  104. {
  105. int i, j, b3, tc25, tc25diff, b3diff;
  106. // both tc & beta are unscaled inputs
  107. // minimum useful value is 1, full range 0-24
  108. tc[0] = (rnd() % 25) + 1;
  109. tc[1] = (rnd() % 25) + 1;
  110. // minimum useful value for 8bit is 8
  111. *beta = (rnd() % 57) + 8;
  112. switch (type) {
  113. case 0: // strong
  114. for (j = 0; j < 2; j++) {
  115. tc25 = TC25(j) << (bit_depth - 8);
  116. tc25diff = FFMAX(tc25 - 1, 0);
  117. // 4 lines per tc
  118. for (i = 0; i < 4; i++) {
  119. b3 = (*beta << (bit_depth - 8)) >> 3;
  120. SET(P0, rnd() % (1 << bit_depth));
  121. SET(Q0, RANDCLIP(P0, tc25diff));
  122. // p3 - p0 up to beta3 budget
  123. b3diff = rnd() % b3;
  124. SET(P3, RANDCLIP(P0, b3diff));
  125. // q3 - q0, reduced budget
  126. b3diff = rnd() % FFMAX(b3 - b3diff, 1);
  127. SET(Q3, RANDCLIP(Q0, b3diff));
  128. // same concept, budget across 4 pixels
  129. b3 -= b3diff = rnd() % FFMAX(b3, 1);
  130. SET(P2, RANDCLIP(P0, b3diff));
  131. b3 -= b3diff = rnd() % FFMAX(b3, 1);
  132. SET(Q2, RANDCLIP(Q0, b3diff));
  133. // extra reduced budget for weighted pixels
  134. b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
  135. SET(P1, RANDCLIP(P0, b3diff));
  136. b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
  137. SET(Q1, RANDCLIP(Q0, b3diff));
  138. buf += ystride;
  139. }
  140. }
  141. break;
  142. case 1: // weak
  143. for (j = 0; j < 2; j++) {
  144. tc25 = TC25(j) << (bit_depth - 8);
  145. tc25diff = FFMAX(tc25 - 1, 0);
  146. // 4 lines per tc
  147. for (i = 0; i < 4; i++) {
  148. // Weak filtering is signficantly simpler to activate as
  149. // we only need to satisfy d0 + d3 < beta, which
  150. // can be simplified to d0 + d0 < beta. Using the above
  151. // derivations but substiuting b3 for b1 and ensuring
  152. // that P0/Q0 are at least 1/2 tc25diff apart (tending
  153. // towards 1/2 range).
  154. b3 = (*beta << (bit_depth - 8)) >> 1;
  155. SET(P0, rnd() % (1 << bit_depth));
  156. SET(Q0, RANDCLIP(P0, tc25diff >> 1) +
  157. (tc25diff >> 1) * (P0 < (1 << (bit_depth - 1))) ? 1 : -1);
  158. // p3 - p0 up to beta3 budget
  159. b3diff = rnd() % b3;
  160. SET(P3, RANDCLIP(P0, b3diff));
  161. // q3 - q0, reduced budget
  162. b3diff = rnd() % FFMAX(b3 - b3diff, 1);
  163. SET(Q3, RANDCLIP(Q0, b3diff));
  164. // same concept, budget across 4 pixels
  165. b3 -= b3diff = rnd() % FFMAX(b3, 1);
  166. SET(P2, RANDCLIP(P0, b3diff));
  167. b3 -= b3diff = rnd() % FFMAX(b3, 1);
  168. SET(Q2, RANDCLIP(Q0, b3diff));
  169. // extra reduced budget for weighted pixels
  170. b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
  171. SET(P1, RANDCLIP(P0, b3diff));
  172. b3 -= b3diff = rnd() % FFMAX(b3 - (1 << (bit_depth - 8)), 1);
  173. SET(Q1, RANDCLIP(Q0, b3diff));
  174. buf += ystride;
  175. }
  176. }
  177. break;
  178. case 2: // none
  179. *beta = 0; // ensure skip
  180. for (i = 0; i < 8; i++) {
  181. // we can just fill with completely random data, nothing should be touched.
  182. SET(P3, rnd()); SET(P2, rnd()); SET(P1, rnd()); SET(P0, rnd());
  183. SET(Q0, rnd()); SET(Q1, rnd()); SET(Q2, rnd()); SET(Q3, rnd());
  184. buf += ystride;
  185. }
  186. break;
  187. }
  188. }
  189. static void check_deblock_luma(HEVCDSPContext *h, int bit_depth, int c)
  190. {
  191. const char *type;
  192. const char *types[3] = { "strong", "weak", "skip" };
  193. int beta;
  194. int32_t tc[2] = {0};
  195. uint8_t no_p[2] = { rnd() & c, rnd() & c };
  196. uint8_t no_q[2] = { rnd() & c, rnd() & c };
  197. LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]);
  198. LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]);
  199. uint8_t *ptr0 = buf0 + BUF_OFFSET,
  200. *ptr1 = buf1 + BUF_OFFSET;
  201. declare_func(void, uint8_t *pix, ptrdiff_t stride, int beta,
  202. const int32_t *tc, const uint8_t *no_p, const uint8_t *no_q);
  203. memset(buf0, 0, BUF_SIZE);
  204. for (int j = 0; j < 3; j++) {
  205. type = types[j];
  206. if (check_func(c ? h->hevc_h_loop_filter_luma_c : h->hevc_h_loop_filter_luma,
  207. "hevc_h_loop_filter_luma%d_%s%s", bit_depth, type, c ? "_full" : ""))
  208. {
  209. randomize_luma_buffers(j, &beta, tc, buf0 + BUF_OFFSET, 16 * SIZEOF_PIXEL, SIZEOF_PIXEL, bit_depth);
  210. memcpy(buf1, buf0, BUF_SIZE);
  211. call_ref(ptr0, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
  212. call_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
  213. if (memcmp(buf0, buf1, BUF_SIZE))
  214. fail();
  215. bench_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
  216. }
  217. if (check_func(c ? h->hevc_v_loop_filter_luma_c : h->hevc_v_loop_filter_luma,
  218. "hevc_v_loop_filter_luma%d_%s%s", bit_depth, type, c ? "_full" : ""))
  219. {
  220. randomize_luma_buffers(j, &beta, tc, buf0 + BUF_OFFSET, SIZEOF_PIXEL, 16 * SIZEOF_PIXEL, bit_depth);
  221. memcpy(buf1, buf0, BUF_SIZE);
  222. call_ref(ptr0, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
  223. call_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
  224. if (memcmp(buf0, buf1, BUF_SIZE))
  225. fail();
  226. bench_new(ptr1, 16 * SIZEOF_PIXEL, beta, tc, no_p, no_q);
  227. }
  228. }
  229. }
  230. void checkasm_check_hevc_deblock(void)
  231. {
  232. HEVCDSPContext h;
  233. int bit_depth;
  234. for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
  235. ff_hevc_dsp_init(&h, bit_depth);
  236. check_deblock_chroma(&h, bit_depth, 0);
  237. }
  238. report("chroma");
  239. for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
  240. ff_hevc_dsp_init(&h, bit_depth);
  241. check_deblock_chroma(&h, bit_depth, 1);
  242. }
  243. report("chroma_full");
  244. for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
  245. ff_hevc_dsp_init(&h, bit_depth);
  246. check_deblock_luma(&h, bit_depth, 0);
  247. }
  248. report("luma");
  249. for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
  250. ff_hevc_dsp_init(&h, bit_depth);
  251. check_deblock_luma(&h, bit_depth, 1);
  252. }
  253. report("luma_full");
  254. }