vc1dsp.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. /*
  2. * Copyright (c) 2022 Ben Avison
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License along
  17. * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  18. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19. */
  20. #include <string.h>
  21. #include "checkasm.h"
  22. #include "libavcodec/vc1dsp.h"
  23. #include "libavutil/common.h"
  24. #include "libavutil/internal.h"
  25. #include "libavutil/intreadwrite.h"
  26. #include "libavutil/mem.h"
  27. #include "libavutil/mem_internal.h"
  28. #define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
  29. #define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
  30. typedef struct {
  31. const char *name;
  32. size_t offset;
  33. int width;
  34. int height;
  35. } test;
  36. typedef struct matrix {
  37. size_t width;
  38. size_t height;
  39. float d[];
  40. } matrix;
  41. static const matrix T8 = { 8, 8, {
  42. 12, 12, 12, 12, 12, 12, 12, 12,
  43. 16, 15, 9, 4, -4, -9, -15, -16,
  44. 16, 6, -6, -16, -16, -6, 6, 16,
  45. 15, -4, -16, -9, 9, 16, 4, -15,
  46. 12, -12, -12, 12, 12, -12, -12, 12,
  47. 9, -16, 4, 15, -15, -4, 16, -9,
  48. 6, -16, 16, -6, -6, 16, -16, 6,
  49. 4, -9, 15, -16, 16, -15, 9, -4
  50. } };
  51. static const matrix T4 = { 4, 4, {
  52. 17, 17, 17, 17,
  53. 22, 10, -10, -22,
  54. 17, -17, -17, 17,
  55. 10, -22, 22, -10
  56. } };
  57. static const matrix T8t = { 8, 8, {
  58. 12, 16, 16, 15, 12, 9, 6, 4,
  59. 12, 15, 6, -4, -12, -16, -16, -9,
  60. 12, 9, -6, -16, -12, 4, 16, 15,
  61. 12, 4, -16, -9, 12, 15, -6, -16,
  62. 12, -4, -16, 9, 12, -15, -6, 16,
  63. 12, -9, -6, 16, -12, -4, 16, -15,
  64. 12, -15, 6, 4, -12, 16, -16, 9,
  65. 12, -16, 16, -15, 12, -9, 6, -4
  66. } };
  67. static const matrix T4t = { 4, 4, {
  68. 17, 22, 17, 10,
  69. 17, 10, -17, -22,
  70. 17, -10, -17, 22,
  71. 17, -22, 17, -10
  72. } };
  73. static matrix *new_matrix(size_t width, size_t height)
  74. {
  75. matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
  76. if (out == NULL) {
  77. fprintf(stderr, "Memory allocation failure\n");
  78. exit(EXIT_FAILURE);
  79. }
  80. out->width = width;
  81. out->height = height;
  82. return out;
  83. }
  84. static matrix *multiply(const matrix *a, const matrix *b)
  85. {
  86. matrix *out;
  87. if (a->width != b->height) {
  88. fprintf(stderr, "Incompatible multiplication\n");
  89. exit(EXIT_FAILURE);
  90. }
  91. out = new_matrix(b->width, a->height);
  92. for (int j = 0; j < out->height; ++j)
  93. for (int i = 0; i < out->width; ++i) {
  94. float sum = 0;
  95. for (int k = 0; k < a->width; ++k)
  96. sum += a->d[j * a->width + k] * b->d[k * b->width + i];
  97. out->d[j * out->width + i] = sum;
  98. }
  99. return out;
  100. }
  101. static void normalise(matrix *a)
  102. {
  103. for (int j = 0; j < a->height; ++j)
  104. for (int i = 0; i < a->width; ++i) {
  105. float *p = a->d + j * a->width + i;
  106. *p *= 64;
  107. if (a->height == 4)
  108. *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
  109. else
  110. *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
  111. if (a->width == 4)
  112. *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
  113. else
  114. *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
  115. }
  116. }
  117. static void divide_and_round_nearest(matrix *a, float by)
  118. {
  119. for (int j = 0; j < a->height; ++j)
  120. for (int i = 0; i < a->width; ++i) {
  121. float *p = a->d + j * a->width + i;
  122. *p = rintf(*p / by);
  123. }
  124. }
  125. static void tweak(matrix *a)
  126. {
  127. for (int j = 4; j < a->height; ++j)
  128. for (int i = 0; i < a->width; ++i) {
  129. float *p = a->d + j * a->width + i;
  130. *p += 1;
  131. }
  132. }
  133. /* The VC-1 spec places restrictions on the values permitted at three
  134. * different stages:
  135. * - D: the input coefficients in frequency domain
  136. * - E: the intermediate coefficients, inverse-transformed only horizontally
  137. * - R: the fully inverse-transformed coefficients
  138. *
  139. * To fully cater for the ranges specified requires various intermediate
  140. * values to be held to 17-bit precision; yet these conditions do not appear
  141. * to be utilised in real-world streams. At least some assembly
  142. * implementations have chosen to restrict these values to 16-bit precision,
  143. * to accelerate the decoding of real-world streams at the cost of strict
  144. * adherence to the spec. To avoid our test marking these as failures,
  145. * reduce our random inputs.
  146. */
  147. #define ATTENUATION 4
  148. static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
  149. {
  150. matrix *raw, *tmp, *D, *E, *R;
  151. raw = new_matrix(width, height);
  152. for (int i = 0; i < width * height; ++i)
  153. raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
  154. tmp = multiply(height == 8 ? &T8 : &T4, raw);
  155. D = multiply(tmp, width == 8 ? &T8t : &T4t);
  156. normalise(D);
  157. divide_and_round_nearest(D, 1);
  158. for (int i = 0; i < width * height; ++i) {
  159. if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
  160. /* Rare, so simply try again */
  161. av_free(raw);
  162. av_free(tmp);
  163. av_free(D);
  164. return generate_inverse_quantized_transform_coefficients(width, height);
  165. }
  166. }
  167. E = multiply(D, width == 8 ? &T8 : &T4);
  168. divide_and_round_nearest(E, 8);
  169. for (int i = 0; i < width * height; ++i)
  170. if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
  171. /* Rare, so simply try again */
  172. av_free(raw);
  173. av_free(tmp);
  174. av_free(D);
  175. av_free(E);
  176. return generate_inverse_quantized_transform_coefficients(width, height);
  177. }
  178. R = multiply(height == 8 ? &T8t : &T4t, E);
  179. tweak(R);
  180. divide_and_round_nearest(R, 128);
  181. for (int i = 0; i < width * height; ++i)
  182. if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
  183. /* Rare, so simply try again */
  184. av_free(raw);
  185. av_free(tmp);
  186. av_free(D);
  187. av_free(E);
  188. av_free(R);
  189. return generate_inverse_quantized_transform_coefficients(width, height);
  190. }
  191. av_free(raw);
  192. av_free(tmp);
  193. av_free(E);
  194. av_free(R);
  195. return D;
  196. }
  197. #define RANDOMIZE_BUFFER16(name, size) \
  198. do { \
  199. int i; \
  200. for (i = 0; i < size; ++i) { \
  201. uint16_t r = rnd(); \
  202. AV_WN16A(name##0 + i, r); \
  203. AV_WN16A(name##1 + i, r); \
  204. } \
  205. } while (0)
  206. #define RANDOMIZE_BUFFER8(name, size) \
  207. do { \
  208. int i; \
  209. for (i = 0; i < size; ++i) { \
  210. uint8_t r = rnd(); \
  211. name##0[i] = r; \
  212. name##1[i] = r; \
  213. } \
  214. } while (0)
  215. #define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size) \
  216. do { \
  217. uint8_t *p##0 = name##0, *p##1 = name##1; \
  218. int i = (size); \
  219. while (i-- > 0) { \
  220. int x = 0x80 | (rnd() & 0x7F); \
  221. x >>= rnd() % 9; \
  222. if (rnd() & 1) \
  223. x = -x; \
  224. *p##1++ = *p##0++ = 0x80 + x; \
  225. } \
  226. } while (0)
  227. static void check_inv_trans_inplace(void)
  228. {
  229. /* Inverse transform input coefficients are stored in a 16-bit buffer
  230. * with row stride of 8 coefficients irrespective of transform size.
  231. * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
  232. * are stored in column-major order, and the outputs are written back
  233. * to the input buffer, so we oversize it slightly to catch overruns. */
  234. LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
  235. LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
  236. VC1DSPContext h;
  237. ff_vc1dsp_init(&h);
  238. if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
  239. matrix *coeffs;
  240. declare_func(void, int16_t *);
  241. RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
  242. coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
  243. for (int j = 0; j < 8; ++j)
  244. for (int i = 0; i < 8; ++i) {
  245. int idx = 8 + i * 8 + j;
  246. inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
  247. }
  248. call_ref(inv_trans_in0 + 8);
  249. call_new(inv_trans_in1 + 8);
  250. if (memcmp(inv_trans_in0, inv_trans_in1, 10 * 8 * sizeof (int16_t)))
  251. fail();
  252. bench_new(inv_trans_in1 + 8);
  253. av_free(coeffs);
  254. }
  255. }
  256. static void check_inv_trans_adding(void)
  257. {
  258. /* Inverse transform input coefficients are stored in a 16-bit buffer
  259. * with row stride of 8 coefficients irrespective of transform size. */
  260. LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
  261. LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
  262. /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
  263. * added with saturation to an array of unsigned 8-bit values. Oversize
  264. * this by 8 samples left and right and one row above and below. */
  265. LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
  266. LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
  267. VC1DSPContext h;
  268. const test tests[] = {
  269. VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
  270. VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
  271. VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
  272. VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
  273. VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
  274. VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
  275. VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
  276. };
  277. ff_vc1dsp_init(&h);
  278. for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
  279. void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
  280. if (check_func(func, "vc1dsp.%s", tests[t].name)) {
  281. matrix *coeffs;
  282. declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
  283. RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
  284. RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
  285. coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
  286. for (int j = 0; j < tests[t].height; ++j)
  287. for (int i = 0; i < tests[t].width; ++i) {
  288. int idx = j * 8 + i;
  289. inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
  290. }
  291. call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
  292. call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
  293. if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
  294. fail();
  295. bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
  296. av_free(coeffs);
  297. }
  298. }
  299. }
  300. static void check_loop_filter(void)
  301. {
  302. /* Deblocking filter buffers are big enough to hold a 16x16 block,
  303. * plus 16 columns left and 4 rows above to hold filter inputs
  304. * (depending on whether v or h neighbouring block edge, oversized
  305. * horizontally to maintain 16-byte alignment) plus 16 columns and
  306. * 4 rows below to catch write overflows */
  307. LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
  308. LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
  309. VC1DSPContext h;
  310. const test tests[] = {
  311. VC1DSP_TEST(vc1_v_loop_filter4)
  312. VC1DSP_TEST(vc1_h_loop_filter4)
  313. VC1DSP_TEST(vc1_v_loop_filter8)
  314. VC1DSP_TEST(vc1_h_loop_filter8)
  315. VC1DSP_TEST(vc1_v_loop_filter16)
  316. VC1DSP_TEST(vc1_h_loop_filter16)
  317. };
  318. ff_vc1dsp_init(&h);
  319. for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
  320. void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
  321. declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
  322. if (check_func(func, "vc1dsp.%s", tests[t].name)) {
  323. for (int count = 1000; count > 0; --count) {
  324. int pq = rnd() % 31 + 1;
  325. RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
  326. call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
  327. call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
  328. if (memcmp(filter_buf0, filter_buf1, 24 * 48))
  329. fail();
  330. }
  331. }
  332. for (int j = 0; j < 24; ++j)
  333. for (int i = 0; i < 48; ++i)
  334. filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
  335. if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
  336. bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
  337. if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
  338. bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
  339. }
  340. }
  341. #define TEST_UNESCAPE \
  342. do { \
  343. for (int count = 100; count > 0; --count) { \
  344. escaped_offset = rnd() & 7; \
  345. unescaped_offset = rnd() & 7; \
  346. escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7); \
  347. RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE); \
  348. len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
  349. len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
  350. if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE)) \
  351. fail(); \
  352. } \
  353. } while (0)
  354. static void check_unescape(void)
  355. {
  356. /* This appears to be a typical length of buffer in use */
  357. #define LOG2_UNESCAPE_BUF_SIZE 17
  358. #define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
  359. LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
  360. LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
  361. LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
  362. LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
  363. VC1DSPContext h;
  364. ff_vc1dsp_init(&h);
  365. if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
  366. int len0, len1, escaped_offset, unescaped_offset, escaped_len;
  367. declare_func(int, const uint8_t *, int, uint8_t *);
  368. /* Test data which consists of escapes sequences packed as tightly as possible */
  369. for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
  370. escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
  371. TEST_UNESCAPE;
  372. /* Test random data */
  373. RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
  374. TEST_UNESCAPE;
  375. /* Test data with escape sequences at random intervals */
  376. for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
  377. int gap, gap_msb;
  378. escaped1[x+0] = escaped0[x+0] = 0;
  379. escaped1[x+1] = escaped0[x+1] = 0;
  380. escaped1[x+2] = escaped0[x+2] = 3;
  381. escaped1[x+3] = escaped0[x+3] = rnd() & 3;
  382. gap_msb = 2u << (rnd() % 8);
  383. gap = (rnd() &~ -gap_msb) | gap_msb;
  384. x += gap;
  385. }
  386. TEST_UNESCAPE;
  387. /* Test data which is known to contain no escape sequences */
  388. memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
  389. memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
  390. TEST_UNESCAPE;
  391. /* Benchmark the no-escape-sequences case */
  392. bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
  393. }
  394. }
  395. static void check_mspel_pixels(void)
  396. {
  397. LOCAL_ALIGNED_16(uint8_t, src0, [32 * 32]);
  398. LOCAL_ALIGNED_16(uint8_t, src1, [32 * 32]);
  399. LOCAL_ALIGNED_16(uint8_t, dst0, [32 * 32]);
  400. LOCAL_ALIGNED_16(uint8_t, dst1, [32 * 32]);
  401. VC1DSPContext h;
  402. const test tests[] = {
  403. VC1DSP_SIZED_TEST(put_vc1_mspel_pixels_tab[0][0], 16, 16)
  404. VC1DSP_SIZED_TEST(put_vc1_mspel_pixels_tab[1][0], 8, 8)
  405. VC1DSP_SIZED_TEST(avg_vc1_mspel_pixels_tab[0][0], 16, 16)
  406. VC1DSP_SIZED_TEST(avg_vc1_mspel_pixels_tab[1][0], 8, 8)
  407. };
  408. ff_vc1dsp_init(&h);
  409. for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
  410. void (*func)(uint8_t *, const uint8_t*, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
  411. if (check_func(func, "vc1dsp.%s", tests[t].name)) {
  412. declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, const uint8_t*, ptrdiff_t, int);
  413. RANDOMIZE_BUFFER8(dst, 32 * 32);
  414. RANDOMIZE_BUFFER8(src, 32 * 32);
  415. call_ref(dst0, src0, 32, 0);
  416. call_new(dst1, src1, 32, 0);
  417. if (memcmp(dst0, dst1, 32 * 32)) {
  418. fail();
  419. }
  420. bench_new(dst1, src0, 32, 0);
  421. }
  422. }
  423. }
  424. void checkasm_check_vc1dsp(void)
  425. {
  426. check_inv_trans_inplace();
  427. check_inv_trans_adding();
  428. report("inv_trans");
  429. check_loop_filter();
  430. report("loop_filter");
  431. check_unescape();
  432. report("unescape_buffer");
  433. check_mspel_pixels();
  434. report("mspel_pixels");
  435. }