enc.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830
  1. // Copyright 2011 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // Speed-critical encoding functions.
  11. //
  12. // Author: Skal (pascal.massimino@gmail.com)
  13. #include <assert.h>
  14. #include <stdlib.h> // for abs()
  15. #include "./dsp.h"
  16. #include "../enc/vp8i_enc.h"
  17. static WEBP_INLINE uint8_t clip_8b(int v) {
  18. return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
  19. }
  20. #if !WEBP_NEON_OMIT_C_CODE
  21. static WEBP_INLINE int clip_max(int v, int max) {
  22. return (v > max) ? max : v;
  23. }
  24. #endif // !WEBP_NEON_OMIT_C_CODE
  25. //------------------------------------------------------------------------------
  26. // Compute susceptibility based on DCT-coeff histograms:
  27. // the higher, the "easier" the macroblock is to compress.
  28. const int VP8DspScan[16 + 4 + 4] = {
  29. // Luma
  30. 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
  31. 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
  32. 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
  33. 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
  34. 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
  35. 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
  36. };
  37. // general-purpose util function
  38. void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
  39. VP8Histogram* const histo) {
  40. int max_value = 0, last_non_zero = 1;
  41. int k;
  42. for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
  43. const int value = distribution[k];
  44. if (value > 0) {
  45. if (value > max_value) max_value = value;
  46. last_non_zero = k;
  47. }
  48. }
  49. histo->max_value = max_value;
  50. histo->last_non_zero = last_non_zero;
  51. }
  52. #if !WEBP_NEON_OMIT_C_CODE
  53. static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
  54. int start_block, int end_block,
  55. VP8Histogram* const histo) {
  56. int j;
  57. int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  58. for (j = start_block; j < end_block; ++j) {
  59. int k;
  60. int16_t out[16];
  61. VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
  62. // Convert coefficients to bin.
  63. for (k = 0; k < 16; ++k) {
  64. const int v = abs(out[k]) >> 3;
  65. const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
  66. ++distribution[clipped_value];
  67. }
  68. }
  69. VP8SetHistogramData(distribution, histo);
  70. }
  71. #endif // !WEBP_NEON_OMIT_C_CODE
  72. //------------------------------------------------------------------------------
  73. // run-time tables (~4k)
  74. static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
  75. // We declare this variable 'volatile' to prevent instruction reordering
  76. // and make sure it's set to true _last_ (so as to be thread-safe)
  77. static volatile int tables_ok = 0;
  78. static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
  79. if (!tables_ok) {
  80. int i;
  81. for (i = -255; i <= 255 + 255; ++i) {
  82. clip1[255 + i] = clip_8b(i);
  83. }
  84. tables_ok = 1;
  85. }
  86. }
  87. //------------------------------------------------------------------------------
  88. // Transforms (Paragraph 14.4)
  89. #if !WEBP_NEON_OMIT_C_CODE
  90. #define STORE(x, y, v) \
  91. dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
  92. static const int kC1 = 20091 + (1 << 16);
  93. static const int kC2 = 35468;
  94. #define MUL(a, b) (((a) * (b)) >> 16)
  95. static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  96. uint8_t* dst) {
  97. int C[4 * 4], *tmp;
  98. int i;
  99. tmp = C;
  100. for (i = 0; i < 4; ++i) { // vertical pass
  101. const int a = in[0] + in[8];
  102. const int b = in[0] - in[8];
  103. const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
  104. const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
  105. tmp[0] = a + d;
  106. tmp[1] = b + c;
  107. tmp[2] = b - c;
  108. tmp[3] = a - d;
  109. tmp += 4;
  110. in++;
  111. }
  112. tmp = C;
  113. for (i = 0; i < 4; ++i) { // horizontal pass
  114. const int dc = tmp[0] + 4;
  115. const int a = dc + tmp[8];
  116. const int b = dc - tmp[8];
  117. const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
  118. const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
  119. STORE(0, i, a + d);
  120. STORE(1, i, b + c);
  121. STORE(2, i, b - c);
  122. STORE(3, i, a - d);
  123. tmp++;
  124. }
  125. }
  126. static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
  127. int do_two) {
  128. ITransformOne(ref, in, dst);
  129. if (do_two) {
  130. ITransformOne(ref + 4, in + 16, dst + 4);
  131. }
  132. }
  133. static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  134. int i;
  135. int tmp[16];
  136. for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
  137. const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255])
  138. const int d1 = src[1] - ref[1];
  139. const int d2 = src[2] - ref[2];
  140. const int d3 = src[3] - ref[3];
  141. const int a0 = (d0 + d3); // 10b [-510,510]
  142. const int a1 = (d1 + d2);
  143. const int a2 = (d1 - d2);
  144. const int a3 = (d0 - d3);
  145. tmp[0 + i * 4] = (a0 + a1) * 8; // 14b [-8160,8160]
  146. tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542]
  147. tmp[2 + i * 4] = (a0 - a1) * 8;
  148. tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9;
  149. }
  150. for (i = 0; i < 4; ++i) {
  151. const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b
  152. const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
  153. const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
  154. const int a3 = (tmp[0 + i] - tmp[12 + i]);
  155. out[0 + i] = (a0 + a1 + 7) >> 4; // 12b
  156. out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
  157. out[8 + i] = (a0 - a1 + 7) >> 4;
  158. out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
  159. }
  160. }
  161. #endif // !WEBP_NEON_OMIT_C_CODE
  162. static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
  163. int16_t* out) {
  164. VP8FTransform(src, ref, out);
  165. VP8FTransform(src + 4, ref + 4, out + 16);
  166. }
  167. #if !WEBP_NEON_OMIT_C_CODE
  168. static void FTransformWHT_C(const int16_t* in, int16_t* out) {
  169. // input is 12b signed
  170. int32_t tmp[16];
  171. int i;
  172. for (i = 0; i < 4; ++i, in += 64) {
  173. const int a0 = (in[0 * 16] + in[2 * 16]); // 13b
  174. const int a1 = (in[1 * 16] + in[3 * 16]);
  175. const int a2 = (in[1 * 16] - in[3 * 16]);
  176. const int a3 = (in[0 * 16] - in[2 * 16]);
  177. tmp[0 + i * 4] = a0 + a1; // 14b
  178. tmp[1 + i * 4] = a3 + a2;
  179. tmp[2 + i * 4] = a3 - a2;
  180. tmp[3 + i * 4] = a0 - a1;
  181. }
  182. for (i = 0; i < 4; ++i) {
  183. const int a0 = (tmp[0 + i] + tmp[8 + i]); // 15b
  184. const int a1 = (tmp[4 + i] + tmp[12+ i]);
  185. const int a2 = (tmp[4 + i] - tmp[12+ i]);
  186. const int a3 = (tmp[0 + i] - tmp[8 + i]);
  187. const int b0 = a0 + a1; // 16b
  188. const int b1 = a3 + a2;
  189. const int b2 = a3 - a2;
  190. const int b3 = a0 - a1;
  191. out[ 0 + i] = b0 >> 1; // 15b
  192. out[ 4 + i] = b1 >> 1;
  193. out[ 8 + i] = b2 >> 1;
  194. out[12 + i] = b3 >> 1;
  195. }
  196. }
  197. #endif // !WEBP_NEON_OMIT_C_CODE
  198. #undef MUL
  199. #undef STORE
  200. //------------------------------------------------------------------------------
  201. // Intra predictions
  202. static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
  203. int j;
  204. for (j = 0; j < size; ++j) {
  205. memset(dst + j * BPS, value, size);
  206. }
  207. }
  208. static WEBP_INLINE void VerticalPred(uint8_t* dst,
  209. const uint8_t* top, int size) {
  210. int j;
  211. if (top != NULL) {
  212. for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
  213. } else {
  214. Fill(dst, 127, size);
  215. }
  216. }
  217. static WEBP_INLINE void HorizontalPred(uint8_t* dst,
  218. const uint8_t* left, int size) {
  219. if (left != NULL) {
  220. int j;
  221. for (j = 0; j < size; ++j) {
  222. memset(dst + j * BPS, left[j], size);
  223. }
  224. } else {
  225. Fill(dst, 129, size);
  226. }
  227. }
  228. static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
  229. const uint8_t* top, int size) {
  230. int y;
  231. if (left != NULL) {
  232. if (top != NULL) {
  233. const uint8_t* const clip = clip1 + 255 - left[-1];
  234. for (y = 0; y < size; ++y) {
  235. const uint8_t* const clip_table = clip + left[y];
  236. int x;
  237. for (x = 0; x < size; ++x) {
  238. dst[x] = clip_table[top[x]];
  239. }
  240. dst += BPS;
  241. }
  242. } else {
  243. HorizontalPred(dst, left, size);
  244. }
  245. } else {
  246. // true motion without left samples (hence: with default 129 value)
  247. // is equivalent to VE prediction where you just copy the top samples.
  248. // Note that if top samples are not available, the default value is
  249. // then 129, and not 127 as in the VerticalPred case.
  250. if (top != NULL) {
  251. VerticalPred(dst, top, size);
  252. } else {
  253. Fill(dst, 129, size);
  254. }
  255. }
  256. }
  257. static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
  258. const uint8_t* top,
  259. int size, int round, int shift) {
  260. int DC = 0;
  261. int j;
  262. if (top != NULL) {
  263. for (j = 0; j < size; ++j) DC += top[j];
  264. if (left != NULL) { // top and left present
  265. for (j = 0; j < size; ++j) DC += left[j];
  266. } else { // top, but no left
  267. DC += DC;
  268. }
  269. DC = (DC + round) >> shift;
  270. } else if (left != NULL) { // left but no top
  271. for (j = 0; j < size; ++j) DC += left[j];
  272. DC += DC;
  273. DC = (DC + round) >> shift;
  274. } else { // no top, no left, nothing.
  275. DC = 0x80;
  276. }
  277. Fill(dst, DC, size);
  278. }
  279. //------------------------------------------------------------------------------
  280. // Chroma 8x8 prediction (paragraph 12.2)
  281. static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
  282. const uint8_t* top) {
  283. // U block
  284. DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  285. VerticalPred(C8VE8 + dst, top, 8);
  286. HorizontalPred(C8HE8 + dst, left, 8);
  287. TrueMotion(C8TM8 + dst, left, top, 8);
  288. // V block
  289. dst += 8;
  290. if (top != NULL) top += 8;
  291. if (left != NULL) left += 16;
  292. DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  293. VerticalPred(C8VE8 + dst, top, 8);
  294. HorizontalPred(C8HE8 + dst, left, 8);
  295. TrueMotion(C8TM8 + dst, left, top, 8);
  296. }
  297. //------------------------------------------------------------------------------
  298. // luma 16x16 prediction (paragraph 12.3)
  299. static void Intra16Preds_C(uint8_t* dst,
  300. const uint8_t* left, const uint8_t* top) {
  301. DCMode(I16DC16 + dst, left, top, 16, 16, 5);
  302. VerticalPred(I16VE16 + dst, top, 16);
  303. HorizontalPred(I16HE16 + dst, left, 16);
  304. TrueMotion(I16TM16 + dst, left, top, 16);
  305. }
  306. //------------------------------------------------------------------------------
  307. // luma 4x4 prediction
  308. #define DST(x, y) dst[(x) + (y) * BPS]
  309. #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
  310. #define AVG2(a, b) (((a) + (b) + 1) >> 1)
  311. static void VE4(uint8_t* dst, const uint8_t* top) { // vertical
  312. const uint8_t vals[4] = {
  313. AVG3(top[-1], top[0], top[1]),
  314. AVG3(top[ 0], top[1], top[2]),
  315. AVG3(top[ 1], top[2], top[3]),
  316. AVG3(top[ 2], top[3], top[4])
  317. };
  318. int i;
  319. for (i = 0; i < 4; ++i) {
  320. memcpy(dst + i * BPS, vals, 4);
  321. }
  322. }
  323. static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
  324. const int X = top[-1];
  325. const int I = top[-2];
  326. const int J = top[-3];
  327. const int K = top[-4];
  328. const int L = top[-5];
  329. WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
  330. WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
  331. WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
  332. WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
  333. }
  334. static void DC4(uint8_t* dst, const uint8_t* top) {
  335. uint32_t dc = 4;
  336. int i;
  337. for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
  338. Fill(dst, dc >> 3, 4);
  339. }
  340. static void RD4(uint8_t* dst, const uint8_t* top) {
  341. const int X = top[-1];
  342. const int I = top[-2];
  343. const int J = top[-3];
  344. const int K = top[-4];
  345. const int L = top[-5];
  346. const int A = top[0];
  347. const int B = top[1];
  348. const int C = top[2];
  349. const int D = top[3];
  350. DST(0, 3) = AVG3(J, K, L);
  351. DST(0, 2) = DST(1, 3) = AVG3(I, J, K);
  352. DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J);
  353. DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
  354. DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X);
  355. DST(2, 0) = DST(3, 1) = AVG3(C, B, A);
  356. DST(3, 0) = AVG3(D, C, B);
  357. }
  358. static void LD4(uint8_t* dst, const uint8_t* top) {
  359. const int A = top[0];
  360. const int B = top[1];
  361. const int C = top[2];
  362. const int D = top[3];
  363. const int E = top[4];
  364. const int F = top[5];
  365. const int G = top[6];
  366. const int H = top[7];
  367. DST(0, 0) = AVG3(A, B, C);
  368. DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
  369. DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
  370. DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
  371. DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
  372. DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
  373. DST(3, 3) = AVG3(G, H, H);
  374. }
  375. static void VR4(uint8_t* dst, const uint8_t* top) {
  376. const int X = top[-1];
  377. const int I = top[-2];
  378. const int J = top[-3];
  379. const int K = top[-4];
  380. const int A = top[0];
  381. const int B = top[1];
  382. const int C = top[2];
  383. const int D = top[3];
  384. DST(0, 0) = DST(1, 2) = AVG2(X, A);
  385. DST(1, 0) = DST(2, 2) = AVG2(A, B);
  386. DST(2, 0) = DST(3, 2) = AVG2(B, C);
  387. DST(3, 0) = AVG2(C, D);
  388. DST(0, 3) = AVG3(K, J, I);
  389. DST(0, 2) = AVG3(J, I, X);
  390. DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
  391. DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
  392. DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
  393. DST(3, 1) = AVG3(B, C, D);
  394. }
  395. static void VL4(uint8_t* dst, const uint8_t* top) {
  396. const int A = top[0];
  397. const int B = top[1];
  398. const int C = top[2];
  399. const int D = top[3];
  400. const int E = top[4];
  401. const int F = top[5];
  402. const int G = top[6];
  403. const int H = top[7];
  404. DST(0, 0) = AVG2(A, B);
  405. DST(1, 0) = DST(0, 2) = AVG2(B, C);
  406. DST(2, 0) = DST(1, 2) = AVG2(C, D);
  407. DST(3, 0) = DST(2, 2) = AVG2(D, E);
  408. DST(0, 1) = AVG3(A, B, C);
  409. DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
  410. DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
  411. DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
  412. DST(3, 2) = AVG3(E, F, G);
  413. DST(3, 3) = AVG3(F, G, H);
  414. }
  415. static void HU4(uint8_t* dst, const uint8_t* top) {
  416. const int I = top[-2];
  417. const int J = top[-3];
  418. const int K = top[-4];
  419. const int L = top[-5];
  420. DST(0, 0) = AVG2(I, J);
  421. DST(2, 0) = DST(0, 1) = AVG2(J, K);
  422. DST(2, 1) = DST(0, 2) = AVG2(K, L);
  423. DST(1, 0) = AVG3(I, J, K);
  424. DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
  425. DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
  426. DST(3, 2) = DST(2, 2) =
  427. DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
  428. }
  429. static void HD4(uint8_t* dst, const uint8_t* top) {
  430. const int X = top[-1];
  431. const int I = top[-2];
  432. const int J = top[-3];
  433. const int K = top[-4];
  434. const int L = top[-5];
  435. const int A = top[0];
  436. const int B = top[1];
  437. const int C = top[2];
  438. DST(0, 0) = DST(2, 1) = AVG2(I, X);
  439. DST(0, 1) = DST(2, 2) = AVG2(J, I);
  440. DST(0, 2) = DST(2, 3) = AVG2(K, J);
  441. DST(0, 3) = AVG2(L, K);
  442. DST(3, 0) = AVG3(A, B, C);
  443. DST(2, 0) = AVG3(X, A, B);
  444. DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
  445. DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
  446. DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
  447. DST(1, 3) = AVG3(L, K, J);
  448. }
  449. static void TM4(uint8_t* dst, const uint8_t* top) {
  450. int x, y;
  451. const uint8_t* const clip = clip1 + 255 - top[-1];
  452. for (y = 0; y < 4; ++y) {
  453. const uint8_t* const clip_table = clip + top[-2 - y];
  454. for (x = 0; x < 4; ++x) {
  455. dst[x] = clip_table[top[x]];
  456. }
  457. dst += BPS;
  458. }
  459. }
  460. #undef DST
  461. #undef AVG3
  462. #undef AVG2
  463. // Left samples are top[-5 .. -2], top_left is top[-1], top are
  464. // located at top[0..3], and top right is top[4..7]
  465. static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
  466. DC4(I4DC4 + dst, top);
  467. TM4(I4TM4 + dst, top);
  468. VE4(I4VE4 + dst, top);
  469. HE4(I4HE4 + dst, top);
  470. RD4(I4RD4 + dst, top);
  471. VR4(I4VR4 + dst, top);
  472. LD4(I4LD4 + dst, top);
  473. VL4(I4VL4 + dst, top);
  474. HD4(I4HD4 + dst, top);
  475. HU4(I4HU4 + dst, top);
  476. }
  477. //------------------------------------------------------------------------------
  478. // Metric
  479. #if !WEBP_NEON_OMIT_C_CODE
  480. static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
  481. int w, int h) {
  482. int count = 0;
  483. int y, x;
  484. for (y = 0; y < h; ++y) {
  485. for (x = 0; x < w; ++x) {
  486. const int diff = (int)a[x] - b[x];
  487. count += diff * diff;
  488. }
  489. a += BPS;
  490. b += BPS;
  491. }
  492. return count;
  493. }
  494. static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
  495. return GetSSE(a, b, 16, 16);
  496. }
  497. static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
  498. return GetSSE(a, b, 16, 8);
  499. }
  500. static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
  501. return GetSSE(a, b, 8, 8);
  502. }
  503. static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
  504. return GetSSE(a, b, 4, 4);
  505. }
  506. #endif // !WEBP_NEON_OMIT_C_CODE
  507. static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
  508. int k, x, y;
  509. for (k = 0; k < 4; ++k) {
  510. uint32_t avg = 0;
  511. for (y = 0; y < 4; ++y) {
  512. for (x = 0; x < 4; ++x) {
  513. avg += ref[x + y * BPS];
  514. }
  515. }
  516. dc[k] = avg;
  517. ref += 4; // go to next 4x4 block.
  518. }
  519. }
  520. //------------------------------------------------------------------------------
  521. // Texture distortion
  522. //
  523. // We try to match the spectral content (weighted) between source and
  524. // reconstructed samples.
  525. #if !WEBP_NEON_OMIT_C_CODE
  526. // Hadamard transform
  527. // Returns the weighted sum of the absolute value of transformed coefficients.
  528. // w[] contains a row-major 4 by 4 symmetric matrix.
  529. static int TTransform(const uint8_t* in, const uint16_t* w) {
  530. int sum = 0;
  531. int tmp[16];
  532. int i;
  533. // horizontal pass
  534. for (i = 0; i < 4; ++i, in += BPS) {
  535. const int a0 = in[0] + in[2];
  536. const int a1 = in[1] + in[3];
  537. const int a2 = in[1] - in[3];
  538. const int a3 = in[0] - in[2];
  539. tmp[0 + i * 4] = a0 + a1;
  540. tmp[1 + i * 4] = a3 + a2;
  541. tmp[2 + i * 4] = a3 - a2;
  542. tmp[3 + i * 4] = a0 - a1;
  543. }
  544. // vertical pass
  545. for (i = 0; i < 4; ++i, ++w) {
  546. const int a0 = tmp[0 + i] + tmp[8 + i];
  547. const int a1 = tmp[4 + i] + tmp[12+ i];
  548. const int a2 = tmp[4 + i] - tmp[12+ i];
  549. const int a3 = tmp[0 + i] - tmp[8 + i];
  550. const int b0 = a0 + a1;
  551. const int b1 = a3 + a2;
  552. const int b2 = a3 - a2;
  553. const int b3 = a0 - a1;
  554. sum += w[ 0] * abs(b0);
  555. sum += w[ 4] * abs(b1);
  556. sum += w[ 8] * abs(b2);
  557. sum += w[12] * abs(b3);
  558. }
  559. return sum;
  560. }
  561. static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
  562. const uint16_t* const w) {
  563. const int sum1 = TTransform(a, w);
  564. const int sum2 = TTransform(b, w);
  565. return abs(sum2 - sum1) >> 5;
  566. }
  567. static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
  568. const uint16_t* const w) {
  569. int D = 0;
  570. int x, y;
  571. for (y = 0; y < 16 * BPS; y += 4 * BPS) {
  572. for (x = 0; x < 16; x += 4) {
  573. D += Disto4x4_C(a + x + y, b + x + y, w);
  574. }
  575. }
  576. return D;
  577. }
  578. #endif // !WEBP_NEON_OMIT_C_CODE
  579. //------------------------------------------------------------------------------
  580. // Quantization
  581. //
  582. static const uint8_t kZigzag[16] = {
  583. 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
  584. };
  585. // Simple quantization
  586. static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
  587. const VP8Matrix* const mtx) {
  588. int last = -1;
  589. int n;
  590. for (n = 0; n < 16; ++n) {
  591. const int j = kZigzag[n];
  592. const int sign = (in[j] < 0);
  593. const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
  594. if (coeff > mtx->zthresh_[j]) {
  595. const uint32_t Q = mtx->q_[j];
  596. const uint32_t iQ = mtx->iq_[j];
  597. const uint32_t B = mtx->bias_[j];
  598. int level = QUANTDIV(coeff, iQ, B);
  599. if (level > MAX_LEVEL) level = MAX_LEVEL;
  600. if (sign) level = -level;
  601. in[j] = level * (int)Q;
  602. out[n] = level;
  603. if (level) last = n;
  604. } else {
  605. out[n] = 0;
  606. in[j] = 0;
  607. }
  608. }
  609. return (last >= 0);
  610. }
  611. #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
  612. static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
  613. const VP8Matrix* const mtx) {
  614. int nz;
  615. nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  616. nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
  617. return nz;
  618. }
  619. #endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
  620. //------------------------------------------------------------------------------
  621. // Block copy
  622. static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
  623. int y;
  624. for (y = 0; y < h; ++y) {
  625. memcpy(dst, src, w);
  626. src += BPS;
  627. dst += BPS;
  628. }
  629. }
  630. static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
  631. Copy(src, dst, 4, 4);
  632. }
  633. static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
  634. Copy(src, dst, 16, 8);
  635. }
  636. //------------------------------------------------------------------------------
  637. // Initialization
  638. // Speed-critical function pointers. We have to initialize them to the default
  639. // implementations within VP8EncDspInit().
  640. VP8CHisto VP8CollectHistogram;
  641. VP8Idct VP8ITransform;
  642. VP8Fdct VP8FTransform;
  643. VP8Fdct VP8FTransform2;
  644. VP8WHT VP8FTransformWHT;
  645. VP8Intra4Preds VP8EncPredLuma4;
  646. VP8IntraPreds VP8EncPredLuma16;
  647. VP8IntraPreds VP8EncPredChroma8;
  648. VP8Metric VP8SSE16x16;
  649. VP8Metric VP8SSE8x8;
  650. VP8Metric VP8SSE16x8;
  651. VP8Metric VP8SSE4x4;
  652. VP8WMetric VP8TDisto4x4;
  653. VP8WMetric VP8TDisto16x16;
  654. VP8MeanMetric VP8Mean16x4;
  655. VP8QuantizeBlock VP8EncQuantizeBlock;
  656. VP8Quantize2Blocks VP8EncQuantize2Blocks;
  657. VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
  658. VP8BlockCopy VP8Copy4x4;
  659. VP8BlockCopy VP8Copy16x8;
  660. extern void VP8EncDspInitSSE2(void);
  661. extern void VP8EncDspInitSSE41(void);
  662. extern void VP8EncDspInitNEON(void);
  663. extern void VP8EncDspInitMIPS32(void);
  664. extern void VP8EncDspInitMIPSdspR2(void);
  665. extern void VP8EncDspInitMSA(void);
  666. WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
  667. VP8DspInit(); // common inverse transforms
  668. InitTables();
  669. // default C implementations
  670. #if !WEBP_NEON_OMIT_C_CODE
  671. VP8ITransform = ITransform_C;
  672. VP8FTransform = FTransform_C;
  673. VP8FTransformWHT = FTransformWHT_C;
  674. VP8TDisto4x4 = Disto4x4_C;
  675. VP8TDisto16x16 = Disto16x16_C;
  676. VP8CollectHistogram = CollectHistogram_C;
  677. VP8SSE16x16 = SSE16x16_C;
  678. VP8SSE16x8 = SSE16x8_C;
  679. VP8SSE8x8 = SSE8x8_C;
  680. VP8SSE4x4 = SSE4x4_C;
  681. #endif
  682. #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
  683. VP8EncQuantizeBlock = QuantizeBlock_C;
  684. VP8EncQuantize2Blocks = Quantize2Blocks_C;
  685. #endif
  686. VP8FTransform2 = FTransform2_C;
  687. VP8EncPredLuma4 = Intra4Preds_C;
  688. VP8EncPredLuma16 = Intra16Preds_C;
  689. VP8EncPredChroma8 = IntraChromaPreds_C;
  690. VP8Mean16x4 = Mean16x4_C;
  691. VP8EncQuantizeBlockWHT = QuantizeBlock_C;
  692. VP8Copy4x4 = Copy4x4_C;
  693. VP8Copy16x8 = Copy16x8_C;
  694. // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  695. if (VP8GetCPUInfo != NULL) {
  696. #if defined(WEBP_HAVE_SSE2)
  697. if (VP8GetCPUInfo(kSSE2)) {
  698. VP8EncDspInitSSE2();
  699. #if defined(WEBP_HAVE_SSE41)
  700. if (VP8GetCPUInfo(kSSE4_1)) {
  701. VP8EncDspInitSSE41();
  702. }
  703. #endif
  704. }
  705. #endif
  706. #if defined(WEBP_USE_MIPS32)
  707. if (VP8GetCPUInfo(kMIPS32)) {
  708. VP8EncDspInitMIPS32();
  709. }
  710. #endif
  711. #if defined(WEBP_USE_MIPS_DSP_R2)
  712. if (VP8GetCPUInfo(kMIPSdspR2)) {
  713. VP8EncDspInitMIPSdspR2();
  714. }
  715. #endif
  716. #if defined(WEBP_USE_MSA)
  717. if (VP8GetCPUInfo(kMSA)) {
  718. VP8EncDspInitMSA();
  719. }
  720. #endif
  721. }
  722. #if defined(WEBP_HAVE_NEON)
  723. if (WEBP_NEON_OMIT_C_CODE ||
  724. (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
  725. VP8EncDspInitNEON();
  726. }
  727. #endif
  728. assert(VP8ITransform != NULL);
  729. assert(VP8FTransform != NULL);
  730. assert(VP8FTransformWHT != NULL);
  731. assert(VP8TDisto4x4 != NULL);
  732. assert(VP8TDisto16x16 != NULL);
  733. assert(VP8CollectHistogram != NULL);
  734. assert(VP8SSE16x16 != NULL);
  735. assert(VP8SSE16x8 != NULL);
  736. assert(VP8SSE8x8 != NULL);
  737. assert(VP8SSE4x4 != NULL);
  738. assert(VP8EncQuantizeBlock != NULL);
  739. assert(VP8EncQuantize2Blocks != NULL);
  740. assert(VP8FTransform2 != NULL);
  741. assert(VP8EncPredLuma4 != NULL);
  742. assert(VP8EncPredLuma16 != NULL);
  743. assert(VP8EncPredChroma8 != NULL);
  744. assert(VP8Mean16x4 != NULL);
  745. assert(VP8EncQuantizeBlockWHT != NULL);
  746. assert(VP8Copy4x4 != NULL);
  747. assert(VP8Copy16x8 != NULL);
  748. }