yuv_sse2.c 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // YUV->RGB conversion functions
  11. //
  12. // Author: Skal (pascal.massimino@gmail.com)
  13. #include "./yuv.h"
  14. #if defined(WEBP_USE_SSE2)
  15. #include "./common_sse2.h"
  16. #include <stdlib.h>
  17. #include <emmintrin.h>
  18. //-----------------------------------------------------------------------------
  19. // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
  20. // These constants are 14b fixed-point version of ITU-R BT.601 constants.
  21. // R = (19077 * y + 26149 * v - 14234) >> 6
  22. // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
  23. // B = (19077 * y + 33050 * u - 17685) >> 6
  24. static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
  25. const __m128i* const U0,
  26. const __m128i* const V0,
  27. __m128i* const R,
  28. __m128i* const G,
  29. __m128i* const B) {
  30. const __m128i k19077 = _mm_set1_epi16(19077);
  31. const __m128i k26149 = _mm_set1_epi16(26149);
  32. const __m128i k14234 = _mm_set1_epi16(14234);
  33. // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
  34. const __m128i k33050 = _mm_set1_epi16((short)33050);
  35. const __m128i k17685 = _mm_set1_epi16(17685);
  36. const __m128i k6419 = _mm_set1_epi16(6419);
  37. const __m128i k13320 = _mm_set1_epi16(13320);
  38. const __m128i k8708 = _mm_set1_epi16(8708);
  39. const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
  40. const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
  41. const __m128i R1 = _mm_sub_epi16(Y1, k14234);
  42. const __m128i R2 = _mm_add_epi16(R1, R0);
  43. const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
  44. const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
  45. const __m128i G2 = _mm_add_epi16(Y1, k8708);
  46. const __m128i G3 = _mm_add_epi16(G0, G1);
  47. const __m128i G4 = _mm_sub_epi16(G2, G3);
  48. // be careful with the saturated *unsigned* arithmetic here!
  49. const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
  50. const __m128i B1 = _mm_adds_epu16(B0, Y1);
  51. const __m128i B2 = _mm_subs_epu16(B1, k17685);
  52. // use logical shift for B2, which can be larger than 32767
  53. *R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815]
  54. *G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710]
  55. *B = _mm_srli_epi16(B2, 6); // range: [0, 34238]
  56. }
  57. // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
  58. static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
  59. const __m128i zero = _mm_setzero_si128();
  60. return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
  61. }
  62. // Load and replicate the U/V samples
  63. static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
  64. const __m128i zero = _mm_setzero_si128();
  65. const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
  66. const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
  67. return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples
  68. }
  69. // Convert 32 samples of YUV444 to R/G/B
  70. static void YUV444ToRGB_SSE2(const uint8_t* const y,
  71. const uint8_t* const u,
  72. const uint8_t* const v,
  73. __m128i* const R, __m128i* const G,
  74. __m128i* const B) {
  75. const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
  76. V0 = Load_HI_16_SSE2(v);
  77. ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
  78. }
  79. // Convert 32 samples of YUV420 to R/G/B
  80. static void YUV420ToRGB_SSE2(const uint8_t* const y,
  81. const uint8_t* const u,
  82. const uint8_t* const v,
  83. __m128i* const R, __m128i* const G,
  84. __m128i* const B) {
  85. const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
  86. V0 = Load_UV_HI_8_SSE2(v);
  87. ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
  88. }
  89. // Pack R/G/B/A results into 32b output.
  90. static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
  91. const __m128i* const G,
  92. const __m128i* const B,
  93. const __m128i* const A,
  94. uint8_t* const dst) {
  95. const __m128i rb = _mm_packus_epi16(*R, *B);
  96. const __m128i ga = _mm_packus_epi16(*G, *A);
  97. const __m128i rg = _mm_unpacklo_epi8(rb, ga);
  98. const __m128i ba = _mm_unpackhi_epi8(rb, ga);
  99. const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba);
  100. const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba);
  101. _mm_storeu_si128((__m128i*)(dst + 0), RGBA_lo);
  102. _mm_storeu_si128((__m128i*)(dst + 16), RGBA_hi);
  103. }
  104. // Pack R/G/B/A results into 16b output.
  105. static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
  106. const __m128i* const G,
  107. const __m128i* const B,
  108. const __m128i* const A,
  109. uint8_t* const dst) {
  110. #if (WEBP_SWAP_16BIT_CSP == 0)
  111. const __m128i rg0 = _mm_packus_epi16(*R, *G);
  112. const __m128i ba0 = _mm_packus_epi16(*B, *A);
  113. #else
  114. const __m128i rg0 = _mm_packus_epi16(*B, *A);
  115. const __m128i ba0 = _mm_packus_epi16(*R, *G);
  116. #endif
  117. const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
  118. const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb...
  119. const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga...
  120. const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
  121. const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), 4);
  122. const __m128i rgba4444 = _mm_or_si128(rb2, ga2);
  123. _mm_storeu_si128((__m128i*)dst, rgba4444);
  124. }
  125. // Pack R/G/B results into 16b output.
  126. static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
  127. const __m128i* const G,
  128. const __m128i* const B,
  129. uint8_t* const dst) {
  130. const __m128i r0 = _mm_packus_epi16(*R, *R);
  131. const __m128i g0 = _mm_packus_epi16(*G, *G);
  132. const __m128i b0 = _mm_packus_epi16(*B, *B);
  133. const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8));
  134. const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f));
  135. const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5);
  136. const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
  137. const __m128i rg = _mm_or_si128(r1, g1);
  138. const __m128i gb = _mm_or_si128(g2, b1);
  139. #if (WEBP_SWAP_16BIT_CSP == 0)
  140. const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
  141. #else
  142. const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
  143. #endif
  144. _mm_storeu_si128((__m128i*)dst, rgb565);
  145. }
  146. // Pack the planar buffers
  147. // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
  148. // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
  149. static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
  150. __m128i* const in2, __m128i* const in3,
  151. __m128i* const in4, __m128i* const in5,
  152. uint8_t* const rgb) {
  153. // The input is 6 registers of sixteen 8b but for the sake of explanation,
  154. // let's take 6 registers of four 8b values.
  155. // To pack, we will keep taking one every two 8b integer and move it
  156. // around as follows:
  157. // Input:
  158. // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
  159. // Split the 6 registers in two sets of 3 registers: the first set as the even
  160. // 8b bytes, the second the odd ones:
  161. // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
  162. // Repeat the same permutations twice more:
  163. // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
  164. // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
  165. VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5);
  166. _mm_storeu_si128((__m128i*)(rgb + 0), *in0);
  167. _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
  168. _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
  169. _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
  170. _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
  171. _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
  172. }
  173. void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  174. uint8_t* dst) {
  175. const __m128i kAlpha = _mm_set1_epi16(255);
  176. int n;
  177. for (n = 0; n < 32; n += 8, dst += 32) {
  178. __m128i R, G, B;
  179. YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
  180. PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
  181. }
  182. }
  183. void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  184. uint8_t* dst) {
  185. const __m128i kAlpha = _mm_set1_epi16(255);
  186. int n;
  187. for (n = 0; n < 32; n += 8, dst += 32) {
  188. __m128i R, G, B;
  189. YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
  190. PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
  191. }
  192. }
  193. void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  194. uint8_t* dst) {
  195. const __m128i kAlpha = _mm_set1_epi16(255);
  196. int n;
  197. for (n = 0; n < 32; n += 8, dst += 32) {
  198. __m128i R, G, B;
  199. YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
  200. PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
  201. }
  202. }
  203. void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
  204. const uint8_t* v, uint8_t* dst) {
  205. const __m128i kAlpha = _mm_set1_epi16(255);
  206. int n;
  207. for (n = 0; n < 32; n += 8, dst += 16) {
  208. __m128i R, G, B;
  209. YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
  210. PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
  211. }
  212. }
  213. void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  214. uint8_t* dst) {
  215. int n;
  216. for (n = 0; n < 32; n += 8, dst += 16) {
  217. __m128i R, G, B;
  218. YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
  219. PackAndStore565_SSE2(&R, &G, &B, dst);
  220. }
  221. }
  222. void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  223. uint8_t* dst) {
  224. __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  225. __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
  226. YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
  227. YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
  228. YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
  229. YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
  230. // Cast to 8b and store as RRRRGGGGBBBB.
  231. rgb0 = _mm_packus_epi16(R0, R1);
  232. rgb1 = _mm_packus_epi16(R2, R3);
  233. rgb2 = _mm_packus_epi16(G0, G1);
  234. rgb3 = _mm_packus_epi16(G2, G3);
  235. rgb4 = _mm_packus_epi16(B0, B1);
  236. rgb5 = _mm_packus_epi16(B2, B3);
  237. // Pack as RGBRGBRGBRGB.
  238. PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
  239. }
  240. void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  241. uint8_t* dst) {
  242. __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  243. __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
  244. YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
  245. YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
  246. YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
  247. YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
  248. // Cast to 8b and store as BBBBGGGGRRRR.
  249. bgr0 = _mm_packus_epi16(B0, B1);
  250. bgr1 = _mm_packus_epi16(B2, B3);
  251. bgr2 = _mm_packus_epi16(G0, G1);
  252. bgr3 = _mm_packus_epi16(G2, G3);
  253. bgr4 = _mm_packus_epi16(R0, R1);
  254. bgr5= _mm_packus_epi16(R2, R3);
  255. // Pack as BGRBGRBGRBGR.
  256. PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
  257. }
  258. //-----------------------------------------------------------------------------
  259. // Arbitrary-length row conversion functions
  260. static void YuvToRgbaRow_SSE2(const uint8_t* y,
  261. const uint8_t* u, const uint8_t* v,
  262. uint8_t* dst, int len) {
  263. const __m128i kAlpha = _mm_set1_epi16(255);
  264. int n;
  265. for (n = 0; n + 8 <= len; n += 8, dst += 32) {
  266. __m128i R, G, B;
  267. YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
  268. PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
  269. y += 8;
  270. u += 4;
  271. v += 4;
  272. }
  273. for (; n < len; ++n) { // Finish off
  274. VP8YuvToRgba(y[0], u[0], v[0], dst);
  275. dst += 4;
  276. y += 1;
  277. u += (n & 1);
  278. v += (n & 1);
  279. }
  280. }
  281. static void YuvToBgraRow_SSE2(const uint8_t* y,
  282. const uint8_t* u, const uint8_t* v,
  283. uint8_t* dst, int len) {
  284. const __m128i kAlpha = _mm_set1_epi16(255);
  285. int n;
  286. for (n = 0; n + 8 <= len; n += 8, dst += 32) {
  287. __m128i R, G, B;
  288. YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
  289. PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
  290. y += 8;
  291. u += 4;
  292. v += 4;
  293. }
  294. for (; n < len; ++n) { // Finish off
  295. VP8YuvToBgra(y[0], u[0], v[0], dst);
  296. dst += 4;
  297. y += 1;
  298. u += (n & 1);
  299. v += (n & 1);
  300. }
  301. }
  302. static void YuvToArgbRow_SSE2(const uint8_t* y,
  303. const uint8_t* u, const uint8_t* v,
  304. uint8_t* dst, int len) {
  305. const __m128i kAlpha = _mm_set1_epi16(255);
  306. int n;
  307. for (n = 0; n + 8 <= len; n += 8, dst += 32) {
  308. __m128i R, G, B;
  309. YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
  310. PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
  311. y += 8;
  312. u += 4;
  313. v += 4;
  314. }
  315. for (; n < len; ++n) { // Finish off
  316. VP8YuvToArgb(y[0], u[0], v[0], dst);
  317. dst += 4;
  318. y += 1;
  319. u += (n & 1);
  320. v += (n & 1);
  321. }
  322. }
  323. static void YuvToRgbRow_SSE2(const uint8_t* y,
  324. const uint8_t* u, const uint8_t* v,
  325. uint8_t* dst, int len) {
  326. int n;
  327. for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
  328. __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  329. __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
  330. YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
  331. YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
  332. YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
  333. YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
  334. // Cast to 8b and store as RRRRGGGGBBBB.
  335. rgb0 = _mm_packus_epi16(R0, R1);
  336. rgb1 = _mm_packus_epi16(R2, R3);
  337. rgb2 = _mm_packus_epi16(G0, G1);
  338. rgb3 = _mm_packus_epi16(G2, G3);
  339. rgb4 = _mm_packus_epi16(B0, B1);
  340. rgb5 = _mm_packus_epi16(B2, B3);
  341. // Pack as RGBRGBRGBRGB.
  342. PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
  343. y += 32;
  344. u += 16;
  345. v += 16;
  346. }
  347. for (; n < len; ++n) { // Finish off
  348. VP8YuvToRgb(y[0], u[0], v[0], dst);
  349. dst += 3;
  350. y += 1;
  351. u += (n & 1);
  352. v += (n & 1);
  353. }
  354. }
  355. static void YuvToBgrRow_SSE2(const uint8_t* y,
  356. const uint8_t* u, const uint8_t* v,
  357. uint8_t* dst, int len) {
  358. int n;
  359. for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
  360. __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  361. __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
  362. YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
  363. YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
  364. YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
  365. YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
  366. // Cast to 8b and store as BBBBGGGGRRRR.
  367. bgr0 = _mm_packus_epi16(B0, B1);
  368. bgr1 = _mm_packus_epi16(B2, B3);
  369. bgr2 = _mm_packus_epi16(G0, G1);
  370. bgr3 = _mm_packus_epi16(G2, G3);
  371. bgr4 = _mm_packus_epi16(R0, R1);
  372. bgr5 = _mm_packus_epi16(R2, R3);
  373. // Pack as BGRBGRBGRBGR.
  374. PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
  375. y += 32;
  376. u += 16;
  377. v += 16;
  378. }
  379. for (; n < len; ++n) { // Finish off
  380. VP8YuvToBgr(y[0], u[0], v[0], dst);
  381. dst += 3;
  382. y += 1;
  383. u += (n & 1);
  384. v += (n & 1);
  385. }
  386. }
  387. //------------------------------------------------------------------------------
  388. // Entry point
  389. extern void WebPInitSamplersSSE2(void);
  390. WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
  391. WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE2;
  392. WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
  393. WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE2;
  394. WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
  395. WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
  396. }
  397. //------------------------------------------------------------------------------
  398. // RGB24/32 -> YUV converters
  399. // Load eight 16b-words from *src.
  400. #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
  401. // Store either 16b-words into *dst
  402. #define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
  403. // Function that inserts a value of the second half of the in buffer in between
  404. // every two char of the first half.
  405. static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
  406. const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
  407. out[0] = _mm_unpacklo_epi8(in[0], in[3]);
  408. out[1] = _mm_unpackhi_epi8(in[0], in[3]);
  409. out[2] = _mm_unpacklo_epi8(in[1], in[4]);
  410. out[3] = _mm_unpackhi_epi8(in[1], in[4]);
  411. out[4] = _mm_unpacklo_epi8(in[2], in[5]);
  412. out[5] = _mm_unpackhi_epi8(in[2], in[5]);
  413. }
  414. // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
  415. // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
  416. // Similar to PlanarTo24bHelper(), but in reverse order.
  417. static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
  418. const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
  419. __m128i tmp[6];
  420. tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
  421. tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
  422. tmp[2] = _mm_loadu_si128((const __m128i*)(rgb + 32));
  423. tmp[3] = _mm_loadu_si128((const __m128i*)(rgb + 48));
  424. tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
  425. tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
  426. RGB24PackedToPlanarHelper_SSE2(tmp, out);
  427. RGB24PackedToPlanarHelper_SSE2(out, tmp);
  428. RGB24PackedToPlanarHelper_SSE2(tmp, out);
  429. RGB24PackedToPlanarHelper_SSE2(out, tmp);
  430. RGB24PackedToPlanarHelper_SSE2(tmp, out);
  431. }
  432. // Convert 8 packed ARGB to r[], g[], b[]
  433. static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
  434. __m128i* const rgb /*in[6]*/) {
  435. const __m128i zero = _mm_setzero_si128();
  436. __m128i a0 = LOAD_16(argb + 0);
  437. __m128i a1 = LOAD_16(argb + 4);
  438. __m128i a2 = LOAD_16(argb + 8);
  439. __m128i a3 = LOAD_16(argb + 12);
  440. VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
  441. rgb[0] = _mm_unpacklo_epi8(a1, zero);
  442. rgb[1] = _mm_unpackhi_epi8(a1, zero);
  443. rgb[2] = _mm_unpacklo_epi8(a2, zero);
  444. rgb[3] = _mm_unpackhi_epi8(a2, zero);
  445. rgb[4] = _mm_unpacklo_epi8(a3, zero);
  446. rgb[5] = _mm_unpackhi_epi8(a3, zero);
  447. }
  448. // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
  449. // It's a macro and not a function because we need to use immediate values with
  450. // srai_epi32, e.g.
  451. #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
  452. ROUNDER, DESCALE_FIX, OUT) do { \
  453. const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
  454. const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
  455. const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
  456. const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
  457. const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
  458. const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
  459. const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
  460. const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
  461. const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
  462. const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
  463. (OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
  464. } while (0)
  465. #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
  466. static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
  467. const __m128i* const G,
  468. const __m128i* const B,
  469. __m128i* const Y) {
  470. const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
  471. const __m128i kGB_y = MK_CST_16(16384, 6420);
  472. const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
  473. const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
  474. const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
  475. const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
  476. const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
  477. TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
  478. }
  479. static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
  480. const __m128i* const G,
  481. const __m128i* const B,
  482. __m128i* const U,
  483. __m128i* const V) {
  484. const __m128i kRG_u = MK_CST_16(-9719, -19081);
  485. const __m128i kGB_u = MK_CST_16(0, 28800);
  486. const __m128i kRG_v = MK_CST_16(28800, 0);
  487. const __m128i kGB_v = MK_CST_16(-24116, -4684);
  488. const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
  489. const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
  490. const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
  491. const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
  492. const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
  493. TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
  494. kHALF_UV, YUV_FIX + 2, *U);
  495. TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
  496. kHALF_UV, YUV_FIX + 2, *V);
  497. }
  498. #undef MK_CST_16
  499. #undef TRANSFORM
  500. static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
  501. const int max_width = width & ~31;
  502. int i;
  503. for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
  504. __m128i rgb_plane[6];
  505. int j;
  506. RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
  507. for (j = 0; j < 2; ++j, i += 16) {
  508. const __m128i zero = _mm_setzero_si128();
  509. __m128i r, g, b, Y0, Y1;
  510. // Convert to 16-bit Y.
  511. r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
  512. g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
  513. b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
  514. ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
  515. // Convert to 16-bit Y.
  516. r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
  517. g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
  518. b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
  519. ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
  520. // Cast to 8-bit and store.
  521. STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
  522. }
  523. }
  524. for (; i < width; ++i, rgb += 3) { // left-over
  525. y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
  526. }
  527. }
  528. static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
  529. const int max_width = width & ~31;
  530. int i;
  531. for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
  532. __m128i bgr_plane[6];
  533. int j;
  534. RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
  535. for (j = 0; j < 2; ++j, i += 16) {
  536. const __m128i zero = _mm_setzero_si128();
  537. __m128i r, g, b, Y0, Y1;
  538. // Convert to 16-bit Y.
  539. b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
  540. g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
  541. r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
  542. ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
  543. // Convert to 16-bit Y.
  544. b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
  545. g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
  546. r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
  547. ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
  548. // Cast to 8-bit and store.
  549. STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
  550. }
  551. }
  552. for (; i < width; ++i, bgr += 3) { // left-over
  553. y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
  554. }
  555. }
  556. static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
  557. const int max_width = width & ~15;
  558. int i;
  559. for (i = 0; i < max_width; i += 16) {
  560. __m128i Y0, Y1, rgb[6];
  561. RGB32PackedToPlanar_SSE2(&argb[i], rgb);
  562. ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
  563. ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
  564. STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
  565. }
  566. for (; i < width; ++i) { // left-over
  567. const uint32_t p = argb[i];
  568. y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
  569. YUV_HALF);
  570. }
  571. }
  572. // Horizontal add (doubled) of two 16b values, result is 16b.
  573. // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
  574. static void HorizontalAddPack_SSE2(const __m128i* const A,
  575. const __m128i* const B,
  576. __m128i* const out) {
  577. const __m128i k2 = _mm_set1_epi16(2);
  578. const __m128i C = _mm_madd_epi16(*A, k2);
  579. const __m128i D = _mm_madd_epi16(*B, k2);
  580. *out = _mm_packs_epi32(C, D);
  581. }
  582. static void ConvertARGBToUV_SSE2(const uint32_t* argb,
  583. uint8_t* u, uint8_t* v,
  584. int src_width, int do_store) {
  585. const int max_width = src_width & ~31;
  586. int i;
  587. for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
  588. __m128i rgb[6], U0, V0, U1, V1;
  589. RGB32PackedToPlanar_SSE2(&argb[i], rgb);
  590. HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
  591. HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
  592. HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
  593. ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
  594. RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb);
  595. HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
  596. HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
  597. HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
  598. ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
  599. U0 = _mm_packus_epi16(U0, U1);
  600. V0 = _mm_packus_epi16(V0, V1);
  601. if (!do_store) {
  602. const __m128i prev_u = LOAD_16(u);
  603. const __m128i prev_v = LOAD_16(v);
  604. U0 = _mm_avg_epu8(U0, prev_u);
  605. V0 = _mm_avg_epu8(V0, prev_v);
  606. }
  607. STORE_16(U0, u);
  608. STORE_16(V0, v);
  609. }
  610. if (i < src_width) { // left-over
  611. WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
  612. }
  613. }
  614. // Convert 16 packed ARGB 16b-values to r[], g[], b[]
  615. static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
  616. const uint16_t* const rgbx,
  617. __m128i* const r, __m128i* const g, __m128i* const b) {
  618. const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
  619. const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
  620. const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ...
  621. const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ...
  622. // column-wise transpose
  623. const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
  624. const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
  625. const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
  626. const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
  627. const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // r0 r1 r2 r3 | g0 g1 ..
  628. const __m128i B1 = _mm_unpackhi_epi16(A0, A1); // b0 b1 b2 b3 | x x x x
  629. const __m128i B2 = _mm_unpacklo_epi16(A2, A3); // r4 r5 r6 r7 | g4 g5 ..
  630. const __m128i B3 = _mm_unpackhi_epi16(A2, A3); // b4 b5 b6 b7 | x x x x
  631. *r = _mm_unpacklo_epi64(B0, B2);
  632. *g = _mm_unpackhi_epi64(B0, B2);
  633. *b = _mm_unpacklo_epi64(B1, B3);
  634. }
  635. static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
  636. uint8_t* u, uint8_t* v, int width) {
  637. const int max_width = width & ~15;
  638. const uint16_t* const last_rgb = rgb + 4 * max_width;
  639. while (rgb < last_rgb) {
  640. __m128i r, g, b, U0, V0, U1, V1;
  641. RGBA32PackedToPlanar_16b_SSE2(rgb + 0, &r, &g, &b);
  642. ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
  643. RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b);
  644. ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
  645. STORE_16(_mm_packus_epi16(U0, U1), u);
  646. STORE_16(_mm_packus_epi16(V0, V1), v);
  647. u += 16;
  648. v += 16;
  649. rgb += 2 * 32;
  650. }
  651. if (max_width < width) { // left-over
  652. WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
  653. }
  654. }
  655. //------------------------------------------------------------------------------
  656. extern void WebPInitConvertARGBToYUVSSE2(void);
  657. WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
  658. WebPConvertARGBToY = ConvertARGBToY_SSE2;
  659. WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
  660. WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
  661. WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
  662. WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
  663. }
  664. //------------------------------------------------------------------------------
  665. #define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
  666. static uint16_t clip_y(int v) {
  667. return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
  668. }
  669. static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
  670. uint16_t* dst, int len) {
  671. uint64_t diff = 0;
  672. uint32_t tmp[4];
  673. int i;
  674. const __m128i zero = _mm_setzero_si128();
  675. const __m128i max = _mm_set1_epi16(MAX_Y);
  676. const __m128i one = _mm_set1_epi16(1);
  677. __m128i sum = zero;
  678. for (i = 0; i + 8 <= len; i += 8) {
  679. const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
  680. const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
  681. const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
  682. const __m128i D = _mm_sub_epi16(A, B); // diff_y
  683. const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
  684. const __m128i F = _mm_add_epi16(C, D); // new_y
  685. const __m128i G = _mm_or_si128(E, one); // -1 or 1
  686. const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
  687. const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
  688. _mm_storeu_si128((__m128i*)(dst + i), H);
  689. sum = _mm_add_epi32(sum, I);
  690. }
  691. _mm_storeu_si128((__m128i*)tmp, sum);
  692. diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
  693. for (; i < len; ++i) {
  694. const int diff_y = ref[i] - src[i];
  695. const int new_y = (int)dst[i] + diff_y;
  696. dst[i] = clip_y(new_y);
  697. diff += (uint64_t)abs(diff_y);
  698. }
  699. return diff;
  700. }
  701. static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
  702. int16_t* dst, int len) {
  703. int i = 0;
  704. for (i = 0; i + 8 <= len; i += 8) {
  705. const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
  706. const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
  707. const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
  708. const __m128i D = _mm_sub_epi16(A, B); // diff_uv
  709. const __m128i E = _mm_add_epi16(C, D); // new_uv
  710. _mm_storeu_si128((__m128i*)(dst + i), E);
  711. }
  712. for (; i < len; ++i) {
  713. const int diff_uv = ref[i] - src[i];
  714. dst[i] += diff_uv;
  715. }
  716. }
  717. static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
  718. const uint16_t* best_y, uint16_t* out) {
  719. int i;
  720. const __m128i kCst8 = _mm_set1_epi16(8);
  721. const __m128i max = _mm_set1_epi16(MAX_Y);
  722. const __m128i zero = _mm_setzero_si128();
  723. for (i = 0; i + 8 <= len; i += 8) {
  724. const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
  725. const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
  726. const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
  727. const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
  728. const __m128i a0b1 = _mm_add_epi16(a0, b1);
  729. const __m128i a1b0 = _mm_add_epi16(a1, b0);
  730. const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
  731. const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
  732. const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
  733. const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
  734. const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
  735. const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
  736. const __m128i d0 = _mm_add_epi16(c1, a0);
  737. const __m128i d1 = _mm_add_epi16(c0, a1);
  738. const __m128i e0 = _mm_srai_epi16(d0, 1);
  739. const __m128i e1 = _mm_srai_epi16(d1, 1);
  740. const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
  741. const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
  742. const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
  743. const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
  744. const __m128i h0 = _mm_add_epi16(g0, f0);
  745. const __m128i h1 = _mm_add_epi16(g1, f1);
  746. const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
  747. const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
  748. _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
  749. _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
  750. }
  751. for (; i < len; ++i) {
  752. // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
  753. // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
  754. // We reuse the common sub-expressions.
  755. const int a0b1 = A[i + 0] + B[i + 1];
  756. const int a1b0 = A[i + 1] + B[i + 0];
  757. const int a0a1b0b1 = a0b1 + a1b0 + 8;
  758. const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
  759. const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
  760. out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
  761. out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
  762. }
  763. }
  764. #undef MAX_Y
  765. //------------------------------------------------------------------------------
  766. extern void WebPInitSharpYUVSSE2(void);
  767. WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
  768. WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
  769. WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
  770. WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
  771. }
  772. #else // !WEBP_USE_SSE2
  773. WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
  774. WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
  775. WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
  776. #endif // WEBP_USE_SSE2