lossless_sse41.c 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. // Copyright 2021 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // SSE41 variant of methods for lossless decoder
  11. #include "./dsp.h"
  12. #if defined(WEBP_USE_SSE41)
  13. #include "./common_sse41.h"
  14. #include "./lossless.h"
  15. #include "./lossless_common.h"
  16. //------------------------------------------------------------------------------
  17. // Color-space conversion functions
  18. static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
  19. const uint32_t* const src,
  20. int num_pixels, uint32_t* dst) {
  21. // sign-extended multiplying constants, pre-shifted by 5.
  22. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
  23. const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 |
  24. (CST(green_to_blue_) & 0xffff));
  25. const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
  26. #undef CST
  27. const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);
  28. const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
  29. -1, 9, -1, 9, -1, 13, -1, 13);
  30. const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
  31. -1, 10, -1, -1, -1, 14, -1, -1);
  32. int i;
  33. for (i = 0; i + 4 <= num_pixels; i += 4) {
  34. const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
  35. const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
  36. const __m128i C = _mm_mulhi_epi16(B, mults_rb);
  37. const __m128i D = _mm_add_epi8(A, C);
  38. const __m128i E = _mm_shuffle_epi8(D, perm2);
  39. const __m128i F = _mm_mulhi_epi16(E, mults_b2);
  40. const __m128i G = _mm_add_epi8(D, F);
  41. const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
  42. _mm_storeu_si128((__m128i*)&dst[i], out);
  43. }
  44. // Fall-back to C-version for left-overs.
  45. if (i != num_pixels) {
  46. VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
  47. }
  48. }
  49. //------------------------------------------------------------------------------
  50. #define ARGB_TO_RGB_SSE41 do { \
  51. while (num_pixels >= 16) { \
  52. const __m128i in0 = _mm_loadu_si128(in + 0); \
  53. const __m128i in1 = _mm_loadu_si128(in + 1); \
  54. const __m128i in2 = _mm_loadu_si128(in + 2); \
  55. const __m128i in3 = _mm_loadu_si128(in + 3); \
  56. const __m128i a0 = _mm_shuffle_epi8(in0, perm0); \
  57. const __m128i a1 = _mm_shuffle_epi8(in1, perm1); \
  58. const __m128i a2 = _mm_shuffle_epi8(in2, perm2); \
  59. const __m128i a3 = _mm_shuffle_epi8(in3, perm3); \
  60. const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
  61. const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
  62. const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
  63. _mm_storeu_si128(out + 0, b0); \
  64. _mm_storeu_si128(out + 1, b1); \
  65. _mm_storeu_si128(out + 2, b2); \
  66. in += 4; \
  67. out += 3; \
  68. num_pixels -= 16; \
  69. } \
  70. } while (0)
  71. static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
  72. uint8_t* dst) {
  73. const __m128i* in = (const __m128i*)src;
  74. __m128i* out = (__m128i*)dst;
  75. const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
  76. 8, 14, 13, 12, -1, -1, -1, -1);
  77. const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
  78. const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
  79. const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
  80. ARGB_TO_RGB_SSE41;
  81. // left-overs
  82. if (num_pixels > 0) {
  83. VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  84. }
  85. }
  86. static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
  87. int num_pixels, uint8_t* dst) {
  88. const __m128i* in = (const __m128i*)src;
  89. __m128i* out = (__m128i*)dst;
  90. const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
  91. 12, 13, 14, -1, -1, -1, -1);
  92. const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
  93. const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
  94. const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
  95. ARGB_TO_RGB_SSE41;
  96. // left-overs
  97. if (num_pixels > 0) {
  98. VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  99. }
  100. }
  101. #undef ARGB_TO_RGB_SSE41
  102. //------------------------------------------------------------------------------
  103. // Entry point
  104. extern void VP8LDspInitSSE41(void);
  105. WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
  106. VP8LTransformColorInverse = TransformColorInverse_SSE41;
  107. VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
  108. VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
  109. }
  110. #else // !WEBP_USE_SSE41
  111. WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
  112. #endif // WEBP_USE_SSE41