ssim_sse2.c 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. // Copyright 2017 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // SSE2 version of distortion calculation
  11. //
  12. // Author: Skal (pascal.massimino@gmail.com)
  13. #include "./dsp.h"
  14. #if defined(WEBP_USE_SSE2)
  15. #include <assert.h>
  16. #include <emmintrin.h>
  17. #include "./common_sse2.h"
  18. #if !defined(WEBP_DISABLE_STATS)
  19. // Helper function
  20. static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
  21. __m128i* const sum) {
  22. // take abs(a-b) in 8b
  23. const __m128i a_b = _mm_subs_epu8(a, b);
  24. const __m128i b_a = _mm_subs_epu8(b, a);
  25. const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
  26. // zero-extend to 16b
  27. const __m128i zero = _mm_setzero_si128();
  28. const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
  29. const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
  30. // multiply with self
  31. const __m128i sum1 = _mm_madd_epi16(C0, C0);
  32. const __m128i sum2 = _mm_madd_epi16(C1, C1);
  33. *sum = _mm_add_epi32(sum1, sum2);
  34. }
  35. //------------------------------------------------------------------------------
  36. // SSIM / PSNR entry point
  37. static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
  38. const uint8_t* src2, int len) {
  39. int i = 0;
  40. uint32_t sse2 = 0;
  41. if (len >= 16) {
  42. const int limit = len - 32;
  43. int32_t tmp[4];
  44. __m128i sum1;
  45. __m128i sum = _mm_setzero_si128();
  46. __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
  47. __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
  48. i += 16;
  49. while (i <= limit) {
  50. const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
  51. const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
  52. __m128i sum2;
  53. i += 16;
  54. SubtractAndSquare_SSE2(a0, b0, &sum1);
  55. sum = _mm_add_epi32(sum, sum1);
  56. a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
  57. b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
  58. i += 16;
  59. SubtractAndSquare_SSE2(a1, b1, &sum2);
  60. sum = _mm_add_epi32(sum, sum2);
  61. }
  62. SubtractAndSquare_SSE2(a0, b0, &sum1);
  63. sum = _mm_add_epi32(sum, sum1);
  64. _mm_storeu_si128((__m128i*)tmp, sum);
  65. sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
  66. }
  67. for (; i < len; ++i) {
  68. const int32_t diff = src1[i] - src2[i];
  69. sse2 += diff * diff;
  70. }
  71. return sse2;
  72. }
  73. #endif // !defined(WEBP_DISABLE_STATS)
  74. #if !defined(WEBP_REDUCE_SIZE)
  75. static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
  76. uint16_t tmp[8];
  77. const __m128i a = _mm_srli_si128(*m, 8);
  78. const __m128i b = _mm_add_epi16(*m, a);
  79. _mm_storeu_si128((__m128i*)tmp, b);
  80. return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
  81. }
  82. static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
  83. const __m128i a = _mm_srli_si128(*m, 8);
  84. const __m128i b = _mm_add_epi32(*m, a);
  85. const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
  86. return (uint32_t)_mm_cvtsi128_si32(c);
  87. }
  88. static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
  89. #define ACCUMULATE_ROW(WEIGHT) do { \
  90. /* compute row weight (Wx * Wy) */ \
  91. const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
  92. const __m128i W = _mm_mullo_epi16(Wx, Wy); \
  93. /* process 8 bytes at a time (7 bytes, actually) */ \
  94. const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
  95. const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
  96. /* convert to 16b and multiply by weight */ \
  97. const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
  98. const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
  99. const __m128i wa1 = _mm_mullo_epi16(a1, W); \
  100. const __m128i wb1 = _mm_mullo_epi16(b1, W); \
  101. /* accumulate */ \
  102. xm = _mm_add_epi16(xm, wa1); \
  103. ym = _mm_add_epi16(ym, wb1); \
  104. xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
  105. xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
  106. yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
  107. src1 += stride1; \
  108. src2 += stride2; \
  109. } while (0)
  110. static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
  111. const uint8_t* src2, int stride2) {
  112. VP8DistoStats stats;
  113. const __m128i zero = _mm_setzero_si128();
  114. __m128i xm = zero, ym = zero; // 16b accums
  115. __m128i xxm = zero, yym = zero, xym = zero; // 32b accum
  116. const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
  117. assert(2 * VP8_SSIM_KERNEL + 1 == 7);
  118. ACCUMULATE_ROW(1);
  119. ACCUMULATE_ROW(2);
  120. ACCUMULATE_ROW(3);
  121. ACCUMULATE_ROW(4);
  122. ACCUMULATE_ROW(3);
  123. ACCUMULATE_ROW(2);
  124. ACCUMULATE_ROW(1);
  125. stats.xm = HorizontalAdd16b_SSE2(&xm);
  126. stats.ym = HorizontalAdd16b_SSE2(&ym);
  127. stats.xxm = HorizontalAdd32b_SSE2(&xxm);
  128. stats.xym = HorizontalAdd32b_SSE2(&xym);
  129. stats.yym = HorizontalAdd32b_SSE2(&yym);
  130. return VP8SSIMFromStats(&stats);
  131. }
  132. #endif // !defined(WEBP_REDUCE_SIZE)
  133. extern void VP8SSIMDspInitSSE2(void);
  134. WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
  135. #if !defined(WEBP_DISABLE_STATS)
  136. VP8AccumulateSSE = AccumulateSSE_SSE2;
  137. #endif
  138. #if !defined(WEBP_REDUCE_SIZE)
  139. VP8SSIMGet = SSIMGet_SSE2;
  140. #endif
  141. }
  142. #else // !WEBP_USE_SSE2
  143. WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
  144. #endif // WEBP_USE_SSE2