multiword_64_64_intrinsic_i386_mmx.cc 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. // Copyright 2010 Google Inc. All rights reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Implements 64-bit multiword CRC using MMX built-in functions.
  15. #include "generic_crc.h"
  16. #if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX
  17. namespace crcutil {
  18. template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
  19. const void *data, size_t bytes, const uint64 &start)
  20. const GCC_OMIT_FRAME_POINTER;
  21. #if !defined(_MSC_VER)
  22. template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
  23. const void *data,
  24. size_t bytes,
  25. const uint64 &start) const {
  26. if (bytes <= 7) {
  27. const uint8 *src = static_cast<const uint8 *>(data);
  28. uint64 crc = start ^ Base().Canonize();
  29. for (const uint8 *end = src + bytes; src < end; ++src) {
  30. CRC_BYTE(this, crc, *src);
  31. }
  32. return (crc ^ Base().Canonize());
  33. }
  34. return CrcMultiwordI386Mmx(data, bytes, start);
  35. }
  36. #else
  37. #pragma warning(push)
  38. // CL: uninitialized local variable 'crc1' used
  39. // Wrong: crc1 = XOR(crc1, crc1) sets it to 0.
  40. #pragma warning(disable: 4700)
  41. #pragma warning(disable: 4619) // there is no warning number '592'
  42. // ICL: variable "crc1" is used before its value is set
  43. // Wrong: crc1 = XOR(crc1, crc1) sets it to 0.
  44. #pragma warning(disable: 592)
  45. #endif // !defined(_MSC_VER)
  46. #define MM64(adr) reinterpret_cast<const __m64 *>(adr)
  47. #define MM64_TABLE(byte) MM64(crc_word_interleaved_[byte])
  48. #define CRC_WORD_MMX(this, crc, buf) do { \
  49. buf = _mm_xor_si64(buf, crc); \
  50. uint32 tmp = static_cast<uint32>(_mm_cvtsi64_si32(buf)); \
  51. buf = _mm_srli_si64(buf, 32); \
  52. crc = MM64(crc_word_[0])[TO_BYTE(tmp)]; \
  53. tmp >>= 8; \
  54. crc = _mm_xor_si64(crc, MM64(crc_word_[1])[TO_BYTE(tmp)]); \
  55. tmp >>= 8; \
  56. crc = _mm_xor_si64(crc, MM64(crc_word_[2])[TO_BYTE(tmp)]); \
  57. tmp >>= 8; \
  58. crc = _mm_xor_si64(crc, MM64(crc_word_[3])[tmp]); \
  59. tmp = static_cast<uint32>(_mm_cvtsi64_si32(buf)); \
  60. crc = _mm_xor_si64(crc, MM64(crc_word_[4])[TO_BYTE(tmp)]); \
  61. tmp >>= 8; \
  62. crc = _mm_xor_si64(crc, MM64(crc_word_[5])[TO_BYTE(tmp)]); \
  63. tmp >>= 8; \
  64. crc = _mm_xor_si64(crc, MM64(crc_word_[6])[TO_BYTE(tmp)]); \
  65. tmp >>= 8; \
  66. crc = _mm_xor_si64(crc, MM64(crc_word_[7])[tmp]); \
  67. } while (0)
  68. template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
  69. const void *data, size_t bytes, const uint64 &start) const {
  70. const uint8 *src = static_cast<const uint8 *>(data);
  71. const uint8 *end = src + bytes;
  72. uint64 crc = start ^ Base().Canonize();
  73. ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc, uint64);
  74. if (src >= end) {
  75. return (crc ^ Base().Canonize());
  76. }
  77. // Process 4 registers of sizeof(uint64) bytes at once.
  78. bytes = static_cast<size_t>(end - src) & ~(4*8 - 1);
  79. if (bytes > 4*8) {
  80. const uint8 *stop = src + bytes - 4*8;
  81. union {
  82. __m64 m64;
  83. uint64 u64;
  84. } temp;
  85. __m64 crc0;
  86. __m64 crc1;
  87. __m64 crc2;
  88. __m64 crc3;
  89. __m64 buf0 = MM64(src)[0];
  90. __m64 buf1 = MM64(src)[1];
  91. __m64 buf2 = MM64(src)[2];
  92. __m64 buf3 = MM64(src)[3];
  93. temp.u64 = crc;
  94. crc0 = temp.m64;
  95. #if defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 4)
  96. // There is no way to suppress a warning in GCC;
  97. // generate extra assignments.
  98. temp.u64 = 0;
  99. crc1 = temp.m64;
  100. crc2 = temp.m64;
  101. crc3 = temp.m64;
  102. #else
  103. crc1 = _mm_xor_si64(crc1, crc1);
  104. crc2 = _mm_xor_si64(crc2, crc2);
  105. crc3 = _mm_xor_si64(crc3, crc3);
  106. #endif // defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 4)
  107. do {
  108. PREFETCH(src);
  109. src += 4*8;
  110. buf0 = _mm_xor_si64(buf0, crc0);
  111. buf1 = _mm_xor_si64(buf1, crc1);
  112. buf2 = _mm_xor_si64(buf2, crc2);
  113. buf3 = _mm_xor_si64(buf3, crc3);
  114. uint32 tmp0 = static_cast<uint32>(_mm_cvtsi64_si32(buf0));
  115. uint32 tmp1 = static_cast<uint32>(_mm_cvtsi64_si32(buf1));
  116. uint32 tmp2 = static_cast<uint32>(_mm_cvtsi64_si32(buf2));
  117. uint32 tmp3 = static_cast<uint32>(_mm_cvtsi64_si32(buf3));
  118. buf0 = _mm_srli_si64(buf0, 32);
  119. buf1 = _mm_srli_si64(buf1, 32);
  120. buf2 = _mm_srli_si64(buf2, 32);
  121. buf3 = _mm_srli_si64(buf3, 32);
  122. crc0 = MM64_TABLE(0)[TO_BYTE(tmp0)];
  123. tmp0 >>= 8;
  124. crc1 = MM64_TABLE(0)[TO_BYTE(tmp1)];
  125. tmp1 >>= 8;
  126. crc2 = MM64_TABLE(0)[TO_BYTE(tmp2)];
  127. tmp2 >>= 8;
  128. crc3 = MM64_TABLE(0)[TO_BYTE(tmp3)];
  129. tmp3 >>= 8;
  130. #define XOR(byte) do { \
  131. crc0 = _mm_xor_si64(crc0, MM64_TABLE(byte)[TO_BYTE(tmp0)]); \
  132. tmp0 >>= 8; \
  133. crc1 = _mm_xor_si64(crc1, MM64_TABLE(byte)[TO_BYTE(tmp1)]); \
  134. tmp1 >>= 8; \
  135. crc2 = _mm_xor_si64(crc2, MM64_TABLE(byte)[TO_BYTE(tmp2)]); \
  136. tmp2 >>= 8; \
  137. crc3 = _mm_xor_si64(crc3, MM64_TABLE(byte)[TO_BYTE(tmp3)]); \
  138. tmp3 >>= 8; \
  139. } while (0)
  140. XOR(1);
  141. XOR(2);
  142. crc0 = _mm_xor_si64(crc0, MM64_TABLE(3)[tmp0]);
  143. tmp0 = static_cast<uint32>(_mm_cvtsi64_si32(buf0));
  144. crc1 = _mm_xor_si64(crc1, MM64_TABLE(3)[tmp1]);
  145. tmp1 = static_cast<uint32>(_mm_cvtsi64_si32(buf1));
  146. crc2 = _mm_xor_si64(crc2, MM64_TABLE(3)[tmp2]);
  147. tmp2 = static_cast<uint32>(_mm_cvtsi64_si32(buf2));
  148. crc3 = _mm_xor_si64(crc3, MM64_TABLE(3)[tmp3]);
  149. tmp3 = static_cast<uint32>(_mm_cvtsi64_si32(buf3));
  150. XOR(4);
  151. XOR(5);
  152. XOR(6);
  153. #undef XOR
  154. crc0 = _mm_xor_si64(crc0, MM64_TABLE(sizeof(uint64) - 1)[tmp0]);
  155. buf0 = MM64(src)[0];
  156. crc1 = _mm_xor_si64(crc1, MM64_TABLE(sizeof(uint64) - 1)[tmp1]);
  157. buf1 = MM64(src)[1];
  158. crc2 = _mm_xor_si64(crc2, MM64_TABLE(sizeof(uint64) - 1)[tmp2]);
  159. buf2 = MM64(src)[2];
  160. crc3 = _mm_xor_si64(crc3, MM64_TABLE(sizeof(uint64) - 1)[tmp3]);
  161. buf3 = MM64(src)[3];
  162. }
  163. while (src < stop);
  164. CRC_WORD_MMX(this, crc0, buf0);
  165. buf1 = _mm_xor_si64(buf1, crc1);
  166. CRC_WORD_MMX(this, crc0, buf1);
  167. buf2 = _mm_xor_si64(buf2, crc2);
  168. CRC_WORD_MMX(this, crc0, buf2);
  169. buf3 = _mm_xor_si64(buf3, crc3);
  170. CRC_WORD_MMX(this, crc0, buf3);
  171. temp.m64 = crc0;
  172. crc = temp.u64;
  173. _mm_empty();
  174. src += 4*8;
  175. }
  176. // Process sizeof(uint64) bytes at once.
  177. bytes = static_cast<size_t>(end - src) & ~(sizeof(uint64) - 1);
  178. if (bytes > 0) {
  179. union {
  180. __m64 m64;
  181. uint64 u64;
  182. } temp;
  183. __m64 crc0;
  184. temp.u64 = crc;
  185. crc0 = temp.m64;
  186. for (const uint8 *stop = src + bytes; src < stop; src += sizeof(uint64)) {
  187. __m64 buf0 = MM64(src)[0];
  188. CRC_WORD_MMX(this, crc0, buf0);
  189. }
  190. temp.m64 = crc0;
  191. crc = temp.u64;
  192. _mm_empty();
  193. }
  194. // Compute CRC of remaining bytes.
  195. for (;src < end; ++src) {
  196. CRC_BYTE(this, crc, *src);
  197. }
  198. return (crc ^ Base().Canonize());
  199. }
  200. #if defined(_MSC_VER)
  201. #pragma warning(pop)
  202. #endif // defined(_MSC_VER)
  203. } // namespace crcutil
  204. #endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX