crc32c_sse4.cc 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. // Copyright 2010 Google Inc. All rights reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Implements CRC32C using Intel's SSE4 crc32 instruction.
  15. // Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero,
  16. // emilates intrinsics via CRC_WORD/CRC_BYTE otherwise.
  17. #include "crc32c_sse4.h"
  18. #include <util/system/compiler.h>
  19. #if HAVE_I386 || HAVE_AMD64
  20. namespace crcutil {
  21. #define UPDATE_STRIPE_CRCS(index, block_size, num_stripes) do { \
  22. CRC_UPDATE_WORD(crc0, \
  23. reinterpret_cast<const size_t *>(src + \
  24. 0 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
  25. CRC_UPDATE_WORD(crc1, \
  26. reinterpret_cast<const size_t *>(src + \
  27. 1 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
  28. CRC_UPDATE_WORD(crc2, \
  29. reinterpret_cast<const size_t *>(src + \
  30. 2 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
  31. if (num_stripes > 3) { \
  32. CRC_UPDATE_WORD(crc3, \
  33. reinterpret_cast<const size_t *>(src + \
  34. 3 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
  35. } \
  36. } while (0)
  37. // Multiplies "crc" by "x**(8 * STRIPE_SIZE(block_size)"
  38. // using appropriate multiplication table(s).
  39. //
  40. #if 0
  41. // This variant is for illustration purposes only.
  42. // Actual implementation below:
  43. // 1. Splits the computation into 2 data-independent paths
  44. // by independently multiplying lower and upper halves
  45. // of "crc0" in interleaved manner, and combining the
  46. // results in the end.
  47. // 2. Removing redundant "crc0 = 0" etc. in the beginning.
  48. // 3. Removing redundant shifts of "tmp0" and "tmp1" in the last round.
  49. #define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
  50. size_t tmp0 = crc0; \
  51. crc0 = 0; \
  52. for (size_t i = 0; i < kNumTables; ++i) { \
  53. crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
  54. [i][tmp0 & (kTableEntries - 1)]; \
  55. tmp0 >>= kTableEntryBits; \
  56. } \
  57. } while (0)
  58. #else
  59. #define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
  60. size_t tmp0 = crc0; \
  61. size_t tmp1 = crc0 >> (kTableEntryBits * kNumTablesHalfHi); \
  62. crc0 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
  63. [0][tmp0 & (kTableEntries - 1)]; \
  64. tmp0 >>= kTableEntryBits; \
  65. size_t crc1 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
  66. [kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
  67. tmp1 >>= kTableEntryBits; \
  68. for (size_t i = 1; i < kNumTablesHalfLo - 1; ++i) { \
  69. crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
  70. [i][tmp0 & (kTableEntries - 1)]; \
  71. tmp0 >>= kTableEntryBits; \
  72. crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
  73. [i + kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
  74. tmp1 >>= kTableEntryBits; \
  75. } \
  76. crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
  77. [kNumTablesHalfLo - 1][tmp0 & (kTableEntries - 1)]; \
  78. if (kNumTables & 1) { \
  79. tmp0 >>= kTableEntryBits; \
  80. } \
  81. crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
  82. [kNumTables - 1][tmp1]; \
  83. if (kNumTables & 1) { \
  84. crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
  85. [kNumTablesHalfLo][tmp0 & (kTableEntries - 1)]; \
  86. } \
  87. crc0 ^= crc1; \
  88. } while (0)
  89. #endif
  90. // Given CRCs (crc0, crc1, etc.) of consequitive
  91. // stripes of STRIPE_SIZE(block_size) bytes each,
  92. // produces CRC of concatenated stripes.
  93. #define COMBINE_STRIPE_CRCS(block_size, num_stripes) do { \
  94. MULTIPLY_CRC(crc0, block_size, num_stripes); \
  95. crc0 ^= crc1; \
  96. MULTIPLY_CRC(crc0, block_size, num_stripes); \
  97. crc0 ^= crc2; \
  98. if (num_stripes > 3) { \
  99. MULTIPLY_CRC(crc0, block_size, num_stripes); \
  100. crc0 ^= crc3; \
  101. } \
  102. } while (0)
  103. // Processes input BLOCK_SIZE(block) bytes per iteration
  104. // by splitting a block of BLOCK_SIZE(block) bytes into N
  105. // equally-sized stripes of STRIPE_SIZE(block_size) each,
  106. // computing CRC of each stripe, and concatenating stripe CRCs.
  107. #define PROCESS_BLOCK(block_size, num_stripes) do { \
  108. while (bytes >= CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
  109. Crc crc1 = 0; \
  110. Crc crc2 = 0; \
  111. Crc crc3; \
  112. if (num_stripes > 3) crc3 = 0; \
  113. { \
  114. const uint8 *stripe_end = src + \
  115. (CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) / \
  116. kUnrolledLoopBytes) * kUnrolledLoopBytes; \
  117. do { \
  118. UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
  119. UPDATE_STRIPE_CRCS(1, block_size, num_stripes); \
  120. UPDATE_STRIPE_CRCS(2, block_size, num_stripes); \
  121. UPDATE_STRIPE_CRCS(3, block_size, num_stripes); \
  122. UPDATE_STRIPE_CRCS(4, block_size, num_stripes); \
  123. UPDATE_STRIPE_CRCS(5, block_size, num_stripes); \
  124. UPDATE_STRIPE_CRCS(6, block_size, num_stripes); \
  125. UPDATE_STRIPE_CRCS(7, block_size, num_stripes); \
  126. src += kUnrolledLoopBytes; \
  127. } while (src < stripe_end); \
  128. if ((CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
  129. kUnrolledLoopBytes) != 0) { \
  130. stripe_end += \
  131. CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
  132. kUnrolledLoopBytes; \
  133. do { \
  134. UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
  135. src += sizeof(size_t); \
  136. } while (src < stripe_end); \
  137. } \
  138. } \
  139. COMBINE_STRIPE_CRCS(block_size, num_stripes); \
  140. src += CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * \
  141. ((num_stripes) - 1); \
  142. bytes = static_cast<size_t>(end - src); \
  143. } \
  144. no_more_##block_size##_##num_stripes:; \
  145. } while (0)
  146. Y_NO_SANITIZE("undefined")
  147. size_t Crc32cSSE4::Crc32c(const void *data, size_t bytes, Crc crc0) const {
  148. const uint8 *src = static_cast<const uint8 *>(data);
  149. const uint8 *end = src + bytes;
  150. crc0 ^= Base().Canonize();
  151. // If we don't have too much data to process,
  152. // do not waste time trying to align input etc.
  153. // Noticeably improves performance on small inputs.
  154. if (bytes < 4 * sizeof(size_t)) goto less_than_4_size_t;
  155. if (bytes < 8 * sizeof(size_t)) goto less_than_8_size_t;
  156. if (bytes < 16 * sizeof(size_t)) goto less_than_16_size_t;
  157. #define PROCESS_TAIL_IF_SMALL(block_size, num_stripes) do { \
  158. if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
  159. goto no_more_##block_size##_##num_stripes; \
  160. } \
  161. } while (0)
  162. #define NOOP(block_size, num_stripes)
  163. CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(PROCESS_TAIL_IF_SMALL,
  164. NOOP,
  165. NOOP);
  166. #undef PROCESS_TAIL_IF_SMALL
  167. // Do not use ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() here because:
  168. // 1. It uses CRC_BYTE() which won't work.
  169. // 2. Its threshold may be incorrect becuase Crc32 that uses
  170. // native CPU crc32 instruction is much faster than
  171. // generic table-based CRC computation.
  172. //
  173. // In case of X5550 CPU, break even point is at 2KB -- exactly.
  174. if (bytes >= 2 * 1024) {
  175. while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) {
  176. if (src >= end) {
  177. return (crc0 ^ Base().Canonize());
  178. }
  179. CRC_UPDATE_BYTE(crc0, src[0]);
  180. src += 1;
  181. }
  182. bytes = static_cast<size_t>(end - src);
  183. }
  184. if (src >= end) {
  185. return (crc0 ^ Base().Canonize());
  186. }
  187. // Quickly skip processing of too large blocks
  188. // Noticeably improves performance on small inputs.
  189. #define SKIP_BLOCK_IF_NEEDED(block_size, num_stripes) do { \
  190. if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
  191. goto no_more_##block_size##_##num_stripes; \
  192. } \
  193. } while (0)
  194. CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(NOOP,
  195. SKIP_BLOCK_IF_NEEDED,
  196. SKIP_BLOCK_IF_NEEDED);
  197. #undef SKIP_BLOCK_IF_NEEDED
  198. // Process data in all blocks.
  199. CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING(PROCESS_BLOCK,
  200. PROCESS_BLOCK,
  201. PROCESS_BLOCK);
  202. // Finish the tail word-by-word and then byte-by-byte.
  203. #define CRC_UPDATE_WORD_4(index) do { \
  204. CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index]); \
  205. CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 1]); \
  206. CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 2]); \
  207. CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 3]); \
  208. } while (0)
  209. if (bytes >= 4 * 4 * sizeof(size_t)) {
  210. end -= 4 * 4 * sizeof(size_t);
  211. do {
  212. CRC_UPDATE_WORD_4(4 * 0);
  213. CRC_UPDATE_WORD_4(4 * 1);
  214. CRC_UPDATE_WORD_4(4 * 2);
  215. CRC_UPDATE_WORD_4(4 * 3);
  216. src += 4 * 4 * sizeof(size_t);
  217. } while (src <= end);
  218. end += 4 * 4 * sizeof(size_t);
  219. bytes = static_cast<size_t>(end - src);
  220. }
  221. less_than_16_size_t:
  222. if (bytes >= 4 * 2 * sizeof(size_t)) {
  223. CRC_UPDATE_WORD_4(4 * 0);
  224. CRC_UPDATE_WORD_4(4 * 1);
  225. src += 4 * 2 * sizeof(size_t);
  226. bytes -= 4 * 2 * sizeof(size_t);
  227. }
  228. less_than_8_size_t:
  229. if (bytes >= 4 * sizeof(size_t)) {
  230. CRC_UPDATE_WORD_4(0);
  231. src += 4 * sizeof(size_t);
  232. bytes -= 4 * sizeof(size_t);
  233. }
  234. less_than_4_size_t:
  235. if (bytes >= 1 * sizeof(size_t)) {
  236. end -= 1 * sizeof(size_t);
  237. do {
  238. CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[0]);
  239. src += 1 * sizeof(size_t);
  240. } while (src <= end);
  241. end += 1 * sizeof(size_t);
  242. }
  243. while (src < end) {
  244. CRC_UPDATE_BYTE(crc0, src[0]);
  245. src += 1;
  246. }
  247. return (crc0 ^ Base().Canonize());
  248. }
  249. void Crc32cSSE4::Init(bool constant) {
  250. base_.Init(FixedGeneratingPolynomial(), FixedDegree(), constant);
  251. #define INIT_MUL_TABLE(block_size, num_stripes) do { \
  252. size_t multiplier = \
  253. Base().Xpow8N(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes)); \
  254. for (size_t table = 0; table < kNumTables; ++table) { \
  255. for (size_t entry = 0; entry < kTableEntries; ++entry) { \
  256. size_t value = static_cast<uint32>(entry << (kTableEntryBits * table)); \
  257. CRC32C_SSE4_MUL_TABLE(block_size, num_stripes)[table][entry] = \
  258. static_cast<Entry>(Base().Multiply(value, multiplier)); \
  259. } \
  260. } \
  261. } while (0)
  262. CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(INIT_MUL_TABLE);
  263. #undef INIT_MUL_TABLE
  264. #if !CRCUTIL_USE_MM_CRC32
  265. for (size_t j = 0; j < sizeof(Word); ++j) {
  266. Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + 32);
  267. for (size_t i = 0; i < 256; ++i) {
  268. crc_word_[j][i] = Base().MultiplyUnnormalized(i, 8, k);
  269. }
  270. }
  271. #endif // !CRCUTIL_USE_MM_CRC32
  272. }
  273. bool Crc32cSSE4::IsSSE42Available() {
  274. #if defined(_MSC_VER)
  275. int cpu_info[4];
  276. __cpuid(cpu_info, 1);
  277. return ((cpu_info[2] & (1 << 20)) != 0);
  278. #elif defined(__GNUC__) && (HAVE_AMD64 || HAVE_I386)
  279. // Not using "cpuid.h" intentionally: it is missing from
  280. // too many installations.
  281. uint32 eax;
  282. uint32 ecx;
  283. uint32 edx;
  284. __asm__ volatile(
  285. #if HAVE_I386 && defined(__PIC__)
  286. "push %%ebx\n"
  287. "cpuid\n"
  288. "pop %%ebx\n"
  289. #else
  290. "cpuid\n"
  291. #endif // HAVE_I386 && defined(__PIC__)
  292. : "=a" (eax), "=c" (ecx), "=d" (edx)
  293. : "a" (1), "2" (0)
  294. : "%ebx"
  295. );
  296. return ((ecx & (1 << 20)) != 0);
  297. #else
  298. return false;
  299. #endif
  300. }
  301. void RollingCrc32cSSE4::Init(const Crc32cSSE4 &crc,
  302. size_t roll_window_bytes,
  303. const Crc &start_value) {
  304. crc_ = &crc;
  305. roll_window_bytes_ = roll_window_bytes;
  306. start_value_ = start_value;
  307. Crc add = crc.Base().Canonize() ^ start_value;
  308. add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes));
  309. add ^= crc.Base().Canonize();
  310. Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1);
  311. add = crc.Base().Multiply(add, mul);
  312. mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree());
  313. for (size_t i = 0; i < 256; ++i) {
  314. out_[i] = static_cast<Entry>(
  315. crc.Base().MultiplyUnnormalized(
  316. static_cast<Crc>(i), 8, mul) ^ add);
  317. }
  318. #if !CRCUTIL_USE_MM_CRC32
  319. memcpy(crc_word_, crc_->crc_word_, sizeof(crc_word_));
  320. #endif // !CRCUTIL_USE_MM_CRC32
  321. }
  322. } // namespace crcutil
  323. #endif // HAVE_I386 || HAVE_AMD64