crc.cc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. // Copyright 2022 The Abseil Authors.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // https://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Implementation of CRCs (aka Rabin Fingerprints).
  15. // Treats the input as a polynomial with coefficients in Z(2),
  16. // and finds the remainder when divided by an irreducible polynomial
  17. // of the appropriate length.
  18. // It handles all CRC sizes from 8 to 128 bits.
  19. // It's somewhat complicated by having separate implementations optimized for
  20. // CRC's <=32 bits, <= 64 bits, and <= 128 bits.
  21. // The input string is prefixed with a "1" bit, and has "degree" "0" bits
  22. // appended to it before the remainder is found. This ensures that
  23. // short strings are scrambled somewhat and that strings consisting
  24. // of all nulls have a non-zero CRC.
  25. //
  26. // Uses the "interleaved word-by-word" method from
  27. // "Everything we know about CRC but afraid to forget" by Andrew Kadatch
  28. // and Bob Jenkins,
  29. // http://crcutil.googlecode.com/files/crc-doc.1.0.pdf
  30. //
  31. // The idea is to compute kStride CRCs simultaneously, allowing the
  32. // processor to more effectively use multiple execution units. Each of
  33. // the CRCs is calculated on one word of data followed by kStride - 1
  34. // words of zeroes; the CRC starting points are staggered by one word.
  35. // Assuming a stride of 4 with data words "ABCDABCDABCD", the first
  36. // CRC is over A000A000A, the second over 0B000B000B, and so on.
  37. // The CRC of the whole data is then calculated by properly aligning the
  38. // CRCs by appending zeroes until the data lengths agree then XORing
  39. // the CRCs.
  40. #include "y_absl/crc/internal/crc.h"
  41. #include <cstdint>
  42. #include "y_absl/base/internal/endian.h"
  43. #include "y_absl/base/internal/raw_logging.h"
  44. #include "y_absl/base/prefetch.h"
  45. #include "y_absl/crc/internal/crc_internal.h"
  46. namespace y_absl {
  47. Y_ABSL_NAMESPACE_BEGIN
  48. namespace crc_internal {
  49. namespace {
  50. // Constants
  51. #if defined(__i386__) || defined(__x86_64__)
  52. constexpr bool kNeedAlignedLoads = false;
  53. #else
  54. constexpr bool kNeedAlignedLoads = true;
  55. #endif
  56. // We express the number of zeroes as a number in base ZEROES_BASE. By
  57. // pre-computing the zero extensions for all possible components of such an
  58. // expression (numbers in a form a*ZEROES_BASE**b), we can calculate the
  59. // resulting extension by multiplying the extensions for individual components
  60. // using log_{ZEROES_BASE}(num_zeroes) polynomial multiplications. The tables of
  61. // zero extensions contain (ZEROES_BASE - 1) * (log_{ZEROES_BASE}(64)) entries.
  62. constexpr int ZEROES_BASE_LG = 4; // log_2(ZEROES_BASE)
  63. constexpr int ZEROES_BASE = (1 << ZEROES_BASE_LG); // must be a power of 2
  64. constexpr uint32_t kCrc32cPoly = 0x82f63b78;
  65. uint32_t ReverseBits(uint32_t bits) {
  66. bits = (bits & 0xaaaaaaaau) >> 1 | (bits & 0x55555555u) << 1;
  67. bits = (bits & 0xccccccccu) >> 2 | (bits & 0x33333333u) << 2;
  68. bits = (bits & 0xf0f0f0f0u) >> 4 | (bits & 0x0f0f0f0fu) << 4;
  69. return y_absl::gbswap_32(bits);
  70. }
  71. // Polynomial long multiplication mod the polynomial of degree 32.
  72. void PolyMultiply(uint32_t* val, uint32_t m, uint32_t poly) {
  73. uint32_t l = *val;
  74. uint32_t result = 0;
  75. auto onebit = uint32_t{0x80000000u};
  76. for (uint32_t one = onebit; one != 0; one >>= 1) {
  77. if ((l & one) != 0) {
  78. result ^= m;
  79. }
  80. if (m & 1) {
  81. m = (m >> 1) ^ poly;
  82. } else {
  83. m >>= 1;
  84. }
  85. }
  86. *val = result;
  87. }
  88. } // namespace
  89. void CRCImpl::FillWordTable(uint32_t poly, uint32_t last, int word_size,
  90. Uint32By256* t) {
  91. for (int j = 0; j != word_size; j++) { // for each byte of extension....
  92. t[j][0] = 0; // a zero has no effect
  93. for (int i = 128; i != 0; i >>= 1) { // fill in entries for powers of 2
  94. if (j == 0 && i == 128) {
  95. t[j][i] = last; // top bit in last byte is given
  96. } else {
  97. // each successive power of two is derived from the previous
  98. // one, either in this table, or the last table
  99. uint32_t pred;
  100. if (i == 128) {
  101. pred = t[j - 1][1];
  102. } else {
  103. pred = t[j][i << 1];
  104. }
  105. // Advance the CRC by one bit (multiply by X, and take remainder
  106. // through one step of polynomial long division)
  107. if (pred & 1) {
  108. t[j][i] = (pred >> 1) ^ poly;
  109. } else {
  110. t[j][i] = pred >> 1;
  111. }
  112. }
  113. }
  114. // CRCs have the property that CRC(a xor b) == CRC(a) xor CRC(b)
  115. // so we can make all the tables for non-powers of two by
  116. // xoring previously created entries.
  117. for (int i = 2; i != 256; i <<= 1) {
  118. for (int k = i + 1; k != (i << 1); k++) {
  119. t[j][k] = t[j][i] ^ t[j][k - i];
  120. }
  121. }
  122. }
  123. }
  124. int CRCImpl::FillZeroesTable(uint32_t poly, Uint32By256* t) {
  125. uint32_t inc = 1;
  126. inc <<= 31;
  127. // Extend by one zero bit. We know degree > 1 so (inc & 1) == 0.
  128. inc >>= 1;
  129. // Now extend by 2, 4, and 8 bits, so now `inc` is extended by one zero byte.
  130. for (int i = 0; i < 3; ++i) {
  131. PolyMultiply(&inc, inc, poly);
  132. }
  133. int j = 0;
  134. for (uint64_t inc_len = 1; inc_len != 0; inc_len <<= ZEROES_BASE_LG) {
  135. // Every entry in the table adds an additional inc_len zeroes.
  136. uint32_t v = inc;
  137. for (int a = 1; a != ZEROES_BASE; a++) {
  138. t[0][j] = v;
  139. PolyMultiply(&v, inc, poly);
  140. j++;
  141. }
  142. inc = v;
  143. }
  144. Y_ABSL_RAW_CHECK(j <= 256, "");
  145. return j;
  146. }
  147. // Internal version of the "constructor".
  148. CRCImpl* CRCImpl::NewInternal() {
  149. // Find an accelearated implementation first.
  150. CRCImpl* result = TryNewCRC32AcceleratedX86ARMCombined();
  151. // Fall back to generic implementions if no acceleration is available.
  152. if (result == nullptr) {
  153. result = new CRC32();
  154. }
  155. result->InitTables();
  156. return result;
  157. }
  158. // The 32-bit implementation
  159. void CRC32::InitTables() {
  160. // Compute the table for extending a CRC by one byte.
  161. Uint32By256* t = new Uint32By256[4];
  162. FillWordTable(kCrc32cPoly, kCrc32cPoly, 1, t);
  163. for (int i = 0; i != 256; i++) {
  164. this->table0_[i] = t[0][i];
  165. }
  166. // Construct a table for updating the CRC by 4 bytes data followed by
  167. // 12 bytes of zeroes.
  168. //
  169. // Note: the data word size could be larger than the CRC size; it might
  170. // be slightly faster to use a 64-bit data word, but doing so doubles the
  171. // table size.
  172. uint32_t last = kCrc32cPoly;
  173. const size_t size = 12;
  174. for (size_t i = 0; i < size; ++i) {
  175. last = (last >> 8) ^ this->table0_[last & 0xff];
  176. }
  177. FillWordTable(kCrc32cPoly, last, 4, t);
  178. for (size_t b = 0; b < 4; ++b) {
  179. for (int i = 0; i < 256; ++i) {
  180. this->table_[b][i] = t[b][i];
  181. }
  182. }
  183. int j = FillZeroesTable(kCrc32cPoly, t);
  184. Y_ABSL_RAW_CHECK(j <= static_cast<int>(Y_ABSL_ARRAYSIZE(this->zeroes_)), "");
  185. for (int i = 0; i < j; i++) {
  186. this->zeroes_[i] = t[0][i];
  187. }
  188. delete[] t;
  189. // Build up tables for _reversing_ the operation of doing CRC operations on
  190. // zero bytes.
  191. // In C++, extending `crc` by a single zero bit is done by the following:
  192. // (A) bool low_bit_set = (crc & 1);
  193. // crc >>= 1;
  194. // if (low_bit_set) crc ^= kCrc32cPoly;
  195. //
  196. // In particular note that the high bit of `crc` after this operation will be
  197. // set if and only if the low bit of `crc` was set before it. This means that
  198. // no information is lost, and the operation can be reversed, as follows:
  199. // (B) bool high_bit_set = (crc & 0x80000000u);
  200. // if (high_bit_set) crc ^= kCrc32cPoly;
  201. // crc <<= 1;
  202. // if (high_bit_set) crc ^= 1;
  203. //
  204. // Or, equivalently:
  205. // (C) bool high_bit_set = (crc & 0x80000000u);
  206. // crc <<= 1;
  207. // if (high_bit_set) crc ^= ((kCrc32cPoly << 1) ^ 1);
  208. //
  209. // The last observation is, if we store our checksums in variable `rcrc`,
  210. // with order of the bits reversed, the inverse operation becomes:
  211. // (D) bool low_bit_set = (rcrc & 1);
  212. // rcrc >>= 1;
  213. // if (low_bit_set) rcrc ^= ReverseBits((kCrc32cPoly << 1) ^ 1)
  214. //
  215. // This is the same algorithm (A) that we started with, only with a different
  216. // polynomial bit pattern. This means that by building up our tables with
  217. // this alternate polynomial, we can apply the CRC algorithms to a
  218. // bit-reversed CRC checksum to perform inverse zero-extension.
  219. const uint32_t kCrc32cUnextendPoly =
  220. ReverseBits(static_cast<uint32_t>((kCrc32cPoly << 1) ^ 1));
  221. FillWordTable(kCrc32cUnextendPoly, kCrc32cUnextendPoly, 1, &reverse_table0_);
  222. j = FillZeroesTable(kCrc32cUnextendPoly, &reverse_zeroes_);
  223. Y_ABSL_RAW_CHECK(j <= static_cast<int>(Y_ABSL_ARRAYSIZE(this->reverse_zeroes_)),
  224. "");
  225. }
  226. void CRC32::Extend(uint32_t* crc, const void* bytes, size_t length) const {
  227. const uint8_t* p = static_cast<const uint8_t*>(bytes);
  228. const uint8_t* e = p + length;
  229. uint32_t l = *crc;
  230. auto step_one_byte = [this, &p, &l]() {
  231. int c = (l & 0xff) ^ *p++;
  232. l = this->table0_[c] ^ (l >> 8);
  233. };
  234. if (kNeedAlignedLoads) {
  235. // point x at first 4-byte aligned byte in string. this might be past the
  236. // end of the string.
  237. const uint8_t* x = RoundUp<4>(p);
  238. if (x <= e) {
  239. // Process bytes until finished or p is 4-byte aligned
  240. while (p != x) {
  241. step_one_byte();
  242. }
  243. }
  244. }
  245. const size_t kSwathSize = 16;
  246. if (static_cast<size_t>(e - p) >= kSwathSize) {
  247. // Load one swath of data into the operating buffers.
  248. uint32_t buf0 = y_absl::little_endian::Load32(p) ^ l;
  249. uint32_t buf1 = y_absl::little_endian::Load32(p + 4);
  250. uint32_t buf2 = y_absl::little_endian::Load32(p + 8);
  251. uint32_t buf3 = y_absl::little_endian::Load32(p + 12);
  252. p += kSwathSize;
  253. // Increment a CRC value by a "swath"; this combines the four bytes
  254. // starting at `ptr` and twelve zero bytes, so that four CRCs can be
  255. // built incrementally and combined at the end.
  256. const auto step_swath = [this](uint32_t crc_in, const std::uint8_t* ptr) {
  257. return y_absl::little_endian::Load32(ptr) ^
  258. this->table_[3][crc_in & 0xff] ^
  259. this->table_[2][(crc_in >> 8) & 0xff] ^
  260. this->table_[1][(crc_in >> 16) & 0xff] ^
  261. this->table_[0][crc_in >> 24];
  262. };
  263. // Run one CRC calculation step over all swaths in one 16-byte stride
  264. const auto step_stride = [&]() {
  265. buf0 = step_swath(buf0, p);
  266. buf1 = step_swath(buf1, p + 4);
  267. buf2 = step_swath(buf2, p + 8);
  268. buf3 = step_swath(buf3, p + 12);
  269. p += 16;
  270. };
  271. // Process kStride interleaved swaths through the data in parallel.
  272. while ((e - p) > kPrefetchHorizon) {
  273. PrefetchToLocalCacheNta(
  274. reinterpret_cast<const void*>(p + kPrefetchHorizon));
  275. // Process 64 bytes at a time
  276. step_stride();
  277. step_stride();
  278. step_stride();
  279. step_stride();
  280. }
  281. while (static_cast<size_t>(e - p) >= kSwathSize) {
  282. step_stride();
  283. }
  284. // Now advance one word at a time as far as possible. This isn't worth
  285. // doing if we have word-advance tables.
  286. while (static_cast<size_t>(e - p) >= 4) {
  287. buf0 = step_swath(buf0, p);
  288. uint32_t tmp = buf0;
  289. buf0 = buf1;
  290. buf1 = buf2;
  291. buf2 = buf3;
  292. buf3 = tmp;
  293. p += 4;
  294. }
  295. // Combine the results from the different swaths. This is just a CRC
  296. // on the data values in the bufX words.
  297. auto combine_one_word = [this](uint32_t crc_in, uint32_t w) {
  298. w ^= crc_in;
  299. for (size_t i = 0; i < 4; ++i) {
  300. w = (w >> 8) ^ this->table0_[w & 0xff];
  301. }
  302. return w;
  303. };
  304. l = combine_one_word(0, buf0);
  305. l = combine_one_word(l, buf1);
  306. l = combine_one_word(l, buf2);
  307. l = combine_one_word(l, buf3);
  308. }
  309. // Process the last few bytes
  310. while (p != e) {
  311. step_one_byte();
  312. }
  313. *crc = l;
  314. }
  315. void CRC32::ExtendByZeroesImpl(uint32_t* crc, size_t length,
  316. const uint32_t zeroes_table[256],
  317. const uint32_t poly_table[256]) {
  318. if (length != 0) {
  319. uint32_t l = *crc;
  320. // For each ZEROES_BASE_LG bits in length
  321. // (after the low-order bits have been removed)
  322. // we lookup the appropriate polynomial in the zeroes_ array
  323. // and do a polynomial long multiplication (mod the CRC polynomial)
  324. // to extend the CRC by the appropriate number of bits.
  325. for (int i = 0; length != 0;
  326. i += ZEROES_BASE - 1, length >>= ZEROES_BASE_LG) {
  327. int c = length & (ZEROES_BASE - 1); // pick next ZEROES_BASE_LG bits
  328. if (c != 0) { // if they are not zero,
  329. // multiply by entry in table
  330. // Build a table to aid in multiplying 2 bits at a time.
  331. // It takes too long to build tables for more bits.
  332. uint64_t m = zeroes_table[c + i - 1];
  333. m <<= 1;
  334. uint64_t m2 = m << 1;
  335. uint64_t mtab[4] = {0, m, m2, m2 ^ m};
  336. // Do the multiply one byte at a time.
  337. uint64_t result = 0;
  338. for (int x = 0; x < 32; x += 8) {
  339. // The carry-less multiply.
  340. result ^= mtab[l & 3] ^ (mtab[(l >> 2) & 3] << 2) ^
  341. (mtab[(l >> 4) & 3] << 4) ^ (mtab[(l >> 6) & 3] << 6);
  342. l >>= 8;
  343. // Reduce modulo the polynomial
  344. result = (result >> 8) ^ poly_table[result & 0xff];
  345. }
  346. l = static_cast<uint32_t>(result);
  347. }
  348. }
  349. *crc = l;
  350. }
  351. }
  352. void CRC32::ExtendByZeroes(uint32_t* crc, size_t length) const {
  353. return CRC32::ExtendByZeroesImpl(crc, length, zeroes_, table0_);
  354. }
  355. void CRC32::UnextendByZeroes(uint32_t* crc, size_t length) const {
  356. // See the comment in CRC32::InitTables() for an explanation of the algorithm
  357. // below.
  358. *crc = ReverseBits(*crc);
  359. ExtendByZeroesImpl(crc, length, reverse_zeroes_, reverse_table0_);
  360. *crc = ReverseBits(*crc);
  361. }
  362. void CRC32::Scramble(uint32_t* crc) const {
  363. // Rotate by near half the word size plus 1. See the scramble comment in
  364. // crc_internal.h for an explanation.
  365. constexpr int scramble_rotate = (32 / 2) + 1;
  366. *crc = RotateRight<uint32_t>(static_cast<unsigned int>(*crc + kScrambleLo),
  367. 32, scramble_rotate) &
  368. MaskOfLength<uint32_t>(32);
  369. }
  370. void CRC32::Unscramble(uint32_t* crc) const {
  371. constexpr int scramble_rotate = (32 / 2) + 1;
  372. uint64_t rotated = RotateRight<uint32_t>(static_cast<unsigned int>(*crc), 32,
  373. 32 - scramble_rotate);
  374. *crc = (rotated - kScrambleLo) & MaskOfLength<uint32_t>(32);
  375. }
  376. // Constructor and destructor for base class CRC.
  377. CRC::~CRC() {}
  378. CRC::CRC() {}
  379. // The "constructor" for a CRC32C with a standard polynomial.
  380. CRC* CRC::Crc32c() {
  381. static CRC* singleton = CRCImpl::NewInternal();
  382. return singleton;
  383. }
  384. } // namespace crc_internal
  385. Y_ABSL_NAMESPACE_END
  386. } // namespace y_absl