// Copyright 2010 Google Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Defines GenericCrc class which implements arbitrary CRCs. // // Please read crc.pdf to understand how it all works. #ifndef CRCUTIL_GENERIC_CRC_H_ #define CRCUTIL_GENERIC_CRC_H_ #include "base_types.h" // uint8 #include "crc_casts.h" // TO_BYTE(), Downcast<>. #include "gf_util.h" // GfUtil class. #include "platform.h" // GCC_ALIGN_ATTRIBUTE(16) #include "uint128_sse2.h" // uint128_sse2 type (if necessary) namespace crcutil { #pragma pack(push, 16) // Extends CRC by one byte. // Technically, if degree of a polynomial does not exceed 8, // right shift by 8 bits is not required, but who cares about CRC-8? #define CRC_BYTE(table, crc, byte) do { \ crc = ((sizeof(crc) > 1) ? SHIFT_RIGHT_SAFE(crc, 8) : 0) ^ \ table->crc_word_[sizeof(Word) - 1][TO_BYTE(crc) ^ (byte)]; \ } while (0) #define TABLE_ENTRY(table, byte, buf) \ table[byte][Downcast(buf)] #define TABLE_ENTRY_LAST(table, buf) \ table[sizeof(Word) - 1][buf] // Extends CRC by one word. #define CRC_WORD(table, crc, buf) do { \ buf ^= Downcast(crc); \ if (sizeof(crc) > sizeof(buf)) { \ crc = SHIFT_RIGHT_SAFE(crc, sizeof(buf) * 8); \ crc ^= TABLE_ENTRY(table->crc_word_, 0, buf); \ } else { \ crc = TABLE_ENTRY(table->crc_word_, 0, buf); \ } \ buf >>= 8; \ for (size_t byte = 1; byte < sizeof(buf) - 1; ++byte) { \ crc ^= TABLE_ENTRY(table->crc_word_, byte, buf); \ buf >>= 8; \ } \ crc ^= TABLE_ENTRY_LAST(table->crc_word_, buf); \ } while (0) // Process beginning of data block byte by byte until source pointer // becomes perfectly aligned on Word boundary. #define ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word) do { \ while ((reinterpret_cast(src) & (sizeof(Word) - 1)) != 0) { \ if (src >= end) { \ return (crc ^ table->Base().Canonize()); \ } \ CRC_BYTE(table, crc, *src); \ src += 1; \ } \ } while (0) // On amd64, enforcing alignment is 2-4% slower on small (<= 64 bytes) blocks // but 6-10% faster on larger blocks (>= 2KB). // Break-even point (+-1%) is around 1KB (Q9650, E6600). // #define ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, table, src, end, crc, Word) \ do { \ if (sizeof(Word) > 8 || (bytes) > CRCUTIL_MIN_ALIGN_SIZE) { \ ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word); \ } \ } while (0) #if defined(_MSC_VER) #pragma warning(push) #pragma warning(disable: 4127) // conditional expression is constant #endif // defined(_MSC_VER) // Forward declarations. template class RollingCrc; // Crc is the type used internally and to return values of N-bit CRC. // It should be at least as large as "TableEntry" and "Word" but // may be larger (e.g. for 16-bit CRC, TableEntry and Word may be // set to uint16 but Crc may be set to uint32). // // TableEntry is the type of values stored in the tables. // To implement N-bit CRC, TableEntry should be large enough // to store N bits. // // Word is the type used to read data sizeof(Word) at a time. // Ideally, it shoulde be "most suitable for given architecture" // integer type -- typically "size_t". // // kStride is the number of words processed in interleaved manner by // CrcMultiword() and CrcWordblock(). Shall be either 3 or 4. // Optimal value depends on hardware architecture (AMD64, ARM, etc). // template class GenericCrc { public: // Make Crc, TableEntry, and Word types visible (used by RollingCrc etc.) typedef _Crc Crc; typedef _TableEntry TableEntry; typedef _Word Word; GenericCrc() {} // Initializes the tables given generating polynomial of degree. // If "canonical" is true, crc value will be XOR'ed with (-1) before and // after actual CRC computation. GenericCrc(const Crc &generating_polynomial, size_t degree, bool canonical) { Init(generating_polynomial, degree, canonical); } void Init(const Crc &generating_polynomial, size_t degree, bool canonical) { base_.Init(generating_polynomial, degree, canonical); // Instead of computing // table[j][i] = MultiplyUnnormalized(i, 8, k), // for all i = 0...255, we may notice that // if i = 2**n then for all m = 1...(i-1) // MultiplyUnnormalized(i + m, 8, k) = // MultiplyUnnormalized(i ^ m, 8, k) = // MultiplyUnnormalized(i, 8, k) ^ MultiplyUnnormalized(m, 8, k) = // MultiplyUnnormalized(i, 8, k) ^ crc_word_interleaved[j][m] = // table[i] ^ table[m]. #if 0 for (size_t j = 0; j < sizeof(Word); ++j) { Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree); for (size_t i = 0; i < 256; ++i) { Crc temp = Base().MultiplyUnnormalized(static_cast(i), 8, k); this->crc_word_interleaved_[j][i] = Downcast(temp); } } #else for (size_t j = 0; j < sizeof(Word); ++j) { Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree); TableEntry *table = this->crc_word_interleaved_[j]; table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0. for (size_t i = 1; i < 256; i <<= 1) { TableEntry value = Downcast( Base().MultiplyUnnormalized(static_cast(i), 8, k)); table[i] = value; for (size_t m = 1; m < i; ++m) { table[i + m] = value ^ table[m]; } } } #endif #if 0 for (size_t j = 0; j < sizeof(Word); ++j) { Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree); for (size_t i = 0; i < 256; ++i) { Crc temp = Base().MultiplyUnnormalized(static_cast(i), 8, k); this->crc_word_[j][i] = Downcast(temp); } } #else for (size_t j = 0; j < sizeof(Word); ++j) { Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree); TableEntry *table = this->crc_word_[j]; table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0. for (size_t i = 1; i < 256; i <<= 1) { TableEntry value = Downcast( Base().MultiplyUnnormalized(static_cast(i), 8, k)); table[i] = value; for (size_t m = 1; m < i; ++m) { table[i + m] = value ^ table[m]; } } } #endif } // Default CRC implementation Crc CrcDefault(const void *data, size_t bytes, const Crc &start) const { #if HAVE_AMD64 || HAVE_I386 || defined(__aarch64__) return CrcMultiword(data, bytes, start); #else // Very few CPUs have multiple ALUs and speculative execution // (Itanium is an exception) so sophisticated algorithms will // not perform better than good old Sarwate algorithm. return CrcByteUnrolled(data, bytes, start); #endif // HAVE_AMD64 || HAVE_I386 } // Returns base class. const GfUtil &Base() const { return base_; } protected: // Canonical, byte-by-byte CRC computation. Crc CrcByte(const void *data, size_t bytes, const Crc &start) const { const uint8 *src = static_cast(data); Crc crc = start ^ Base().Canonize(); for (const uint8 *end = src + bytes; src < end; ++src) { CRC_BYTE(this, crc, *src); } return (crc ^ Base().Canonize()); } // Byte-by-byte CRC with main loop unrolled. Crc CrcByteUnrolled(const void *data, size_t bytes, const Crc &start) const { if (bytes == 0) { return start; } const uint8 *src = static_cast(data); const uint8 *end = src + bytes; Crc crc = start ^ Base().Canonize(); // Unroll loop 4 times. end -= 3; for (; src < end; src += 4) { PREFETCH(src); CRC_BYTE(this, crc, src[0]); CRC_BYTE(this, crc, src[1]); CRC_BYTE(this, crc, src[2]); CRC_BYTE(this, crc, src[3]); } end += 3; // Compute CRC of remaining bytes. for (; src < end; ++src) { CRC_BYTE(this, crc, *src); } return (crc ^ Base().Canonize()); } // Canonical, byte-by-byte CRC computation. Crc CrcByteWord(const void *data, size_t bytes, const Crc &start) const { const uint8 *src = static_cast(data); const uint8 *end = src + bytes; Crc crc0 = start ^ Base().Canonize(); ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Crc); if (src >= end) { return (crc0 ^ Base().Canonize()); } // Process 4*sizeof(Crc) bytes at a time. end -= 4 * sizeof(Crc) - 1; for (; src < end; src += 4 * sizeof(Crc)) { for (size_t i = 0; i < 4; ++i) { crc0 ^= reinterpret_cast(src)[i]; if (i == 0) { PREFETCH(src); } for (size_t byte = 0; byte < sizeof(crc0); ++byte) { CRC_BYTE(this, crc0, 0); } } } end += 4 * sizeof(Crc) - 1; // Process sizeof(Crc) bytes at a time. end -= sizeof(Crc) - 1; for (; src < end; src += sizeof(Crc)) { crc0 ^= reinterpret_cast(src)[0]; for (size_t byte = 0; byte < sizeof(crc0); ++byte) { CRC_BYTE(this, crc0, 0); } } end += sizeof(Crc) - 1; // Compute CRC of remaining bytes. for (;src < end; ++src) { CRC_BYTE(this, crc0, *src); } return (crc0 ^ Base().Canonize()); } // Faster, word-by-word CRC. Crc CrcWord(const void *data, size_t bytes, const Crc &start) const { const uint8 *src = static_cast(data); const uint8 *end = src + bytes; Crc crc0 = start ^ Base().Canonize(); ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); if (src >= end) { return (crc0 ^ Base().Canonize()); } // Process 4 sizeof(Word) bytes at once. end -= 4 * sizeof(Word) - 1; for (; src < end; src += 4 * sizeof(Word)) { Word buf0 = reinterpret_cast(src)[0]; PREFETCH(src); CRC_WORD(this, crc0, buf0); buf0 = reinterpret_cast(src)[1]; CRC_WORD(this, crc0, buf0); buf0 = reinterpret_cast(src)[2]; CRC_WORD(this, crc0, buf0); buf0 = reinterpret_cast(src)[3]; CRC_WORD(this, crc0, buf0); } end += 4 * sizeof(Word) - 1; // Process sizeof(Word) bytes at a time. end -= sizeof(Word) - 1; for (; src < end; src += sizeof(Word)) { Word buf0 = reinterpret_cast(src)[0]; CRC_WORD(this, crc0, buf0); } end += sizeof(Word) - 1; // Compute CRC of remaining bytes. for (;src < end; ++src) { CRC_BYTE(this, crc0, *src); } return (crc0 ^ Base().Canonize()); } #define REPEAT_FROM_1(macro) \ macro(1); \ macro(2); \ macro(3); \ macro(4); \ macro(5); \ macro(6); \ macro(7); #define REPEAT_FROM_0(macro) \ macro(0); \ REPEAT_FROM_1(macro) // Faster, process adjusent blocks in parallel and concatenate CRCs. Crc CrcBlockword(const void *data, size_t bytes, const Crc &start) const { if (kStride < 2 || kStride > 8) { // Unsupported configuration; // fall back to something sensible. return CrcWord(data, bytes, start); } const uint8 *src = static_cast(data); const uint8 *end = src + bytes; Crc crc0 = start ^ Base().Canonize(); enum { // Add 16 to avoid false L1 cache collisions. kStripe = (15*1024 + 16) & ~(sizeof(Word) - 1), }; ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); if (src >= end) { return (crc0 ^ Base().Canonize()); } end -= kStride * kStripe - 1; if (src < end) { Crc x_pow_8kStripe = Base().Xpow8N(kStripe); do { const uint8 *stripe_end = src + kStripe; #define INIT_CRC(reg) \ Crc crc##reg; \ if (kStride >= reg) { \ crc##reg = 0; \ } REPEAT_FROM_1(INIT_CRC); #undef INIT_CRC do { #define FIRST(reg) \ Word buf##reg; \ if (kStride > reg) { \ buf##reg = reinterpret_cast(src + reg * kStripe)[0]; \ buf##reg ^= Downcast(crc##reg); \ if (sizeof(crc##reg) > sizeof(buf##reg)) { \ crc##reg = SHIFT_RIGHT_SAFE(crc##reg, sizeof(buf##reg) * 8); \ crc##reg ^= TABLE_ENTRY(this->crc_word_, 0, buf##reg); \ } else { \ crc##reg = TABLE_ENTRY(this->crc_word_, 0, buf##reg); \ } \ buf##reg >>= 8; \ } REPEAT_FROM_0(FIRST); #undef FIRST for (size_t byte = 1; byte < sizeof(buf0) - 1; ++byte) { #define NEXT(reg) do { \ if (kStride > reg) { \ crc##reg ^= TABLE_ENTRY(this->crc_word_, byte, buf##reg); \ buf##reg >>= 8; \ } \ } while (0) REPEAT_FROM_0(NEXT); #undef NEXT } #define LAST(reg) do { \ if (kStride > reg) { \ crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_, buf##reg); \ } \ } while (0) REPEAT_FROM_0(LAST); #undef LAST src += sizeof(Word); } while (src < stripe_end); #if 0 // The code is left for illustrational purposes only. #define COMBINE(reg) do { \ if (reg > 0 && kStride > reg) { \ crc0 = Base().ChangeStartValue(crc##reg, kStripe, 0, crc0); \ } \ } while (0) #else #define COMBINE(reg) do { \ if (reg > 0 && kStride > reg) { \ crc0 = crc##reg ^ Base().Multiply(crc0, x_pow_8kStripe); \ } \ } while (0) #endif REPEAT_FROM_0(COMBINE); #undef COMBINE src += (kStride - 1) * kStripe; } while (src < end); } end += kStride * kStripe - 1; // Process sizeof(Word) bytes at a time. end -= sizeof(Word) - 1; for (; src < end; src += sizeof(Word)) { Word buf0 = reinterpret_cast(src)[0]; CRC_WORD(this, crc0, buf0); } end += sizeof(Word) - 1; // Compute CRC of remaining bytes. for (;src < end; ++src) { CRC_BYTE(this, crc0, *src); } return (crc0 ^ Base().Canonize()); } // Fastest, interleaved multi-byte CRC. Crc CrcMultiword(const void *data, size_t bytes, const Crc &start) const { if (kStride < 2 || kStride > 8) { // Unsupported configuration; // fall back to something sensible. return CrcWord(data, bytes, start); } const uint8 *src = static_cast(data); const uint8 *end = src + bytes; Crc crc0 = start ^ Base().Canonize(); ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word); if (src >= end) { return (crc0 ^ Base().Canonize()); } // Process kStride Word registers at once; // should have have at least 2*kInterleaveBytes of data to start. end -= 2*kInterleaveBytes - 1; if (src < end) { Crc crc_carryover; if (sizeof(Crc) > sizeof(Word)) { // crc_carryover is used if and only if Crc is wider than Word. crc_carryover = 0; } #define INIT_CRC(reg) \ Crc crc##reg; \ if (reg > 0 && kStride > reg) { \ crc##reg = 0; \ } REPEAT_FROM_1(INIT_CRC); #undef INIT_CRC #define INIT_BUF(reg) \ Word buf##reg; \ if (kStride > reg) { \ buf##reg = reinterpret_cast(src)[reg]; \ } REPEAT_FROM_0(INIT_BUF); #undef INIT_BUF do { PREFETCH(src); src += kInterleaveBytes; if (sizeof(Crc) > sizeof(Word)) { crc0 ^= crc_carryover; } #define FIRST(reg, next_reg) do { \ if (kStride > reg) { \ buf##reg ^= Downcast(crc##reg); \ if (sizeof(Crc) > sizeof(Word)) { \ if (reg < kStride - 1) { \ crc##next_reg ^= SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \ } else { \ crc_carryover = SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \ } \ } \ crc##reg = TABLE_ENTRY(this->crc_word_interleaved_, 0, buf##reg); \ buf##reg >>= 8; \ } \ } while (0) FIRST(0, 1); FIRST(1, 2); FIRST(2, 3); FIRST(3, 4); FIRST(4, 5); FIRST(5, 6); FIRST(6, 7); FIRST(7, 0); #undef FIRST for (size_t byte = 1; byte < sizeof(Word) - 1; ++byte) { #define NEXT(reg) do { \ if (kStride > reg) { \ crc##reg ^= \ TABLE_ENTRY(this->crc_word_interleaved_, byte, buf##reg); \ buf##reg >>= 8; \ } \ } while(0) REPEAT_FROM_0(NEXT); #undef NEXT } #define LAST(reg) do { \ if (kStride > reg) { \ crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_interleaved_, buf##reg); \ buf##reg = reinterpret_cast(src)[reg]; \ } \ } while(0) REPEAT_FROM_0(LAST); #undef LAST } while (src < end); if (sizeof(Crc) > sizeof(Word)) { crc0 ^= crc_carryover; } #define COMBINE(reg) do { \ if (kStride > reg) { \ if (reg != 0) { \ crc0 ^= crc##reg; \ } \ CRC_WORD(this, crc0, buf##reg); \ } \ } while (0) REPEAT_FROM_0(COMBINE); #undef COMBINE src += kInterleaveBytes; } end += 2*kInterleaveBytes - 1; // Process sizeof(Word) bytes at once. end -= sizeof(Word) - 1; for (; src < end; src += sizeof(Word)) { Word buf0 = reinterpret_cast(src)[0]; CRC_WORD(this, crc0, buf0); } end += sizeof(Word) - 1; // Compute CRC of remaining bytes. for (;src < end; ++src) { CRC_BYTE(this, crc0, *src); } return (crc0 ^ Base().Canonize()); } protected: enum { kInterleaveBytes = sizeof(Word) * kStride, }; // Multiplication tables used by CRCs. TableEntry crc_word_interleaved_[sizeof(Word)][256]; TableEntry crc_word_[sizeof(Word)][256]; // Base class stored after CRC tables so that the most frequently // used table is at offset 0 and may be accessed faster. GfUtil base_; friend class RollingCrc< GenericCrc >; private: // CrcMultiword on amd64 may run at 1.2 CPU cycles per byte which is // noticeably faster than CrcWord (2.2-2.6 cycles/byte depending on // hardware and compiler). However, there are problems with compilers. // // Test system: P45 chipset, Intel Q9650 CPU, 800MHz 4-4-4-12 memory. // // 64-bit compiler, <= 64-bit CRC, 64-bit tables, 64-bit reads: // CL 15.00.307291.1 C++ >1.2< CPU cycles/byte // ICL 11.1.051 -O3 C++ 1.5 CPU cycles/byte // GCC 4.5 -O3 C++ 2.0 CPU cycles/byte // GCC 4.x -O3 ASM >1.2< CPU cycles/byte // // 32-bit compiler, MMX used, <= 64-bit CRC, 64-bit tables, 64-bit reads // CL 15.00.307291.1 C++ 2.0 CPU cycles/byte // GCC 4.5 -O3 C++ 1.9 CPU cycles/byte // ICL 11.1.051 -S C++ 1.6 CPU cycles/byte // GCC 4.x -O3 ASM >1.3< CPU cycles/byte // // So, use inline ASM code for GCC for both i386 and amd64. Crc CrcMultiwordI386Mmx( const void *data, size_t bytes, const Crc &start) const; Crc CrcMultiwordGccAmd64( const void *data, size_t bytes, const Crc &start) const; Crc CrcMultiwordGccAmd64Sse2( const uint8 *src, const uint8 *end, const Crc &start) const; } GCC_ALIGN_ATTRIBUTE(16); #undef REPEAT_FROM_0 #undef REPEAT_FROM_1 // Specialized variants. #if CRCUTIL_USE_ASM #if (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX))) // Declare specialized functions. template<> uint64 GenericCrc::CrcMultiword( const void *data, size_t bytes, const uint64 &start) const; #if HAVE_AMD64 && HAVE_SSE2 template<> uint128_sse2 GenericCrc::CrcMultiword( const void *data, size_t bytes, const uint128_sse2 &start) const; #endif // HAVE_AMD64 && HAVE_SSE2 #elif defined(_MSC_FULL_VER) && _MSC_FULL_VER <= 150030729 && \ (HAVE_I386 && HAVE_MMX) // Work around bug in MSC (present at least in v. 15.00.30729.1) template<> uint64 GenericCrc::CrcMultiwordI386Mmx( const void *data, size_t bytes, const uint64 &start) const; template<> __forceinline uint64 GenericCrc::CrcMultiword( const void *data, size_t bytes, const uint64 &start) const { typedef uint64 Word; typedef uint64 Crc; if (bytes <= 12) { const uint8 *src = static_cast(data); uint64 crc = start ^ Base().Canonize(); for (const uint8 *end = src + bytes; src < end; ++src) { CRC_BYTE(this, crc, *src); } return (crc ^ Base().Canonize()); } return CrcMultiwordI386Mmx(data, bytes, start); } #endif // (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX))) #endif // CRCUTIL_USE_ASM #pragma pack(pop) } // namespace crcutil #endif // CRCUTIL_GENERIC_CRC_H_