123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369 |
- // Copyright 2010 Google Inc. All rights reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- // Implements CRC32C using Intel's SSE4 crc32 instruction.
- // Uses _mm_crc32_u64/32/8 intrinsics if CRCUTIL_USE_MM_CRC32 is not zero,
- // emilates intrinsics via CRC_WORD/CRC_BYTE otherwise.
- #include "crc32c_sse4.h"
- #include <util/system/compiler.h>
- #if HAVE_I386 || HAVE_AMD64
- namespace crcutil {
- #define UPDATE_STRIPE_CRCS(index, block_size, num_stripes) do { \
- CRC_UPDATE_WORD(crc0, \
- reinterpret_cast<const size_t *>(src + \
- 0 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
- CRC_UPDATE_WORD(crc1, \
- reinterpret_cast<const size_t *>(src + \
- 1 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
- CRC_UPDATE_WORD(crc2, \
- reinterpret_cast<const size_t *>(src + \
- 2 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
- if (num_stripes > 3) { \
- CRC_UPDATE_WORD(crc3, \
- reinterpret_cast<const size_t *>(src + \
- 3 * CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes))[index]); \
- } \
- } while (0)
- // Multiplies "crc" by "x**(8 * STRIPE_SIZE(block_size)"
- // using appropriate multiplication table(s).
- //
- #if 0
- // This variant is for illustration purposes only.
- // Actual implementation below:
- // 1. Splits the computation into 2 data-independent paths
- // by independently multiplying lower and upper halves
- // of "crc0" in interleaved manner, and combining the
- // results in the end.
- // 2. Removing redundant "crc0 = 0" etc. in the beginning.
- // 3. Removing redundant shifts of "tmp0" and "tmp1" in the last round.
- #define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
- size_t tmp0 = crc0; \
- crc0 = 0; \
- for (size_t i = 0; i < kNumTables; ++i) { \
- crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
- [i][tmp0 & (kTableEntries - 1)]; \
- tmp0 >>= kTableEntryBits; \
- } \
- } while (0)
- #else
- #define MULTIPLY_CRC(crc0, block_size, num_stripes) do { \
- size_t tmp0 = crc0; \
- size_t tmp1 = crc0 >> (kTableEntryBits * kNumTablesHalfHi); \
- crc0 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
- [0][tmp0 & (kTableEntries - 1)]; \
- tmp0 >>= kTableEntryBits; \
- size_t crc1 = CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
- [kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
- tmp1 >>= kTableEntryBits; \
- for (size_t i = 1; i < kNumTablesHalfLo - 1; ++i) { \
- crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
- [i][tmp0 & (kTableEntries - 1)]; \
- tmp0 >>= kTableEntryBits; \
- crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
- [i + kNumTablesHalfHi][tmp1 & (kTableEntries - 1)]; \
- tmp1 >>= kTableEntryBits; \
- } \
- crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
- [kNumTablesHalfLo - 1][tmp0 & (kTableEntries - 1)]; \
- if (kNumTables & 1) { \
- tmp0 >>= kTableEntryBits; \
- } \
- crc1 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
- [kNumTables - 1][tmp1]; \
- if (kNumTables & 1) { \
- crc0 ^= CRC32C_SSE4_MUL_TABLE(block_size, num_stripes) \
- [kNumTablesHalfLo][tmp0 & (kTableEntries - 1)]; \
- } \
- crc0 ^= crc1; \
- } while (0)
- #endif
- // Given CRCs (crc0, crc1, etc.) of consequitive
- // stripes of STRIPE_SIZE(block_size) bytes each,
- // produces CRC of concatenated stripes.
- #define COMBINE_STRIPE_CRCS(block_size, num_stripes) do { \
- MULTIPLY_CRC(crc0, block_size, num_stripes); \
- crc0 ^= crc1; \
- MULTIPLY_CRC(crc0, block_size, num_stripes); \
- crc0 ^= crc2; \
- if (num_stripes > 3) { \
- MULTIPLY_CRC(crc0, block_size, num_stripes); \
- crc0 ^= crc3; \
- } \
- } while (0)
- // Processes input BLOCK_SIZE(block) bytes per iteration
- // by splitting a block of BLOCK_SIZE(block) bytes into N
- // equally-sized stripes of STRIPE_SIZE(block_size) each,
- // computing CRC of each stripe, and concatenating stripe CRCs.
- #define PROCESS_BLOCK(block_size, num_stripes) do { \
- while (bytes >= CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
- Crc crc1 = 0; \
- Crc crc2 = 0; \
- Crc crc3; \
- if (num_stripes > 3) crc3 = 0; \
- { \
- const uint8 *stripe_end = src + \
- (CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) / \
- kUnrolledLoopBytes) * kUnrolledLoopBytes; \
- do { \
- UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
- UPDATE_STRIPE_CRCS(1, block_size, num_stripes); \
- UPDATE_STRIPE_CRCS(2, block_size, num_stripes); \
- UPDATE_STRIPE_CRCS(3, block_size, num_stripes); \
- UPDATE_STRIPE_CRCS(4, block_size, num_stripes); \
- UPDATE_STRIPE_CRCS(5, block_size, num_stripes); \
- UPDATE_STRIPE_CRCS(6, block_size, num_stripes); \
- UPDATE_STRIPE_CRCS(7, block_size, num_stripes); \
- src += kUnrolledLoopBytes; \
- } while (src < stripe_end); \
- if ((CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
- kUnrolledLoopBytes) != 0) { \
- stripe_end += \
- CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) % \
- kUnrolledLoopBytes; \
- do { \
- UPDATE_STRIPE_CRCS(0, block_size, num_stripes); \
- src += sizeof(size_t); \
- } while (src < stripe_end); \
- } \
- } \
- COMBINE_STRIPE_CRCS(block_size, num_stripes); \
- src += CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes) * \
- ((num_stripes) - 1); \
- bytes = static_cast<size_t>(end - src); \
- } \
- no_more_##block_size##_##num_stripes:; \
- } while (0)
- Y_NO_SANITIZE("undefined")
- size_t Crc32cSSE4::Crc32c(const void *data, size_t bytes, Crc crc0) const {
- const uint8 *src = static_cast<const uint8 *>(data);
- const uint8 *end = src + bytes;
- crc0 ^= Base().Canonize();
- // If we don't have too much data to process,
- // do not waste time trying to align input etc.
- // Noticeably improves performance on small inputs.
- if (bytes < 4 * sizeof(size_t)) goto less_than_4_size_t;
- if (bytes < 8 * sizeof(size_t)) goto less_than_8_size_t;
- if (bytes < 16 * sizeof(size_t)) goto less_than_16_size_t;
- #define PROCESS_TAIL_IF_SMALL(block_size, num_stripes) do { \
- if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
- goto no_more_##block_size##_##num_stripes; \
- } \
- } while (0)
- #define NOOP(block_size, num_stripes)
- CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(PROCESS_TAIL_IF_SMALL,
- NOOP,
- NOOP);
- #undef PROCESS_TAIL_IF_SMALL
- // Do not use ALIGN_ON_WORD_BOUNDARY_IF_NEEDED() here because:
- // 1. It uses CRC_BYTE() which won't work.
- // 2. Its threshold may be incorrect becuase Crc32 that uses
- // native CPU crc32 instruction is much faster than
- // generic table-based CRC computation.
- //
- // In case of X5550 CPU, break even point is at 2KB -- exactly.
- if (bytes >= 2 * 1024) {
- while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) {
- if (src >= end) {
- return (crc0 ^ Base().Canonize());
- }
- CRC_UPDATE_BYTE(crc0, src[0]);
- src += 1;
- }
- bytes = static_cast<size_t>(end - src);
- }
- if (src >= end) {
- return (crc0 ^ Base().Canonize());
- }
- // Quickly skip processing of too large blocks
- // Noticeably improves performance on small inputs.
- #define SKIP_BLOCK_IF_NEEDED(block_size, num_stripes) do { \
- if (bytes < CRC32C_SSE4_BLOCK_SIZE(block_size, num_stripes)) { \
- goto no_more_##block_size##_##num_stripes; \
- } \
- } while (0)
- CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_ASCENDING(NOOP,
- SKIP_BLOCK_IF_NEEDED,
- SKIP_BLOCK_IF_NEEDED);
- #undef SKIP_BLOCK_IF_NEEDED
- // Process data in all blocks.
- CRC32C_SSE4_ENUMERATE_ALL_BLOCKS_DESCENDING(PROCESS_BLOCK,
- PROCESS_BLOCK,
- PROCESS_BLOCK);
- // Finish the tail word-by-word and then byte-by-byte.
- #define CRC_UPDATE_WORD_4(index) do { \
- CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index]); \
- CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 1]); \
- CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 2]); \
- CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[index + 3]); \
- } while (0)
- if (bytes >= 4 * 4 * sizeof(size_t)) {
- end -= 4 * 4 * sizeof(size_t);
- do {
- CRC_UPDATE_WORD_4(4 * 0);
- CRC_UPDATE_WORD_4(4 * 1);
- CRC_UPDATE_WORD_4(4 * 2);
- CRC_UPDATE_WORD_4(4 * 3);
- src += 4 * 4 * sizeof(size_t);
- } while (src <= end);
- end += 4 * 4 * sizeof(size_t);
- bytes = static_cast<size_t>(end - src);
- }
- less_than_16_size_t:
- if (bytes >= 4 * 2 * sizeof(size_t)) {
- CRC_UPDATE_WORD_4(4 * 0);
- CRC_UPDATE_WORD_4(4 * 1);
- src += 4 * 2 * sizeof(size_t);
- bytes -= 4 * 2 * sizeof(size_t);
- }
- less_than_8_size_t:
- if (bytes >= 4 * sizeof(size_t)) {
- CRC_UPDATE_WORD_4(0);
- src += 4 * sizeof(size_t);
- bytes -= 4 * sizeof(size_t);
- }
- less_than_4_size_t:
- if (bytes >= 1 * sizeof(size_t)) {
- end -= 1 * sizeof(size_t);
- do {
- CRC_UPDATE_WORD(crc0, reinterpret_cast<const size_t *>(src)[0]);
- src += 1 * sizeof(size_t);
- } while (src <= end);
- end += 1 * sizeof(size_t);
- }
- while (src < end) {
- CRC_UPDATE_BYTE(crc0, src[0]);
- src += 1;
- }
- return (crc0 ^ Base().Canonize());
- }
- void Crc32cSSE4::Init(bool constant) {
- base_.Init(FixedGeneratingPolynomial(), FixedDegree(), constant);
- #define INIT_MUL_TABLE(block_size, num_stripes) do { \
- size_t multiplier = \
- Base().Xpow8N(CRC32C_SSE4_STRIPE_SIZE(block_size, num_stripes)); \
- for (size_t table = 0; table < kNumTables; ++table) { \
- for (size_t entry = 0; entry < kTableEntries; ++entry) { \
- size_t value = static_cast<uint32>(entry << (kTableEntryBits * table)); \
- CRC32C_SSE4_MUL_TABLE(block_size, num_stripes)[table][entry] = \
- static_cast<Entry>(Base().Multiply(value, multiplier)); \
- } \
- } \
- } while (0)
- CRC32C_SSE4_ENUMERATE_ALL_BLOCKS(INIT_MUL_TABLE);
- #undef INIT_MUL_TABLE
- #if !CRCUTIL_USE_MM_CRC32
- for (size_t j = 0; j < sizeof(Word); ++j) {
- Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + 32);
- for (size_t i = 0; i < 256; ++i) {
- crc_word_[j][i] = Base().MultiplyUnnormalized(i, 8, k);
- }
- }
- #endif // !CRCUTIL_USE_MM_CRC32
- }
- bool Crc32cSSE4::IsSSE42Available() {
- #if defined(_MSC_VER)
- int cpu_info[4];
- __cpuid(cpu_info, 1);
- return ((cpu_info[2] & (1 << 20)) != 0);
- #elif defined(__GNUC__) && (HAVE_AMD64 || HAVE_I386)
- // Not using "cpuid.h" intentionally: it is missing from
- // too many installations.
- uint32 eax;
- uint32 ecx;
- uint32 edx;
- __asm__ volatile(
- #if HAVE_I386 && defined(__PIC__)
- "push %%ebx\n"
- "cpuid\n"
- "pop %%ebx\n"
- #else
- "cpuid\n"
- #endif // HAVE_I386 && defined(__PIC__)
- : "=a" (eax), "=c" (ecx), "=d" (edx)
- : "a" (1), "2" (0)
- : "%ebx"
- );
- return ((ecx & (1 << 20)) != 0);
- #else
- return false;
- #endif
- }
- void RollingCrc32cSSE4::Init(const Crc32cSSE4 &crc,
- size_t roll_window_bytes,
- const Crc &start_value) {
- crc_ = &crc;
- roll_window_bytes_ = roll_window_bytes;
- start_value_ = start_value;
- Crc add = crc.Base().Canonize() ^ start_value;
- add = crc.Base().Multiply(add, crc.Base().Xpow8N(roll_window_bytes));
- add ^= crc.Base().Canonize();
- Crc mul = crc.Base().One() ^ crc.Base().Xpow8N(1);
- add = crc.Base().Multiply(add, mul);
- mul = crc.Base().XpowN(8 * roll_window_bytes + crc.Base().Degree());
- for (size_t i = 0; i < 256; ++i) {
- out_[i] = static_cast<Entry>(
- crc.Base().MultiplyUnnormalized(
- static_cast<Crc>(i), 8, mul) ^ add);
- }
- #if !CRCUTIL_USE_MM_CRC32
- memcpy(crc_word_, crc_->crc_word_, sizeof(crc_word_));
- #endif // !CRCUTIL_USE_MM_CRC32
- }
- } // namespace crcutil
- #endif // HAVE_I386 || HAVE_AMD64
|