123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377 |
- /**
- * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
- * SPDX-License-Identifier: Apache-2.0.
- */
- #include <aws/checksums/private/crc_priv.h>
- #include <aws/common/cpuid.h>
- /* this implementation is only for the x86_64 intel architecture */
- #if defined(__x86_64__)
- # if defined(__clang__)
- # pragma clang diagnostic push
- # pragma clang diagnostic ignored "-Wdollar-in-identifier-extension"
- # endif
- /*
- * Factored out common inline asm for folding crc0,crc1,crc2 stripes in rcx, r11, r10 using
- * the specified Magic Constants K1 and K2.
- * Assumes rcx, r11, r10 contain crc0, crc1, crc2 that need folding
- * Utilizes xmm1, xmm2, xmm3, xmm4 as well as clobbering r8, r9, r11
- * Result is placed in ecx
- */
- # define FOLD_K1K2(NAME, K1, K2) \
- "fold_k1k2_" #NAME "_%=: \n" \
- "movl " #K1 ", %%r8d # Magic K1 constant \n" \
- "movl " #K2 ", %%r9d # Magic K2 constant \n" \
- "movq %%rcx, %%xmm1 # crc0 into lower dword of xmm1 \n" \
- "movq %%r8, %%xmm3 # K1 into lower dword of xmm3 \n" \
- "movq %%r11, %%xmm2 # crc1 into lower dword of xmm2 \n" \
- "movq %%r9, %%xmm4 # K2 into lower dword of xmm4 \n" \
- "pclmulqdq $0x00, %%xmm3, %%xmm1 # Multiply crc0 by K1 \n" \
- "pclmulqdq $0x00, %%xmm4, %%xmm2 # Multiply crc1 by K2 \n" \
- "xor %%rcx, %%rcx # \n" \
- "xor %%r11, %%r11 # \n" \
- "movq %%xmm1, %%r8 # \n" \
- "movq %%xmm2, %%r9 # \n" \
- "crc32q %%r8, %%rcx # folding crc0 \n" \
- "crc32q %%r9, %%r11 # folding crc1 \n" \
- "xor %%r10d, %%ecx # combine crc2 and crc0 \n" \
- "xor %%r11d, %%ecx # combine crc1 and crc0 \n"
- /**
- * Private (static) function.
- * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
- * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function
- * is optimized for exactly 256 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
- * pointer to input data that is exactly 256 bytes in length. Note: this function does NOT invert bits of the input crc
- * or return value.
- */
- static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t crc) {
- __asm__ __volatile__(
- "enter_256_%=:"
- "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
- "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
- "crc32q 0(%[in]), %%rcx # crc0 \n"
- "crc32q 88(%[in]), %%r11 # crc1 \n"
- "crc32q 176(%[in]), %%r10 # crc2 \n"
- "crc32q 8(%[in]), %%rcx # crc0 \n"
- "crc32q 96(%[in]), %%r11 # crc1 \n"
- "crc32q 184(%[in]), %%r10 # crc2 \n"
- "crc32q 16(%[in]), %%rcx # crc0 \n"
- "crc32q 104(%[in]), %%r11 # crc1 \n"
- "crc32q 192(%[in]), %%r10 # crc2 \n"
- "crc32q 24(%[in]), %%rcx # crc0 \n"
- "crc32q 112(%[in]), %%r11 # crc1 \n"
- "crc32q 200(%[in]), %%r10 # crc2 \n"
- "crc32q 32(%[in]), %%rcx # crc0 \n"
- "crc32q 120(%[in]), %%r11 # crc1 \n"
- "crc32q 208(%[in]), %%r10 # crc2 \n"
- "crc32q 40(%[in]), %%rcx # crc0 \n"
- "crc32q 128(%[in]), %%r11 # crc1 \n"
- "crc32q 216(%[in]), %%r10 # crc2 \n"
- "crc32q 48(%[in]), %%rcx # crc0 \n"
- "crc32q 136(%[in]), %%r11 # crc1 \n"
- "crc32q 224(%[in]), %%r10 # crc2 \n"
- "crc32q 56(%[in]), %%rcx # crc0 \n"
- "crc32q 144(%[in]), %%r11 # crc1 \n"
- "crc32q 232(%[in]), %%r10 # crc2 \n"
- "crc32q 64(%[in]), %%rcx # crc0 \n"
- "crc32q 152(%[in]), %%r11 # crc1 \n"
- "crc32q 240(%[in]), %%r10 # crc2 \n"
- "crc32q 72(%[in]), %%rcx # crc0 \n"
- "crc32q 160(%[in]), %%r11 # crc1 \n"
- "crc32q 248(%[in]), %%r10 # crc2 \n"
- "crc32q 80(%[in]), %%rcx # crc0 \n"
- "crc32q 168(%[in]), %%r11 # crc2 \n"
- FOLD_K1K2(256, $0x1b3d8f29, $0x39d3b296) /* Magic Constants used to fold crc stripes into ecx */
- /* output registers
- [crc] is an input and and output so it is marked read/write (i.e. "+c")*/
- : [ crc ] "+c"(crc)
- /* input registers */
- : [ in ] "d"(input)
- /* additional clobbered registers */
- : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
- return crc;
- }
- /**
- * Private (static) function.
- * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
- * instruction by operating on 3 24-byte stripes in parallel. The results are folded together using CLMUL. This function
- * is optimized for exactly 1024 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
- * pointer to input data that is exactly 1024 bytes in length. Note: this function does NOT invert bits of the input crc
- * or return value.
- */
- static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) {
- __asm__ __volatile__(
- "enter_1024_%=:"
- "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
- "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
- "movl $5, %%r8d # Loop 5 times through 64 byte chunks in 3 parallel stripes \n"
- "loop_1024_%=:"
- "prefetcht0 128(%[in]) # \n"
- "prefetcht0 472(%[in]) # \n"
- "prefetcht0 808(%[in]) # \n"
- "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n"
- "crc32q 344(%[in]), %%r11 # crc1: stripe1 \n"
- "crc32q 680(%[in]), %%r10 # crc2: stripe2 \n"
- "crc32q 8(%[in]), %%rcx # crc0 \n"
- "crc32q 352(%[in]), %%r11 # crc1 \n"
- "crc32q 688(%[in]), %%r10 # crc2 \n"
- "crc32q 16(%[in]), %%rcx # crc0 \n"
- "crc32q 360(%[in]), %%r11 # crc1 \n"
- "crc32q 696(%[in]), %%r10 # crc2 \n"
- "crc32q 24(%[in]), %%rcx # crc0 \n"
- "crc32q 368(%[in]), %%r11 # crc1 \n"
- "crc32q 704(%[in]), %%r10 # crc2 \n"
- "crc32q 32(%[in]), %%rcx # crc0 \n"
- "crc32q 376(%[in]), %%r11 # crc1 \n"
- "crc32q 712(%[in]), %%r10 # crc2 \n"
- "crc32q 40(%[in]), %%rcx # crc0 \n"
- "crc32q 384(%[in]), %%r11 # crc1 \n"
- "crc32q 720(%[in]), %%r10 # crc2 \n"
- "crc32q 48(%[in]), %%rcx # crc0 \n"
- "crc32q 392(%[in]), %%r11 # crc1 \n"
- "crc32q 728(%[in]), %%r10 # crc2 \n"
- "crc32q 56(%[in]), %%rcx # crc0 \n"
- "crc32q 400(%[in]), %%r11 # crc1 \n"
- "crc32q 736(%[in]), %%r10 # crc2 \n"
- "add $64, %[in] # \n"
- "sub $1, %%r8d # \n"
- "jnz loop_1024_%= # \n"
- "crc32q 0(%[in]), %%rcx # crc0 \n"
- "crc32q 344(%[in]), %%r11 # crc1 \n"
- "crc32q 680(%[in]), %%r10 # crc2 \n"
- "crc32q 8(%[in]), %%rcx # crc0 \n"
- "crc32q 352(%[in]), %%r11 # crc1 \n"
- "crc32q 688(%[in]), %%r10 # crc2 \n"
- "crc32q 16(%[in]), %%rcx # crc0 \n"
- "crc32q 696(%[in]), %%r10 # crc2 \n"
- FOLD_K1K2(1024, $0xe417f38a, $0x8f158014) /* Magic Constants used to fold crc stripes into ecx
- output registers
- [crc] is an input and and output so it is marked read/write (i.e. "+c")
- we clobber the register for [input] (via add instruction) so we must also
- tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber */
- : [ crc ] "+c"(crc), [ in ] "+d"(input)
- :
- /* additional clobbered registers */
- /* "cc" is the flags - we add and sub, so the flags are also clobbered */
- : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
- return crc;
- }
- /**
- * Private (static) function.
- * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
- * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function
- * is optimized for exactly 3072 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
- * pointer to input data that is exactly 3072 bytes in length. Note: this function does NOT invert bits of the input crc
- * or return value.
- */
- static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t crc) {
- __asm__ __volatile__(
- "enter_3072_%=:"
- "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
- "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
- "movl $16, %%r8d # Loop 16 times through 64 byte chunks in 3 parallel stripes \n"
- "loop_3072_%=:"
- "prefetcht0 128(%[in]) # \n"
- "prefetcht0 1152(%[in]) # \n"
- "prefetcht0 2176(%[in]) # \n"
- "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n"
- "crc32q 1024(%[in]), %%r11 # crc1: stripe1 \n"
- "crc32q 2048(%[in]), %%r10 # crc2: stripe2 \n"
- "crc32q 8(%[in]), %%rcx # crc0: stripe0 \n"
- "crc32q 1032(%[in]), %%r11 # crc1: stripe1 \n"
- "crc32q 2056(%[in]), %%r10 # crc2: stripe2 \n"
- "crc32q 16(%[in]), %%rcx # crc0: stripe0 \n"
- "crc32q 1040(%[in]), %%r11 # crc1: stripe1 \n"
- "crc32q 2064(%[in]), %%r10 # crc2: stripe2 \n"
- "crc32q 24(%[in]), %%rcx # crc0: stripe0 \n"
- "crc32q 1048(%[in]), %%r11 # crc1: stripe1 \n"
- "crc32q 2072(%[in]), %%r10 # crc2: stripe2 \n"
- "crc32q 32(%[in]), %%rcx # crc0: stripe0 \n"
- "crc32q 1056(%[in]), %%r11 # crc1: stripe1 \n"
- "crc32q 2080(%[in]), %%r10 # crc2: stripe2 \n"
- "crc32q 40(%[in]), %%rcx # crc0: stripe0 \n"
- "crc32q 1064(%[in]), %%r11 # crc1: stripe1 \n"
- "crc32q 2088(%[in]), %%r10 # crc2: stripe2 \n"
- "crc32q 48(%[in]), %%rcx # crc0: stripe0 \n"
- "crc32q 1072(%[in]), %%r11 # crc1: stripe1 \n"
- "crc32q 2096(%[in]), %%r10 # crc2: stripe2 \n"
- "crc32q 56(%[in]), %%rcx # crc0: stripe0 \n"
- "crc32q 1080(%[in]), %%r11 # crc1: stripe1 \n"
- "crc32q 2104(%[in]), %%r10 # crc2: stripe2 \n"
- "add $64, %[in] # \n"
- "sub $1, %%r8d # \n"
- "jnz loop_3072_%= # \n"
- FOLD_K1K2(
- 3072,
- $0xa51b6135,
- $0x170076fa) /* Magic Constants used to fold crc stripes into ecx
- output registers
- [crc] is an input and and output so it is marked read/write (i.e. "+c")
- we clobber the register for [input] (via add instruction) so we must also
- tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber*/
- : [ crc ] "+c"(crc), [ in ] "+d"(input)
- :
- /* additional clobbered registers
- "cc" is the flags - we add and sub, so the flags are also clobbered */
- : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
- return crc;
- }
- static bool detection_performed = false;
- static bool detected_clmul = false;
- /*
- * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
- * PCLMULQDQ machine instructions (if present).
- * Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
- * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
- * call.
- */
- uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
- if (AWS_UNLIKELY(!detection_performed)) {
- detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
- /* Simply setting the flag true to skip HW detection next time
- Not using memory barriers since the worst that can
- happen is a fallback to the non HW accelerated code. */
- detection_performed = true;
- }
- uint32_t crc = ~previousCrc32;
- /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
- if (AWS_UNLIKELY(length < 8)) {
- while (length-- > 0) {
- __asm__("loop_small_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
- input++;
- }
- return ~crc;
- }
- /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
- int input_alignment = (unsigned long int)input & 0x7;
- /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
- int leading = (8 - input_alignment) & 0x7;
- /* reduce the length by the leading unaligned bytes we are about to process */
- length -= leading;
- /* spin through the leading unaligned input bytes (if any) one-by-one */
- while (leading-- > 0) {
- __asm__("loop_leading_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
- input++;
- }
- /* Using likely to keep this code inlined */
- if (AWS_LIKELY(detected_clmul)) {
- while (AWS_LIKELY(length >= 3072)) {
- /* Compute crc32c on each block, chaining each crc result */
- crc = s_crc32c_sse42_clmul_3072(input, crc);
- input += 3072;
- length -= 3072;
- }
- while (AWS_LIKELY(length >= 1024)) {
- /* Compute crc32c on each block, chaining each crc result */
- crc = s_crc32c_sse42_clmul_1024(input, crc);
- input += 1024;
- length -= 1024;
- }
- while (AWS_LIKELY(length >= 256)) {
- /* Compute crc32c on each block, chaining each crc result */
- crc = s_crc32c_sse42_clmul_256(input, crc);
- input += 256;
- length -= 256;
- }
- }
- /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
- while (AWS_LIKELY(length >= 8)) {
- /* Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction */
- __asm__ __volatile__("loop_8_%=: CRC32Q (%[in]), %%rcx" : [ crc ] "+c"(crc) : [ in ] "r"(input));
- input += 8;
- length -= 8;
- }
- /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
- while (length-- > 0) {
- __asm__ __volatile__("loop_trailing_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
- input++;
- }
- return ~crc;
- }
- uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
- return aws_checksums_crc32_sw(input, length, previousCrc32);
- }
- # if defined(__clang__)
- # pragma clang diagnostic pop
- # endif
- #else
- uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
- return aws_checksums_crc32_sw(input, length, previousCrc32);
- }
- uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
- return aws_checksums_crc32c_sw(input, length, previousCrc32);
- }
- #endif
|