crc32c_sse42_asm.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. /**
  2. * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3. * SPDX-License-Identifier: Apache-2.0.
  4. */
  5. #include <aws/checksums/private/crc_priv.h>
  6. #include <aws/common/cpuid.h>
  7. /* this implementation is only for the x86_64 intel architecture */
  8. #if defined(__x86_64__)
  9. # if defined(__clang__)
  10. # pragma clang diagnostic push
  11. # pragma clang diagnostic ignored "-Wdollar-in-identifier-extension"
  12. # endif
  13. /*
  14. * Factored out common inline asm for folding crc0,crc1,crc2 stripes in rcx, r11, r10 using
  15. * the specified Magic Constants K1 and K2.
  16. * Assumes rcx, r11, r10 contain crc0, crc1, crc2 that need folding
  17. * Utilizes xmm1, xmm2, xmm3, xmm4 as well as clobbering r8, r9, r11
  18. * Result is placed in ecx
  19. */
  20. # define FOLD_K1K2(NAME, K1, K2) \
  21. "fold_k1k2_" #NAME "_%=: \n" \
  22. "movl " #K1 ", %%r8d # Magic K1 constant \n" \
  23. "movl " #K2 ", %%r9d # Magic K2 constant \n" \
  24. "movq %%rcx, %%xmm1 # crc0 into lower dword of xmm1 \n" \
  25. "movq %%r8, %%xmm3 # K1 into lower dword of xmm3 \n" \
  26. "movq %%r11, %%xmm2 # crc1 into lower dword of xmm2 \n" \
  27. "movq %%r9, %%xmm4 # K2 into lower dword of xmm4 \n" \
  28. "pclmulqdq $0x00, %%xmm3, %%xmm1 # Multiply crc0 by K1 \n" \
  29. "pclmulqdq $0x00, %%xmm4, %%xmm2 # Multiply crc1 by K2 \n" \
  30. "xor %%rcx, %%rcx # \n" \
  31. "xor %%r11, %%r11 # \n" \
  32. "movq %%xmm1, %%r8 # \n" \
  33. "movq %%xmm2, %%r9 # \n" \
  34. "crc32q %%r8, %%rcx # folding crc0 \n" \
  35. "crc32q %%r9, %%r11 # folding crc1 \n" \
  36. "xor %%r10d, %%ecx # combine crc2 and crc0 \n" \
  37. "xor %%r11d, %%ecx # combine crc1 and crc0 \n"
  38. /**
  39. * Private (static) function.
  40. * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
  41. * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function
  42. * is optimized for exactly 256 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
  43. * pointer to input data that is exactly 256 bytes in length. Note: this function does NOT invert bits of the input crc
  44. * or return value.
  45. */
  46. static inline uint32_t s_crc32c_sse42_clmul_256(const uint8_t *input, uint32_t crc) {
  47. __asm__ __volatile__(
  48. "enter_256_%=:"
  49. "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
  50. "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
  51. "crc32q 0(%[in]), %%rcx # crc0 \n"
  52. "crc32q 88(%[in]), %%r11 # crc1 \n"
  53. "crc32q 176(%[in]), %%r10 # crc2 \n"
  54. "crc32q 8(%[in]), %%rcx # crc0 \n"
  55. "crc32q 96(%[in]), %%r11 # crc1 \n"
  56. "crc32q 184(%[in]), %%r10 # crc2 \n"
  57. "crc32q 16(%[in]), %%rcx # crc0 \n"
  58. "crc32q 104(%[in]), %%r11 # crc1 \n"
  59. "crc32q 192(%[in]), %%r10 # crc2 \n"
  60. "crc32q 24(%[in]), %%rcx # crc0 \n"
  61. "crc32q 112(%[in]), %%r11 # crc1 \n"
  62. "crc32q 200(%[in]), %%r10 # crc2 \n"
  63. "crc32q 32(%[in]), %%rcx # crc0 \n"
  64. "crc32q 120(%[in]), %%r11 # crc1 \n"
  65. "crc32q 208(%[in]), %%r10 # crc2 \n"
  66. "crc32q 40(%[in]), %%rcx # crc0 \n"
  67. "crc32q 128(%[in]), %%r11 # crc1 \n"
  68. "crc32q 216(%[in]), %%r10 # crc2 \n"
  69. "crc32q 48(%[in]), %%rcx # crc0 \n"
  70. "crc32q 136(%[in]), %%r11 # crc1 \n"
  71. "crc32q 224(%[in]), %%r10 # crc2 \n"
  72. "crc32q 56(%[in]), %%rcx # crc0 \n"
  73. "crc32q 144(%[in]), %%r11 # crc1 \n"
  74. "crc32q 232(%[in]), %%r10 # crc2 \n"
  75. "crc32q 64(%[in]), %%rcx # crc0 \n"
  76. "crc32q 152(%[in]), %%r11 # crc1 \n"
  77. "crc32q 240(%[in]), %%r10 # crc2 \n"
  78. "crc32q 72(%[in]), %%rcx # crc0 \n"
  79. "crc32q 160(%[in]), %%r11 # crc1 \n"
  80. "crc32q 248(%[in]), %%r10 # crc2 \n"
  81. "crc32q 80(%[in]), %%rcx # crc0 \n"
  82. "crc32q 168(%[in]), %%r11 # crc2 \n"
  83. FOLD_K1K2(256, $0x1b3d8f29, $0x39d3b296) /* Magic Constants used to fold crc stripes into ecx */
  84. /* output registers
  85. [crc] is an input and and output so it is marked read/write (i.e. "+c")*/
  86. : [ crc ] "+c"(crc)
  87. /* input registers */
  88. : [ in ] "d"(input)
  89. /* additional clobbered registers */
  90. : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
  91. return crc;
  92. }
  93. /**
  94. * Private (static) function.
  95. * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
  96. * instruction by operating on 3 24-byte stripes in parallel. The results are folded together using CLMUL. This function
  97. * is optimized for exactly 1024 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
  98. * pointer to input data that is exactly 1024 bytes in length. Note: this function does NOT invert bits of the input crc
  99. * or return value.
  100. */
  101. static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) {
  102. __asm__ __volatile__(
  103. "enter_1024_%=:"
  104. "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
  105. "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
  106. "movl $5, %%r8d # Loop 5 times through 64 byte chunks in 3 parallel stripes \n"
  107. "loop_1024_%=:"
  108. "prefetcht0 128(%[in]) # \n"
  109. "prefetcht0 472(%[in]) # \n"
  110. "prefetcht0 808(%[in]) # \n"
  111. "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n"
  112. "crc32q 344(%[in]), %%r11 # crc1: stripe1 \n"
  113. "crc32q 680(%[in]), %%r10 # crc2: stripe2 \n"
  114. "crc32q 8(%[in]), %%rcx # crc0 \n"
  115. "crc32q 352(%[in]), %%r11 # crc1 \n"
  116. "crc32q 688(%[in]), %%r10 # crc2 \n"
  117. "crc32q 16(%[in]), %%rcx # crc0 \n"
  118. "crc32q 360(%[in]), %%r11 # crc1 \n"
  119. "crc32q 696(%[in]), %%r10 # crc2 \n"
  120. "crc32q 24(%[in]), %%rcx # crc0 \n"
  121. "crc32q 368(%[in]), %%r11 # crc1 \n"
  122. "crc32q 704(%[in]), %%r10 # crc2 \n"
  123. "crc32q 32(%[in]), %%rcx # crc0 \n"
  124. "crc32q 376(%[in]), %%r11 # crc1 \n"
  125. "crc32q 712(%[in]), %%r10 # crc2 \n"
  126. "crc32q 40(%[in]), %%rcx # crc0 \n"
  127. "crc32q 384(%[in]), %%r11 # crc1 \n"
  128. "crc32q 720(%[in]), %%r10 # crc2 \n"
  129. "crc32q 48(%[in]), %%rcx # crc0 \n"
  130. "crc32q 392(%[in]), %%r11 # crc1 \n"
  131. "crc32q 728(%[in]), %%r10 # crc2 \n"
  132. "crc32q 56(%[in]), %%rcx # crc0 \n"
  133. "crc32q 400(%[in]), %%r11 # crc1 \n"
  134. "crc32q 736(%[in]), %%r10 # crc2 \n"
  135. "add $64, %[in] # \n"
  136. "sub $1, %%r8d # \n"
  137. "jnz loop_1024_%= # \n"
  138. "crc32q 0(%[in]), %%rcx # crc0 \n"
  139. "crc32q 344(%[in]), %%r11 # crc1 \n"
  140. "crc32q 680(%[in]), %%r10 # crc2 \n"
  141. "crc32q 8(%[in]), %%rcx # crc0 \n"
  142. "crc32q 352(%[in]), %%r11 # crc1 \n"
  143. "crc32q 688(%[in]), %%r10 # crc2 \n"
  144. "crc32q 16(%[in]), %%rcx # crc0 \n"
  145. "crc32q 696(%[in]), %%r10 # crc2 \n"
  146. FOLD_K1K2(1024, $0xe417f38a, $0x8f158014) /* Magic Constants used to fold crc stripes into ecx
  147. output registers
  148. [crc] is an input and and output so it is marked read/write (i.e. "+c")
  149. we clobber the register for [input] (via add instruction) so we must also
  150. tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber */
  151. : [ crc ] "+c"(crc), [ in ] "+d"(input)
  152. :
  153. /* additional clobbered registers */
  154. /* "cc" is the flags - we add and sub, so the flags are also clobbered */
  155. : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
  156. return crc;
  157. }
  158. /**
  159. * Private (static) function.
  160. * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (quad word) machine
  161. * instruction by operating on 24-byte stripes in parallel. The results are folded together using CLMUL. This function
  162. * is optimized for exactly 3072 byte blocks that are best aligned on 8-byte memory addresses. It MUST be passed a
  163. * pointer to input data that is exactly 3072 bytes in length. Note: this function does NOT invert bits of the input crc
  164. * or return value.
  165. */
  166. static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t crc) {
  167. __asm__ __volatile__(
  168. "enter_3072_%=:"
  169. "xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
  170. "xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
  171. "movl $16, %%r8d # Loop 16 times through 64 byte chunks in 3 parallel stripes \n"
  172. "loop_3072_%=:"
  173. "prefetcht0 128(%[in]) # \n"
  174. "prefetcht0 1152(%[in]) # \n"
  175. "prefetcht0 2176(%[in]) # \n"
  176. "crc32q 0(%[in]), %%rcx # crc0: stripe0 \n"
  177. "crc32q 1024(%[in]), %%r11 # crc1: stripe1 \n"
  178. "crc32q 2048(%[in]), %%r10 # crc2: stripe2 \n"
  179. "crc32q 8(%[in]), %%rcx # crc0: stripe0 \n"
  180. "crc32q 1032(%[in]), %%r11 # crc1: stripe1 \n"
  181. "crc32q 2056(%[in]), %%r10 # crc2: stripe2 \n"
  182. "crc32q 16(%[in]), %%rcx # crc0: stripe0 \n"
  183. "crc32q 1040(%[in]), %%r11 # crc1: stripe1 \n"
  184. "crc32q 2064(%[in]), %%r10 # crc2: stripe2 \n"
  185. "crc32q 24(%[in]), %%rcx # crc0: stripe0 \n"
  186. "crc32q 1048(%[in]), %%r11 # crc1: stripe1 \n"
  187. "crc32q 2072(%[in]), %%r10 # crc2: stripe2 \n"
  188. "crc32q 32(%[in]), %%rcx # crc0: stripe0 \n"
  189. "crc32q 1056(%[in]), %%r11 # crc1: stripe1 \n"
  190. "crc32q 2080(%[in]), %%r10 # crc2: stripe2 \n"
  191. "crc32q 40(%[in]), %%rcx # crc0: stripe0 \n"
  192. "crc32q 1064(%[in]), %%r11 # crc1: stripe1 \n"
  193. "crc32q 2088(%[in]), %%r10 # crc2: stripe2 \n"
  194. "crc32q 48(%[in]), %%rcx # crc0: stripe0 \n"
  195. "crc32q 1072(%[in]), %%r11 # crc1: stripe1 \n"
  196. "crc32q 2096(%[in]), %%r10 # crc2: stripe2 \n"
  197. "crc32q 56(%[in]), %%rcx # crc0: stripe0 \n"
  198. "crc32q 1080(%[in]), %%r11 # crc1: stripe1 \n"
  199. "crc32q 2104(%[in]), %%r10 # crc2: stripe2 \n"
  200. "add $64, %[in] # \n"
  201. "sub $1, %%r8d # \n"
  202. "jnz loop_3072_%= # \n"
  203. FOLD_K1K2(
  204. 3072,
  205. $0xa51b6135,
  206. $0x170076fa) /* Magic Constants used to fold crc stripes into ecx
  207. output registers
  208. [crc] is an input and and output so it is marked read/write (i.e. "+c")
  209. we clobber the register for [input] (via add instruction) so we must also
  210. tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber*/
  211. : [ crc ] "+c"(crc), [ in ] "+d"(input)
  212. :
  213. /* additional clobbered registers
  214. "cc" is the flags - we add and sub, so the flags are also clobbered */
  215. : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
  216. return crc;
  217. }
  218. static bool detection_performed = false;
  219. static bool detected_clmul = false;
  220. /*
  221. * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
  222. * PCLMULQDQ machine instructions (if present).
  223. * Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
  224. * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
  225. * call.
  226. */
  227. uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
  228. if (AWS_UNLIKELY(!detection_performed)) {
  229. detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
  230. /* Simply setting the flag true to skip HW detection next time
  231. Not using memory barriers since the worst that can
  232. happen is a fallback to the non HW accelerated code. */
  233. detection_performed = true;
  234. }
  235. uint32_t crc = ~previousCrc32;
  236. /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
  237. if (AWS_UNLIKELY(length < 8)) {
  238. while (length-- > 0) {
  239. __asm__("loop_small_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
  240. input++;
  241. }
  242. return ~crc;
  243. }
  244. /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
  245. int input_alignment = (unsigned long int)input & 0x7;
  246. /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
  247. int leading = (8 - input_alignment) & 0x7;
  248. /* reduce the length by the leading unaligned bytes we are about to process */
  249. length -= leading;
  250. /* spin through the leading unaligned input bytes (if any) one-by-one */
  251. while (leading-- > 0) {
  252. __asm__("loop_leading_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
  253. input++;
  254. }
  255. /* Using likely to keep this code inlined */
  256. if (AWS_LIKELY(detected_clmul)) {
  257. while (AWS_LIKELY(length >= 3072)) {
  258. /* Compute crc32c on each block, chaining each crc result */
  259. crc = s_crc32c_sse42_clmul_3072(input, crc);
  260. input += 3072;
  261. length -= 3072;
  262. }
  263. while (AWS_LIKELY(length >= 1024)) {
  264. /* Compute crc32c on each block, chaining each crc result */
  265. crc = s_crc32c_sse42_clmul_1024(input, crc);
  266. input += 1024;
  267. length -= 1024;
  268. }
  269. while (AWS_LIKELY(length >= 256)) {
  270. /* Compute crc32c on each block, chaining each crc result */
  271. crc = s_crc32c_sse42_clmul_256(input, crc);
  272. input += 256;
  273. length -= 256;
  274. }
  275. }
  276. /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
  277. while (AWS_LIKELY(length >= 8)) {
  278. /* Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction */
  279. __asm__ __volatile__("loop_8_%=: CRC32Q (%[in]), %%rcx" : [ crc ] "+c"(crc) : [ in ] "r"(input));
  280. input += 8;
  281. length -= 8;
  282. }
  283. /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
  284. while (length-- > 0) {
  285. __asm__ __volatile__("loop_trailing_%=: CRC32B (%[in]), %[crc]" : [ crc ] "+c"(crc) : [ in ] "r"(input));
  286. input++;
  287. }
  288. return ~crc;
  289. }
  290. uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
  291. return aws_checksums_crc32_sw(input, length, previousCrc32);
  292. }
  293. # if defined(__clang__)
  294. # pragma clang diagnostic pop
  295. # endif
  296. #else
  297. uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
  298. return aws_checksums_crc32_sw(input, length, previousCrc32);
  299. }
  300. uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
  301. return aws_checksums_crc32c_sw(input, length, previousCrc32);
  302. }
  303. #endif