crc_x86_arm_combined.cc 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737
  1. // Copyright 2022 The Abseil Authors.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // https://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Hardware accelerated CRC32 computation on Intel and ARM architecture.
  15. #include <cstddef>
  16. #include <cstdint>
  17. #include <memory>
  18. #include <vector>
  19. #include "absl/base/attributes.h"
  20. #include "absl/base/config.h"
  21. #include "absl/base/internal/endian.h"
  22. #include "absl/base/prefetch.h"
  23. #include "absl/crc/internal/cpu_detect.h"
  24. #include "absl/crc/internal/crc32_x86_arm_combined_simd.h"
  25. #include "absl/crc/internal/crc_internal.h"
  26. #include "absl/memory/memory.h"
  27. #include "absl/numeric/bits.h"
  28. #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
  29. defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
  30. #define ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
  31. #endif
  32. namespace absl {
  33. ABSL_NAMESPACE_BEGIN
  34. namespace crc_internal {
  35. #if defined(ABSL_INTERNAL_CAN_USE_SIMD_CRC32C)
  36. // Implementation details not exported outside of file
  37. namespace {
  38. // Some machines have CRC acceleration hardware.
  39. // We can do a faster version of Extend() on such machines.
  40. class CRC32AcceleratedX86ARMCombined : public CRC32 {
  41. public:
  42. CRC32AcceleratedX86ARMCombined() {}
  43. ~CRC32AcceleratedX86ARMCombined() override {}
  44. void ExtendByZeroes(uint32_t* crc, size_t length) const override;
  45. uint32_t ComputeZeroConstant(size_t length) const;
  46. private:
  47. CRC32AcceleratedX86ARMCombined(const CRC32AcceleratedX86ARMCombined&) =
  48. delete;
  49. CRC32AcceleratedX86ARMCombined& operator=(
  50. const CRC32AcceleratedX86ARMCombined&) = delete;
  51. };
  52. // Constants for switching between algorithms.
  53. // Chosen by comparing speed at different powers of 2.
  54. constexpr size_t kSmallCutoff = 256;
  55. constexpr size_t kMediumCutoff = 2048;
  56. #define ABSL_INTERNAL_STEP1(crc) \
  57. do { \
  58. crc = CRC32_u8(static_cast<uint32_t>(crc), *p++); \
  59. } while (0)
  60. #define ABSL_INTERNAL_STEP2(crc) \
  61. do { \
  62. crc = \
  63. CRC32_u16(static_cast<uint32_t>(crc), absl::little_endian::Load16(p)); \
  64. p += 2; \
  65. } while (0)
  66. #define ABSL_INTERNAL_STEP4(crc) \
  67. do { \
  68. crc = \
  69. CRC32_u32(static_cast<uint32_t>(crc), absl::little_endian::Load32(p)); \
  70. p += 4; \
  71. } while (0)
  72. #define ABSL_INTERNAL_STEP8(crc, data) \
  73. do { \
  74. crc = CRC32_u64(static_cast<uint32_t>(crc), \
  75. absl::little_endian::Load64(data)); \
  76. data += 8; \
  77. } while (0)
  78. #define ABSL_INTERNAL_STEP8BY2(crc0, crc1, p0, p1) \
  79. do { \
  80. ABSL_INTERNAL_STEP8(crc0, p0); \
  81. ABSL_INTERNAL_STEP8(crc1, p1); \
  82. } while (0)
  83. #define ABSL_INTERNAL_STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \
  84. do { \
  85. ABSL_INTERNAL_STEP8(crc0, p0); \
  86. ABSL_INTERNAL_STEP8(crc1, p1); \
  87. ABSL_INTERNAL_STEP8(crc2, p2); \
  88. } while (0)
  89. namespace {
  90. uint32_t multiply(uint32_t a, uint32_t b) {
  91. V128 power = V128_From64WithZeroFill(a);
  92. V128 crc = V128_From64WithZeroFill(b);
  93. V128 res = V128_PMulLow(power, crc);
  94. // Combine crc values.
  95. //
  96. // Adding res to itself is equivalent to multiplying by 2,
  97. // or shifting left by 1. Addition is used as not all compilers
  98. // are able to generate optimal code without this hint.
  99. // https://godbolt.org/z/rr3fMnf39
  100. res = V128_Add64(res, res);
  101. return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
  102. CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
  103. }
  104. // Powers of crc32c polynomial, for faster ExtendByZeros.
  105. // Verified against folly:
  106. // folly/hash/detail/Crc32CombineDetail.cpp
  107. constexpr uint32_t kCRC32CPowers[] = {
  108. 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
  109. 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564,
  110. 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3,
  111. 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc,
  112. 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
  113. 0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
  114. 0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62,
  115. 0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f,
  116. 0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
  117. 0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000,
  118. 0x00800000, 0x00008000,
  119. };
  120. } // namespace
  121. // Compute a magic constant, so that multiplying by it is the same as
  122. // extending crc by length zeros.
  123. uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
  124. size_t length) const {
  125. // Lowest 2 bits are handled separately in ExtendByZeroes
  126. length >>= 2;
  127. int index = absl::countr_zero(length);
  128. uint32_t prev = kCRC32CPowers[index];
  129. length &= length - 1;
  130. while (length) {
  131. // For each bit of length, extend by 2**n zeros.
  132. index = absl::countr_zero(length);
  133. prev = multiply(prev, kCRC32CPowers[index]);
  134. length &= length - 1;
  135. }
  136. return prev;
  137. }
  138. void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc,
  139. size_t length) const {
  140. uint32_t val = *crc;
  141. // Don't bother with multiplication for small length.
  142. switch (length & 3) {
  143. case 0:
  144. break;
  145. case 1:
  146. val = CRC32_u8(val, 0);
  147. break;
  148. case 2:
  149. val = CRC32_u16(val, 0);
  150. break;
  151. case 3:
  152. val = CRC32_u8(val, 0);
  153. val = CRC32_u16(val, 0);
  154. break;
  155. }
  156. if (length > 3) {
  157. val = multiply(val, ComputeZeroConstant(length));
  158. }
  159. *crc = val;
  160. }
  161. // Taken from Intel paper "Fast CRC Computation for iSCSI Polynomial Using CRC32
  162. // Instruction"
  163. // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
  164. // We only need every 4th value, because we unroll loop by 4.
  165. constexpr uint64_t kClmulConstants[] = {
  166. 0x09e4addf8, 0x0ba4fc28e, 0x00d3b6092, 0x09e4addf8, 0x0ab7aff2a,
  167. 0x102f9b8a2, 0x0b9e02b86, 0x00d3b6092, 0x1bf2e8b8a, 0x18266e456,
  168. 0x0d270f1a2, 0x0ab7aff2a, 0x11eef4f8e, 0x083348832, 0x0dd7e3b0c,
  169. 0x0b9e02b86, 0x0271d9844, 0x1b331e26a, 0x06b749fb2, 0x1bf2e8b8a,
  170. 0x0e6fc4e6a, 0x0ce7f39f4, 0x0d7a4825c, 0x0d270f1a2, 0x026f6a60a,
  171. 0x12ed0daac, 0x068bce87a, 0x11eef4f8e, 0x1329d9f7e, 0x0b3e32c28,
  172. 0x0170076fa, 0x0dd7e3b0c, 0x1fae1cc66, 0x010746f3c, 0x086d8e4d2,
  173. 0x0271d9844, 0x0b3af077a, 0x093a5f730, 0x1d88abd4a, 0x06b749fb2,
  174. 0x0c9c8b782, 0x0cec3662e, 0x1ddffc5d4, 0x0e6fc4e6a, 0x168763fa6,
  175. 0x0b0cd4768, 0x19b1afbc4, 0x0d7a4825c, 0x123888b7a, 0x00167d312,
  176. 0x133d7a042, 0x026f6a60a, 0x000bcf5f6, 0x19d34af3a, 0x1af900c24,
  177. 0x068bce87a, 0x06d390dec, 0x16cba8aca, 0x1f16a3418, 0x1329d9f7e,
  178. 0x19fb2a8b0, 0x02178513a, 0x1a0f717c4, 0x0170076fa,
  179. };
  180. enum class CutoffStrategy {
  181. // Use 3 CRC streams to fold into 1.
  182. Fold3,
  183. // Unroll CRC instructions for 64 bytes.
  184. Unroll64CRC,
  185. };
  186. // Base class for CRC32AcceleratedX86ARMCombinedMultipleStreams containing the
  187. // methods and data that don't need the template arguments.
  188. class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
  189. : public CRC32AcceleratedX86ARMCombined {
  190. protected:
  191. // Update partialCRC with crc of 64 byte block. Calling FinalizePclmulStream
  192. // would produce a single crc checksum, but it is expensive. PCLMULQDQ has a
  193. // high latency, so we run 4 128-bit partial checksums that can be reduced to
  194. // a single value by FinalizePclmulStream later. Computing crc for arbitrary
  195. // polynomialas with PCLMULQDQ is described in Intel paper "Fast CRC
  196. // Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  197. // https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  198. // We are applying it to CRC32C polynomial.
  199. ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesPclmul(
  200. const uint8_t* p, V128* partialCRC) const {
  201. V128 loopMultiplicands = V128_Load(reinterpret_cast<const V128*>(k1k2));
  202. V128 partialCRC1 = partialCRC[0];
  203. V128 partialCRC2 = partialCRC[1];
  204. V128 partialCRC3 = partialCRC[2];
  205. V128 partialCRC4 = partialCRC[3];
  206. V128 tmp1 = V128_PMulHi(partialCRC1, loopMultiplicands);
  207. V128 tmp2 = V128_PMulHi(partialCRC2, loopMultiplicands);
  208. V128 tmp3 = V128_PMulHi(partialCRC3, loopMultiplicands);
  209. V128 tmp4 = V128_PMulHi(partialCRC4, loopMultiplicands);
  210. V128 data1 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 0));
  211. V128 data2 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 1));
  212. V128 data3 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 2));
  213. V128 data4 = V128_LoadU(reinterpret_cast<const V128*>(p + 16 * 3));
  214. partialCRC1 = V128_PMulLow(partialCRC1, loopMultiplicands);
  215. partialCRC2 = V128_PMulLow(partialCRC2, loopMultiplicands);
  216. partialCRC3 = V128_PMulLow(partialCRC3, loopMultiplicands);
  217. partialCRC4 = V128_PMulLow(partialCRC4, loopMultiplicands);
  218. partialCRC1 = V128_Xor(tmp1, partialCRC1);
  219. partialCRC2 = V128_Xor(tmp2, partialCRC2);
  220. partialCRC3 = V128_Xor(tmp3, partialCRC3);
  221. partialCRC4 = V128_Xor(tmp4, partialCRC4);
  222. partialCRC1 = V128_Xor(partialCRC1, data1);
  223. partialCRC2 = V128_Xor(partialCRC2, data2);
  224. partialCRC3 = V128_Xor(partialCRC3, data3);
  225. partialCRC4 = V128_Xor(partialCRC4, data4);
  226. partialCRC[0] = partialCRC1;
  227. partialCRC[1] = partialCRC2;
  228. partialCRC[2] = partialCRC3;
  229. partialCRC[3] = partialCRC4;
  230. }
  231. // Reduce partialCRC produced by Process64BytesPclmul into a single value,
  232. // that represents crc checksum of all the processed bytes.
  233. ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t
  234. FinalizePclmulStream(V128* partialCRC) const {
  235. V128 partialCRC1 = partialCRC[0];
  236. V128 partialCRC2 = partialCRC[1];
  237. V128 partialCRC3 = partialCRC[2];
  238. V128 partialCRC4 = partialCRC[3];
  239. // Combine 4 vectors of partial crc into a single vector.
  240. V128 reductionMultiplicands =
  241. V128_Load(reinterpret_cast<const V128*>(k5k6));
  242. V128 low = V128_PMulLow(reductionMultiplicands, partialCRC1);
  243. V128 high = V128_PMulHi(reductionMultiplicands, partialCRC1);
  244. partialCRC1 = V128_Xor(low, high);
  245. partialCRC1 = V128_Xor(partialCRC1, partialCRC2);
  246. low = V128_PMulLow(reductionMultiplicands, partialCRC3);
  247. high = V128_PMulHi(reductionMultiplicands, partialCRC3);
  248. partialCRC3 = V128_Xor(low, high);
  249. partialCRC3 = V128_Xor(partialCRC3, partialCRC4);
  250. reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k3k4));
  251. low = V128_PMulLow(reductionMultiplicands, partialCRC1);
  252. high = V128_PMulHi(reductionMultiplicands, partialCRC1);
  253. V128 fullCRC = V128_Xor(low, high);
  254. fullCRC = V128_Xor(fullCRC, partialCRC3);
  255. // Reduce fullCRC into scalar value.
  256. reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k5k6));
  257. V128 mask = V128_Load(reinterpret_cast<const V128*>(kMask));
  258. V128 tmp = V128_PMul01(reductionMultiplicands, fullCRC);
  259. fullCRC = V128_ShiftRight<8>(fullCRC);
  260. fullCRC = V128_Xor(fullCRC, tmp);
  261. reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(k7k0));
  262. tmp = V128_ShiftRight<4>(fullCRC);
  263. fullCRC = V128_And(fullCRC, mask);
  264. fullCRC = V128_PMulLow(reductionMultiplicands, fullCRC);
  265. fullCRC = V128_Xor(tmp, fullCRC);
  266. reductionMultiplicands = V128_Load(reinterpret_cast<const V128*>(kPoly));
  267. tmp = V128_And(fullCRC, mask);
  268. tmp = V128_PMul01(reductionMultiplicands, tmp);
  269. tmp = V128_And(tmp, mask);
  270. tmp = V128_PMulLow(reductionMultiplicands, tmp);
  271. fullCRC = V128_Xor(tmp, fullCRC);
  272. return static_cast<uint64_t>(V128_Extract32<1>(fullCRC));
  273. }
  274. // Update crc with 64 bytes of data from p.
  275. ABSL_ATTRIBUTE_ALWAYS_INLINE uint64_t Process64BytesCRC(const uint8_t* p,
  276. uint64_t crc) const {
  277. for (int i = 0; i < 8; i++) {
  278. crc =
  279. CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p));
  280. p += 8;
  281. }
  282. return crc;
  283. }
  284. // Generated by crc32c_x86_test --crc32c_generate_constants=true
  285. // and verified against constants in linux kernel for S390:
  286. // https://github.com/torvalds/linux/blob/master/arch/s390/crypto/crc32le-vx.S
  287. alignas(16) static constexpr uint64_t k1k2[2] = {0x0740eef02, 0x09e4addf8};
  288. alignas(16) static constexpr uint64_t k3k4[2] = {0x1384aa63a, 0x0ba4fc28e};
  289. alignas(16) static constexpr uint64_t k5k6[2] = {0x0f20c0dfe, 0x14cd00bd6};
  290. alignas(16) static constexpr uint64_t k7k0[2] = {0x0dd45aab8, 0x000000000};
  291. alignas(16) static constexpr uint64_t kPoly[2] = {0x105ec76f0, 0x0dea713f1};
  292. alignas(16) static constexpr uint32_t kMask[4] = {~0u, 0u, ~0u, 0u};
  293. // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
  294. // size. Each group is CRCed in parallel then combined at the end of the
  295. // block.
  296. static constexpr size_t kGroupsSmall = 3;
  297. // For large runs we use up to kMaxStreams blocks computed with CRC
  298. // instruction, and up to kMaxStreams blocks computed with PCLMULQDQ, which
  299. // are combined in the end.
  300. static constexpr size_t kMaxStreams = 3;
  301. };
  302. #ifdef ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
  303. alignas(16) constexpr uint64_t
  304. CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k1k2[2];
  305. alignas(16) constexpr uint64_t
  306. CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k3k4[2];
  307. alignas(16) constexpr uint64_t
  308. CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k5k6[2];
  309. alignas(16) constexpr uint64_t
  310. CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::k7k0[2];
  311. alignas(16) constexpr uint64_t
  312. CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kPoly[2];
  313. alignas(16) constexpr uint32_t
  314. CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMask[4];
  315. constexpr size_t
  316. CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kGroupsSmall;
  317. constexpr size_t CRC32AcceleratedX86ARMCombinedMultipleStreamsBase::kMaxStreams;
  318. #endif // ABSL_INTERNAL_NEED_REDUNDANT_CONSTEXPR_DECL
  319. template <size_t num_crc_streams, size_t num_pclmul_streams,
  320. CutoffStrategy strategy>
  321. class CRC32AcceleratedX86ARMCombinedMultipleStreams
  322. : public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
  323. ABSL_ATTRIBUTE_HOT
  324. void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
  325. static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
  326. "Invalid number of crc streams");
  327. static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams,
  328. "Invalid number of pclmul streams");
  329. const uint8_t* p = static_cast<const uint8_t*>(bytes);
  330. const uint8_t* e = p + length;
  331. uint32_t l = *crc;
  332. uint64_t l64;
  333. // We have dedicated instruction for 1,2,4 and 8 bytes.
  334. if (length & 8) {
  335. ABSL_INTERNAL_STEP8(l, p);
  336. length &= ~size_t{8};
  337. }
  338. if (length & 4) {
  339. ABSL_INTERNAL_STEP4(l);
  340. length &= ~size_t{4};
  341. }
  342. if (length & 2) {
  343. ABSL_INTERNAL_STEP2(l);
  344. length &= ~size_t{2};
  345. }
  346. if (length & 1) {
  347. ABSL_INTERNAL_STEP1(l);
  348. length &= ~size_t{1};
  349. }
  350. if (length == 0) {
  351. *crc = l;
  352. return;
  353. }
  354. // length is now multiple of 16.
  355. // For small blocks just run simple loop, because cost of combining multiple
  356. // streams is significant.
  357. if (strategy != CutoffStrategy::Unroll64CRC) {
  358. if (length < kSmallCutoff) {
  359. while (length >= 16) {
  360. ABSL_INTERNAL_STEP8(l, p);
  361. ABSL_INTERNAL_STEP8(l, p);
  362. length -= 16;
  363. }
  364. *crc = l;
  365. return;
  366. }
  367. }
  368. // For medium blocks we run 3 crc streams and combine them as described in
  369. // Intel paper above. Running 4th stream doesn't help, because crc
  370. // instruction has latency 3 and throughput 1.
  371. if (length < kMediumCutoff) {
  372. l64 = l;
  373. if (strategy == CutoffStrategy::Fold3) {
  374. uint64_t l641 = 0;
  375. uint64_t l642 = 0;
  376. const size_t blockSize = 32;
  377. size_t bs = static_cast<size_t>(e - p) / kGroupsSmall / blockSize;
  378. const uint8_t* p1 = p + bs * blockSize;
  379. const uint8_t* p2 = p1 + bs * blockSize;
  380. for (size_t i = 0; i + 1 < bs; ++i) {
  381. ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
  382. ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
  383. ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
  384. ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
  385. PrefetchToLocalCache(
  386. reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
  387. PrefetchToLocalCache(
  388. reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
  389. PrefetchToLocalCache(
  390. reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
  391. }
  392. // Don't run crc on last 8 bytes.
  393. ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
  394. ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
  395. ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
  396. ABSL_INTERNAL_STEP8BY2(l64, l641, p, p1);
  397. V128 magic = *(reinterpret_cast<const V128*>(kClmulConstants) + bs - 1);
  398. V128 tmp = V128_From64WithZeroFill(l64);
  399. V128 res1 = V128_PMulLow(tmp, magic);
  400. tmp = V128_From64WithZeroFill(l641);
  401. V128 res2 = V128_PMul10(tmp, magic);
  402. V128 x = V128_Xor(res1, res2);
  403. l64 = static_cast<uint64_t>(V128_Low64(x)) ^
  404. absl::little_endian::Load64(p2);
  405. l64 = CRC32_u64(static_cast<uint32_t>(l642), l64);
  406. p = p2 + 8;
  407. } else if (strategy == CutoffStrategy::Unroll64CRC) {
  408. while ((e - p) >= 64) {
  409. l64 = Process64BytesCRC(p, l64);
  410. p += 64;
  411. }
  412. }
  413. } else {
  414. // There is a lot of data, we can ignore combine costs and run all
  415. // requested streams (num_crc_streams + num_pclmul_streams),
  416. // using prefetch. CRC and PCLMULQDQ use different cpu execution units,
  417. // so on some cpus it makes sense to execute both of them for different
  418. // streams.
  419. // Point x at first 8-byte aligned byte in string.
  420. const uint8_t* x = RoundUp<8>(p);
  421. // Process bytes until p is 8-byte aligned, if that isn't past the end.
  422. while (p != x) {
  423. ABSL_INTERNAL_STEP1(l);
  424. }
  425. size_t bs = static_cast<size_t>(e - p) /
  426. (num_crc_streams + num_pclmul_streams) / 64;
  427. const uint8_t* crc_streams[kMaxStreams];
  428. const uint8_t* pclmul_streams[kMaxStreams];
  429. // We are guaranteed to have at least one crc stream.
  430. crc_streams[0] = p;
  431. for (size_t i = 1; i < num_crc_streams; i++) {
  432. crc_streams[i] = crc_streams[i - 1] + bs * 64;
  433. }
  434. pclmul_streams[0] = crc_streams[num_crc_streams - 1] + bs * 64;
  435. for (size_t i = 1; i < num_pclmul_streams; i++) {
  436. pclmul_streams[i] = pclmul_streams[i - 1] + bs * 64;
  437. }
  438. // Per stream crc sums.
  439. uint64_t l64_crc[kMaxStreams] = {l};
  440. uint64_t l64_pclmul[kMaxStreams] = {0};
  441. // Peel first iteration, because PCLMULQDQ stream, needs setup.
  442. for (size_t i = 0; i < num_crc_streams; i++) {
  443. l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
  444. crc_streams[i] += 16 * 4;
  445. }
  446. V128 partialCRC[kMaxStreams][4];
  447. for (size_t i = 0; i < num_pclmul_streams; i++) {
  448. partialCRC[i][0] = V128_LoadU(
  449. reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 0));
  450. partialCRC[i][1] = V128_LoadU(
  451. reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 1));
  452. partialCRC[i][2] = V128_LoadU(
  453. reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 2));
  454. partialCRC[i][3] = V128_LoadU(
  455. reinterpret_cast<const V128*>(pclmul_streams[i] + 16 * 3));
  456. pclmul_streams[i] += 16 * 4;
  457. }
  458. for (size_t i = 1; i < bs; i++) {
  459. // Prefetch data for next iterations.
  460. for (size_t j = 0; j < num_crc_streams; j++) {
  461. PrefetchToLocalCache(
  462. reinterpret_cast<const char*>(crc_streams[j] + kPrefetchHorizon));
  463. }
  464. for (size_t j = 0; j < num_pclmul_streams; j++) {
  465. PrefetchToLocalCache(reinterpret_cast<const char*>(pclmul_streams[j] +
  466. kPrefetchHorizon));
  467. }
  468. // We process each stream in 64 byte blocks. This can be written as
  469. // for (int i = 0; i < num_pclmul_streams; i++) {
  470. // Process64BytesPclmul(pclmul_streams[i], partialCRC[i]);
  471. // pclmul_streams[i] += 16 * 4;
  472. // }
  473. // for (int i = 0; i < num_crc_streams; i++) {
  474. // l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
  475. // crc_streams[i] += 16*4;
  476. // }
  477. // But unrolling and interleaving PCLMULQDQ and CRC blocks manually
  478. // gives ~2% performance boost.
  479. l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
  480. crc_streams[0] += 16 * 4;
  481. if (num_pclmul_streams > 0) {
  482. Process64BytesPclmul(pclmul_streams[0], partialCRC[0]);
  483. pclmul_streams[0] += 16 * 4;
  484. }
  485. if (num_crc_streams > 1) {
  486. l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]);
  487. crc_streams[1] += 16 * 4;
  488. }
  489. if (num_pclmul_streams > 1) {
  490. Process64BytesPclmul(pclmul_streams[1], partialCRC[1]);
  491. pclmul_streams[1] += 16 * 4;
  492. }
  493. if (num_crc_streams > 2) {
  494. l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]);
  495. crc_streams[2] += 16 * 4;
  496. }
  497. if (num_pclmul_streams > 2) {
  498. Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
  499. pclmul_streams[2] += 16 * 4;
  500. }
  501. }
  502. // PCLMULQDQ based streams require special final step;
  503. // CRC based don't.
  504. for (size_t i = 0; i < num_pclmul_streams; i++) {
  505. l64_pclmul[i] = FinalizePclmulStream(partialCRC[i]);
  506. }
  507. // Combine all streams into single result.
  508. uint32_t magic = ComputeZeroConstant(bs * 64);
  509. l64 = l64_crc[0];
  510. for (size_t i = 1; i < num_crc_streams; i++) {
  511. l64 = multiply(static_cast<uint32_t>(l64), magic);
  512. l64 ^= l64_crc[i];
  513. }
  514. for (size_t i = 0; i < num_pclmul_streams; i++) {
  515. l64 = multiply(static_cast<uint32_t>(l64), magic);
  516. l64 ^= l64_pclmul[i];
  517. }
  518. // Update p.
  519. if (num_pclmul_streams > 0) {
  520. p = pclmul_streams[num_pclmul_streams - 1];
  521. } else {
  522. p = crc_streams[num_crc_streams - 1];
  523. }
  524. }
  525. l = static_cast<uint32_t>(l64);
  526. while ((e - p) >= 16) {
  527. ABSL_INTERNAL_STEP8(l, p);
  528. ABSL_INTERNAL_STEP8(l, p);
  529. }
  530. // Process the last few bytes
  531. while (p != e) {
  532. ABSL_INTERNAL_STEP1(l);
  533. }
  534. #undef ABSL_INTERNAL_STEP8BY3
  535. #undef ABSL_INTERNAL_STEP8BY2
  536. #undef ABSL_INTERNAL_STEP8
  537. #undef ABSL_INTERNAL_STEP4
  538. #undef ABSL_INTERNAL_STEP2
  539. #undef ABSL_INTERNAL_STEP1
  540. *crc = l;
  541. }
  542. };
  543. } // namespace
  544. // Intel processors with SSE4.2 have an instruction for one particular
  545. // 32-bit CRC polynomial: crc32c
  546. CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() {
  547. CpuType type = GetCpuType();
  548. switch (type) {
  549. case CpuType::kIntelHaswell:
  550. case CpuType::kAmdRome:
  551. case CpuType::kAmdNaples:
  552. case CpuType::kAmdMilan:
  553. return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
  554. 3, 1, CutoffStrategy::Fold3>();
  555. // PCLMULQDQ is fast, use combined PCLMULQDQ + CRC implementation.
  556. case CpuType::kIntelCascadelakeXeon:
  557. case CpuType::kIntelSkylakeXeon:
  558. case CpuType::kIntelBroadwell:
  559. case CpuType::kIntelSkylake:
  560. return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
  561. 3, 2, CutoffStrategy::Fold3>();
  562. // PCLMULQDQ is slow, don't use it.
  563. case CpuType::kIntelIvybridge:
  564. case CpuType::kIntelSandybridge:
  565. case CpuType::kIntelWestmere:
  566. return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
  567. 3, 0, CutoffStrategy::Fold3>();
  568. case CpuType::kArmNeoverseN1:
  569. case CpuType::kArmNeoverseN2:
  570. case CpuType::kArmNeoverseV1:
  571. return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
  572. 1, 1, CutoffStrategy::Unroll64CRC>();
  573. case CpuType::kAmpereSiryn:
  574. return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
  575. 3, 2, CutoffStrategy::Fold3>();
  576. case CpuType::kArmNeoverseV2:
  577. return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
  578. 1, 2, CutoffStrategy::Unroll64CRC>();
  579. #if defined(__aarch64__)
  580. default:
  581. // Not all ARM processors support the needed instructions, so check here
  582. // before trying to use an accelerated implementation.
  583. if (SupportsArmCRC32PMULL()) {
  584. return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
  585. 1, 1, CutoffStrategy::Unroll64CRC>();
  586. } else {
  587. return nullptr;
  588. }
  589. #else
  590. default:
  591. // Something else, play it safe and assume slow PCLMULQDQ.
  592. return new CRC32AcceleratedX86ARMCombinedMultipleStreams<
  593. 3, 0, CutoffStrategy::Fold3>();
  594. #endif
  595. }
  596. }
  597. std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
  598. auto ret = std::vector<std::unique_ptr<CRCImpl>>();
  599. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  600. 1, 0, CutoffStrategy::Fold3>>());
  601. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  602. 1, 1, CutoffStrategy::Fold3>>());
  603. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  604. 1, 2, CutoffStrategy::Fold3>>());
  605. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  606. 1, 3, CutoffStrategy::Fold3>>());
  607. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  608. 2, 0, CutoffStrategy::Fold3>>());
  609. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  610. 2, 1, CutoffStrategy::Fold3>>());
  611. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  612. 2, 2, CutoffStrategy::Fold3>>());
  613. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  614. 2, 3, CutoffStrategy::Fold3>>());
  615. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  616. 3, 0, CutoffStrategy::Fold3>>());
  617. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  618. 3, 1, CutoffStrategy::Fold3>>());
  619. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  620. 3, 2, CutoffStrategy::Fold3>>());
  621. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  622. 3, 3, CutoffStrategy::Fold3>>());
  623. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  624. 1, 0, CutoffStrategy::Unroll64CRC>>());
  625. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  626. 1, 1, CutoffStrategy::Unroll64CRC>>());
  627. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  628. 1, 2, CutoffStrategy::Unroll64CRC>>());
  629. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  630. 1, 3, CutoffStrategy::Unroll64CRC>>());
  631. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  632. 2, 0, CutoffStrategy::Unroll64CRC>>());
  633. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  634. 2, 1, CutoffStrategy::Unroll64CRC>>());
  635. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  636. 2, 2, CutoffStrategy::Unroll64CRC>>());
  637. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  638. 2, 3, CutoffStrategy::Unroll64CRC>>());
  639. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  640. 3, 0, CutoffStrategy::Unroll64CRC>>());
  641. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  642. 3, 1, CutoffStrategy::Unroll64CRC>>());
  643. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  644. 3, 2, CutoffStrategy::Unroll64CRC>>());
  645. ret.push_back(absl::make_unique<CRC32AcceleratedX86ARMCombinedMultipleStreams<
  646. 3, 3, CutoffStrategy::Unroll64CRC>>());
  647. return ret;
  648. }
  649. #else // !ABSL_INTERNAL_CAN_USE_SIMD_CRC32C
  650. std::vector<std::unique_ptr<CRCImpl>> NewCRC32AcceleratedX86ARMCombinedAll() {
  651. return std::vector<std::unique_ptr<CRCImpl>>();
  652. }
  653. // no hardware acceleration available
  654. CRCImpl* TryNewCRC32AcceleratedX86ARMCombined() { return nullptr; }
  655. #endif
  656. } // namespace crc_internal
  657. ABSL_NAMESPACE_END
  658. } // namespace absl