generic_crc.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687
  1. // Copyright 2010 Google Inc. All rights reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Defines GenericCrc class which implements arbitrary CRCs.
  15. //
  16. // Please read crc.pdf to understand how it all works.
  17. #ifndef CRCUTIL_GENERIC_CRC_H_
  18. #define CRCUTIL_GENERIC_CRC_H_
  19. #include "base_types.h" // uint8
  20. #include "crc_casts.h" // TO_BYTE(), Downcast<>.
  21. #include "gf_util.h" // GfUtil<Crc> class.
  22. #include "platform.h" // GCC_ALIGN_ATTRIBUTE(16)
  23. #include "uint128_sse2.h" // uint128_sse2 type (if necessary)
  24. namespace crcutil {
  25. #pragma pack(push, 16)
  26. // Extends CRC by one byte.
  27. // Technically, if degree of a polynomial does not exceed 8,
  28. // right shift by 8 bits is not required, but who cares about CRC-8?
  29. #define CRC_BYTE(table, crc, byte) do { \
  30. crc = ((sizeof(crc) > 1) ? SHIFT_RIGHT_SAFE(crc, 8) : 0) ^ \
  31. table->crc_word_[sizeof(Word) - 1][TO_BYTE(crc) ^ (byte)]; \
  32. } while (0)
  33. #define TABLE_ENTRY(table, byte, buf) \
  34. table[byte][Downcast<Word, uint8>(buf)]
  35. #define TABLE_ENTRY_LAST(table, buf) \
  36. table[sizeof(Word) - 1][buf]
  37. // Extends CRC by one word.
  38. #define CRC_WORD(table, crc, buf) do { \
  39. buf ^= Downcast<Crc, Word>(crc); \
  40. if (sizeof(crc) > sizeof(buf)) { \
  41. crc = SHIFT_RIGHT_SAFE(crc, sizeof(buf) * 8); \
  42. crc ^= TABLE_ENTRY(table->crc_word_, 0, buf); \
  43. } else { \
  44. crc = TABLE_ENTRY(table->crc_word_, 0, buf); \
  45. } \
  46. buf >>= 8; \
  47. for (size_t byte = 1; byte < sizeof(buf) - 1; ++byte) { \
  48. crc ^= TABLE_ENTRY(table->crc_word_, byte, buf); \
  49. buf >>= 8; \
  50. } \
  51. crc ^= TABLE_ENTRY_LAST(table->crc_word_, buf); \
  52. } while (0)
  53. // Process beginning of data block byte by byte until source pointer
  54. // becomes perfectly aligned on Word boundary.
  55. #define ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word) do { \
  56. while ((reinterpret_cast<size_t>(src) & (sizeof(Word) - 1)) != 0) { \
  57. if (src >= end) { \
  58. return (crc ^ table->Base().Canonize()); \
  59. } \
  60. CRC_BYTE(table, crc, *src); \
  61. src += 1; \
  62. } \
  63. } while (0)
  64. // On amd64, enforcing alignment is 2-4% slower on small (<= 64 bytes) blocks
  65. // but 6-10% faster on larger blocks (>= 2KB).
  66. // Break-even point (+-1%) is around 1KB (Q9650, E6600).
  67. //
  68. #define ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, table, src, end, crc, Word) \
  69. do { \
  70. if (sizeof(Word) > 8 || (bytes) > CRCUTIL_MIN_ALIGN_SIZE) { \
  71. ALIGN_ON_WORD_BOUNDARY(table, src, end, crc, Word); \
  72. } \
  73. } while (0)
  74. #if defined(_MSC_VER)
  75. #pragma warning(push)
  76. #pragma warning(disable: 4127) // conditional expression is constant
  77. #endif // defined(_MSC_VER)
  78. // Forward declarations.
  79. template<typename CrcImplementation> class RollingCrc;
  80. // Crc is the type used internally and to return values of N-bit CRC.
  81. // It should be at least as large as "TableEntry" and "Word" but
  82. // may be larger (e.g. for 16-bit CRC, TableEntry and Word may be
  83. // set to uint16 but Crc may be set to uint32).
  84. //
  85. // TableEntry is the type of values stored in the tables.
  86. // To implement N-bit CRC, TableEntry should be large enough
  87. // to store N bits.
  88. //
  89. // Word is the type used to read data sizeof(Word) at a time.
  90. // Ideally, it shoulde be "most suitable for given architecture"
  91. // integer type -- typically "size_t".
  92. //
  93. // kStride is the number of words processed in interleaved manner by
  94. // CrcMultiword() and CrcWordblock(). Shall be either 3 or 4.
  95. // Optimal value depends on hardware architecture (AMD64, ARM, etc).
  96. //
  97. template<typename _Crc, typename _TableEntry, typename _Word, int kStride>
  98. class GenericCrc {
  99. public:
  100. // Make Crc, TableEntry, and Word types visible (used by RollingCrc etc.)
  101. typedef _Crc Crc;
  102. typedef _TableEntry TableEntry;
  103. typedef _Word Word;
  104. GenericCrc() {}
  105. // Initializes the tables given generating polynomial of degree.
  106. // If "canonical" is true, crc value will be XOR'ed with (-1) before and
  107. // after actual CRC computation.
  108. GenericCrc(const Crc &generating_polynomial, size_t degree, bool canonical) {
  109. Init(generating_polynomial, degree, canonical);
  110. }
  111. void Init(const Crc &generating_polynomial, size_t degree, bool canonical) {
  112. base_.Init(generating_polynomial, degree, canonical);
  113. // Instead of computing
  114. // table[j][i] = MultiplyUnnormalized(i, 8, k),
  115. // for all i = 0...255, we may notice that
  116. // if i = 2**n then for all m = 1...(i-1)
  117. // MultiplyUnnormalized(i + m, 8, k) =
  118. // MultiplyUnnormalized(i ^ m, 8, k) =
  119. // MultiplyUnnormalized(i, 8, k) ^ MultiplyUnnormalized(m, 8, k) =
  120. // MultiplyUnnormalized(i, 8, k) ^ crc_word_interleaved[j][m] =
  121. // table[i] ^ table[m].
  122. #if 0
  123. for (size_t j = 0; j < sizeof(Word); ++j) {
  124. Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree);
  125. for (size_t i = 0; i < 256; ++i) {
  126. Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k);
  127. this->crc_word_interleaved_[j][i] = Downcast<Crc, TableEntry>(temp);
  128. }
  129. }
  130. #else
  131. for (size_t j = 0; j < sizeof(Word); ++j) {
  132. Crc k = Base().XpowN((sizeof(Word) * kStride - 1 - j) * 8 + degree);
  133. TableEntry *table = this->crc_word_interleaved_[j];
  134. table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0.
  135. for (size_t i = 1; i < 256; i <<= 1) {
  136. TableEntry value = Downcast<Crc, TableEntry>(
  137. Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k));
  138. table[i] = value;
  139. for (size_t m = 1; m < i; ++m) {
  140. table[i + m] = value ^ table[m];
  141. }
  142. }
  143. }
  144. #endif
  145. #if 0
  146. for (size_t j = 0; j < sizeof(Word); ++j) {
  147. Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree);
  148. for (size_t i = 0; i < 256; ++i) {
  149. Crc temp = Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k);
  150. this->crc_word_[j][i] = Downcast<Crc, TableEntry>(temp);
  151. }
  152. }
  153. #else
  154. for (size_t j = 0; j < sizeof(Word); ++j) {
  155. Crc k = Base().XpowN((sizeof(Word) - 1 - j) * 8 + degree);
  156. TableEntry *table = this->crc_word_[j];
  157. table[0] = 0; // Init 0s entry -- multiply 0 by anything yields 0.
  158. for (size_t i = 1; i < 256; i <<= 1) {
  159. TableEntry value = Downcast<Crc, TableEntry>(
  160. Base().MultiplyUnnormalized(static_cast<Crc>(i), 8, k));
  161. table[i] = value;
  162. for (size_t m = 1; m < i; ++m) {
  163. table[i + m] = value ^ table[m];
  164. }
  165. }
  166. }
  167. #endif
  168. }
  169. // Default CRC implementation
  170. Crc CrcDefault(const void *data, size_t bytes, const Crc &start) const {
  171. #if HAVE_AMD64 || HAVE_I386 || defined(__aarch64__)
  172. return CrcMultiword(data, bytes, start);
  173. #else
  174. // Very few CPUs have multiple ALUs and speculative execution
  175. // (Itanium is an exception) so sophisticated algorithms will
  176. // not perform better than good old Sarwate algorithm.
  177. return CrcByteUnrolled(data, bytes, start);
  178. #endif // HAVE_AMD64 || HAVE_I386
  179. }
  180. // Returns base class.
  181. const GfUtil<Crc> &Base() const { return base_; }
  182. protected:
  183. // Canonical, byte-by-byte CRC computation.
  184. Crc CrcByte(const void *data, size_t bytes, const Crc &start) const {
  185. const uint8 *src = static_cast<const uint8 *>(data);
  186. Crc crc = start ^ Base().Canonize();
  187. for (const uint8 *end = src + bytes; src < end; ++src) {
  188. CRC_BYTE(this, crc, *src);
  189. }
  190. return (crc ^ Base().Canonize());
  191. }
  192. // Byte-by-byte CRC with main loop unrolled.
  193. Crc CrcByteUnrolled(const void *data, size_t bytes, const Crc &start) const {
  194. if (bytes == 0) {
  195. return start;
  196. }
  197. const uint8 *src = static_cast<const uint8 *>(data);
  198. const uint8 *end = src + bytes;
  199. Crc crc = start ^ Base().Canonize();
  200. // Unroll loop 4 times.
  201. end -= 3;
  202. for (; src < end; src += 4) {
  203. PREFETCH(src);
  204. CRC_BYTE(this, crc, src[0]);
  205. CRC_BYTE(this, crc, src[1]);
  206. CRC_BYTE(this, crc, src[2]);
  207. CRC_BYTE(this, crc, src[3]);
  208. }
  209. end += 3;
  210. // Compute CRC of remaining bytes.
  211. for (; src < end; ++src) {
  212. CRC_BYTE(this, crc, *src);
  213. }
  214. return (crc ^ Base().Canonize());
  215. }
  216. // Canonical, byte-by-byte CRC computation.
  217. Crc CrcByteWord(const void *data, size_t bytes, const Crc &start) const {
  218. const uint8 *src = static_cast<const uint8 *>(data);
  219. const uint8 *end = src + bytes;
  220. Crc crc0 = start ^ Base().Canonize();
  221. ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Crc);
  222. if (src >= end) {
  223. return (crc0 ^ Base().Canonize());
  224. }
  225. // Process 4*sizeof(Crc) bytes at a time.
  226. end -= 4 * sizeof(Crc) - 1;
  227. for (; src < end; src += 4 * sizeof(Crc)) {
  228. for (size_t i = 0; i < 4; ++i) {
  229. crc0 ^= reinterpret_cast<const Crc *>(src)[i];
  230. if (i == 0) {
  231. PREFETCH(src);
  232. }
  233. for (size_t byte = 0; byte < sizeof(crc0); ++byte) {
  234. CRC_BYTE(this, crc0, 0);
  235. }
  236. }
  237. }
  238. end += 4 * sizeof(Crc) - 1;
  239. // Process sizeof(Crc) bytes at a time.
  240. end -= sizeof(Crc) - 1;
  241. for (; src < end; src += sizeof(Crc)) {
  242. crc0 ^= reinterpret_cast<const Crc *>(src)[0];
  243. for (size_t byte = 0; byte < sizeof(crc0); ++byte) {
  244. CRC_BYTE(this, crc0, 0);
  245. }
  246. }
  247. end += sizeof(Crc) - 1;
  248. // Compute CRC of remaining bytes.
  249. for (;src < end; ++src) {
  250. CRC_BYTE(this, crc0, *src);
  251. }
  252. return (crc0 ^ Base().Canonize());
  253. }
  254. // Faster, word-by-word CRC.
  255. Crc CrcWord(const void *data, size_t bytes, const Crc &start) const {
  256. const uint8 *src = static_cast<const uint8 *>(data);
  257. const uint8 *end = src + bytes;
  258. Crc crc0 = start ^ Base().Canonize();
  259. ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
  260. if (src >= end) {
  261. return (crc0 ^ Base().Canonize());
  262. }
  263. // Process 4 sizeof(Word) bytes at once.
  264. end -= 4 * sizeof(Word) - 1;
  265. for (; src < end; src += 4 * sizeof(Word)) {
  266. Word buf0 = reinterpret_cast<const Word *>(src)[0];
  267. PREFETCH(src);
  268. CRC_WORD(this, crc0, buf0);
  269. buf0 = reinterpret_cast<const Word *>(src)[1];
  270. CRC_WORD(this, crc0, buf0);
  271. buf0 = reinterpret_cast<const Word *>(src)[2];
  272. CRC_WORD(this, crc0, buf0);
  273. buf0 = reinterpret_cast<const Word *>(src)[3];
  274. CRC_WORD(this, crc0, buf0);
  275. }
  276. end += 4 * sizeof(Word) - 1;
  277. // Process sizeof(Word) bytes at a time.
  278. end -= sizeof(Word) - 1;
  279. for (; src < end; src += sizeof(Word)) {
  280. Word buf0 = reinterpret_cast<const Word *>(src)[0];
  281. CRC_WORD(this, crc0, buf0);
  282. }
  283. end += sizeof(Word) - 1;
  284. // Compute CRC of remaining bytes.
  285. for (;src < end; ++src) {
  286. CRC_BYTE(this, crc0, *src);
  287. }
  288. return (crc0 ^ Base().Canonize());
  289. }
  290. #define REPEAT_FROM_1(macro) \
  291. macro(1); \
  292. macro(2); \
  293. macro(3); \
  294. macro(4); \
  295. macro(5); \
  296. macro(6); \
  297. macro(7);
  298. #define REPEAT_FROM_0(macro) \
  299. macro(0); \
  300. REPEAT_FROM_1(macro)
  301. // Faster, process adjusent blocks in parallel and concatenate CRCs.
  302. Crc CrcBlockword(const void *data, size_t bytes, const Crc &start) const {
  303. if (kStride < 2 || kStride > 8) {
  304. // Unsupported configuration;
  305. // fall back to something sensible.
  306. return CrcWord(data, bytes, start);
  307. }
  308. const uint8 *src = static_cast<const uint8 *>(data);
  309. const uint8 *end = src + bytes;
  310. Crc crc0 = start ^ Base().Canonize();
  311. enum {
  312. // Add 16 to avoid false L1 cache collisions.
  313. kStripe = (15*1024 + 16) & ~(sizeof(Word) - 1),
  314. };
  315. ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
  316. if (src >= end) {
  317. return (crc0 ^ Base().Canonize());
  318. }
  319. end -= kStride * kStripe - 1;
  320. if (src < end) {
  321. Crc x_pow_8kStripe = Base().Xpow8N(kStripe);
  322. do {
  323. const uint8 *stripe_end = src + kStripe;
  324. #define INIT_CRC(reg) \
  325. Crc crc##reg; \
  326. if (kStride >= reg) { \
  327. crc##reg = 0; \
  328. }
  329. REPEAT_FROM_1(INIT_CRC);
  330. #undef INIT_CRC
  331. do {
  332. #define FIRST(reg) \
  333. Word buf##reg; \
  334. if (kStride > reg) { \
  335. buf##reg = reinterpret_cast<const Word *>(src + reg * kStripe)[0]; \
  336. buf##reg ^= Downcast<Crc, Word>(crc##reg); \
  337. if (sizeof(crc##reg) > sizeof(buf##reg)) { \
  338. crc##reg = SHIFT_RIGHT_SAFE(crc##reg, sizeof(buf##reg) * 8); \
  339. crc##reg ^= TABLE_ENTRY(this->crc_word_, 0, buf##reg); \
  340. } else { \
  341. crc##reg = TABLE_ENTRY(this->crc_word_, 0, buf##reg); \
  342. } \
  343. buf##reg >>= 8; \
  344. }
  345. REPEAT_FROM_0(FIRST);
  346. #undef FIRST
  347. for (size_t byte = 1; byte < sizeof(buf0) - 1; ++byte) {
  348. #define NEXT(reg) do { \
  349. if (kStride > reg) { \
  350. crc##reg ^= TABLE_ENTRY(this->crc_word_, byte, buf##reg); \
  351. buf##reg >>= 8; \
  352. } \
  353. } while (0)
  354. REPEAT_FROM_0(NEXT);
  355. #undef NEXT
  356. }
  357. #define LAST(reg) do { \
  358. if (kStride > reg) { \
  359. crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_, buf##reg); \
  360. } \
  361. } while (0)
  362. REPEAT_FROM_0(LAST);
  363. #undef LAST
  364. src += sizeof(Word);
  365. } while (src < stripe_end);
  366. #if 0
  367. // The code is left for illustrational purposes only.
  368. #define COMBINE(reg) do { \
  369. if (reg > 0 && kStride > reg) { \
  370. crc0 = Base().ChangeStartValue(crc##reg, kStripe, 0, crc0); \
  371. } \
  372. } while (0)
  373. #else
  374. #define COMBINE(reg) do { \
  375. if (reg > 0 && kStride > reg) { \
  376. crc0 = crc##reg ^ Base().Multiply(crc0, x_pow_8kStripe); \
  377. } \
  378. } while (0)
  379. #endif
  380. REPEAT_FROM_0(COMBINE);
  381. #undef COMBINE
  382. src += (kStride - 1) * kStripe;
  383. }
  384. while (src < end);
  385. }
  386. end += kStride * kStripe - 1;
  387. // Process sizeof(Word) bytes at a time.
  388. end -= sizeof(Word) - 1;
  389. for (; src < end; src += sizeof(Word)) {
  390. Word buf0 = reinterpret_cast<const Word *>(src)[0];
  391. CRC_WORD(this, crc0, buf0);
  392. }
  393. end += sizeof(Word) - 1;
  394. // Compute CRC of remaining bytes.
  395. for (;src < end; ++src) {
  396. CRC_BYTE(this, crc0, *src);
  397. }
  398. return (crc0 ^ Base().Canonize());
  399. }
  400. // Fastest, interleaved multi-byte CRC.
  401. Crc CrcMultiword(const void *data, size_t bytes, const Crc &start) const {
  402. if (kStride < 2 || kStride > 8) {
  403. // Unsupported configuration;
  404. // fall back to something sensible.
  405. return CrcWord(data, bytes, start);
  406. }
  407. const uint8 *src = static_cast<const uint8 *>(data);
  408. const uint8 *end = src + bytes;
  409. Crc crc0 = start ^ Base().Canonize();
  410. ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, Word);
  411. if (src >= end) {
  412. return (crc0 ^ Base().Canonize());
  413. }
  414. // Process kStride Word registers at once;
  415. // should have have at least 2*kInterleaveBytes of data to start.
  416. end -= 2*kInterleaveBytes - 1;
  417. if (src < end) {
  418. Crc crc_carryover;
  419. if (sizeof(Crc) > sizeof(Word)) {
  420. // crc_carryover is used if and only if Crc is wider than Word.
  421. crc_carryover = 0;
  422. }
  423. #define INIT_CRC(reg) \
  424. Crc crc##reg; \
  425. if (reg > 0 && kStride > reg) { \
  426. crc##reg = 0; \
  427. }
  428. REPEAT_FROM_1(INIT_CRC);
  429. #undef INIT_CRC
  430. #define INIT_BUF(reg) \
  431. Word buf##reg; \
  432. if (kStride > reg) { \
  433. buf##reg = reinterpret_cast<const Word *>(src)[reg]; \
  434. }
  435. REPEAT_FROM_0(INIT_BUF);
  436. #undef INIT_BUF
  437. do {
  438. PREFETCH(src);
  439. src += kInterleaveBytes;
  440. if (sizeof(Crc) > sizeof(Word)) {
  441. crc0 ^= crc_carryover;
  442. }
  443. #define FIRST(reg, next_reg) do { \
  444. if (kStride > reg) { \
  445. buf##reg ^= Downcast<Crc, Word>(crc##reg); \
  446. if (sizeof(Crc) > sizeof(Word)) { \
  447. if (reg < kStride - 1) { \
  448. crc##next_reg ^= SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \
  449. } else { \
  450. crc_carryover = SHIFT_RIGHT_SAFE(crc##reg, 8 * sizeof(buf0)); \
  451. } \
  452. } \
  453. crc##reg = TABLE_ENTRY(this->crc_word_interleaved_, 0, buf##reg); \
  454. buf##reg >>= 8; \
  455. } \
  456. } while (0)
  457. FIRST(0, 1);
  458. FIRST(1, 2);
  459. FIRST(2, 3);
  460. FIRST(3, 4);
  461. FIRST(4, 5);
  462. FIRST(5, 6);
  463. FIRST(6, 7);
  464. FIRST(7, 0);
  465. #undef FIRST
  466. for (size_t byte = 1; byte < sizeof(Word) - 1; ++byte) {
  467. #define NEXT(reg) do { \
  468. if (kStride > reg) { \
  469. crc##reg ^= \
  470. TABLE_ENTRY(this->crc_word_interleaved_, byte, buf##reg); \
  471. buf##reg >>= 8; \
  472. } \
  473. } while(0)
  474. REPEAT_FROM_0(NEXT);
  475. #undef NEXT
  476. }
  477. #define LAST(reg) do { \
  478. if (kStride > reg) { \
  479. crc##reg ^= TABLE_ENTRY_LAST(this->crc_word_interleaved_, buf##reg); \
  480. buf##reg = reinterpret_cast<const Word *>(src)[reg]; \
  481. } \
  482. } while(0)
  483. REPEAT_FROM_0(LAST);
  484. #undef LAST
  485. }
  486. while (src < end);
  487. if (sizeof(Crc) > sizeof(Word)) {
  488. crc0 ^= crc_carryover;
  489. }
  490. #define COMBINE(reg) do { \
  491. if (kStride > reg) { \
  492. if (reg != 0) { \
  493. crc0 ^= crc##reg; \
  494. } \
  495. CRC_WORD(this, crc0, buf##reg); \
  496. } \
  497. } while (0)
  498. REPEAT_FROM_0(COMBINE);
  499. #undef COMBINE
  500. src += kInterleaveBytes;
  501. }
  502. end += 2*kInterleaveBytes - 1;
  503. // Process sizeof(Word) bytes at once.
  504. end -= sizeof(Word) - 1;
  505. for (; src < end; src += sizeof(Word)) {
  506. Word buf0 = reinterpret_cast<const Word *>(src)[0];
  507. CRC_WORD(this, crc0, buf0);
  508. }
  509. end += sizeof(Word) - 1;
  510. // Compute CRC of remaining bytes.
  511. for (;src < end; ++src) {
  512. CRC_BYTE(this, crc0, *src);
  513. }
  514. return (crc0 ^ Base().Canonize());
  515. }
  516. protected:
  517. enum {
  518. kInterleaveBytes = sizeof(Word) * kStride,
  519. };
  520. // Multiplication tables used by CRCs.
  521. TableEntry crc_word_interleaved_[sizeof(Word)][256];
  522. TableEntry crc_word_[sizeof(Word)][256];
  523. // Base class stored after CRC tables so that the most frequently
  524. // used table is at offset 0 and may be accessed faster.
  525. GfUtil<Crc> base_;
  526. friend class RollingCrc< GenericCrc<Crc, TableEntry, Word, kStride> >;
  527. private:
  528. // CrcMultiword on amd64 may run at 1.2 CPU cycles per byte which is
  529. // noticeably faster than CrcWord (2.2-2.6 cycles/byte depending on
  530. // hardware and compiler). However, there are problems with compilers.
  531. //
  532. // Test system: P45 chipset, Intel Q9650 CPU, 800MHz 4-4-4-12 memory.
  533. //
  534. // 64-bit compiler, <= 64-bit CRC, 64-bit tables, 64-bit reads:
  535. // CL 15.00.307291.1 C++ >1.2< CPU cycles/byte
  536. // ICL 11.1.051 -O3 C++ 1.5 CPU cycles/byte
  537. // GCC 4.5 -O3 C++ 2.0 CPU cycles/byte
  538. // GCC 4.x -O3 ASM >1.2< CPU cycles/byte
  539. //
  540. // 32-bit compiler, MMX used, <= 64-bit CRC, 64-bit tables, 64-bit reads
  541. // CL 15.00.307291.1 C++ 2.0 CPU cycles/byte
  542. // GCC 4.5 -O3 C++ 1.9 CPU cycles/byte
  543. // ICL 11.1.051 -S C++ 1.6 CPU cycles/byte
  544. // GCC 4.x -O3 ASM >1.3< CPU cycles/byte
  545. //
  546. // So, use inline ASM code for GCC for both i386 and amd64.
  547. Crc CrcMultiwordI386Mmx(
  548. const void *data, size_t bytes, const Crc &start) const;
  549. Crc CrcMultiwordGccAmd64(
  550. const void *data, size_t bytes, const Crc &start) const;
  551. Crc CrcMultiwordGccAmd64Sse2(
  552. const uint8 *src, const uint8 *end, const Crc &start) const;
  553. } GCC_ALIGN_ATTRIBUTE(16);
  554. #undef REPEAT_FROM_0
  555. #undef REPEAT_FROM_1
  556. // Specialized variants.
  557. #if CRCUTIL_USE_ASM
  558. #if (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX)))
  559. // Declare specialized functions.
  560. template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
  561. const void *data, size_t bytes, const uint64 &start) const;
  562. #if HAVE_AMD64 && HAVE_SSE2
  563. template<>
  564. uint128_sse2
  565. GenericCrc<uint128_sse2, uint128_sse2, uint64, 4>::CrcMultiword(
  566. const void *data, size_t bytes, const uint128_sse2 &start) const;
  567. #endif // HAVE_AMD64 && HAVE_SSE2
  568. #elif defined(_MSC_FULL_VER) && _MSC_FULL_VER <= 150030729 && \
  569. (HAVE_I386 && HAVE_MMX)
  570. // Work around bug in MSC (present at least in v. 15.00.30729.1)
  571. template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
  572. const void *data,
  573. size_t bytes,
  574. const uint64 &start) const;
  575. template<> __forceinline
  576. uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiword(
  577. const void *data,
  578. size_t bytes,
  579. const uint64 &start) const {
  580. typedef uint64 Word;
  581. typedef uint64 Crc;
  582. if (bytes <= 12) {
  583. const uint8 *src = static_cast<const uint8 *>(data);
  584. uint64 crc = start ^ Base().Canonize();
  585. for (const uint8 *end = src + bytes; src < end; ++src) {
  586. CRC_BYTE(this, crc, *src);
  587. }
  588. return (crc ^ Base().Canonize());
  589. }
  590. return CrcMultiwordI386Mmx(data, bytes, start);
  591. }
  592. #endif // (defined(__GNUC__) && (HAVE_AMD64 || (HAVE_I386 && HAVE_MMX)))
  593. #endif // CRCUTIL_USE_ASM
  594. #pragma pack(pop)
  595. } // namespace crcutil
  596. #endif // CRCUTIL_GENERIC_CRC_H_