multiword_64_64_cl_i386_mmx.cc 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. // Copyright 2010 Google Inc. All rights reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Implements 64-bit multiword CRC for Microsoft and Intel compilers
  15. // using MMX instructions (i386).
  16. #include "generic_crc.h"
  17. #if CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)
  18. namespace crcutil {
  19. #define CRC_WORD_MMX() \
  20. __asm pxor BUF0, CRC0 \
  21. __asm movd TMP0, BUF0 \
  22. __asm psrlq BUF0, 32 \
  23. __asm movzx TEMP, TMP0L \
  24. __asm shr TMP0, 8 \
  25. __asm movq CRC0, [TABLE + TEMP * 8] \
  26. __asm movzx TEMP, TMP0L \
  27. __asm shr TMP0, 8 \
  28. __asm pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8] \
  29. __asm movzx TEMP, TMP0L \
  30. __asm shr TMP0, 8 \
  31. __asm pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8] \
  32. __asm pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8] \
  33. __asm movd TMP0, BUF0 \
  34. __asm movzx TEMP, TMP0L \
  35. __asm shr TMP0, 8 \
  36. __asm pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8] \
  37. __asm movzx TEMP, TMP0L \
  38. __asm shr TMP0, 8 \
  39. __asm pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8] \
  40. __asm movzx TEMP, TMP0L \
  41. __asm shr TMP0, 8 \
  42. __asm pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8] \
  43. __asm pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
  44. // frame pointer register 'ebp' modified by inline assembly code
  45. #pragma warning(disable: 4731)
  46. template<> uint64 GenericCrc<uint64, uint64, uint64, 4>::CrcMultiwordI386Mmx(
  47. const void *data,
  48. size_t bytes,
  49. const uint64 &start) const {
  50. const uint8 *src = static_cast<const uint8 *>(data);
  51. const uint8 *end = src + bytes;
  52. uint64 crc0 = start ^ this->Base().Canonize();
  53. ALIGN_ON_WORD_BOUNDARY_IF_NEEDED(bytes, this, src, end, crc0, uint64);
  54. if (src >= end) {
  55. return (crc0 ^ this->Base().Canonize());
  56. }
  57. #define CRC0 mm0
  58. #define CRC1 mm1
  59. #define CRC2 mm2
  60. #define CRC3 mm3
  61. #define BUF0 mm4
  62. #define BUF1 mm5
  63. #define BUF2 mm6
  64. #define BUF3 mm7
  65. #define TMP0 eax
  66. #define TMP0L al
  67. #define TMP0H ah
  68. #define TMP1 ebx
  69. #define TMP1L bl
  70. #define TMP1H bh
  71. #define TMP2 ecx
  72. #define TMP2L cl
  73. #define TMP2H ch
  74. #define TMP3 edx
  75. #define TMP3L dl
  76. #define TMP3H dh
  77. #define TEMP edi
  78. #define SRC esi
  79. #define END [esp]
  80. #define TABLE ebp
  81. const uint64 *interleaved_table_address =
  82. &this->crc_word_interleaved_[0][0];
  83. const uint64 *word_table_address = &this->crc_word_[0][0];
  84. __asm {
  85. push ebp
  86. mov TMP0, interleaved_table_address
  87. movq CRC0, crc0
  88. mov SRC, src
  89. mov TMP1, end
  90. sub TMP1, 2*4*8 - 1
  91. cmp SRC, TMP1
  92. mov TABLE, word_table_address
  93. jae end_main_loop
  94. push TABLE
  95. mov TABLE, TMP0
  96. push TMP1
  97. pxor CRC1, CRC1
  98. pxor CRC2, CRC2
  99. pxor CRC3, CRC3
  100. movq BUF0, [SRC]
  101. movq BUF1, [SRC + 1 * 8]
  102. movq BUF2, [SRC + 2 * 8]
  103. movq BUF3, [SRC + 3 * 8]
  104. main_loop:
  105. #if HAVE_SSE && CRCUTIL_PREFETCH_WIDTH > 0
  106. prefetcht0 [SRC + CRCUTIL_PREFETCH_WIDTH]
  107. #endif
  108. add SRC, 32
  109. pxor BUF0, CRC0
  110. pxor BUF1, CRC1
  111. pxor BUF2, CRC2
  112. pxor BUF3, CRC3
  113. movd TMP0, BUF0
  114. psrlq BUF0, 32
  115. movd TMP1, BUF1
  116. psrlq BUF1, 32
  117. movd TMP2, BUF2
  118. psrlq BUF2, 32
  119. movd TMP3, BUF3
  120. psrlq BUF3, 32
  121. movzx TEMP, TMP0L
  122. movq CRC0, [TABLE + TEMP * 8]
  123. movzx TEMP, TMP1L
  124. movq CRC1, [TABLE + TEMP * 8]
  125. movzx TEMP, TMP2L
  126. movq CRC2, [TABLE + TEMP * 8]
  127. movzx TEMP, TMP3L
  128. movq CRC3, [TABLE + TEMP * 8]
  129. movzx TEMP, TMP0H
  130. shr TMP0, 16
  131. pxor CRC0, [TABLE + TEMP * 8 + 1 * 256 * 8]
  132. movzx TEMP, TMP1H
  133. shr TMP1, 16
  134. pxor CRC1, [TABLE + TEMP * 8 + 1 * 256 * 8]
  135. movzx TEMP, TMP2H
  136. shr TMP2, 16
  137. pxor CRC2, [TABLE + TEMP * 8 + 1 * 256 * 8]
  138. movzx TEMP, TMP3H
  139. shr TMP3, 16
  140. pxor CRC3, [TABLE + TEMP * 8 + 1 * 256 * 8]
  141. movzx TEMP, TMP0L
  142. shr TMP0, 8
  143. pxor CRC0, [TABLE + TEMP * 8 + 2 * 256 * 8]
  144. movzx TEMP, TMP1L
  145. shr TMP1, 8
  146. pxor CRC1, [TABLE + TEMP * 8 + 2 * 256 * 8]
  147. movzx TEMP, TMP2L
  148. shr TMP2, 8
  149. pxor CRC2, [TABLE + TEMP * 8 + 2 * 256 * 8]
  150. movzx TEMP, TMP3L
  151. shr TMP3, 8
  152. pxor CRC3, [TABLE + TEMP * 8 + 2 * 256 * 8]
  153. pxor CRC0, [TABLE + TMP0 * 8 + 3 * 256 * 8]
  154. movd TMP0, BUF0
  155. pxor CRC1, [TABLE + TMP1 * 8 + 3 * 256 * 8]
  156. movd TMP1, BUF1
  157. pxor CRC2, [TABLE + TMP2 * 8 + 3 * 256 * 8]
  158. movd TMP2, BUF2
  159. pxor CRC3, [TABLE + TMP3 * 8 + 3 * 256 * 8]
  160. movd TMP3, BUF3
  161. movzx TEMP, TMP0L
  162. pxor CRC0, [TABLE + TEMP * 8 + 4 * 256 * 8]
  163. movzx TEMP, TMP1L
  164. pxor CRC1, [TABLE + TEMP * 8 + 4 * 256 * 8]
  165. movzx TEMP, TMP2L
  166. pxor CRC2, [TABLE + TEMP * 8 + 4 * 256 * 8]
  167. movzx TEMP, TMP3L
  168. pxor CRC3, [TABLE + TEMP * 8 + 4 * 256 * 8]
  169. movzx TEMP, TMP0H
  170. shr TMP0, 16
  171. pxor CRC0, [TABLE + TEMP * 8 + 5 * 256 * 8]
  172. movzx TEMP, TMP1H
  173. shr TMP1, 16
  174. pxor CRC1, [TABLE + TEMP * 8 + 5 * 256 * 8]
  175. movzx TEMP, TMP2H
  176. shr TMP2, 16
  177. pxor CRC2, [TABLE + TEMP * 8 + 5 * 256 * 8]
  178. movzx TEMP, TMP3H
  179. shr TMP3, 16
  180. pxor CRC3, [TABLE + TEMP * 8 + 5 * 256 * 8]
  181. movzx TEMP, TMP0L
  182. shr TMP0, 8
  183. pxor CRC0, [TABLE + TEMP * 8 + 6 * 256 * 8]
  184. movzx TEMP, TMP1L
  185. shr TMP1, 8
  186. pxor CRC1, [TABLE + TEMP * 8 + 6 * 256 * 8]
  187. movzx TEMP, TMP2L
  188. shr TMP2, 8
  189. pxor CRC2, [TABLE + TEMP * 8 + 6 * 256 * 8]
  190. movzx TEMP, TMP3L
  191. shr TMP3, 8
  192. pxor CRC3, [TABLE + TEMP * 8 + 6 * 256 * 8]
  193. pxor CRC0, [TABLE + TMP0 * 8 + 7 * 256 * 8]
  194. movq BUF0, [SRC]
  195. pxor CRC1, [TABLE + TMP1 * 8 + 7 * 256 * 8]
  196. movq BUF1, [SRC + 1 * 8]
  197. pxor CRC2, [TABLE + TMP2 * 8 + 7 * 256 * 8]
  198. movq BUF2, [SRC + 2 * 8]
  199. pxor CRC3, [TABLE + TMP3 * 8 + 7 * 256 * 8]
  200. movq BUF3, [SRC + 3 * 8]
  201. cmp END, SRC
  202. ja main_loop
  203. #undef END
  204. #define END TMP1
  205. pop END
  206. pop TABLE
  207. add SRC, 32
  208. CRC_WORD_MMX()
  209. pxor BUF1, CRC1
  210. movq BUF0, BUF1
  211. CRC_WORD_MMX()
  212. pxor BUF2, CRC2
  213. movq BUF0, BUF2
  214. CRC_WORD_MMX()
  215. pxor BUF3, CRC3
  216. movq BUF0, BUF3
  217. CRC_WORD_MMX()
  218. end_main_loop:
  219. add END, 2*4*8 - 8
  220. cmp SRC, END
  221. jae end_word_loop
  222. word_loop:
  223. movq BUF0, [SRC]
  224. add SRC, 8
  225. CRC_WORD_MMX()
  226. cmp END, SRC
  227. ja word_loop
  228. end_word_loop:
  229. #if 0 // Plain C version is faster?
  230. add END, 7
  231. cmp SRC, END
  232. jae end_byte_loop
  233. byte_loop:
  234. movd TMP0, CRC0
  235. movzx TEMP, byte ptr [SRC]
  236. movzx TMP0, TMP0L
  237. psrlq CRC0, 8
  238. xor TEMP, TMP0
  239. add SRC, 1
  240. pxor CRC0, [TABLE + TEMP*8 + 7*256*8]
  241. cmp END, SRC
  242. ja byte_loop
  243. end_byte_loop:
  244. #endif
  245. pop ebp
  246. mov src, SRC
  247. movq crc0, CRC0
  248. emms
  249. }
  250. #if 1
  251. // Compute CRC of remaining bytes.
  252. for (;src < end; ++src) {
  253. CRC_BYTE(this, crc0, *src);
  254. }
  255. #endif
  256. return (crc0 ^ this->Base().Canonize());
  257. }
  258. } // namespace crcutil
  259. #endif // CRCUTIL_USE_ASM && HAVE_I386 && HAVE_MMX && defined(_MSC_VER)