uint128_sse2.h 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. // Copyright 2010 Google Inc. All rights reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Implements a limited set of 128-bit arithmetic operations
  15. // (the ones that are used by CRC) using SSE2 intrinsics.
  16. #ifndef CRCUTIL_UINT128_SSE2_H_
  17. #define CRCUTIL_UINT128_SSE2_H_
  18. #include "base_types.h"
  19. #include "crc_casts.h" // Downcast, CrcFromUint64, Uint64FromCrc
  20. #include "platform.h"
  21. #if HAVE_SSE2
  22. namespace crcutil {
  23. // Specialized functions handling __m128i.
  24. template<> __forceinline uint64 Downcast(const __m128i &value) {
  25. #if HAVE_AMD64 && defined(__GNUC__)
  26. // GCC 4.4.x is too smart and, instead of MOVQ, generates SSE4 PEXTRQ
  27. // instruction when the code is compiled with -mmsse4.
  28. // Fixed in 4.5 which generates conversion through memory (why?).
  29. // And -- yes, it makes quite measurable difference.
  30. uint64 temp;
  31. asm(SSE2_MOVQ " %[i128], %[u64]\n" : [u64] "=r" (temp) : [i128] "x" (value));
  32. return temp;
  33. #elif HAVE_AMD64 && (!defined(_MSC_FULL_VER) || _MSC_FULL_VER > 150030729)
  34. return static_cast<uint64>(_mm_cvtsi128_si64(value));
  35. #else
  36. // 64-bit CL 15.00.30729.1 -O2 generates incorrect code (tests fail).
  37. // _mm_cvtsi128_si64() is not available on i386.
  38. uint64 temp;
  39. _mm_storel_epi64(reinterpret_cast<__m128i *>(&temp), value);
  40. return temp;
  41. #endif
  42. }
  43. class uint128_sse2 {
  44. public:
  45. uint128_sse2() {}
  46. ~uint128_sse2() {}
  47. // Default casts to uint128_sse2 and assignment operator.
  48. __forceinline void operator =(uint64 value) {
  49. #if HAVE_AMD64 && defined(__GNUC__) && !GCC_VERSION_AVAILABLE(4, 5)
  50. // Prevent generation of SSE4 pinsrq insruction when
  51. // compiling with GCC 4.4.x with -msse4 flag.
  52. asm(SSE2_MOVQ " %[u64], %[i128]\n" : [i128] "=x" (x_) : [u64] "r" (value));
  53. #elif HAVE_AMD64
  54. x_ = _mm_cvtsi64_si128(static_cast<int64>(value));
  55. #else
  56. x_ = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&value));
  57. #endif
  58. }
  59. __forceinline uint128_sse2(uint64 x) {
  60. *this = x;
  61. }
  62. __forceinline uint128_sse2(const __m128i x) : x_(x) {
  63. }
  64. __forceinline operator __m128i() const {
  65. return x_;
  66. }
  67. __forceinline void operator =(const uint128_sse2 &x) {
  68. x_ = x.x_;
  69. }
  70. // Extracts 64 less significant bits.
  71. __forceinline uint64 to_uint64() const {
  72. return Downcast<__m128i, uint64>(x_);
  73. }
  74. // Comparisons.
  75. __forceinline bool operator ==(const uint128_sse2 &y) const {
  76. union {
  77. __m128i i128;
  78. uint64 u64[2];
  79. } t;
  80. t.i128 = _mm_xor_si128(x_, y.x_);
  81. return (t.u64[0] | t.u64[1]) == 0;
  82. }
  83. __forceinline bool operator ==(uint64 value) const {
  84. union {
  85. __m128i i128;
  86. uint64 u64[2];
  87. } t;
  88. t.i128 = x_;
  89. return (t.u64[0] == value && t.u64[1] == 0);
  90. }
  91. __forceinline bool operator !=(const uint128_sse2 &y) const {
  92. union {
  93. __m128i i128;
  94. uint64 u64[2];
  95. } t;
  96. t.i128 = _mm_xor_si128(x_, y.x_);
  97. return (t.u64[0] | t.u64[1]) != 0;
  98. }
  99. __forceinline bool operator !=(uint64 value) const {
  100. union {
  101. __m128i i128;
  102. uint64 u64[2];
  103. } t;
  104. t.i128 = x_;
  105. return (t.u64[0] != value || t.u64[1] != 0);
  106. }
  107. __forceinline bool operator <(const uint128_sse2 &y) const {
  108. union {
  109. __m128i i128;
  110. uint64 u64[2];
  111. } xx, yy;
  112. xx.i128 = x_;
  113. yy.i128 = y.x_;
  114. return (xx.u64[0] < yy.u64[0] ||
  115. (xx.u64[0] == yy.u64[0] && xx.u64[1] < yy.u64[1]));
  116. }
  117. // Bitwise logic operators.
  118. __forceinline uint128_sse2 operator ^(const uint128_sse2 &y) const {
  119. return _mm_xor_si128(x_, y.x_);
  120. }
  121. __forceinline uint128_sse2 operator &(const uint128_sse2 &y) const {
  122. return _mm_and_si128(x_, y.x_);
  123. }
  124. __forceinline uint128_sse2 operator |(const uint128_sse2 &y) const {
  125. return _mm_or_si128(x_, y.x_);
  126. }
  127. __forceinline void operator ^=(const uint128_sse2 &y) {
  128. *this = *this ^ y.x_;
  129. }
  130. __forceinline void operator &=(const uint128_sse2 &y) {
  131. *this = *this & y.x_;
  132. }
  133. __forceinline void operator |=(const uint128_sse2 &y) {
  134. *this = *this | y.x_;
  135. }
  136. // Arithmetic operators.
  137. __forceinline uint128_sse2 operator +(uint64 y) const {
  138. union {
  139. __m128i i128;
  140. uint64 u64[2];
  141. } temp;
  142. temp.i128 = x_;
  143. // a + b >= 2**64 iff
  144. // a + b > (2**64 - 1) iff
  145. // a > (2**64 - 1) - b iff
  146. // a > ~b
  147. if (temp.u64[0] > ~y) {
  148. temp.u64[1] += 1;
  149. }
  150. temp.u64[0] += y;
  151. return temp.i128;
  152. }
  153. __forceinline void operator +=(uint64 x) {
  154. *this = *this + x;
  155. }
  156. __forceinline uint128_sse2 operator -(uint64 y) const {
  157. union {
  158. __m128i i128;
  159. uint64 u64[2];
  160. } temp;
  161. temp.i128 = x_;
  162. if (temp.u64[0] < y) {
  163. temp.u64[1] -= 1;
  164. }
  165. temp.u64[0] -= y;
  166. return temp.i128;
  167. }
  168. __forceinline void operator -=(uint64 x) {
  169. *this = *this - x;
  170. }
  171. // Bitwise logical shifts.
  172. __forceinline uint128_sse2 operator >>(const int bits) const {
  173. if (bits == 8) {
  174. return _mm_srli_si128(x_, 1);
  175. } else if (bits == 16) {
  176. return _mm_srli_si128(x_, 2);
  177. } else if (bits == 32) {
  178. return _mm_srli_si128(x_, 4);
  179. } else if (bits == 64) {
  180. return _mm_srli_si128(x_, 8);
  181. } else {
  182. return long_shift_right(bits);
  183. }
  184. }
  185. __forceinline uint128_sse2 operator >>(const size_t bits) const {
  186. return *this >> static_cast<int>(bits);
  187. }
  188. __forceinline void operator >>=(const int bits) {
  189. *this = *this >> bits;
  190. }
  191. __forceinline void operator >>=(const size_t bits) {
  192. *this = *this >> static_cast<int>(bits);
  193. }
  194. __forceinline uint128_sse2 operator <<(int bits) const {
  195. if (bits == 8) {
  196. return _mm_slli_si128(x_, 1);
  197. } else if (bits == 16) {
  198. return _mm_slli_si128(x_, 2);
  199. } else if (bits == 32) {
  200. return _mm_slli_si128(x_, 4);
  201. } else if (bits == 64) {
  202. return _mm_slli_si128(x_, 8);
  203. } else {
  204. return long_shift_left(bits);
  205. }
  206. }
  207. __forceinline uint128_sse2 operator <<(size_t bits) const {
  208. return *this << static_cast<int>(bits);
  209. }
  210. __forceinline void operator <<=(int bits) {
  211. *this = *this << bits;
  212. }
  213. __forceinline void operator <<=(size_t bits) {
  214. *this = *this << static_cast<int>(bits);
  215. }
  216. protected:
  217. __forceinline uint128_sse2 long_shift_right(int bits) const {
  218. union {
  219. __m128i i128;
  220. uint64 u64[2];
  221. } x;
  222. x.i128 = x_;
  223. for (; bits > 0; --bits) {
  224. x.u64[0] >>= 1;
  225. if (x.u64[1] & 1) {
  226. x.u64[0] |= static_cast<uint64>(1) << 63;
  227. }
  228. x.u64[1] >>= 1;
  229. }
  230. return x.i128;
  231. }
  232. __forceinline uint128_sse2 long_shift_left(int bits) const {
  233. union {
  234. __m128i i128;
  235. int64 i64[2];
  236. } x;
  237. x.i128 = x_;
  238. for (; bits > 0; --bits) {
  239. x.i64[1] <<= 1;
  240. if (x.i64[0] < 0) {
  241. x.i64[1] |= 1;
  242. }
  243. x.i64[0] <<= 1;
  244. }
  245. return x.i128;
  246. }
  247. __m128i x_;
  248. } GCC_ALIGN_ATTRIBUTE(16);
  249. // Specialized versions.
  250. template<> __forceinline uint64 Downcast(const uint128_sse2 &x) {
  251. return x.to_uint64();
  252. }
  253. template<> __forceinline uint32 Downcast(const uint128_sse2 &x) {
  254. return static_cast<uint32>(x.to_uint64());
  255. }
  256. template<> __forceinline uint16 Downcast(const uint128_sse2 &x) {
  257. return static_cast<uint16>(x.to_uint64());
  258. }
  259. template<> __forceinline uint8 Downcast(const uint128_sse2 &x) {
  260. return static_cast<uint8>(x.to_uint64());
  261. }
  262. template<> __forceinline uint128_sse2 CrcFromUint64(uint64 lo, uint64 hi) {
  263. union {
  264. __m128i i128;
  265. uint64 u64[2];
  266. } temp;
  267. temp.u64[0] = lo;
  268. temp.u64[1] = hi;
  269. return temp.i128;
  270. }
  271. template<> __forceinline void Uint64FromCrc(const uint128_sse2 &crc,
  272. uint64 *lo, uint64 *hi) {
  273. union {
  274. __m128i i128;
  275. uint64 u64[2];
  276. } temp;
  277. temp.i128 = crc;
  278. *lo = temp.u64[0];
  279. *hi = temp.u64[1];
  280. }
  281. } // namespace crcutil
  282. #endif // HAVE_SSE2
  283. #endif // CRCUTIL_UINT128_SSE2_H_