wide_sse41.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. #include <util/charset/wide.h>
  2. #include <util/system/types.h>
  3. #ifdef SSE41_STUB
  4. namespace NDetail {
  5. void UTF8ToWideImplSSE41(const unsigned char*&, const unsigned char*, wchar16*&) noexcept {
  6. }
  7. void UTF8ToWideImplSSE41(const unsigned char*&, const unsigned char*, wchar32*&) noexcept {
  8. }
  9. } // namespace NDetail
  10. #else
  11. #include <util/system/compiler.h>
  12. #include <cstring>
  13. #include <emmintrin.h>
  14. #include <smmintrin.h>
  15. // processes to the first error, or until less then 16 bytes left
  16. // most code taken from https://woboq.com/blog/utf-8-processing-using-simd.html
  17. // return dstAdvance 0 in case of problems
  18. static Y_FORCE_INLINE ui32 Unpack16BytesIntoUtf16IfNoSurrogats(const unsigned char*& cur, __m128i& utf16Low, __m128i& utf16High) {
  19. unsigned char curAligned[16];
  20. memcpy(curAligned, cur, sizeof(__m128i));
  21. __m128i chunk = _mm_load_si128(reinterpret_cast<const __m128i*>(curAligned));
  22. // only ascii characters - simple copy
  23. if (!_mm_movemask_epi8(chunk)) {
  24. utf16Low = _mm_unpacklo_epi8(chunk, _mm_setzero_si128());
  25. utf16High = _mm_unpackhi_epi8(chunk, _mm_setzero_si128());
  26. cur += 16;
  27. return 16;
  28. }
  29. __m128i chunkSigned = _mm_add_epi8(chunk, _mm_set1_epi8(0x80));
  30. __m128i isAsciiMask = _mm_cmpgt_epi8(chunk, _mm_set1_epi8(0));
  31. __m128i cond2 = _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunkSigned);
  32. __m128i state = _mm_set1_epi8(0x0 | (char)0x80);
  33. __m128i cond3 = _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunkSigned);
  34. state = _mm_blendv_epi8(state, _mm_set1_epi8(0x2 | (char)0xc0), cond2);
  35. int sourceAdvance;
  36. __m128i shifts;
  37. __m128i chunkLow, chunkHigh;
  38. if (Y_LIKELY(!_mm_movemask_epi8(cond3))) {
  39. // main case: no bloks of size 3 or 4
  40. // rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.)
  41. __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));
  42. __m128i countSub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));
  43. shifts = countSub1;
  44. __m128i continuation1 = _mm_slli_si128(countSub1, 1);
  45. shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
  46. shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));
  47. __m128i counts = _mm_or_si128(count, continuation1);
  48. __m128i isBeginMultibyteMask = _mm_cmpgt_epi8(count, _mm_set1_epi8(0));
  49. __m128i needNoContinuationMask = _mm_cmpeq_epi8(continuation1, _mm_set1_epi8(0));
  50. __m128i isBeginMask = _mm_add_epi8(isBeginMultibyteMask, isAsciiMask);
  51. // each symbol should be exactly one of ascii, continuation or begin
  52. __m128i okMask = _mm_cmpeq_epi8(isBeginMask, needNoContinuationMask);
  53. if (_mm_movemask_epi8(okMask) != 0xFFFF) {
  54. return 0;
  55. }
  56. shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));
  57. __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
  58. shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));
  59. chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits
  60. shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1
  61. __m128i chunk_right = _mm_slli_si128(chunk, 1);
  62. shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
  63. _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1));
  64. chunkLow = _mm_blendv_epi8(chunk,
  65. _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))),
  66. _mm_cmpeq_epi8(counts, _mm_set1_epi8(1)));
  67. chunkHigh = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));
  68. shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
  69. _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2));
  70. chunkHigh = _mm_srli_epi32(chunkHigh, 2);
  71. shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
  72. _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4));
  73. int c = _mm_extract_epi16(counts, 7);
  74. sourceAdvance = !(c & 0x0200) ? 16 : 15;
  75. } else {
  76. __m128i mask3 = _mm_slli_si128(cond3, 1);
  77. __m128i cond4 = _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunkSigned);
  78. state = _mm_blendv_epi8(state, _mm_set1_epi8(0x3 | (char)0xe0), cond3);
  79. // 4 bytes sequences are not vectorize. Fall back to the scalar processing
  80. if (Y_UNLIKELY(_mm_movemask_epi8(cond4))) {
  81. return 0;
  82. }
  83. // rune len for start of multi-byte sequences (0 for b0... and b10..., 2 for b110..., etc.)
  84. __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));
  85. __m128i countSub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));
  86. __m128i continuation2 = _mm_slli_si128(_mm_subs_epu8(count, _mm_set1_epi8(0x2)), 2);
  87. shifts = countSub1;
  88. __m128i continuation1 = _mm_slli_si128(countSub1, 1);
  89. shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
  90. __m128i continuationsRunelen = _mm_or_si128(continuation1, continuation2);
  91. shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));
  92. __m128i counts = _mm_or_si128(count, continuationsRunelen);
  93. __m128i isBeginMultibyteMask = _mm_cmpgt_epi8(count, _mm_set1_epi8(0));
  94. __m128i needNoContinuationMask = _mm_cmpeq_epi8(continuationsRunelen, _mm_set1_epi8(0));
  95. __m128i isBeginMask = _mm_add_epi8(isBeginMultibyteMask, isAsciiMask);
  96. // each symbol should be exactly one of ascii, continuation or begin
  97. __m128i okMask = _mm_cmpeq_epi8(isBeginMask, needNoContinuationMask);
  98. if (_mm_movemask_epi8(okMask) != 0xFFFF) {
  99. return 0;
  100. }
  101. shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));
  102. __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
  103. shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));
  104. chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits
  105. shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1
  106. __m128i chunk_right = _mm_slli_si128(chunk, 1);
  107. shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
  108. _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1));
  109. chunkLow = _mm_blendv_epi8(chunk,
  110. _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))),
  111. _mm_cmpeq_epi8(counts, _mm_set1_epi8(1)));
  112. chunkHigh = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));
  113. shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
  114. _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2));
  115. chunkHigh = _mm_srli_epi32(chunkHigh, 2);
  116. shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
  117. _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4));
  118. chunkHigh = _mm_or_si128(chunkHigh,
  119. _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4), _mm_set1_epi8(0xf0)),
  120. mask3));
  121. int c = _mm_extract_epi16(counts, 7);
  122. sourceAdvance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15
  123. : 14;
  124. }
  125. shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8),
  126. _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8));
  127. chunkHigh = _mm_slli_si128(chunkHigh, 1);
  128. __m128i shuf = _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
  129. chunkLow = _mm_shuffle_epi8(chunkLow, shuf);
  130. chunkHigh = _mm_shuffle_epi8(chunkHigh, shuf);
  131. utf16Low = _mm_unpacklo_epi8(chunkLow, chunkHigh);
  132. utf16High = _mm_unpackhi_epi8(chunkLow, chunkHigh);
  133. ui32 s = _mm_extract_epi32(shifts, 3);
  134. ui32 destAdvance = sourceAdvance - (0xff & (s >> (8 * (3 - 16 + sourceAdvance))));
  135. cur += sourceAdvance;
  136. return destAdvance;
  137. }
  138. namespace NDetail {
  139. void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar16*& dest) noexcept {
  140. alignas(16) wchar16 destAligned[16];
  141. while (cur + 16 <= last) {
  142. __m128i utf16Low;
  143. __m128i utf16High;
  144. ui32 dstAdvance = Unpack16BytesIntoUtf16IfNoSurrogats(cur, utf16Low, utf16High);
  145. if (dstAdvance == 0) {
  146. break;
  147. }
  148. _mm_store_si128(reinterpret_cast<__m128i*>(destAligned), utf16Low);
  149. _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 1, utf16High);
  150. memcpy(dest, destAligned, sizeof(__m128i) * 2);
  151. dest += dstAdvance;
  152. }
  153. // The rest will be handled sequencially.
  154. // Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence
  155. }
  156. void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar32*& dest) noexcept {
  157. alignas(16) wchar32 destAligned[16];
  158. while (cur + 16 <= last) {
  159. __m128i utf16Low;
  160. __m128i utf16High;
  161. ui32 dstAdvance = Unpack16BytesIntoUtf16IfNoSurrogats(cur, utf16Low, utf16High);
  162. if (dstAdvance == 0) {
  163. break;
  164. }
  165. // NOTE: we only work in case without surrogat pairs, so we can make simple copying with zeroes in 2 high bytes
  166. __m128i utf32_lowlow = _mm_unpacklo_epi16(utf16Low, _mm_set1_epi8(0));
  167. __m128i utf32_lowhigh = _mm_unpackhi_epi16(utf16Low, _mm_set1_epi8(0));
  168. __m128i utf32_highlow = _mm_unpacklo_epi16(utf16High, _mm_set1_epi8(0));
  169. __m128i utf32_highhigh = _mm_unpackhi_epi16(utf16High, _mm_set1_epi8(0));
  170. _mm_store_si128(reinterpret_cast<__m128i*>(destAligned), utf32_lowlow);
  171. _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 1, utf32_lowhigh);
  172. _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 2, utf32_highlow);
  173. _mm_store_si128(reinterpret_cast<__m128i*>(destAligned) + 3, utf32_highhigh);
  174. memcpy(dest, destAligned, sizeof(__m128i) * 4);
  175. dest += dstAdvance;
  176. }
  177. // The rest will be handled sequencially.
  178. // Possible improvement: go back to the vectorized processing after the error or the 4 byte sequence
  179. }
  180. } // namespace NDetail
  181. #endif