wide.h 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896
  1. #pragma once
  2. #include "recode_result.h"
  3. #include "unidata.h"
  4. #include "utf8.h"
  5. #include "wide_specific.h"
  6. #include <util/generic/algorithm.h>
  7. #include <util/generic/string.h>
  8. #include <util/generic/yexception.h>
  9. #include <util/memory/tempbuf.h>
  10. #include <util/system/compiler.h>
  11. #include <util/system/cpu_id.h>
  12. #include <util/system/yassert.h>
  13. #include <cstring>
  14. #ifdef _sse2_
  15. #include <emmintrin.h>
  16. #endif
  17. template <class T>
  18. class TTempArray;
  19. using TCharTemp = TTempArray<wchar16>;
  20. namespace NDetail {
  21. inline TString InStringMsg(const char* s, size_t len) {
  22. return (len <= 50) ? " in string " + TString(s, len).Quote() : TString();
  23. }
  24. template <bool isPointer>
  25. struct TSelector;
  26. template <>
  27. struct TSelector<false> {
  28. template <class T>
  29. static inline void WriteSymbol(wchar16 s, T& dest) noexcept {
  30. dest.push_back(s);
  31. }
  32. };
  33. template <>
  34. struct TSelector<true> {
  35. template <class T>
  36. static inline void WriteSymbol(wchar16 s, T& dest) noexcept {
  37. *(dest++) = s;
  38. }
  39. };
  40. inline wchar32 ReadSurrogatePair(const wchar16* chars) noexcept {
  41. const wchar32 SURROGATE_OFFSET = static_cast<wchar32>(0x10000 - (0xD800 << 10) - 0xDC00);
  42. wchar32 lead = chars[0];
  43. wchar32 tail = chars[1];
  44. Y_ASSERT(IsW16SurrogateLead(lead));
  45. Y_ASSERT(IsW16SurrogateTail(tail));
  46. return (static_cast<wchar32>(lead) << 10) + tail + SURROGATE_OFFSET;
  47. }
  48. template <class T>
  49. inline void WriteSurrogatePair(wchar32 s, T& dest) noexcept;
  50. } // namespace NDetail
  51. inline wchar16* SkipSymbol(wchar16* begin, const wchar16* end) noexcept {
  52. return begin + W16SymbolSize(begin, end);
  53. }
  54. inline const wchar16* SkipSymbol(const wchar16* begin, const wchar16* end) noexcept {
  55. return begin + W16SymbolSize(begin, end);
  56. }
  57. inline wchar32* SkipSymbol(wchar32* begin, const wchar32* end) noexcept {
  58. Y_ASSERT(begin < end);
  59. return begin + 1;
  60. }
  61. inline const wchar32* SkipSymbol(const wchar32* begin, const wchar32* end) noexcept {
  62. Y_ASSERT(begin < end);
  63. return begin + 1;
  64. }
  65. inline wchar32 ReadSymbol(const wchar16* begin, const wchar16* end) noexcept {
  66. Y_ASSERT(begin < end);
  67. if (IsW16SurrogateLead(*begin)) {
  68. if (begin + 1 < end && IsW16SurrogateTail(*(begin + 1))) {
  69. return ::NDetail::ReadSurrogatePair(begin);
  70. }
  71. return BROKEN_RUNE;
  72. } else if (IsW16SurrogateTail(*begin)) {
  73. return BROKEN_RUNE;
  74. }
  75. return *begin;
  76. }
  77. inline wchar32 ReadSymbol(const wchar32* begin, const wchar32* end) noexcept {
  78. Y_ASSERT(begin < end);
  79. return *begin;
  80. }
  81. //! presuming input data is either big enought of null terminated
  82. inline wchar32 ReadSymbolAndAdvance(const char16_t*& begin) noexcept {
  83. Y_ASSERT(*begin);
  84. if (IsW16SurrogateLead(begin[0])) {
  85. if (IsW16SurrogateTail(begin[1])) {
  86. Y_ASSERT(begin[1] != 0);
  87. const wchar32 c = ::NDetail::ReadSurrogatePair(begin);
  88. begin += 2;
  89. return c;
  90. }
  91. ++begin;
  92. return BROKEN_RUNE;
  93. } else if (IsW16SurrogateTail(begin[0])) {
  94. ++begin;
  95. return BROKEN_RUNE;
  96. }
  97. return *(begin++);
  98. }
  99. //! presuming input data is either big enought of null terminated
  100. inline wchar32 ReadSymbolAndAdvance(const char32_t*& begin) noexcept {
  101. Y_ASSERT(*begin);
  102. return *(begin++);
  103. }
  104. inline wchar32 ReadSymbolAndAdvance(const wchar_t*& begin) noexcept {
  105. // According to
  106. // https://en.cppreference.com/w/cpp/language/types
  107. // wchar_t holds UTF-16 on Windows and UTF-32 on Linux / macOS
  108. //
  109. // Apply reinterpret cast and dispatch to a proper type
  110. #ifdef _win_
  111. using TDistinctChar = char16_t;
  112. #else
  113. using TDistinctChar = char32_t;
  114. #endif
  115. const TDistinctChar*& distinctBegin = reinterpret_cast<const TDistinctChar*&>(begin);
  116. wchar32 result = ReadSymbolAndAdvance(distinctBegin);
  117. begin = reinterpret_cast<const wchar_t*&>(distinctBegin);
  118. return result;
  119. }
  120. inline wchar32 ReadSymbolAndAdvance(const char16_t*& begin, const char16_t* end) noexcept {
  121. Y_ASSERT(begin < end);
  122. if (IsW16SurrogateLead(begin[0])) {
  123. if (begin + 1 != end && IsW16SurrogateTail(begin[1])) {
  124. const wchar32 c = ::NDetail::ReadSurrogatePair(begin);
  125. begin += 2;
  126. return c;
  127. }
  128. ++begin;
  129. return BROKEN_RUNE;
  130. } else if (IsW16SurrogateTail(begin[0])) {
  131. ++begin;
  132. return BROKEN_RUNE;
  133. }
  134. return *(begin++);
  135. }
  136. inline wchar32 ReadSymbolAndAdvance(const wchar32*& begin, const wchar32* end) noexcept {
  137. Y_ASSERT(begin < end);
  138. return *(begin++);
  139. }
  140. inline wchar32 ReadSymbolAndAdvance(const wchar_t*& begin, const wchar_t* end) noexcept {
  141. // According to
  142. // https://en.cppreference.com/w/cpp/language/types
  143. // wchar_t holds UTF-16 on Windows and UTF-32 on Linux / macOS
  144. //
  145. // Apply reinterpret cast and dispatch to a proper type
  146. #ifdef _win_
  147. using TDistinctChar = char16_t;
  148. #else
  149. using TDistinctChar = char32_t;
  150. #endif
  151. const TDistinctChar* distinctBegin = reinterpret_cast<const TDistinctChar*>(begin);
  152. const TDistinctChar* distinctEnd = reinterpret_cast<const TDistinctChar*>(end);
  153. wchar32 result = ::ReadSymbolAndAdvance(distinctBegin, distinctEnd);
  154. begin = reinterpret_cast<const wchar_t*>(distinctBegin);
  155. return result;
  156. }
  157. template <class T>
  158. inline size_t WriteSymbol(wchar16 s, T& dest) noexcept {
  159. ::NDetail::TSelector<std::is_pointer<T>::value>::WriteSymbol(s, dest);
  160. return 1;
  161. }
  162. template <class T>
  163. inline size_t WriteSymbol(wchar32 s, T& dest) noexcept {
  164. if (s > 0xFFFF) {
  165. if (s >= ::NUnicode::UnicodeInstancesLimit()) {
  166. return WriteSymbol(static_cast<wchar16>(BROKEN_RUNE), dest);
  167. }
  168. ::NDetail::WriteSurrogatePair(s, dest);
  169. return 2;
  170. }
  171. return WriteSymbol(static_cast<wchar16>(s), dest);
  172. }
  173. inline bool WriteSymbol(wchar32 s, wchar16*& dest, const wchar16* destEnd) noexcept {
  174. Y_ASSERT(dest < destEnd);
  175. if (s > 0xFFFF) {
  176. if (s >= NUnicode::UnicodeInstancesLimit()) {
  177. *(dest++) = static_cast<wchar16>(BROKEN_RUNE);
  178. return true;
  179. }
  180. if (dest + 2 > destEnd) {
  181. return false;
  182. }
  183. ::NDetail::WriteSurrogatePair(s, dest);
  184. } else {
  185. *(dest++) = static_cast<wchar16>(s);
  186. }
  187. return true;
  188. }
  189. inline size_t WriteSymbol(wchar32 s, wchar32*& dest) noexcept {
  190. *(dest++) = s;
  191. return 1;
  192. }
  193. inline bool WriteSymbol(wchar32 s, wchar32*& dest, const wchar32* destEnd) noexcept {
  194. Y_ASSERT(dest < destEnd);
  195. *(dest++) = s;
  196. return true;
  197. }
  198. template <class T>
  199. inline void ::NDetail::WriteSurrogatePair(wchar32 s, T& dest) noexcept {
  200. const wchar32 LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
  201. Y_ASSERT(s > 0xFFFF && s < ::NUnicode::UnicodeInstancesLimit());
  202. wchar16 lead = LEAD_OFFSET + (static_cast<wchar16>(s >> 10));
  203. wchar16 tail = 0xDC00 + static_cast<wchar16>(s & 0x3FF);
  204. Y_ASSERT(IsW16SurrogateLead(lead));
  205. Y_ASSERT(IsW16SurrogateTail(tail));
  206. WriteSymbol(lead, dest);
  207. WriteSymbol(tail, dest);
  208. }
  209. class TCharIterator {
  210. private:
  211. const wchar16* Begin;
  212. const wchar16* End;
  213. public:
  214. inline explicit TCharIterator(const wchar16* end)
  215. : Begin(end)
  216. , End(end)
  217. {
  218. }
  219. inline TCharIterator(const wchar16* begin, const wchar16* end)
  220. : Begin(begin)
  221. , End(end)
  222. {
  223. }
  224. inline TCharIterator& operator++() {
  225. Begin = SkipSymbol(Begin, End);
  226. return *this;
  227. }
  228. inline bool operator==(const wchar16* other) const {
  229. return Begin == other;
  230. }
  231. inline bool operator!=(const wchar16* other) const {
  232. return !(*this == other);
  233. }
  234. inline bool operator==(const TCharIterator& other) const {
  235. return *this == other.Begin;
  236. }
  237. inline bool operator!=(const TCharIterator& other) const {
  238. return *this != other.Begin;
  239. }
  240. inline wchar32 operator*() const {
  241. return ReadSymbol(Begin, End);
  242. }
  243. inline const wchar16* Get() const {
  244. return Begin;
  245. }
  246. };
  247. namespace NDetail {
  248. template <bool robust, typename TCharType>
  249. inline void UTF8ToWideImplScalar(const unsigned char*& cur, const unsigned char* last, TCharType*& dest) noexcept {
  250. wchar32 rune = BROKEN_RUNE;
  251. while (cur != last) {
  252. if (ReadUTF8CharAndAdvance(rune, cur, last) != RECODE_OK) {
  253. if (robust) {
  254. rune = BROKEN_RUNE;
  255. ++cur;
  256. } else {
  257. break;
  258. }
  259. }
  260. Y_ASSERT(cur <= last);
  261. WriteSymbol(rune, dest);
  262. }
  263. }
  264. template <typename TCharType>
  265. inline void UTF16ToUTF32ImplScalar(const wchar16* cur, const wchar16* last, TCharType*& dest) noexcept {
  266. wchar32 rune = BROKEN_RUNE;
  267. while (cur != last) {
  268. rune = ReadSymbolAndAdvance(cur, last);
  269. Y_ASSERT(cur <= last);
  270. WriteSymbol(rune, dest);
  271. }
  272. }
  273. template <class TCharType>
  274. inline void UTF8ToWideImplSSE41(const unsigned char*& /*cur*/, const unsigned char* /*last*/, TCharType*& /*dest*/) noexcept {
  275. }
  276. void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar16*& dest) noexcept;
  277. void UTF8ToWideImplSSE41(const unsigned char*& cur, const unsigned char* last, wchar32*& dest) noexcept;
  278. } // namespace NDetail
  279. //! @return len if robust and position where encoding stopped if not
  280. template <bool robust, typename TCharType>
  281. inline size_t UTF8ToWideImpl(const char* text, size_t len, TCharType* dest, size_t& written) noexcept {
  282. const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
  283. const unsigned char* last = cur + len;
  284. TCharType* p = dest;
  285. #ifdef _sse_ // can't check for sse4, as we build most of arcadia without sse4 support even on platforms that support it
  286. if (cur + 16 <= last && NX86::CachedHaveSSE41()) {
  287. ::NDetail::UTF8ToWideImplSSE41(cur, last, p);
  288. }
  289. #endif
  290. ::NDetail::UTF8ToWideImplScalar<robust>(cur, last, p);
  291. written = p - dest;
  292. return cur - reinterpret_cast<const unsigned char*>(text);
  293. }
  294. template <typename TCharType>
  295. inline size_t UTF8ToWideImpl(const char* text, size_t len, TCharType* dest, size_t& written) {
  296. return UTF8ToWideImpl<false>(text, len, dest, written);
  297. }
  298. template <bool robust>
  299. inline TUtf16String UTF8ToWide(const char* text, size_t len) {
  300. TUtf16String w = TUtf16String::Uninitialized(len);
  301. size_t written;
  302. size_t pos = UTF8ToWideImpl<robust>(text, len, w.begin(), written);
  303. if (pos != len) {
  304. ythrow yexception() << "failed to decode UTF-8 string at pos " << pos << ::NDetail::InStringMsg(text, len);
  305. }
  306. Y_ASSERT(w.size() >= written);
  307. w.remove(written);
  308. return w;
  309. }
  310. template <bool robust, typename TCharType>
  311. inline bool UTF8ToWide(const char* text, size_t len, TCharType* dest, size_t& written) noexcept {
  312. return UTF8ToWideImpl<robust>(text, len, dest, written) == len;
  313. }
  314. //! converts text from UTF8 to unicode, stops immediately it UTF8 byte sequence is wrong
  315. //! @attention destination buffer must be long enough to fit all characters of the text,
  316. //! conversion stops if a broken symbol is met
  317. //! @return @c true if all the text converted successfully, @c false - a broken symbol was found
  318. template <typename TCharType>
  319. inline bool UTF8ToWide(const char* text, size_t len, TCharType* dest, size_t& written) noexcept {
  320. return UTF8ToWide<false>(text, len, dest, written);
  321. }
  322. template <bool robust>
  323. inline TWtringBuf UTF8ToWide(const TStringBuf src, TUtf16String& dst) {
  324. dst.ReserveAndResize(src.size());
  325. size_t written = 0;
  326. UTF8ToWideImpl<robust>(src.data(), src.size(), dst.begin(), written);
  327. dst.resize(written);
  328. return dst;
  329. }
  330. //! if not robust will stop at first error position
  331. template <bool robust>
  332. inline TUtf32StringBuf UTF8ToUTF32(const TStringBuf src, TUtf32String& dst) {
  333. dst.ReserveAndResize(src.size());
  334. size_t written = 0;
  335. UTF8ToWideImpl<robust>(src.data(), src.size(), dst.begin(), written);
  336. dst.resize(written);
  337. return dst;
  338. }
  339. inline TWtringBuf UTF8ToWide(const TStringBuf src, TUtf16String& dst) {
  340. return UTF8ToWide<false>(src, dst);
  341. }
  342. inline TUtf16String UTF8ToWide(const char* text, size_t len) {
  343. return UTF8ToWide<false>(text, len);
  344. }
  345. template <bool robust>
  346. inline TUtf16String UTF8ToWide(const TStringBuf s) {
  347. return UTF8ToWide<robust>(s.data(), s.size());
  348. }
  349. template <bool robust>
  350. inline TUtf32String UTF8ToUTF32(const TStringBuf s) {
  351. TUtf32String r;
  352. UTF8ToUTF32<robust>(s, r);
  353. return r;
  354. }
  355. inline TUtf16String UTF8ToWide(const TStringBuf s) {
  356. return UTF8ToWide<false>(s.data(), s.size());
  357. }
  358. //! converts text from unicode to UTF8
  359. //! @attention destination buffer must be long enough to fit all characters of the text,
  360. //! @c WriteUTF8Char converts @c wchar32 into maximum 4 bytes of UTF8 so
  361. //! destination buffer must have length equal to <tt> len * 4 </tt>
  362. template <typename TCharType>
  363. inline void WideToUTF8(const TCharType* text, size_t len, char* dest, size_t& written) {
  364. const TCharType* const last = text + len;
  365. unsigned char* p = reinterpret_cast<unsigned char*>(dest);
  366. size_t runeLen;
  367. for (const TCharType* cur = text; cur != last;) {
  368. WriteUTF8Char(ReadSymbolAndAdvance(cur, last), runeLen, p);
  369. Y_ASSERT(runeLen <= 4);
  370. p += runeLen;
  371. }
  372. written = p - reinterpret_cast<unsigned char*>(dest);
  373. }
  374. constexpr size_t WideToUTF8BufferSize(const size_t inputStringSize) noexcept {
  375. return inputStringSize * 4; // * 4 because the conversion functions can convert unicode character into maximum 4 bytes of UTF8
  376. }
  377. inline TStringBuf WideToUTF8(const TWtringBuf src, TString& dst) {
  378. dst.ReserveAndResize(WideToUTF8BufferSize(src.size()));
  379. size_t written = 0;
  380. WideToUTF8(src.data(), src.size(), dst.begin(), written);
  381. Y_ASSERT(dst.size() >= written);
  382. dst.remove(written);
  383. return dst;
  384. }
  385. inline TString WideToUTF8(const wchar16* text, size_t len) {
  386. TString s = TString::Uninitialized(WideToUTF8BufferSize(len));
  387. size_t written = 0;
  388. WideToUTF8(text, len, s.begin(), written);
  389. Y_ASSERT(s.size() >= written);
  390. s.remove(written);
  391. return s;
  392. }
  393. #if defined(_win_)
  394. inline TString WideToUTF8(const wchar_t* text, size_t len) {
  395. return WideToUTF8(reinterpret_cast<const wchar16*>(text), len);
  396. }
  397. inline std::string WideToUTF8(std::wstring_view text) {
  398. return WideToUTF8(text.data(), text.size()).ConstRef();
  399. }
  400. #endif
  401. inline TString WideToUTF8(const wchar32* text, size_t len) {
  402. TString s = TString::Uninitialized(WideToUTF8BufferSize(len));
  403. size_t written = 0;
  404. WideToUTF8(text, len, s.begin(), written);
  405. Y_ASSERT(s.size() >= written);
  406. s.remove(written);
  407. return s;
  408. }
  409. inline TString WideToUTF8(const TWtringBuf w) {
  410. return WideToUTF8(w.data(), w.size());
  411. }
  412. inline TString WideToUTF8(const TUtf32StringBuf w) {
  413. return WideToUTF8(w.data(), w.size());
  414. }
  415. inline TUtf16String UTF32ToWide(const wchar32* begin, size_t len) {
  416. TUtf16String res;
  417. res.reserve(len);
  418. const wchar32* end = begin + len;
  419. for (const wchar32* i = begin; i != end; ++i) {
  420. WriteSymbol(*i, res);
  421. }
  422. return res;
  423. }
  424. // adopted from https://chromium.googlesource.com/chromium/src/+/master/base/strings/string_util.cc
  425. // Assuming that a pointer is the size of a "machine word", then
  426. // uintptr_t is an integer type that is also a machine word.
  427. namespace NDetail {
  428. using TMachineWord = uintptr_t;
  429. const uintptr_t kMachineWordAlignmentMask = sizeof(TMachineWord) - 1;
  430. inline bool IsAlignedToMachineWord(const void* pointer) {
  431. return !(reinterpret_cast<TMachineWord>(pointer) & kMachineWordAlignmentMask);
  432. }
  433. template <typename T>
  434. inline T* AlignToMachineWord(T* pointer) {
  435. return reinterpret_cast<T*>(reinterpret_cast<TMachineWord>(pointer) & ~kMachineWordAlignmentMask);
  436. }
  437. template <size_t size, typename CharacterType>
  438. struct NonASCIIMask;
  439. template <>
  440. struct
  441. NonASCIIMask<4, wchar16> {
  442. static constexpr ui32 Value() {
  443. return 0xFF80FF80U;
  444. }
  445. };
  446. template <>
  447. struct
  448. NonASCIIMask<4, char> {
  449. static constexpr ui32 Value() {
  450. return 0x80808080U;
  451. }
  452. };
  453. template <>
  454. struct
  455. NonASCIIMask<8, wchar16> {
  456. static constexpr ui64 Value() {
  457. return 0xFF80FF80FF80FF80ULL;
  458. }
  459. };
  460. template <>
  461. struct
  462. NonASCIIMask<8, char> {
  463. static constexpr ui64 Value() {
  464. return 0x8080808080808080ULL;
  465. }
  466. };
  467. template <typename TChar>
  468. inline bool DoIsStringASCIISlow(const TChar* first, const TChar* last) {
  469. using TUnsignedChar = std::make_unsigned_t<TChar>;
  470. Y_ASSERT(first <= last);
  471. for (; first != last; ++first) {
  472. if (static_cast<TUnsignedChar>(*first) > 0x7F) {
  473. return false;
  474. }
  475. }
  476. return true;
  477. }
  478. template <typename TChar>
  479. inline bool DoIsStringASCII(const TChar* first, const TChar* last) {
  480. if (last - first < 10) {
  481. return DoIsStringASCIISlow(first, last);
  482. }
  483. TMachineWord allCharBits = 0;
  484. TMachineWord nonAsciiBitMask = NonASCIIMask<sizeof(TMachineWord), TChar>::Value();
  485. // Prologue: align the input.
  486. while (!IsAlignedToMachineWord(first) && first != last) {
  487. allCharBits |= *first;
  488. ++first;
  489. }
  490. // Compare the values of CPU word size.
  491. const TChar* word_end = AlignToMachineWord(last);
  492. const size_t loopIncrement = sizeof(TMachineWord) / sizeof(TChar);
  493. while (first < word_end) {
  494. allCharBits |= *(reinterpret_cast<const TMachineWord*>(first));
  495. first += loopIncrement;
  496. // fast exit
  497. if (allCharBits & nonAsciiBitMask) {
  498. return false;
  499. }
  500. }
  501. // Process the remaining bytes.
  502. while (first != last) {
  503. allCharBits |= *first;
  504. ++first;
  505. }
  506. return !(allCharBits & nonAsciiBitMask);
  507. }
  508. #ifdef _sse2_
  509. inline bool DoIsStringASCIISSE(const unsigned char* first, const unsigned char* last) {
  510. // scalar version for short strings
  511. if (first + 8 > last) {
  512. return ::NDetail::DoIsStringASCIISlow(first, last);
  513. }
  514. alignas(16) unsigned char buf[16];
  515. while (first + 16 <= last) {
  516. memcpy(buf, first, 16);
  517. __m128i chunk = _mm_load_si128(reinterpret_cast<__m128i*>(buf));
  518. int asciiMask = _mm_movemask_epi8(chunk);
  519. if (asciiMask) {
  520. return false;
  521. }
  522. first += 16;
  523. }
  524. if (first + 8 <= last) {
  525. memcpy(buf, first, 8);
  526. __m128i chunk = _mm_loadl_epi64(reinterpret_cast<__m128i*>(buf));
  527. int asciiMask = _mm_movemask_epi8(chunk);
  528. if (asciiMask) {
  529. return false;
  530. }
  531. first += 8;
  532. }
  533. return ::NDetail::DoIsStringASCIISlow(first, last);
  534. }
  535. #endif // _sse2_
  536. } // namespace NDetail
  537. //! returns @c true if character sequence has no symbols with value greater than 0x7F
  538. template <typename TChar>
  539. inline bool IsStringASCII(const TChar* first, const TChar* last) {
  540. return ::NDetail::DoIsStringASCII(first, last);
  541. }
  542. #ifdef _sse2_
  543. template <>
  544. inline bool IsStringASCII<unsigned char>(const unsigned char* first, const unsigned char* last) {
  545. return ::NDetail::DoIsStringASCIISSE(first, last);
  546. }
  547. template <>
  548. inline bool IsStringASCII<char>(const char* first, const char* last) {
  549. return ::NDetail::DoIsStringASCIISSE(reinterpret_cast<const unsigned char*>(first), reinterpret_cast<const unsigned char*>(last));
  550. }
  551. #endif
  552. //! copies elements from one character sequence to another using memcpy
  553. //! for compatibility only
  554. template <typename TChar>
  555. inline void Copy(const TChar* first, size_t len, TChar* result) {
  556. memcpy(result, first, len * sizeof(TChar));
  557. }
  558. template <typename TChar1, typename TChar2>
  559. inline void Copy(const TChar1* first, size_t len, TChar2* result) {
  560. Copy(first, first + len, result);
  561. }
  562. //! copies symbols from one character sequence to another without any conversion
  563. //! @note this function can be used instead of the template constructor of @c std::basic_string:
  564. //! template <typename InputIterator>
  565. //! basic_string(InputIterator begin, InputIterator end, const Allocator& a = Allocator());
  566. //! and the family of template member functions: append, assign, insert, replace.
  567. template <typename TStringType, typename TChar>
  568. inline TStringType CopyTo(const TChar* first, const TChar* last) {
  569. Y_ASSERT(first <= last);
  570. TStringType str = TStringType::Uninitialized(last - first);
  571. Copy(first, last, str.begin());
  572. return str;
  573. }
  574. template <typename TStringType, typename TChar>
  575. inline TStringType CopyTo(const TChar* s, size_t n) {
  576. TStringType str = TStringType::Uninitialized(n);
  577. Copy(s, n, str.begin());
  578. return str;
  579. }
  580. inline TString WideToASCII(const TWtringBuf w) {
  581. Y_ASSERT(IsStringASCII(w.begin(), w.end()));
  582. return CopyTo<TString>(w.begin(), w.end());
  583. }
  584. inline TUtf16String ASCIIToWide(const TStringBuf s) {
  585. Y_ASSERT(IsStringASCII(s.begin(), s.end()));
  586. return CopyTo<TUtf16String>(s.begin(), s.end());
  587. }
  588. inline TUtf32String ASCIIToUTF32(const TStringBuf s) {
  589. Y_ASSERT(IsStringASCII(s.begin(), s.end()));
  590. return CopyTo<TUtf32String>(s.begin(), s.end());
  591. }
  592. //! returns @c true if string contains whitespace characters only
  593. inline bool IsSpace(const wchar16* s, size_t n) {
  594. if (n == 0) {
  595. return false;
  596. }
  597. Y_ASSERT(s);
  598. const wchar16* const e = s + n;
  599. for (const wchar16* p = s; p != e; ++p) {
  600. if (!IsWhitespace(*p)) {
  601. return false;
  602. }
  603. }
  604. return true;
  605. }
  606. //! returns @c true if string contains whitespace characters only
  607. inline bool IsSpace(const TWtringBuf s) {
  608. return IsSpace(s.data(), s.length());
  609. }
  610. //! replaces multiple sequential whitespace characters with a single space character
  611. void Collapse(TUtf16String& w);
  612. //! @return new length
  613. size_t Collapse(wchar16* s, size_t n);
  614. //! Removes leading whitespace characters
  615. TWtringBuf StripLeft(const TWtringBuf text) noexcept Y_WARN_UNUSED_RESULT;
  616. void StripLeft(TUtf16String& text);
  617. //! Removes trailing whitespace characters
  618. TWtringBuf StripRight(const TWtringBuf text) noexcept Y_WARN_UNUSED_RESULT;
  619. void StripRight(TUtf16String& text);
  620. //! Removes leading and trailing whitespace characters
  621. TWtringBuf Strip(const TWtringBuf text) noexcept Y_WARN_UNUSED_RESULT;
  622. void Strip(TUtf16String& text);
  623. /* Check if given word is lowercase/uppercase. Will return false if string contains any
  624. * non-alphabetical symbols. It is expected that `text` is a correct UTF-16 string.
  625. *
  626. * For example `IsLowerWord("hello")` will return `true`, when `IsLowerWord("hello there")` will
  627. * return false because of the space in the middle of the string. Empty string is also considered
  628. * lowercase.
  629. */
  630. bool IsLowerWord(const TWtringBuf text) noexcept;
  631. bool IsUpperWord(const TWtringBuf text) noexcept;
  632. /* Will check if given word starts with capital letter and the rest of the word is lowercase. Will
  633. * return `false` for empty string. See also `IsLowerWord`.
  634. */
  635. bool IsTitleWord(const TWtringBuf text) noexcept;
  636. /* Check if given string is lowercase/uppercase. Will return `true` if all alphabetic symbols are
  637. * in proper case, all other symbols are ignored. It is expected that `text` is a correct UTF-16
  638. * string.
  639. *
  640. * For example `IsLowerWord("hello")` will return `true` and `IsLowerWord("hello there")` will
  641. * also return true because. Empty string is also considered lowercase.
  642. *
  643. * NOTE: for any case where `IsLowerWord` returns `true` `IsLower` will also return `true`.
  644. */
  645. bool IsLower(const TWtringBuf text) noexcept;
  646. bool IsUpper(const TWtringBuf text) noexcept;
  647. /* Lowercase/uppercase given string inplace. Any alphabetic symbol will be converted to a proper
  648. * case, the rest of the symbols will be kept the same. It is expected that `text` is a correct
  649. * UTF-16 string.
  650. *
  651. * For example `ToLower("heLLo")` will return `"hello"`.
  652. *
  653. * @param text String to modify
  654. * @param pos Position of the first character to modify
  655. * @param count Length of the substring
  656. * @returns `true` if `text` was changed
  657. *
  658. * NOTE: `pos` and `count` are measured in `wchar16`, not in codepoints.
  659. */
  660. bool ToLower(TUtf16String& text, size_t pos = 0, size_t count = TUtf16String::npos);
  661. bool ToUpper(TUtf16String& text, size_t pos = 0, size_t count = TUtf16String::npos);
  662. /* Lowercase/uppercase given string inplace. Any alphabetic symbol will be converted to a proper
  663. * case, the rest of the symbols will be kept the same. It is expected that `text` is a correct
  664. * UTF-32 string.
  665. *
  666. * For example `ToLower("heLLo")` will return `"hello"`.
  667. *
  668. * @param text String to modify
  669. * @param pos Position of the first character to modify
  670. * @param count Length of the substring
  671. * @returns `true` if `text` was changed
  672. *
  673. * NOTE: `pos` and `count` are measured in `wchar16`, not in codepoints.
  674. */
  675. bool ToLower(TUtf32String& /*text*/, size_t /*pos*/ = 0, size_t /*count*/ = TUtf16String::npos);
  676. bool ToUpper(TUtf32String& /*text*/, size_t /*pos*/ = 0, size_t /*count*/ = TUtf16String::npos);
  677. /* Titlecase first symbol and lowercase the rest, see `ToLower` for more details.
  678. */
  679. bool ToTitle(TUtf16String& text, size_t pos = 0, size_t count = TUtf16String::npos);
  680. /* Titlecase first symbol and lowercase the rest, see `ToLower` for more details.
  681. */
  682. bool ToTitle(TUtf32String& /*text*/, size_t /*pos*/ = 0, size_t /*count*/ = TUtf16String::npos);
  683. /* @param text Pointer to the string to modify
  684. * @param length Length of the string to modify
  685. * @param out Pointer to the character array to write to
  686. *
  687. * NOTE: [text, text+length) and [out, out+length) should not interleave.
  688. *
  689. * TODO(yazevnul): replace these functions with `bool(const TWtringBuf, const TArrayRef<wchar16>)`
  690. * overload.
  691. */
  692. bool ToLower(const wchar16* text, size_t length, wchar16* out) noexcept;
  693. bool ToUpper(const wchar16* text, size_t length, wchar16* out) noexcept;
  694. bool ToTitle(const wchar16* text, size_t length, wchar16* out) noexcept;
  695. bool ToLower(const wchar32* text, size_t length, wchar32* out) noexcept;
  696. bool ToUpper(const wchar32* text, size_t length, wchar32* out) noexcept;
  697. bool ToTitle(const wchar32* text, size_t length, wchar32* out) noexcept;
  698. /* @param text Pointer to the string to modify
  699. * @param length Length of the string to modify
  700. *
  701. * TODO(yazevnul): replace these functions with `bool(const TArrayRef<wchar16>)` overload.
  702. */
  703. bool ToLower(wchar16* text, size_t length) noexcept;
  704. bool ToUpper(wchar16* text, size_t length) noexcept;
  705. bool ToTitle(wchar16* text, size_t length) noexcept;
  706. bool ToLower(wchar32* text, size_t length) noexcept;
  707. bool ToUpper(wchar32* text, size_t length) noexcept;
  708. bool ToTitle(wchar32* text, size_t length) noexcept;
  709. /* Convenience wrappers for `ToLower`, `ToUpper` and `ToTitle`.
  710. */
  711. TUtf16String ToLowerRet(TUtf16String text, size_t pos = 0, size_t count = TUtf16String::npos) Y_WARN_UNUSED_RESULT;
  712. TUtf16String ToUpperRet(TUtf16String text, size_t pos = 0, size_t count = TUtf16String::npos) Y_WARN_UNUSED_RESULT;
  713. TUtf16String ToTitleRet(TUtf16String text, size_t pos = 0, size_t count = TUtf16String::npos) Y_WARN_UNUSED_RESULT;
  714. TUtf16String ToLowerRet(const TWtringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
  715. TUtf16String ToUpperRet(const TWtringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
  716. TUtf16String ToTitleRet(const TWtringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
  717. TUtf32String ToLowerRet(const TUtf32StringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
  718. TUtf32String ToUpperRet(const TUtf32StringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
  719. TUtf32String ToTitleRet(const TUtf32StringBuf text, size_t pos = 0, size_t count = TWtringBuf::npos) Y_WARN_UNUSED_RESULT;
  720. //! replaces the '<', '>' and '&' characters in string with '&lt;', '&gt;' and '&amp;' respectively
  721. // insertBr=true - replace '\r' and '\n' with "<BR>"
  722. template <bool insertBr>
  723. void EscapeHtmlChars(TUtf16String& str);
  724. //! returns number of characters in range. Handle surrogate pairs as one character.
  725. inline size_t CountWideChars(const wchar16* b, const wchar16* e) {
  726. size_t count = 0;
  727. Y_ENSURE(b <= e, TStringBuf("invalid iterators"));
  728. while (b < e) {
  729. b = SkipSymbol(b, e);
  730. ++count;
  731. }
  732. return count;
  733. }
  734. inline size_t CountWideChars(const TWtringBuf str) {
  735. return CountWideChars(str.begin(), str.end());
  736. }
  737. //! checks whether the range is valid UTF-16 sequence
  738. inline bool IsValidUTF16(const wchar16* b, const wchar16* e) {
  739. Y_ENSURE(b <= e, TStringBuf("invalid iterators"));
  740. while (b < e) {
  741. wchar32 symbol = ReadSymbolAndAdvance(b, e);
  742. if (symbol == BROKEN_RUNE) {
  743. return false;
  744. }
  745. }
  746. return true;
  747. }
  748. inline bool IsValidUTF16(const TWtringBuf str) {
  749. return IsValidUTF16(str.begin(), str.end());
  750. }