wide.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. #include "wide.h"
  2. #include <util/generic/mem_copy.h>
  3. #include <util/string/strip.h>
  4. namespace {
  5. //! the constants are not zero-terminated
  6. const wchar16 LT[] = {'&', 'l', 't', ';'};
  7. const wchar16 GT[] = {'&', 'g', 't', ';'};
  8. const wchar16 AMP[] = {'&', 'a', 'm', 'p', ';'};
  9. const wchar16 BR[] = {'<', 'B', 'R', '>'};
  10. const wchar16 QUOT[] = {'&', 'q', 'u', 'o', 't', ';'};
  11. template <bool insertBr>
  12. inline size_t EscapedLen(wchar16 c) {
  13. switch (c) {
  14. case '<':
  15. return Y_ARRAY_SIZE(LT);
  16. case '>':
  17. return Y_ARRAY_SIZE(GT);
  18. case '&':
  19. return Y_ARRAY_SIZE(AMP);
  20. case '\"':
  21. return Y_ARRAY_SIZE(QUOT);
  22. default:
  23. if (insertBr && (c == '\r' || c == '\n'))
  24. return Y_ARRAY_SIZE(BR);
  25. else
  26. return 1;
  27. }
  28. }
  29. }
  30. void Collapse(TUtf16String& w) {
  31. CollapseImpl(w, w, 0, IsWhitespace);
  32. }
  33. size_t Collapse(wchar16* s, size_t n) {
  34. return CollapseImpl(s, n, IsWhitespace);
  35. }
  36. TWtringBuf StripLeft(const TWtringBuf text) noexcept {
  37. const auto* p = text.data();
  38. const auto* const pe = text.data() + text.size();
  39. for (; p != pe && IsWhitespace(*p); ++p) {
  40. }
  41. return {p, pe};
  42. }
  43. void StripLeft(TUtf16String& text) {
  44. const auto stripped = StripLeft(TWtringBuf(text));
  45. if (stripped.size() == text.size()) {
  46. return;
  47. }
  48. text = stripped;
  49. }
  50. TWtringBuf StripRight(const TWtringBuf text) noexcept {
  51. if (!text) {
  52. return {};
  53. }
  54. const auto* const pe = text.data() - 1;
  55. const auto* p = text.data() + text.size() - 1;
  56. for (; p != pe && IsWhitespace(*p); --p) {
  57. }
  58. return {pe + 1, p + 1};
  59. }
  60. void StripRight(TUtf16String& text) {
  61. const auto stripped = StripRight(TWtringBuf(text));
  62. if (stripped.size() == text.size()) {
  63. return;
  64. }
  65. text.resize(stripped.size());
  66. }
  67. TWtringBuf Strip(const TWtringBuf text) noexcept {
  68. return StripRight(StripLeft(text));
  69. }
  70. void Strip(TUtf16String& text) {
  71. StripLeft(text);
  72. StripRight(text);
  73. }
  74. template <typename T>
  75. static bool IsReductionOnSymbolsTrue(const TWtringBuf text, T&& f) {
  76. const auto* p = text.data();
  77. const auto* const pe = text.data() + text.length();
  78. while (p != pe) {
  79. const auto symbol = ReadSymbolAndAdvance(p, pe);
  80. if (!f(symbol)) {
  81. return false;
  82. }
  83. }
  84. return true;
  85. }
  86. bool IsLowerWord(const TWtringBuf text) noexcept {
  87. return IsReductionOnSymbolsTrue(text, [](const wchar32 s) { return IsLower(s); });
  88. }
  89. bool IsUpperWord(const TWtringBuf text) noexcept {
  90. return IsReductionOnSymbolsTrue(text, [](const wchar32 s) { return IsUpper(s); });
  91. }
  92. bool IsLower(const TWtringBuf text) noexcept {
  93. return IsReductionOnSymbolsTrue(text, [](const wchar32 s) {
  94. if (IsAlpha(s)) {
  95. return IsLower(s);
  96. }
  97. return true;
  98. });
  99. }
  100. bool IsUpper(const TWtringBuf text) noexcept {
  101. return IsReductionOnSymbolsTrue(text, [](const wchar32 s) {
  102. if (IsAlpha(s)) {
  103. return IsUpper(s);
  104. }
  105. return true;
  106. });
  107. }
  108. bool IsTitleWord(const TWtringBuf text) noexcept {
  109. if (!text) {
  110. return false;
  111. }
  112. const auto* p = text.data();
  113. const auto* pe = text.data() + text.size();
  114. const auto firstSymbol = ReadSymbolAndAdvance(p, pe);
  115. if (firstSymbol != ToTitle(firstSymbol)) {
  116. return false;
  117. }
  118. return IsLowerWord({p, pe});
  119. }
  120. template <bool stopOnFirstModification, typename TCharType, typename F>
  121. static bool ModifySequence(TCharType*& p, const TCharType* const pe, F&& f) {
  122. while (p != pe) {
  123. const auto symbol = ReadSymbol(p, pe);
  124. const auto modified = f(symbol);
  125. if (symbol != modified) {
  126. if (stopOnFirstModification) {
  127. return true;
  128. }
  129. WriteSymbol(modified, p); // also moves `p` forward
  130. } else {
  131. p = SkipSymbol(p, pe);
  132. }
  133. }
  134. return false;
  135. }
  136. template <bool stopOnFirstModification, typename TCharType, typename F>
  137. static bool ModifySequence(const TCharType*& p, const TCharType* const pe, TCharType*& out, F&& f) {
  138. while (p != pe) {
  139. const auto symbol = stopOnFirstModification ? ReadSymbol(p, pe) : ReadSymbolAndAdvance(p, pe);
  140. const auto modified = f(symbol);
  141. if (stopOnFirstModification) {
  142. if (symbol != modified) {
  143. return true;
  144. }
  145. p = SkipSymbol(p, pe);
  146. }
  147. WriteSymbol(modified, out);
  148. }
  149. return false;
  150. }
  151. template <class TStringType>
  152. static void DetachAndFixPointers(TStringType& text, typename TStringType::value_type*& p, const typename TStringType::value_type*& pe) {
  153. const auto pos = p - text.data();
  154. const auto count = pe - p;
  155. p = text.Detach() + pos;
  156. pe = p + count;
  157. }
  158. template <class TStringType, typename F>
  159. static bool ModifyStringSymbolwise(TStringType& text, size_t pos, size_t count, F&& f) {
  160. // TODO(yazevnul): this is done for consistency with `TUtf16String::to_lower` and friends
  161. // at r2914050, maybe worth replacing them with asserts. Also see the same code in `ToTitle`.
  162. pos = pos < text.size() ? pos : text.size();
  163. count = count < text.size() - pos ? count : text.size() - pos;
  164. // TUtf16String is refcounted and it's `data` method return pointer to the constant memory.
  165. // To simplify the code we do a `const_cast`, though first write to the memory will be done only
  166. // after we call `Detach()` and get pointer to a writable piece of memory.
  167. auto* p = const_cast<typename TStringType::value_type*>(text.data() + pos);
  168. const auto* pe = text.data() + pos + count;
  169. if (ModifySequence<true>(p, pe, f)) {
  170. DetachAndFixPointers(text, p, pe);
  171. ModifySequence<false>(p, pe, f);
  172. return true;
  173. }
  174. return false;
  175. }
  176. bool ToLower(TUtf16String& text, size_t pos, size_t count) {
  177. const auto f = [](const wchar32 s) { return ToLower(s); };
  178. return ModifyStringSymbolwise(text, pos, count, f);
  179. }
  180. bool ToUpper(TUtf16String& text, size_t pos, size_t count) {
  181. const auto f = [](const wchar32 s) { return ToUpper(s); };
  182. return ModifyStringSymbolwise(text, pos, count, f);
  183. }
  184. bool ToLower(TUtf32String& text, size_t pos, size_t count) {
  185. const auto f = [](const wchar32 s) { return ToLower(s); };
  186. return ModifyStringSymbolwise(text, pos, count, f);
  187. }
  188. bool ToUpper(TUtf32String& text, size_t pos, size_t count) {
  189. const auto f = [](const wchar32 s) { return ToUpper(s); };
  190. return ModifyStringSymbolwise(text, pos, count, f);
  191. }
  192. bool ToTitle(TUtf16String& text, size_t pos, size_t count) {
  193. if (!text) {
  194. return false;
  195. }
  196. pos = pos < text.size() ? pos : text.size();
  197. count = count < text.size() - pos ? count : text.size() - pos;
  198. const auto toLower = [](const wchar32 s) { return ToLower(s); };
  199. auto* p = const_cast<wchar16*>(text.data() + pos);
  200. const auto* pe = text.data() + pos + count;
  201. const auto firstSymbol = ReadSymbol(p, pe);
  202. if (firstSymbol == ToTitle(firstSymbol)) {
  203. p = SkipSymbol(p, pe);
  204. if (ModifySequence<true>(p, pe, toLower)) {
  205. DetachAndFixPointers(text, p, pe);
  206. ModifySequence<false>(p, pe, toLower);
  207. return true;
  208. }
  209. } else {
  210. DetachAndFixPointers(text, p, pe);
  211. WriteSymbol(ToTitle(ReadSymbol(p, pe)), p); // also moves `p` forward
  212. ModifySequence<false>(p, pe, toLower);
  213. return true;
  214. }
  215. return false;
  216. }
  217. bool ToTitle(TUtf32String& text, size_t pos, size_t count) {
  218. if (!text) {
  219. return false;
  220. }
  221. pos = pos < text.size() ? pos : text.size();
  222. count = count < text.size() - pos ? count : text.size() - pos;
  223. const auto toLower = [](const wchar32 s) { return ToLower(s); };
  224. auto* p = const_cast<wchar32*>(text.data() + pos);
  225. const auto* pe = text.data() + pos + count;
  226. const auto firstSymbol = *p;
  227. if (firstSymbol == ToTitle(firstSymbol)) {
  228. p += 1;
  229. if (ModifySequence<true>(p, pe, toLower)) {
  230. DetachAndFixPointers(text, p, pe);
  231. ModifySequence<false>(p, pe, toLower);
  232. return true;
  233. }
  234. } else {
  235. DetachAndFixPointers(text, p, pe);
  236. WriteSymbol(ToTitle(ReadSymbol(p, pe)), p); // also moves `p` forward
  237. ModifySequence<false>(p, pe, toLower);
  238. return true;
  239. }
  240. return false;
  241. }
  242. TUtf16String ToLowerRet(TUtf16String text, size_t pos, size_t count) {
  243. ToLower(text, pos, count);
  244. return text;
  245. }
  246. TUtf16String ToUpperRet(TUtf16String text, size_t pos, size_t count) {
  247. ToUpper(text, pos, count);
  248. return text;
  249. }
  250. TUtf16String ToTitleRet(TUtf16String text, size_t pos, size_t count) {
  251. ToTitle(text, pos, count);
  252. return text;
  253. }
  254. TUtf32String ToLowerRet(TUtf32String text, size_t pos, size_t count) {
  255. ToLower(text, pos, count);
  256. return text;
  257. }
  258. TUtf32String ToUpperRet(TUtf32String text, size_t pos, size_t count) {
  259. ToUpper(text, pos, count);
  260. return text;
  261. }
  262. TUtf32String ToTitleRet(TUtf32String text, size_t pos, size_t count) {
  263. ToTitle(text, pos, count);
  264. return text;
  265. }
  266. bool ToLower(const wchar16* text, size_t length, wchar16* out) noexcept {
  267. // TODO(yazevnul): get rid of `text == out` case (it is probably used only in lemmer) and then
  268. // we can declare text and out as `__restrict__`
  269. Y_ASSERT(text == out || !(out >= text && out < text + length));
  270. const auto f = [](const wchar32 s) { return ToLower(s); };
  271. const auto* p = text;
  272. const auto* const pe = text + length;
  273. if (ModifySequence<true>(p, pe, out, f)) {
  274. ModifySequence<false>(p, pe, out, f);
  275. return true;
  276. }
  277. return false;
  278. }
  279. bool ToUpper(const wchar16* text, size_t length, wchar16* out) noexcept {
  280. Y_ASSERT(text == out || !(out >= text && out < text + length));
  281. const auto f = [](const wchar32 s) { return ToUpper(s); };
  282. const auto* p = text;
  283. const auto* const pe = text + length;
  284. if (ModifySequence<true>(p, pe, out, f)) {
  285. ModifySequence<false>(p, pe, out, f);
  286. return true;
  287. }
  288. return false;
  289. }
  290. bool ToTitle(const wchar16* text, size_t length, wchar16* out) noexcept {
  291. if (!length) {
  292. return false;
  293. }
  294. Y_ASSERT(text == out || !(out >= text && out < text + length));
  295. const auto* const textEnd = text + length;
  296. const auto firstSymbol = ReadSymbolAndAdvance(text, textEnd);
  297. const auto firstSymbolTitle = ToTitle(firstSymbol);
  298. WriteSymbol(firstSymbolTitle, out);
  299. return ToLower(text, textEnd - text, out) || firstSymbol != firstSymbolTitle;
  300. }
  301. bool ToLower(wchar16* text, size_t length) noexcept {
  302. const auto f = [](const wchar32 s) { return ToLower(s); };
  303. const auto* const textEnd = text + length;
  304. if (ModifySequence<true>(text, textEnd, f)) {
  305. ModifySequence<false>(text, textEnd, f);
  306. return true;
  307. }
  308. return false;
  309. }
  310. bool ToUpper(wchar16* text, size_t length) noexcept {
  311. const auto f = [](const wchar32 s) { return ToUpper(s); };
  312. const auto* const textEnd = text + length;
  313. if (ModifySequence<true>(text, textEnd, f)) {
  314. ModifySequence<false>(text, textEnd, f);
  315. return true;
  316. }
  317. return false;
  318. }
  319. bool ToTitle(wchar16* text, size_t length) noexcept {
  320. if (!length) {
  321. return false;
  322. }
  323. const auto* textEnd = text + length;
  324. const auto firstSymbol = ReadSymbol(text, textEnd);
  325. const auto firstSymbolTitle = ToTitle(firstSymbol);
  326. // avoid unnacessary writes to the memory
  327. if (firstSymbol != firstSymbolTitle) {
  328. WriteSymbol(firstSymbolTitle, text);
  329. } else {
  330. text = SkipSymbol(text, textEnd);
  331. }
  332. return ToLower(text, textEnd - text) || firstSymbol != firstSymbolTitle;
  333. }
  334. bool ToLower(const wchar32* text, size_t length, wchar32* out) noexcept {
  335. // TODO(yazevnul): get rid of `text == out` case (it is probably used only in lemmer) and then
  336. // we can declare text and out as `__restrict__`
  337. Y_ASSERT(text == out || !(out >= text && out < text + length));
  338. const auto f = [](const wchar32 s) { return ToLower(s); };
  339. const auto* p = text;
  340. const auto* const pe = text + length;
  341. if (ModifySequence<true>(p, pe, out, f)) {
  342. ModifySequence<false>(p, pe, out, f);
  343. return true;
  344. }
  345. return false;
  346. }
  347. bool ToUpper(const wchar32* text, size_t length, wchar32* out) noexcept {
  348. Y_ASSERT(text == out || !(out >= text && out < text + length));
  349. const auto f = [](const wchar32 s) { return ToUpper(s); };
  350. const auto* p = text;
  351. const auto* const pe = text + length;
  352. if (ModifySequence<true>(p, pe, out, f)) {
  353. ModifySequence<false>(p, pe, out, f);
  354. return true;
  355. }
  356. return false;
  357. }
  358. bool ToTitle(const wchar32* text, size_t length, wchar32* out) noexcept {
  359. if (!length) {
  360. return false;
  361. }
  362. Y_ASSERT(text == out || !(out >= text && out < text + length));
  363. const auto* const textEnd = text + length;
  364. const auto firstSymbol = ReadSymbolAndAdvance(text, textEnd);
  365. const auto firstSymbolTitle = ToTitle(firstSymbol);
  366. WriteSymbol(firstSymbolTitle, out);
  367. return ToLower(text, textEnd - text, out) || firstSymbol != firstSymbolTitle;
  368. }
  369. bool ToLower(wchar32* text, size_t length) noexcept {
  370. const auto f = [](const wchar32 s) { return ToLower(s); };
  371. const auto* const textEnd = text + length;
  372. if (ModifySequence<true>(text, textEnd, f)) {
  373. ModifySequence<false>(text, textEnd, f);
  374. return true;
  375. }
  376. return false;
  377. }
  378. bool ToUpper(wchar32* text, size_t length) noexcept {
  379. const auto f = [](const wchar32 s) { return ToUpper(s); };
  380. const auto* const textEnd = text + length;
  381. if (ModifySequence<true>(text, textEnd, f)) {
  382. ModifySequence<false>(text, textEnd, f);
  383. return true;
  384. }
  385. return false;
  386. }
  387. bool ToTitle(wchar32* text, size_t length) noexcept {
  388. if (!length) {
  389. return false;
  390. }
  391. const auto* textEnd = text + length;
  392. const auto firstSymbol = ReadSymbol(text, textEnd);
  393. const auto firstSymbolTitle = ToTitle(firstSymbol);
  394. // avoid unnacessary writes to the memory
  395. if (firstSymbol != firstSymbolTitle) {
  396. WriteSymbol(firstSymbolTitle, text);
  397. } else {
  398. text = SkipSymbol(text, textEnd);
  399. }
  400. return ToLower(text, textEnd - text) || firstSymbol != firstSymbolTitle;
  401. }
  402. template <typename F>
  403. static TUtf16String ToSmthRet(const TWtringBuf text, size_t pos, size_t count, F&& f) {
  404. pos = pos < text.size() ? pos : text.size();
  405. count = count < text.size() - pos ? count : text.size() - pos;
  406. auto res = TUtf16String::Uninitialized(text.size());
  407. auto* const resBegin = res.Detach();
  408. if (pos) {
  409. MemCopy(resBegin, text.data(), pos);
  410. }
  411. f(text.data() + pos, count, resBegin + pos);
  412. if (count - pos != text.size()) {
  413. MemCopy(resBegin + pos + count, text.data() + pos + count, text.size() - pos - count);
  414. }
  415. return res;
  416. }
  417. template <typename F>
  418. static TUtf32String ToSmthRet(const TUtf32StringBuf text, size_t pos, size_t count, F&& f) {
  419. pos = pos < text.size() ? pos : text.size();
  420. count = count < text.size() - pos ? count : text.size() - pos;
  421. auto res = TUtf32String::Uninitialized(text.size());
  422. auto* const resBegin = res.Detach();
  423. if (pos) {
  424. MemCopy(resBegin, text.data(), pos);
  425. }
  426. f(text.data() + pos, count, resBegin + pos);
  427. if (count - pos != text.size()) {
  428. MemCopy(resBegin + pos + count, text.data() + pos + count, text.size() - pos - count);
  429. }
  430. return res;
  431. }
  432. TUtf16String ToLowerRet(const TWtringBuf text, size_t pos, size_t count) {
  433. return ToSmthRet(text, pos, count, [](const wchar16* theText, size_t length, wchar16* out) {
  434. ToLower(theText, length, out);
  435. });
  436. }
  437. TUtf16String ToUpperRet(const TWtringBuf text, size_t pos, size_t count) {
  438. return ToSmthRet(text, pos, count, [](const wchar16* theText, size_t length, wchar16* out) {
  439. ToUpper(theText, length, out);
  440. });
  441. }
  442. TUtf16String ToTitleRet(const TWtringBuf text, size_t pos, size_t count) {
  443. return ToSmthRet(text, pos, count, [](const wchar16* theText, size_t length, wchar16* out) {
  444. ToTitle(theText, length, out);
  445. });
  446. }
  447. TUtf32String ToLowerRet(const TUtf32StringBuf text, size_t pos, size_t count) {
  448. return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
  449. ToLower(theText, length, out);
  450. });
  451. }
  452. TUtf32String ToUpperRet(const TUtf32StringBuf text, size_t pos, size_t count) {
  453. return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
  454. ToUpper(theText, length, out);
  455. });
  456. }
  457. TUtf32String ToTitleRet(const TUtf32StringBuf text, size_t pos, size_t count) {
  458. return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
  459. ToTitle(theText, length, out);
  460. });
  461. }
  462. template <bool insertBr>
  463. void EscapeHtmlChars(TUtf16String& str) {
  464. static const TUtf16String lt(LT, Y_ARRAY_SIZE(LT));
  465. static const TUtf16String gt(GT, Y_ARRAY_SIZE(GT));
  466. static const TUtf16String amp(AMP, Y_ARRAY_SIZE(AMP));
  467. static const TUtf16String br(BR, Y_ARRAY_SIZE(BR));
  468. static const TUtf16String quot(QUOT, Y_ARRAY_SIZE(QUOT));
  469. size_t escapedLen = 0;
  470. const TUtf16String& cs = str;
  471. for (size_t i = 0; i < cs.size(); ++i)
  472. escapedLen += EscapedLen<insertBr>(cs[i]);
  473. if (escapedLen == cs.size())
  474. return;
  475. TUtf16String res;
  476. res.reserve(escapedLen);
  477. size_t start = 0;
  478. for (size_t i = 0; i < cs.size(); ++i) {
  479. const TUtf16String* ent = nullptr;
  480. switch (cs[i]) {
  481. case '<':
  482. ent = &lt;
  483. break;
  484. case '>':
  485. ent = &gt;
  486. break;
  487. case '&':
  488. ent = &amp;
  489. break;
  490. case '\"':
  491. ent = &quot;
  492. break;
  493. default:
  494. if (insertBr && (cs[i] == '\r' || cs[i] == '\n')) {
  495. ent = &br;
  496. break;
  497. } else
  498. continue;
  499. }
  500. res.append(cs.begin() + start, cs.begin() + i);
  501. res.append(ent->begin(), ent->end());
  502. start = i + 1;
  503. }
  504. res.append(cs.begin() + start, cs.end());
  505. res.swap(str);
  506. }
  507. template void EscapeHtmlChars<false>(TUtf16String& str);
  508. template void EscapeHtmlChars<true>(TUtf16String& str);