wide.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. #include "wide.h"
  2. #include <util/generic/mem_copy.h>
  3. #include <util/string/strip.h>
  4. namespace {
  5. //! the constants are not zero-terminated
  6. const wchar16 LT[] = {'&', 'l', 't', ';'};
  7. const wchar16 GT[] = {'&', 'g', 't', ';'};
  8. const wchar16 AMP[] = {'&', 'a', 'm', 'p', ';'};
  9. const wchar16 BR[] = {'<', 'B', 'R', '>'};
  10. const wchar16 QUOT[] = {'&', 'q', 'u', 'o', 't', ';'};
  11. template <bool insertBr>
  12. inline size_t EscapedLen(wchar16 c) {
  13. switch (c) {
  14. case '<':
  15. return Y_ARRAY_SIZE(LT);
  16. case '>':
  17. return Y_ARRAY_SIZE(GT);
  18. case '&':
  19. return Y_ARRAY_SIZE(AMP);
  20. case '\"':
  21. return Y_ARRAY_SIZE(QUOT);
  22. default:
  23. if (insertBr && (c == '\r' || c == '\n')) {
  24. return Y_ARRAY_SIZE(BR);
  25. } else {
  26. return 1;
  27. }
  28. }
  29. }
  30. } // namespace
  31. void Collapse(TUtf16String& w) {
  32. CollapseImpl(w, w, 0, IsWhitespace);
  33. }
  34. size_t Collapse(wchar16* s, size_t n) {
  35. return CollapseImpl(s, n, IsWhitespace);
  36. }
  37. TWtringBuf StripLeft(const TWtringBuf text) noexcept {
  38. const auto* p = text.data();
  39. const auto* const pe = text.data() + text.size();
  40. for (; p != pe && IsWhitespace(*p); ++p) {
  41. }
  42. return {p, pe};
  43. }
  44. void StripLeft(TUtf16String& text) {
  45. const auto stripped = StripLeft(TWtringBuf(text));
  46. if (stripped.size() == text.size()) {
  47. return;
  48. }
  49. text = stripped;
  50. }
  51. TWtringBuf StripRight(const TWtringBuf text) noexcept {
  52. if (!text) {
  53. return {};
  54. }
  55. const auto* const pe = text.data() - 1;
  56. const auto* p = text.data() + text.size() - 1;
  57. for (; p != pe && IsWhitespace(*p); --p) {
  58. }
  59. return {pe + 1, p + 1};
  60. }
  61. void StripRight(TUtf16String& text) {
  62. const auto stripped = StripRight(TWtringBuf(text));
  63. if (stripped.size() == text.size()) {
  64. return;
  65. }
  66. text.resize(stripped.size());
  67. }
  68. TWtringBuf Strip(const TWtringBuf text) noexcept {
  69. return StripRight(StripLeft(text));
  70. }
  71. void Strip(TUtf16String& text) {
  72. StripLeft(text);
  73. StripRight(text);
  74. }
  75. template <typename T>
  76. static bool IsReductionOnSymbolsTrue(const TWtringBuf text, T&& f) {
  77. const auto* p = text.data();
  78. const auto* const pe = text.data() + text.length();
  79. while (p != pe) {
  80. const auto symbol = ReadSymbolAndAdvance(p, pe);
  81. if (!f(symbol)) {
  82. return false;
  83. }
  84. }
  85. return true;
  86. }
  87. bool IsLowerWord(const TWtringBuf text) noexcept {
  88. return IsReductionOnSymbolsTrue(text, [](const wchar32 s) { return IsLower(s); });
  89. }
  90. bool IsUpperWord(const TWtringBuf text) noexcept {
  91. return IsReductionOnSymbolsTrue(text, [](const wchar32 s) { return IsUpper(s); });
  92. }
  93. bool IsLower(const TWtringBuf text) noexcept {
  94. return IsReductionOnSymbolsTrue(text, [](const wchar32 s) {
  95. if (IsAlpha(s)) {
  96. return IsLower(s);
  97. }
  98. return true;
  99. });
  100. }
  101. bool IsUpper(const TWtringBuf text) noexcept {
  102. return IsReductionOnSymbolsTrue(text, [](const wchar32 s) {
  103. if (IsAlpha(s)) {
  104. return IsUpper(s);
  105. }
  106. return true;
  107. });
  108. }
  109. bool IsTitleWord(const TWtringBuf text) noexcept {
  110. if (!text) {
  111. return false;
  112. }
  113. const auto* p = text.data();
  114. const auto* pe = text.data() + text.size();
  115. const auto firstSymbol = ReadSymbolAndAdvance(p, pe);
  116. if (firstSymbol != ToTitle(firstSymbol)) {
  117. return false;
  118. }
  119. return IsLowerWord({p, pe});
  120. }
  121. template <bool stopOnFirstModification, typename TCharType, typename F>
  122. static bool ModifySequence(TCharType*& p, const TCharType* const pe, F&& f) {
  123. while (p != pe) {
  124. const auto symbol = ReadSymbol(p, pe);
  125. const auto modified = f(symbol);
  126. if (symbol != modified) {
  127. if (stopOnFirstModification) {
  128. return true;
  129. }
  130. WriteSymbol(modified, p); // also moves `p` forward
  131. } else {
  132. p = SkipSymbol(p, pe);
  133. }
  134. }
  135. return false;
  136. }
  137. template <bool stopOnFirstModification, typename TCharType, typename F>
  138. static bool ModifySequence(const TCharType*& p, const TCharType* const pe, TCharType*& out, F&& f) {
  139. while (p != pe) {
  140. const auto symbol = stopOnFirstModification ? ReadSymbol(p, pe) : ReadSymbolAndAdvance(p, pe);
  141. const auto modified = f(symbol);
  142. if (stopOnFirstModification) {
  143. if (symbol != modified) {
  144. return true;
  145. }
  146. p = SkipSymbol(p, pe);
  147. }
  148. WriteSymbol(modified, out);
  149. }
  150. return false;
  151. }
  152. template <class TStringType>
  153. static void DetachAndFixPointers(TStringType& text, typename TStringType::value_type*& p, const typename TStringType::value_type*& pe) {
  154. const auto pos = p - text.data();
  155. const auto count = pe - p;
  156. p = text.Detach() + pos;
  157. pe = p + count;
  158. }
  159. template <class TStringType, typename F>
  160. static bool ModifyStringSymbolwise(TStringType& text, size_t pos, size_t count, F&& f) {
  161. // TODO(yazevnul): this is done for consistency with `TUtf16String::to_lower` and friends
  162. // at r2914050, maybe worth replacing them with asserts. Also see the same code in `ToTitle`.
  163. pos = pos < text.size() ? pos : text.size();
  164. count = count < text.size() - pos ? count : text.size() - pos;
  165. // TUtf16String is refcounted and it's `data` method return pointer to the constant memory.
  166. // To simplify the code we do a `const_cast`, though first write to the memory will be done only
  167. // after we call `Detach()` and get pointer to a writable piece of memory.
  168. auto* p = const_cast<typename TStringType::value_type*>(text.data() + pos);
  169. const auto* pe = text.data() + pos + count;
  170. if (ModifySequence<true>(p, pe, f)) {
  171. DetachAndFixPointers(text, p, pe);
  172. ModifySequence<false>(p, pe, f);
  173. return true;
  174. }
  175. return false;
  176. }
  177. bool ToLower(TUtf16String& text, size_t pos, size_t count) {
  178. const auto f = [](const wchar32 s) { return ToLower(s); };
  179. return ModifyStringSymbolwise(text, pos, count, f);
  180. }
  181. bool ToUpper(TUtf16String& text, size_t pos, size_t count) {
  182. const auto f = [](const wchar32 s) { return ToUpper(s); };
  183. return ModifyStringSymbolwise(text, pos, count, f);
  184. }
  185. bool ToLower(TUtf32String& text, size_t pos, size_t count) {
  186. const auto f = [](const wchar32 s) { return ToLower(s); };
  187. return ModifyStringSymbolwise(text, pos, count, f);
  188. }
  189. bool ToUpper(TUtf32String& text, size_t pos, size_t count) {
  190. const auto f = [](const wchar32 s) { return ToUpper(s); };
  191. return ModifyStringSymbolwise(text, pos, count, f);
  192. }
  193. bool ToTitle(TUtf16String& text, size_t pos, size_t count) {
  194. if (!text) {
  195. return false;
  196. }
  197. pos = pos < text.size() ? pos : text.size();
  198. count = count < text.size() - pos ? count : text.size() - pos;
  199. const auto toLower = [](const wchar32 s) { return ToLower(s); };
  200. auto* p = const_cast<wchar16*>(text.data() + pos);
  201. const auto* pe = text.data() + pos + count;
  202. const auto firstSymbol = ReadSymbol(p, pe);
  203. if (firstSymbol == ToTitle(firstSymbol)) {
  204. p = SkipSymbol(p, pe);
  205. if (ModifySequence<true>(p, pe, toLower)) {
  206. DetachAndFixPointers(text, p, pe);
  207. ModifySequence<false>(p, pe, toLower);
  208. return true;
  209. }
  210. } else {
  211. DetachAndFixPointers(text, p, pe);
  212. WriteSymbol(ToTitle(ReadSymbol(p, pe)), p); // also moves `p` forward
  213. ModifySequence<false>(p, pe, toLower);
  214. return true;
  215. }
  216. return false;
  217. }
  218. bool ToTitle(TUtf32String& text, size_t pos, size_t count) {
  219. if (!text) {
  220. return false;
  221. }
  222. pos = pos < text.size() ? pos : text.size();
  223. count = count < text.size() - pos ? count : text.size() - pos;
  224. const auto toLower = [](const wchar32 s) { return ToLower(s); };
  225. auto* p = const_cast<wchar32*>(text.data() + pos);
  226. const auto* pe = text.data() + pos + count;
  227. const auto firstSymbol = *p;
  228. if (firstSymbol == ToTitle(firstSymbol)) {
  229. p += 1;
  230. if (ModifySequence<true>(p, pe, toLower)) {
  231. DetachAndFixPointers(text, p, pe);
  232. ModifySequence<false>(p, pe, toLower);
  233. return true;
  234. }
  235. } else {
  236. DetachAndFixPointers(text, p, pe);
  237. WriteSymbol(ToTitle(ReadSymbol(p, pe)), p); // also moves `p` forward
  238. ModifySequence<false>(p, pe, toLower);
  239. return true;
  240. }
  241. return false;
  242. }
  243. TUtf16String ToLowerRet(TUtf16String text, size_t pos, size_t count) {
  244. ToLower(text, pos, count);
  245. return text;
  246. }
  247. TUtf16String ToUpperRet(TUtf16String text, size_t pos, size_t count) {
  248. ToUpper(text, pos, count);
  249. return text;
  250. }
  251. TUtf16String ToTitleRet(TUtf16String text, size_t pos, size_t count) {
  252. ToTitle(text, pos, count);
  253. return text;
  254. }
  255. TUtf32String ToLowerRet(TUtf32String text, size_t pos, size_t count) {
  256. ToLower(text, pos, count);
  257. return text;
  258. }
  259. TUtf32String ToUpperRet(TUtf32String text, size_t pos, size_t count) {
  260. ToUpper(text, pos, count);
  261. return text;
  262. }
  263. TUtf32String ToTitleRet(TUtf32String text, size_t pos, size_t count) {
  264. ToTitle(text, pos, count);
  265. return text;
  266. }
  267. bool ToLower(const wchar16* text, size_t length, wchar16* out) noexcept {
  268. // TODO(yazevnul): get rid of `text == out` case (it is probably used only in lemmer) and then
  269. // we can declare text and out as `__restrict__`
  270. Y_ASSERT(text == out || !(out >= text && out < text + length));
  271. const auto f = [](const wchar32 s) { return ToLower(s); };
  272. const auto* p = text;
  273. const auto* const pe = text + length;
  274. if (ModifySequence<true>(p, pe, out, f)) {
  275. ModifySequence<false>(p, pe, out, f);
  276. return true;
  277. }
  278. return false;
  279. }
  280. bool ToUpper(const wchar16* text, size_t length, wchar16* out) noexcept {
  281. Y_ASSERT(text == out || !(out >= text && out < text + length));
  282. const auto f = [](const wchar32 s) { return ToUpper(s); };
  283. const auto* p = text;
  284. const auto* const pe = text + length;
  285. if (ModifySequence<true>(p, pe, out, f)) {
  286. ModifySequence<false>(p, pe, out, f);
  287. return true;
  288. }
  289. return false;
  290. }
  291. bool ToTitle(const wchar16* text, size_t length, wchar16* out) noexcept {
  292. if (!length) {
  293. return false;
  294. }
  295. Y_ASSERT(text == out || !(out >= text && out < text + length));
  296. const auto* const textEnd = text + length;
  297. const auto firstSymbol = ReadSymbolAndAdvance(text, textEnd);
  298. const auto firstSymbolTitle = ToTitle(firstSymbol);
  299. WriteSymbol(firstSymbolTitle, out);
  300. return ToLower(text, textEnd - text, out) || firstSymbol != firstSymbolTitle;
  301. }
  302. bool ToLower(wchar16* text, size_t length) noexcept {
  303. const auto f = [](const wchar32 s) { return ToLower(s); };
  304. const auto* const textEnd = text + length;
  305. if (ModifySequence<true>(text, textEnd, f)) {
  306. ModifySequence<false>(text, textEnd, f);
  307. return true;
  308. }
  309. return false;
  310. }
  311. bool ToUpper(wchar16* text, size_t length) noexcept {
  312. const auto f = [](const wchar32 s) { return ToUpper(s); };
  313. const auto* const textEnd = text + length;
  314. if (ModifySequence<true>(text, textEnd, f)) {
  315. ModifySequence<false>(text, textEnd, f);
  316. return true;
  317. }
  318. return false;
  319. }
  320. bool ToTitle(wchar16* text, size_t length) noexcept {
  321. if (!length) {
  322. return false;
  323. }
  324. const auto* textEnd = text + length;
  325. const auto firstSymbol = ReadSymbol(text, textEnd);
  326. const auto firstSymbolTitle = ToTitle(firstSymbol);
  327. // avoid unnacessary writes to the memory
  328. if (firstSymbol != firstSymbolTitle) {
  329. WriteSymbol(firstSymbolTitle, text);
  330. } else {
  331. text = SkipSymbol(text, textEnd);
  332. }
  333. return ToLower(text, textEnd - text) || firstSymbol != firstSymbolTitle;
  334. }
  335. bool ToLower(const wchar32* text, size_t length, wchar32* out) noexcept {
  336. // TODO(yazevnul): get rid of `text == out` case (it is probably used only in lemmer) and then
  337. // we can declare text and out as `__restrict__`
  338. Y_ASSERT(text == out || !(out >= text && out < text + length));
  339. const auto f = [](const wchar32 s) { return ToLower(s); };
  340. const auto* p = text;
  341. const auto* const pe = text + length;
  342. if (ModifySequence<true>(p, pe, out, f)) {
  343. ModifySequence<false>(p, pe, out, f);
  344. return true;
  345. }
  346. return false;
  347. }
  348. bool ToUpper(const wchar32* text, size_t length, wchar32* out) noexcept {
  349. Y_ASSERT(text == out || !(out >= text && out < text + length));
  350. const auto f = [](const wchar32 s) { return ToUpper(s); };
  351. const auto* p = text;
  352. const auto* const pe = text + length;
  353. if (ModifySequence<true>(p, pe, out, f)) {
  354. ModifySequence<false>(p, pe, out, f);
  355. return true;
  356. }
  357. return false;
  358. }
  359. bool ToTitle(const wchar32* text, size_t length, wchar32* out) noexcept {
  360. if (!length) {
  361. return false;
  362. }
  363. Y_ASSERT(text == out || !(out >= text && out < text + length));
  364. const auto* const textEnd = text + length;
  365. const auto firstSymbol = ReadSymbolAndAdvance(text, textEnd);
  366. const auto firstSymbolTitle = ToTitle(firstSymbol);
  367. WriteSymbol(firstSymbolTitle, out);
  368. return ToLower(text, textEnd - text, out) || firstSymbol != firstSymbolTitle;
  369. }
  370. bool ToLower(wchar32* text, size_t length) noexcept {
  371. const auto f = [](const wchar32 s) { return ToLower(s); };
  372. const auto* const textEnd = text + length;
  373. if (ModifySequence<true>(text, textEnd, f)) {
  374. ModifySequence<false>(text, textEnd, f);
  375. return true;
  376. }
  377. return false;
  378. }
  379. bool ToUpper(wchar32* text, size_t length) noexcept {
  380. const auto f = [](const wchar32 s) { return ToUpper(s); };
  381. const auto* const textEnd = text + length;
  382. if (ModifySequence<true>(text, textEnd, f)) {
  383. ModifySequence<false>(text, textEnd, f);
  384. return true;
  385. }
  386. return false;
  387. }
  388. bool ToTitle(wchar32* text, size_t length) noexcept {
  389. if (!length) {
  390. return false;
  391. }
  392. const auto* textEnd = text + length;
  393. const auto firstSymbol = ReadSymbol(text, textEnd);
  394. const auto firstSymbolTitle = ToTitle(firstSymbol);
  395. // avoid unnacessary writes to the memory
  396. if (firstSymbol != firstSymbolTitle) {
  397. WriteSymbol(firstSymbolTitle, text);
  398. } else {
  399. text = SkipSymbol(text, textEnd);
  400. }
  401. return ToLower(text, textEnd - text) || firstSymbol != firstSymbolTitle;
  402. }
  403. template <typename F>
  404. static TUtf16String ToSmthRet(const TWtringBuf text, size_t pos, size_t count, F&& f) {
  405. pos = pos < text.size() ? pos : text.size();
  406. count = count < text.size() - pos ? count : text.size() - pos;
  407. auto res = TUtf16String::Uninitialized(text.size());
  408. auto* const resBegin = res.Detach();
  409. if (pos) {
  410. MemCopy(resBegin, text.data(), pos);
  411. }
  412. f(text.data() + pos, count, resBegin + pos);
  413. if (count - pos != text.size()) {
  414. MemCopy(resBegin + pos + count, text.data() + pos + count, text.size() - pos - count);
  415. }
  416. return res;
  417. }
  418. template <typename F>
  419. static TUtf32String ToSmthRet(const TUtf32StringBuf text, size_t pos, size_t count, F&& f) {
  420. pos = pos < text.size() ? pos : text.size();
  421. count = count < text.size() - pos ? count : text.size() - pos;
  422. auto res = TUtf32String::Uninitialized(text.size());
  423. auto* const resBegin = res.Detach();
  424. if (pos) {
  425. MemCopy(resBegin, text.data(), pos);
  426. }
  427. f(text.data() + pos, count, resBegin + pos);
  428. if (count - pos != text.size()) {
  429. MemCopy(resBegin + pos + count, text.data() + pos + count, text.size() - pos - count);
  430. }
  431. return res;
  432. }
  433. TUtf16String ToLowerRet(const TWtringBuf text, size_t pos, size_t count) {
  434. return ToSmthRet(text, pos, count, [](const wchar16* theText, size_t length, wchar16* out) {
  435. ToLower(theText, length, out);
  436. });
  437. }
  438. TUtf16String ToUpperRet(const TWtringBuf text, size_t pos, size_t count) {
  439. return ToSmthRet(text, pos, count, [](const wchar16* theText, size_t length, wchar16* out) {
  440. ToUpper(theText, length, out);
  441. });
  442. }
  443. TUtf16String ToTitleRet(const TWtringBuf text, size_t pos, size_t count) {
  444. return ToSmthRet(text, pos, count, [](const wchar16* theText, size_t length, wchar16* out) {
  445. ToTitle(theText, length, out);
  446. });
  447. }
  448. TUtf32String ToLowerRet(const TUtf32StringBuf text, size_t pos, size_t count) {
  449. return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
  450. ToLower(theText, length, out);
  451. });
  452. }
  453. TUtf32String ToUpperRet(const TUtf32StringBuf text, size_t pos, size_t count) {
  454. return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
  455. ToUpper(theText, length, out);
  456. });
  457. }
  458. TUtf32String ToTitleRet(const TUtf32StringBuf text, size_t pos, size_t count) {
  459. return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
  460. ToTitle(theText, length, out);
  461. });
  462. }
  463. template <bool insertBr>
  464. void EscapeHtmlChars(TUtf16String& str) {
  465. static const TUtf16String lt(LT, Y_ARRAY_SIZE(LT));
  466. static const TUtf16String gt(GT, Y_ARRAY_SIZE(GT));
  467. static const TUtf16String amp(AMP, Y_ARRAY_SIZE(AMP));
  468. static const TUtf16String br(BR, Y_ARRAY_SIZE(BR));
  469. static const TUtf16String quot(QUOT, Y_ARRAY_SIZE(QUOT));
  470. size_t escapedLen = 0;
  471. const TUtf16String& cs = str;
  472. for (size_t i = 0; i < cs.size(); ++i) {
  473. escapedLen += EscapedLen<insertBr>(cs[i]);
  474. }
  475. if (escapedLen == cs.size()) {
  476. return;
  477. }
  478. TUtf16String res;
  479. res.reserve(escapedLen);
  480. size_t start = 0;
  481. for (size_t i = 0; i < cs.size(); ++i) {
  482. const TUtf16String* ent = nullptr;
  483. switch (cs[i]) {
  484. case '<':
  485. ent = &lt;
  486. break;
  487. case '>':
  488. ent = &gt;
  489. break;
  490. case '&':
  491. ent = &amp;
  492. break;
  493. case '\"':
  494. ent = &quot;
  495. break;
  496. default:
  497. if (insertBr && (cs[i] == '\r' || cs[i] == '\n')) {
  498. ent = &br;
  499. break;
  500. } else {
  501. continue;
  502. }
  503. }
  504. res.append(cs.begin() + start, cs.begin() + i);
  505. res.append(ent->begin(), ent->end());
  506. start = i + 1;
  507. }
  508. res.append(cs.begin() + start, cs.end());
  509. res.swap(str);
  510. }
  511. template void EscapeHtmlChars<false>(TUtf16String& str);
  512. template void EscapeHtmlChars<true>(TUtf16String& str);