123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626 |
- #include "wide.h"
- #include <util/generic/mem_copy.h>
- #include <util/string/strip.h>
- namespace {
- //! the constants are not zero-terminated
- const wchar16 LT[] = {'&', 'l', 't', ';'};
- const wchar16 GT[] = {'&', 'g', 't', ';'};
- const wchar16 AMP[] = {'&', 'a', 'm', 'p', ';'};
- const wchar16 BR[] = {'<', 'B', 'R', '>'};
- const wchar16 QUOT[] = {'&', 'q', 'u', 'o', 't', ';'};
- template <bool insertBr>
- inline size_t EscapedLen(wchar16 c) {
- switch (c) {
- case '<':
- return Y_ARRAY_SIZE(LT);
- case '>':
- return Y_ARRAY_SIZE(GT);
- case '&':
- return Y_ARRAY_SIZE(AMP);
- case '\"':
- return Y_ARRAY_SIZE(QUOT);
- default:
- if (insertBr && (c == '\r' || c == '\n'))
- return Y_ARRAY_SIZE(BR);
- else
- return 1;
- }
- }
- }
- void Collapse(TUtf16String& w) {
- CollapseImpl(w, w, 0, IsWhitespace);
- }
- size_t Collapse(wchar16* s, size_t n) {
- return CollapseImpl(s, n, IsWhitespace);
- }
- TWtringBuf StripLeft(const TWtringBuf text) noexcept {
- const auto* p = text.data();
- const auto* const pe = text.data() + text.size();
- for (; p != pe && IsWhitespace(*p); ++p) {
- }
- return {p, pe};
- }
- void StripLeft(TUtf16String& text) {
- const auto stripped = StripLeft(TWtringBuf(text));
- if (stripped.size() == text.size()) {
- return;
- }
- text = stripped;
- }
- TWtringBuf StripRight(const TWtringBuf text) noexcept {
- if (!text) {
- return {};
- }
- const auto* const pe = text.data() - 1;
- const auto* p = text.data() + text.size() - 1;
- for (; p != pe && IsWhitespace(*p); --p) {
- }
- return {pe + 1, p + 1};
- }
- void StripRight(TUtf16String& text) {
- const auto stripped = StripRight(TWtringBuf(text));
- if (stripped.size() == text.size()) {
- return;
- }
- text.resize(stripped.size());
- }
- TWtringBuf Strip(const TWtringBuf text) noexcept {
- return StripRight(StripLeft(text));
- }
- void Strip(TUtf16String& text) {
- StripLeft(text);
- StripRight(text);
- }
- template <typename T>
- static bool IsReductionOnSymbolsTrue(const TWtringBuf text, T&& f) {
- const auto* p = text.data();
- const auto* const pe = text.data() + text.length();
- while (p != pe) {
- const auto symbol = ReadSymbolAndAdvance(p, pe);
- if (!f(symbol)) {
- return false;
- }
- }
- return true;
- }
- bool IsLowerWord(const TWtringBuf text) noexcept {
- return IsReductionOnSymbolsTrue(text, [](const wchar32 s) { return IsLower(s); });
- }
- bool IsUpperWord(const TWtringBuf text) noexcept {
- return IsReductionOnSymbolsTrue(text, [](const wchar32 s) { return IsUpper(s); });
- }
- bool IsLower(const TWtringBuf text) noexcept {
- return IsReductionOnSymbolsTrue(text, [](const wchar32 s) {
- if (IsAlpha(s)) {
- return IsLower(s);
- }
- return true;
- });
- }
- bool IsUpper(const TWtringBuf text) noexcept {
- return IsReductionOnSymbolsTrue(text, [](const wchar32 s) {
- if (IsAlpha(s)) {
- return IsUpper(s);
- }
- return true;
- });
- }
- bool IsTitleWord(const TWtringBuf text) noexcept {
- if (!text) {
- return false;
- }
- const auto* p = text.data();
- const auto* pe = text.data() + text.size();
- const auto firstSymbol = ReadSymbolAndAdvance(p, pe);
- if (firstSymbol != ToTitle(firstSymbol)) {
- return false;
- }
- return IsLowerWord({p, pe});
- }
- template <bool stopOnFirstModification, typename TCharType, typename F>
- static bool ModifySequence(TCharType*& p, const TCharType* const pe, F&& f) {
- while (p != pe) {
- const auto symbol = ReadSymbol(p, pe);
- const auto modified = f(symbol);
- if (symbol != modified) {
- if (stopOnFirstModification) {
- return true;
- }
- WriteSymbol(modified, p); // also moves `p` forward
- } else {
- p = SkipSymbol(p, pe);
- }
- }
- return false;
- }
- template <bool stopOnFirstModification, typename TCharType, typename F>
- static bool ModifySequence(const TCharType*& p, const TCharType* const pe, TCharType*& out, F&& f) {
- while (p != pe) {
- const auto symbol = stopOnFirstModification ? ReadSymbol(p, pe) : ReadSymbolAndAdvance(p, pe);
- const auto modified = f(symbol);
- if (stopOnFirstModification) {
- if (symbol != modified) {
- return true;
- }
- p = SkipSymbol(p, pe);
- }
- WriteSymbol(modified, out);
- }
- return false;
- }
- template <class TStringType>
- static void DetachAndFixPointers(TStringType& text, typename TStringType::value_type*& p, const typename TStringType::value_type*& pe) {
- const auto pos = p - text.data();
- const auto count = pe - p;
- p = text.Detach() + pos;
- pe = p + count;
- }
- template <class TStringType, typename F>
- static bool ModifyStringSymbolwise(TStringType& text, size_t pos, size_t count, F&& f) {
- // TODO(yazevnul): this is done for consistency with `TUtf16String::to_lower` and friends
- // at r2914050, maybe worth replacing them with asserts. Also see the same code in `ToTitle`.
- pos = pos < text.size() ? pos : text.size();
- count = count < text.size() - pos ? count : text.size() - pos;
- // TUtf16String is refcounted and it's `data` method return pointer to the constant memory.
- // To simplify the code we do a `const_cast`, though first write to the memory will be done only
- // after we call `Detach()` and get pointer to a writable piece of memory.
- auto* p = const_cast<typename TStringType::value_type*>(text.data() + pos);
- const auto* pe = text.data() + pos + count;
- if (ModifySequence<true>(p, pe, f)) {
- DetachAndFixPointers(text, p, pe);
- ModifySequence<false>(p, pe, f);
- return true;
- }
- return false;
- }
- bool ToLower(TUtf16String& text, size_t pos, size_t count) {
- const auto f = [](const wchar32 s) { return ToLower(s); };
- return ModifyStringSymbolwise(text, pos, count, f);
- }
- bool ToUpper(TUtf16String& text, size_t pos, size_t count) {
- const auto f = [](const wchar32 s) { return ToUpper(s); };
- return ModifyStringSymbolwise(text, pos, count, f);
- }
- bool ToLower(TUtf32String& text, size_t pos, size_t count) {
- const auto f = [](const wchar32 s) { return ToLower(s); };
- return ModifyStringSymbolwise(text, pos, count, f);
- }
- bool ToUpper(TUtf32String& text, size_t pos, size_t count) {
- const auto f = [](const wchar32 s) { return ToUpper(s); };
- return ModifyStringSymbolwise(text, pos, count, f);
- }
- bool ToTitle(TUtf16String& text, size_t pos, size_t count) {
- if (!text) {
- return false;
- }
- pos = pos < text.size() ? pos : text.size();
- count = count < text.size() - pos ? count : text.size() - pos;
- const auto toLower = [](const wchar32 s) { return ToLower(s); };
- auto* p = const_cast<wchar16*>(text.data() + pos);
- const auto* pe = text.data() + pos + count;
- const auto firstSymbol = ReadSymbol(p, pe);
- if (firstSymbol == ToTitle(firstSymbol)) {
- p = SkipSymbol(p, pe);
- if (ModifySequence<true>(p, pe, toLower)) {
- DetachAndFixPointers(text, p, pe);
- ModifySequence<false>(p, pe, toLower);
- return true;
- }
- } else {
- DetachAndFixPointers(text, p, pe);
- WriteSymbol(ToTitle(ReadSymbol(p, pe)), p); // also moves `p` forward
- ModifySequence<false>(p, pe, toLower);
- return true;
- }
- return false;
- }
- bool ToTitle(TUtf32String& text, size_t pos, size_t count) {
- if (!text) {
- return false;
- }
- pos = pos < text.size() ? pos : text.size();
- count = count < text.size() - pos ? count : text.size() - pos;
- const auto toLower = [](const wchar32 s) { return ToLower(s); };
- auto* p = const_cast<wchar32*>(text.data() + pos);
- const auto* pe = text.data() + pos + count;
- const auto firstSymbol = *p;
- if (firstSymbol == ToTitle(firstSymbol)) {
- p += 1;
- if (ModifySequence<true>(p, pe, toLower)) {
- DetachAndFixPointers(text, p, pe);
- ModifySequence<false>(p, pe, toLower);
- return true;
- }
- } else {
- DetachAndFixPointers(text, p, pe);
- WriteSymbol(ToTitle(ReadSymbol(p, pe)), p); // also moves `p` forward
- ModifySequence<false>(p, pe, toLower);
- return true;
- }
- return false;
- }
- TUtf16String ToLowerRet(TUtf16String text, size_t pos, size_t count) {
- ToLower(text, pos, count);
- return text;
- }
- TUtf16String ToUpperRet(TUtf16String text, size_t pos, size_t count) {
- ToUpper(text, pos, count);
- return text;
- }
- TUtf16String ToTitleRet(TUtf16String text, size_t pos, size_t count) {
- ToTitle(text, pos, count);
- return text;
- }
- TUtf32String ToLowerRet(TUtf32String text, size_t pos, size_t count) {
- ToLower(text, pos, count);
- return text;
- }
- TUtf32String ToUpperRet(TUtf32String text, size_t pos, size_t count) {
- ToUpper(text, pos, count);
- return text;
- }
- TUtf32String ToTitleRet(TUtf32String text, size_t pos, size_t count) {
- ToTitle(text, pos, count);
- return text;
- }
- bool ToLower(const wchar16* text, size_t length, wchar16* out) noexcept {
- // TODO(yazevnul): get rid of `text == out` case (it is probably used only in lemmer) and then
- // we can declare text and out as `__restrict__`
- Y_ASSERT(text == out || !(out >= text && out < text + length));
- const auto f = [](const wchar32 s) { return ToLower(s); };
- const auto* p = text;
- const auto* const pe = text + length;
- if (ModifySequence<true>(p, pe, out, f)) {
- ModifySequence<false>(p, pe, out, f);
- return true;
- }
- return false;
- }
- bool ToUpper(const wchar16* text, size_t length, wchar16* out) noexcept {
- Y_ASSERT(text == out || !(out >= text && out < text + length));
- const auto f = [](const wchar32 s) { return ToUpper(s); };
- const auto* p = text;
- const auto* const pe = text + length;
- if (ModifySequence<true>(p, pe, out, f)) {
- ModifySequence<false>(p, pe, out, f);
- return true;
- }
- return false;
- }
- bool ToTitle(const wchar16* text, size_t length, wchar16* out) noexcept {
- if (!length) {
- return false;
- }
- Y_ASSERT(text == out || !(out >= text && out < text + length));
- const auto* const textEnd = text + length;
- const auto firstSymbol = ReadSymbolAndAdvance(text, textEnd);
- const auto firstSymbolTitle = ToTitle(firstSymbol);
- WriteSymbol(firstSymbolTitle, out);
- return ToLower(text, textEnd - text, out) || firstSymbol != firstSymbolTitle;
- }
- bool ToLower(wchar16* text, size_t length) noexcept {
- const auto f = [](const wchar32 s) { return ToLower(s); };
- const auto* const textEnd = text + length;
- if (ModifySequence<true>(text, textEnd, f)) {
- ModifySequence<false>(text, textEnd, f);
- return true;
- }
- return false;
- }
- bool ToUpper(wchar16* text, size_t length) noexcept {
- const auto f = [](const wchar32 s) { return ToUpper(s); };
- const auto* const textEnd = text + length;
- if (ModifySequence<true>(text, textEnd, f)) {
- ModifySequence<false>(text, textEnd, f);
- return true;
- }
- return false;
- }
- bool ToTitle(wchar16* text, size_t length) noexcept {
- if (!length) {
- return false;
- }
- const auto* textEnd = text + length;
- const auto firstSymbol = ReadSymbol(text, textEnd);
- const auto firstSymbolTitle = ToTitle(firstSymbol);
- // avoid unnacessary writes to the memory
- if (firstSymbol != firstSymbolTitle) {
- WriteSymbol(firstSymbolTitle, text);
- } else {
- text = SkipSymbol(text, textEnd);
- }
- return ToLower(text, textEnd - text) || firstSymbol != firstSymbolTitle;
- }
- bool ToLower(const wchar32* text, size_t length, wchar32* out) noexcept {
- // TODO(yazevnul): get rid of `text == out` case (it is probably used only in lemmer) and then
- // we can declare text and out as `__restrict__`
- Y_ASSERT(text == out || !(out >= text && out < text + length));
- const auto f = [](const wchar32 s) { return ToLower(s); };
- const auto* p = text;
- const auto* const pe = text + length;
- if (ModifySequence<true>(p, pe, out, f)) {
- ModifySequence<false>(p, pe, out, f);
- return true;
- }
- return false;
- }
- bool ToUpper(const wchar32* text, size_t length, wchar32* out) noexcept {
- Y_ASSERT(text == out || !(out >= text && out < text + length));
- const auto f = [](const wchar32 s) { return ToUpper(s); };
- const auto* p = text;
- const auto* const pe = text + length;
- if (ModifySequence<true>(p, pe, out, f)) {
- ModifySequence<false>(p, pe, out, f);
- return true;
- }
- return false;
- }
- bool ToTitle(const wchar32* text, size_t length, wchar32* out) noexcept {
- if (!length) {
- return false;
- }
- Y_ASSERT(text == out || !(out >= text && out < text + length));
- const auto* const textEnd = text + length;
- const auto firstSymbol = ReadSymbolAndAdvance(text, textEnd);
- const auto firstSymbolTitle = ToTitle(firstSymbol);
- WriteSymbol(firstSymbolTitle, out);
- return ToLower(text, textEnd - text, out) || firstSymbol != firstSymbolTitle;
- }
- bool ToLower(wchar32* text, size_t length) noexcept {
- const auto f = [](const wchar32 s) { return ToLower(s); };
- const auto* const textEnd = text + length;
- if (ModifySequence<true>(text, textEnd, f)) {
- ModifySequence<false>(text, textEnd, f);
- return true;
- }
- return false;
- }
- bool ToUpper(wchar32* text, size_t length) noexcept {
- const auto f = [](const wchar32 s) { return ToUpper(s); };
- const auto* const textEnd = text + length;
- if (ModifySequence<true>(text, textEnd, f)) {
- ModifySequence<false>(text, textEnd, f);
- return true;
- }
- return false;
- }
- bool ToTitle(wchar32* text, size_t length) noexcept {
- if (!length) {
- return false;
- }
- const auto* textEnd = text + length;
- const auto firstSymbol = ReadSymbol(text, textEnd);
- const auto firstSymbolTitle = ToTitle(firstSymbol);
- // avoid unnacessary writes to the memory
- if (firstSymbol != firstSymbolTitle) {
- WriteSymbol(firstSymbolTitle, text);
- } else {
- text = SkipSymbol(text, textEnd);
- }
- return ToLower(text, textEnd - text) || firstSymbol != firstSymbolTitle;
- }
- template <typename F>
- static TUtf16String ToSmthRet(const TWtringBuf text, size_t pos, size_t count, F&& f) {
- pos = pos < text.size() ? pos : text.size();
- count = count < text.size() - pos ? count : text.size() - pos;
- auto res = TUtf16String::Uninitialized(text.size());
- auto* const resBegin = res.Detach();
- if (pos) {
- MemCopy(resBegin, text.data(), pos);
- }
- f(text.data() + pos, count, resBegin + pos);
- if (count - pos != text.size()) {
- MemCopy(resBegin + pos + count, text.data() + pos + count, text.size() - pos - count);
- }
- return res;
- }
- template <typename F>
- static TUtf32String ToSmthRet(const TUtf32StringBuf text, size_t pos, size_t count, F&& f) {
- pos = pos < text.size() ? pos : text.size();
- count = count < text.size() - pos ? count : text.size() - pos;
- auto res = TUtf32String::Uninitialized(text.size());
- auto* const resBegin = res.Detach();
- if (pos) {
- MemCopy(resBegin, text.data(), pos);
- }
- f(text.data() + pos, count, resBegin + pos);
- if (count - pos != text.size()) {
- MemCopy(resBegin + pos + count, text.data() + pos + count, text.size() - pos - count);
- }
- return res;
- }
- TUtf16String ToLowerRet(const TWtringBuf text, size_t pos, size_t count) {
- return ToSmthRet(text, pos, count, [](const wchar16* theText, size_t length, wchar16* out) {
- ToLower(theText, length, out);
- });
- }
- TUtf16String ToUpperRet(const TWtringBuf text, size_t pos, size_t count) {
- return ToSmthRet(text, pos, count, [](const wchar16* theText, size_t length, wchar16* out) {
- ToUpper(theText, length, out);
- });
- }
- TUtf16String ToTitleRet(const TWtringBuf text, size_t pos, size_t count) {
- return ToSmthRet(text, pos, count, [](const wchar16* theText, size_t length, wchar16* out) {
- ToTitle(theText, length, out);
- });
- }
- TUtf32String ToLowerRet(const TUtf32StringBuf text, size_t pos, size_t count) {
- return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
- ToLower(theText, length, out);
- });
- }
- TUtf32String ToUpperRet(const TUtf32StringBuf text, size_t pos, size_t count) {
- return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
- ToUpper(theText, length, out);
- });
- }
- TUtf32String ToTitleRet(const TUtf32StringBuf text, size_t pos, size_t count) {
- return ToSmthRet(text, pos, count, [](const wchar32* theText, size_t length, wchar32* out) {
- ToTitle(theText, length, out);
- });
- }
- template <bool insertBr>
- void EscapeHtmlChars(TUtf16String& str) {
- static const TUtf16String lt(LT, Y_ARRAY_SIZE(LT));
- static const TUtf16String gt(GT, Y_ARRAY_SIZE(GT));
- static const TUtf16String amp(AMP, Y_ARRAY_SIZE(AMP));
- static const TUtf16String br(BR, Y_ARRAY_SIZE(BR));
- static const TUtf16String quot(QUOT, Y_ARRAY_SIZE(QUOT));
- size_t escapedLen = 0;
- const TUtf16String& cs = str;
- for (size_t i = 0; i < cs.size(); ++i)
- escapedLen += EscapedLen<insertBr>(cs[i]);
- if (escapedLen == cs.size())
- return;
- TUtf16String res;
- res.reserve(escapedLen);
- size_t start = 0;
- for (size_t i = 0; i < cs.size(); ++i) {
- const TUtf16String* ent = nullptr;
- switch (cs[i]) {
- case '<':
- ent = <
- break;
- case '>':
- ent = >
- break;
- case '&':
- ent = &
- break;
- case '\"':
- ent = "
- break;
- default:
- if (insertBr && (cs[i] == '\r' || cs[i] == '\n')) {
- ent = &br;
- break;
- } else
- continue;
- }
- res.append(cs.begin() + start, cs.begin() + i);
- res.append(ent->begin(), ent->end());
- start = i + 1;
- }
- res.append(cs.begin() + start, cs.end());
- res.swap(str);
- }
- template void EscapeHtmlChars<false>(TUtf16String& str);
- template void EscapeHtmlChars<true>(TUtf16String& str);
|