SMusatov
/
ydb
mirror of https://github.com/ydb-platform/ydb.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
							#include "utf8.h"

#include <util/charset/wide.h>

#include <ctype.h>
#include <vector>

namespace NYql {

namespace {

unsigned char GetRange(unsigned char c) {
    // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
    // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
    static const unsigned char type[] = {
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
        0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
        0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
        0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
        8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
        10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
    };
    return type[c];
}

struct TByteRange {
    ui8 First = 0;
    ui8 Last = 0;
};

struct TUtf8Ranges {
    size_t BytesCount = 0;
    TByteRange Bytes[4] = {};
};

// see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8
inline static const std::vector<TUtf8Ranges> Utf8Ranges = {
    { 1, { {0x00, 0x7f}, {0x00, 0x00}, {0x00, 0x00}, {0x00, 0x00}, } },
    { 2, { {0xc2, 0xdf}, {0x80, 0xbf}, {0x00, 0x00}, {0x00, 0x00}, } },
    { 3, { {0xe0, 0xe0}, {0xa0, 0xbf}, {0x80, 0xbf}, {0x00, 0x00}, } },
    { 3, { {0xe1, 0xec}, {0x80, 0xbf}, {0x80, 0xbf}, {0x00, 0x00}, } },
    { 3, { {0xed, 0xed}, {0x80, 0x9f}, {0x80, 0xbf}, {0x00, 0x00}, } },
    { 3, { {0xee, 0xef}, {0x80, 0xbf}, {0x80, 0xbf}, {0x00, 0x00}, } },
    { 4, { {0xf0, 0xf0}, {0x90, 0xbf}, {0x80, 0xbf}, {0x80, 0xbf}, } },
    { 4, { {0xf1, 0xf3}, {0x80, 0xbf}, {0x80, 0xbf}, {0x80, 0xbf}, } },
    { 4, { {0xf4, 0xf4}, {0x80, 0x8f}, {0x80, 0xbf}, {0x80, 0xbf}, } },
};

std::optional<std::string> RoundBadUtf8(size_t range, std::string_view inputString, size_t pos,
    bool roundDown)
{
    Y_ENSURE(range > 0);
    Y_ENSURE(range < Utf8Ranges.size());

    const std::string prefix{inputString.substr(0, pos)};
    std::string_view suffix = inputString.substr(pos, Utf8Ranges[range].BytesCount);

    std::string newSuffix;
    if (roundDown) {
        std::optional<size_t> lastNonMin;
        for (size_t i = 0; i < suffix.size(); ++i) {
            if (Utf8Ranges[range].Bytes[i].First < ui8(suffix[i])) {
                lastNonMin = i;
            }
        }

        if (!lastNonMin) {
            for (size_t i = 0; i < Utf8Ranges[range - 1].BytesCount; ++i) {
                newSuffix.push_back(Utf8Ranges[range - 1].Bytes[i].Last);
            }
        } else {
            for (size_t i = 0; i < Utf8Ranges[range].BytesCount; ++i) {
                if (i < *lastNonMin) {
                    ui8 c = suffix[i];
                    newSuffix.push_back(c);
                } else if (i == *lastNonMin) {
                    ui8 c = suffix[i];
                    newSuffix.push_back(std::min<ui8>(c - 1, Utf8Ranges[range].Bytes[i].Last));
                } else {
                    newSuffix.push_back(Utf8Ranges[range].Bytes[i].Last);
                }
            }
        }
    } else {
        std::optional<size_t> lastNonMax;
        bool valid = true;
        for (size_t i = 0; i < suffix.size(); ++i) {
            ui8 last = Utf8Ranges[range].Bytes[i].Last;
            ui8 first = Utf8Ranges[range].Bytes[i].First;
            ui8 curr = ui8(suffix[i]);

            valid = valid && curr <= last && curr >= first;
            if (curr < last) {
                lastNonMax = i;
            }
        }

        if (valid) {
            newSuffix = suffix;
            for (size_t i = suffix.size(); i < Utf8Ranges[range].BytesCount; ++i) {
                newSuffix.push_back(Utf8Ranges[range].Bytes[i].First);
            }
        } else if (!lastNonMax) {
            return NextValidUtf8(prefix);
        } else {
            for (size_t i = 0; i < Utf8Ranges[range].BytesCount; ++i) {
                if (i < *lastNonMax) {
                    ui8 c = suffix[i];
                    newSuffix.push_back(c);
                } else if (i == *lastNonMax) {
                    ui8 c = suffix[i];
                    newSuffix.push_back(std::max<ui8>(c + 1, Utf8Ranges[range].Bytes[i].First));
                } else {
                    newSuffix.push_back(Utf8Ranges[range].Bytes[i].First);
                }
            }
        }

    }
    return prefix + newSuffix;
}

}

bool IsUtf8(const std::string_view& str) {
    for (auto it = str.cbegin(); str.cend() != it;) {
#define COPY() if (str.cend() != it) { c = *it++; } else { return false; }
#define TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0)
#define TAIL() COPY(); TRANS(0x70)
        auto c = *it++;
        if (!(c & 0x80))
            continue;

        bool result = true;
        switch (GetRange(static_cast<unsigned char>(c))) {
        case 2: TAIL(); break;
        case 3: TAIL(); TAIL(); break;
        case 4: COPY(); TRANS(0x50); TAIL(); break;
        case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); break;
        case 6: TAIL(); TAIL(); TAIL(); break;
        case 10: COPY(); TRANS(0x20); TAIL(); break;
        case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); break;
        default: return false;
        }

        if (!result) return false;
#undef COPY
#undef TRANS
#undef TAIL
    }
    return true;
}

unsigned char WideCharSize(char head) {
    switch (GetRange(static_cast<unsigned char>(head))) {
        case 0: return 1;
        case 2: return 2;
        case 3: return 3;
        case 4: return 3;
        case 5: return 4;
        case 6: return 4;
        case 10: return 3;
        case 11: return 4;
        default: return 0;
    }
}

std::optional<std::string> RoundToNearestValidUtf8(const std::string_view& str, bool roundDown) {
    const size_t ss = str.size();
    for (size_t pos = 0; pos < ss; ) {
        ui8 c = str[pos];

        for (size_t i = 0; i < Utf8Ranges.size(); ++i) {
            auto& range = Utf8Ranges[i];

            if (c < range.Bytes[0].First) {
                return RoundBadUtf8(i, str, pos, roundDown);
            }

            if (c <= range.Bytes[0].Last) {
                // valid UTF8 code point start
                for (size_t j = 1; j < range.BytesCount; ++j) {
                    if (pos + j >= ss) {
                        return RoundBadUtf8(i, str, pos, roundDown);
                    }
                    ui8 cur = str[pos + j];
                    if (!(cur >= range.Bytes[j].First && cur <= range.Bytes[j].Last)) {
                        return RoundBadUtf8(i, str, pos, roundDown);
                    }
                }

                pos += range.BytesCount;
                break;
            } else if (i + 1 == Utf8Ranges.size()) {
                if (!roundDown) {
                    return NextValidUtf8(str.substr(0,  pos));
                }
                return RoundBadUtf8(i, str, pos, roundDown);
            }
        }
    }
    return std::string(str);
}

std::optional<std::string> NextValidUtf8(const std::string_view& str) {
    Y_ENSURE(IsUtf8(str));
    TUtf32String wide = UTF8ToUTF32<false>(str);
    bool incremented = false;
    size_t toDrop = 0;
    for (auto it = wide.rbegin(); it != wide.rend(); ++it) {
        auto& c = *it;
        if (c < 0x10ffff) {
            c = (c == 0xd7ff) ? 0xe000 : (c + 1);
            incremented = true;
            break;
        } else {
            ++toDrop;
        }
    }

    if (!incremented) {
        return {};
    }

    Y_ENSURE(toDrop < wide.size());
    wide.resize(wide.size() - toDrop);

    TString result = WideToUTF8(wide);
    return std::string(result.data(), result.size());
}

std::optional<std::string> NextLexicographicString(const std::string_view& str) {
    bool incremented = false;
    size_t toDrop = 0;
    std::string result{str};
    for (auto it = result.rbegin(); it != result.rend(); ++it) {
        auto& c = *it;
        if (ui8(c) < 0xff) {
            ++c;
            incremented = true;
            break;
        } else {
            ++toDrop;
        }
    }

    if (!incremented) {
        return {};
    }

    Y_ENSURE(toDrop < result.size());
    result.resize(result.size() - toDrop);
    return result;
}

}