123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431 |
- #pragma once
- #include "recode_result.h"
- #include <util/generic/strbuf.h>
- #include <util/generic/string.h>
- #include <util/generic/yexception.h>
- #include <util/system/defaults.h>
- #include <util/system/yassert.h>
- extern const wchar32 BROKEN_RUNE;
- inline unsigned char UTF8LeadByteMask(size_t utf8_rune_len) {
- // Y_ASSERT (utf8_rune_len <= 4);
- return "\0\0\037\017\007"[utf8_rune_len];
- }
- inline size_t UTF8RuneLen(const unsigned char lead_byte) {
- // b0XXXXXXX
- if ((lead_byte & 0x80) == 0x00) {
- return 1;
- }
- // b110XXXXX
- if ((lead_byte & 0xe0) == 0xc0) {
- return 2;
- }
- // b1110XXXX
- if ((lead_byte & 0xf0) == 0xe0) {
- return 3;
- }
- // b11110XXX
- if ((lead_byte & 0xf8) == 0xf0) {
- return 4;
- }
- // b10XXXXXX
- return 0;
- }
- inline size_t UTF8RuneLenByUCS(wchar32 rune) {
- if (rune < 0x80)
- return 1U;
- else if (rune < 0x800)
- return 2U;
- else if (rune < 0x10000)
- return 3U;
- else if (rune < 0x200000)
- return 4U;
- else if (rune < 0x4000000)
- return 5U;
- else
- return 6U;
- }
- inline void PutUTF8LeadBits(wchar32& rune, unsigned char c, size_t len) {
- rune = c;
- rune &= UTF8LeadByteMask(len);
- }
- inline void PutUTF8SixBits(wchar32& rune, unsigned char c) {
- rune <<= 6;
- rune |= c & 0x3F;
- }
- inline bool IsUTF8ContinuationByte(unsigned char c) {
- return (c & static_cast<unsigned char>(0xC0)) == static_cast<unsigned char>(0x80);
- }
- //! returns length of the current UTF8 character
- //! @param n length of the current character, it is assigned in case of valid UTF8 byte sequence
- //! @param p pointer to the current character
- //! @param e end of the character sequence
- inline RECODE_RESULT GetUTF8CharLen(size_t& n, const unsigned char* p, const unsigned char* e) {
- Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
- switch (UTF8RuneLen(*p)) {
- case 0:
- return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte
- case 1:
- n = 1;
- return RECODE_OK;
- case 2:
- if (p + 2 > e) {
- return RECODE_EOINPUT;
- } else if (!IsUTF8ContinuationByte(p[1])) {
- return RECODE_BROKENSYMBOL;
- } else {
- n = 2;
- return RECODE_OK;
- }
- case 3:
- if (p + 3 > e) {
- return RECODE_EOINPUT;
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
- return RECODE_BROKENSYMBOL;
- } else {
- n = 3;
- return RECODE_OK;
- }
- default: // actually 4
- if (p + 4 > e) {
- return RECODE_EOINPUT;
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
- return RECODE_BROKENSYMBOL;
- } else {
- n = 4;
- return RECODE_OK;
- }
- }
- }
- //! returns number of characters in UTF8 encoded text, stops immediately if UTF8 byte sequence is wrong
- //! @param text UTF8 encoded text
- //! @param len the length of the text in bytes
- //! @param number number of encoded symbols in the text
- inline bool GetNumberOfUTF8Chars(const char* text, size_t len, size_t& number) {
- const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
- const unsigned char* const last = cur + len;
- number = 0;
- size_t runeLen;
- bool res = true;
- while (cur != last) {
- if (GetUTF8CharLen(runeLen, cur, last) != RECODE_OK) { // actually it could be RECODE_BROKENSYMBOL only
- res = false;
- break;
- }
- cur += runeLen;
- Y_ASSERT(cur <= last);
- ++number;
- }
- return res;
- }
- inline size_t GetNumberOfUTF8Chars(TStringBuf text) {
- size_t number;
- if (!GetNumberOfUTF8Chars(text.data(), text.size(), number)) {
- ythrow yexception() << "GetNumberOfUTF8Chars failed on invalid utf-8 " << TString(text.substr(0, 50)).Quote();
- }
- return number;
- }
- enum class StrictUTF8 {
- Yes,
- No
- };
- template <size_t runeLen, StrictUTF8 strictMode>
- inline bool IsValidUTF8Rune(wchar32 rune);
- template <>
- inline bool IsValidUTF8Rune<2, StrictUTF8::Yes>(wchar32 rune) {
- // check for overlong encoding
- return rune >= 0x80;
- }
- template <>
- inline bool IsValidUTF8Rune<2, StrictUTF8::No>(wchar32 rune) {
- return IsValidUTF8Rune<2, StrictUTF8::Yes>(rune);
- }
- template <>
- inline bool IsValidUTF8Rune<3, StrictUTF8::Yes>(wchar32 rune) {
- // surrogates are forbidden by RFC3629 section 3
- return rune >= 0x800 && (rune < 0xD800 || rune > 0xDFFF);
- }
- template <>
- inline bool IsValidUTF8Rune<3, StrictUTF8::No>(wchar32 rune) {
- // check for overlong encoding
- return rune >= 0x800;
- }
- template <>
- inline bool IsValidUTF8Rune<4, StrictUTF8::Yes>(wchar32 rune) {
- // check if this is a valid sumbod without overlong encoding
- return rune <= 0x10FFFF && rune >= 0x10000;
- }
- template <>
- inline bool IsValidUTF8Rune<4, StrictUTF8::No>(wchar32 rune) {
- return IsValidUTF8Rune<4, StrictUTF8::Yes>(rune);
- }
- //! reads one unicode symbol from a character sequence encoded UTF8 and checks for overlong encoding
- //! @param rune value of the current character
- //! @param rune_len length of the UTF8 bytes sequence that has been read
- //! @param s pointer to the current character
- //! @param end the end of the character sequence
- template <StrictUTF8 strictMode = StrictUTF8::No>
- inline RECODE_RESULT SafeReadUTF8Char(wchar32& rune, size_t& rune_len, const unsigned char* s, const unsigned char* end) {
- rune = BROKEN_RUNE;
- rune_len = 0;
- wchar32 _rune;
- size_t _len = UTF8RuneLen(*s);
- if (s + _len > end)
- return RECODE_EOINPUT; // [EOINPUT]
- if (_len == 0)
- return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte
- _rune = *s++; // [00000000 0XXXXXXX]
- if (_len > 1) {
- _rune &= UTF8LeadByteMask(_len);
- unsigned char ch = *s++;
- if (!IsUTF8ContinuationByte(ch))
- return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in second byte
- PutUTF8SixBits(_rune, ch); // [00000XXX XXYYYYYY]
- if (_len > 2) {
- ch = *s++;
- if (!IsUTF8ContinuationByte(ch))
- return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in third byte
- PutUTF8SixBits(_rune, ch); // [XXXXYYYY YYZZZZZZ]
- if (_len > 3) {
- ch = *s;
- if (!IsUTF8ContinuationByte(ch))
- return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in fourth byte
- PutUTF8SixBits(_rune, ch); // [XXXYY YYYYZZZZ ZZQQQQQQ]
- if (!IsValidUTF8Rune<4, strictMode>(_rune))
- return RECODE_BROKENSYMBOL;
- } else {
- if (!IsValidUTF8Rune<3, strictMode>(_rune))
- return RECODE_BROKENSYMBOL;
- }
- } else {
- if (!IsValidUTF8Rune<2, strictMode>(_rune))
- return RECODE_BROKENSYMBOL;
- }
- }
- rune_len = _len;
- rune = _rune;
- return RECODE_OK;
- }
- //! reads one unicode symbol from a character sequence encoded UTF8 and moves pointer to the next character
- //! @param c value of the current character
- //! @param p pointer to the current character, it will be changed in case of valid UTF8 byte sequence
- //! @param e the end of the character sequence
- template <StrictUTF8 strictMode = StrictUTF8::No>
- Y_FORCE_INLINE RECODE_RESULT ReadUTF8CharAndAdvance(wchar32& rune, const unsigned char*& p, const unsigned char* e) noexcept {
- Y_ASSERT(p < e); // since p < e then we will check RECODE_EOINPUT only for n > 1 (see calls of this functions)
- switch (UTF8RuneLen(*p)) {
- case 0:
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL; // [BROKENSYMBOL] in first byte
- case 1:
- rune = *p; // [00000000 0XXXXXXX]
- ++p;
- return RECODE_OK;
- case 2:
- if (p + 2 > e) {
- return RECODE_EOINPUT;
- } else if (!IsUTF8ContinuationByte(p[1])) {
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- } else {
- PutUTF8LeadBits(rune, *p++, 2); // [00000000 000XXXXX]
- PutUTF8SixBits(rune, *p++); // [00000XXX XXYYYYYY]
- if (!IsValidUTF8Rune<2, strictMode>(rune)) {
- p -= 2;
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- }
- return RECODE_OK;
- }
- case 3:
- if (p + 3 > e) {
- return RECODE_EOINPUT;
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2])) {
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- } else {
- PutUTF8LeadBits(rune, *p++, 3); // [00000000 0000XXXX]
- PutUTF8SixBits(rune, *p++); // [000000XX XXYYYYYY]
- PutUTF8SixBits(rune, *p++); // [XXXXYYYY YYZZZZZZ]
- // check for overlong encoding and surrogates
- if (!IsValidUTF8Rune<3, strictMode>(rune)) {
- p -= 3;
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- }
- return RECODE_OK;
- }
- case 4:
- if (p + 4 > e) {
- return RECODE_EOINPUT;
- } else if (!IsUTF8ContinuationByte(p[1]) || !IsUTF8ContinuationByte(p[2]) || !IsUTF8ContinuationByte(p[3])) {
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- } else {
- PutUTF8LeadBits(rune, *p++, 4); // [00000000 00000000 00000XXX]
- PutUTF8SixBits(rune, *p++); // [00000000 0000000X XXYYYYYY]
- PutUTF8SixBits(rune, *p++); // [00000000 0XXXYYYY YYZZZZZZ]
- PutUTF8SixBits(rune, *p++); // [000XXXYY YYYYZZZZ ZZQQQQQQ]
- if (!IsValidUTF8Rune<4, strictMode>(rune)) {
- p -= 4;
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- }
- return RECODE_OK;
- }
- default: // >4
- rune = BROKEN_RUNE;
- return RECODE_BROKENSYMBOL;
- }
- }
- //! writes one unicode symbol into a character sequence encoded UTF8
- //! checks for end of the buffer and returns the result of encoding
- //! @param rune value of the current character
- //! @param rune_len length of the UTF8 byte sequence that has been written
- //! @param s pointer to the output buffer
- //! @param tail available size of the buffer
- inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, size_t tail) {
- rune_len = 0;
- if (rune < 0x80) {
- if (tail <= 0)
- return RECODE_EOOUTPUT;
- *s = static_cast<unsigned char>(rune);
- rune_len = 1;
- return RECODE_OK;
- }
- if (rune < 0x800) {
- if (tail <= 1)
- return RECODE_EOOUTPUT;
- *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 2;
- return RECODE_OK;
- }
- if (rune < 0x10000) {
- if (tail <= 2)
- return RECODE_EOOUTPUT;
- *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 3;
- return RECODE_OK;
- }
- /*if (rune < 0x200000)*/ {
- if (tail <= 3)
- return RECODE_EOOUTPUT;
- *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 4;
- return RECODE_OK;
- }
- }
- inline RECODE_RESULT SafeWriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s, const unsigned char* end) {
- return SafeWriteUTF8Char(rune, rune_len, s, end - s);
- }
- //! writes one unicode symbol into a character sequence encoded UTF8
- //! @attention this function works as @c SafeWriteUTF8Char it does not check
- //! the size of the output buffer, it supposes that buffer is long enough
- //! @param rune value of the current character
- //! @param rune_len length of the UTF8 byte sequence that has been written
- //! @param s pointer to the output buffer
- inline void WriteUTF8Char(wchar32 rune, size_t& rune_len, unsigned char* s) {
- if (rune < 0x80) {
- *s = static_cast<unsigned char>(rune);
- rune_len = 1;
- return;
- }
- if (rune < 0x800) {
- *s++ = static_cast<unsigned char>(0xC0 | (rune >> 6));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 2;
- return;
- }
- if (rune < 0x10000) {
- *s++ = static_cast<unsigned char>(0xE0 | (rune >> 12));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 3;
- return;
- }
- /*if (rune < 0x200000)*/ {
- *s++ = static_cast<unsigned char>(0xF0 | ((rune >> 18) & 0x07));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 12) & 0x3F));
- *s++ = static_cast<unsigned char>(0x80 | ((rune >> 6) & 0x3F));
- *s = static_cast<unsigned char>(0x80 | (rune & 0x3F));
- rune_len = 4;
- }
- }
- TStringBuf SubstrUTF8(const TStringBuf str Y_LIFETIME_BOUND, size_t pos, size_t len);
- enum EUTF8Detect {
- NotUTF8,
- UTF8,
- ASCII
- };
- EUTF8Detect UTF8Detect(const char* s, size_t len);
- inline EUTF8Detect UTF8Detect(const TStringBuf input) {
- return UTF8Detect(input.data(), input.size());
- }
- inline bool IsUtf(const char* input, size_t len) {
- return UTF8Detect(input, len) != NotUTF8;
- }
- inline bool IsUtf(const TStringBuf input) {
- return IsUtf(input.data(), input.size());
- }
- //! returns true, if result is not the same as input, and put it in newString
- //! returns false, if result is unmodified
- bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString);
- TString ToLowerUTF8(const TString& s);
- TString ToLowerUTF8(TStringBuf s);
- TString ToLowerUTF8(const char* s);
- inline TString ToLowerUTF8(const std::string& s) {
- return ToLowerUTF8(TStringBuf(s));
- }
- //! returns true, if result is not the same as input, and put it in newString
- //! returns false, if result is unmodified
- bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString);
- TString ToUpperUTF8(const TString& s);
- TString ToUpperUTF8(TStringBuf s);
- TString ToUpperUTF8(const char* s);
|