123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170 |
- #include "unidata.h"
- #include "utf8.h"
- namespace {
- enum class ECaseConversion {
- ToUpper,
- ToLower,
- };
- wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) {
- switch (conversion) {
- case ECaseConversion::ToUpper:
- return ToUpper(ch);
- case ECaseConversion::ToLower:
- return ToLower(ch);
- }
- Y_ASSERT(false); // NOTREACHED
- return 0;
- }
- bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n,
- TString& newString) {
- const unsigned char* p = (const unsigned char*)beg;
- const unsigned char* const end = p + n;
- // first loop searches for the first character, which is changed by ConvertChar
- // if there is no changed character, we don't need reallocation/copy
- wchar32 cNew = 0;
- size_t cLen = 0;
- while (p < end) {
- wchar32 c;
- if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
- ythrow yexception()
- << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
- }
- cNew = ConvertChar(conversion, c);
- if (cNew != c)
- break;
- p += cLen;
- }
- if (p == end) {
- return false;
- }
- // some character changed after ToLower. Write new string to newString.
- newString.resize(n);
- size_t written = (char*)p - beg;
- char* writePtr = newString.begin();
- memcpy(writePtr, beg, written);
- writePtr += written;
- size_t destSpace = n - written;
- // before each iteration (including the first one) variable 'cNew' contains unwritten symbol
- while (true) {
- size_t cNewLen;
- Y_ASSERT((writePtr - newString.data()) + destSpace == newString.size());
- if (RECODE_EOOUTPUT ==
- SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) {
- destSpace += newString.size();
- newString.resize(newString.size() * 2);
- writePtr = newString.begin() + (newString.size() - destSpace);
- continue;
- }
- destSpace -= cNewLen;
- writePtr += cNewLen;
- p += cLen;
- if (p == end) {
- newString.resize(newString.size() - destSpace);
- return true;
- }
- wchar32 c = 0;
- if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
- ythrow yexception()
- << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
- }
- cNew = ConvertChar(conversion, c);
- }
- Y_ASSERT(false);
- return false;
- }
- } // namespace
- extern const wchar32 BROKEN_RUNE = 0xFFFD;
- static const char* SkipUTF8Chars(const char* begin, const char* end, size_t numChars) {
- const unsigned char* uEnd = reinterpret_cast<const unsigned char*>(end);
- while (begin != end && numChars > 0) {
- const unsigned char* uBegin = reinterpret_cast<const unsigned char*>(begin);
- size_t runeLen;
- if (GetUTF8CharLen(runeLen, uBegin, uEnd) != RECODE_OK) {
- ythrow yexception() << "invalid UTF-8 char";
- }
- begin += runeLen;
- Y_ASSERT(begin <= end);
- --numChars;
- }
- return begin;
- }
- TStringBuf SubstrUTF8(const TStringBuf str Y_LIFETIME_BOUND, size_t pos, size_t len) {
- const char* start = SkipUTF8Chars(str.begin(), str.end(), pos);
- const char* end = SkipUTF8Chars(start, str.end(), len);
- return TStringBuf(start, end - start);
- }
- EUTF8Detect UTF8Detect(const char* s, size_t len) {
- const unsigned char* s0 = (const unsigned char*)s;
- const unsigned char* send = s0 + len;
- wchar32 rune;
- size_t rune_len;
- EUTF8Detect res = ASCII;
- while (s0 < send) {
- RECODE_RESULT rr = SafeReadUTF8Char(rune, rune_len, s0, send);
- if (rr != RECODE_OK) {
- return NotUTF8;
- }
- if (rune_len > 1) {
- res = UTF8;
- }
- s0 += rune_len;
- }
- return res;
- }
- bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString) {
- return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString);
- }
- TString ToLowerUTF8(const TString& s) {
- TString newString;
- bool changed = ToLowerUTF8Impl(s.data(), s.size(), newString);
- return changed ? newString : s;
- }
- TString ToLowerUTF8(TStringBuf s) {
- TString newString;
- bool changed = ToLowerUTF8Impl(s.data(), s.size(), newString);
- return changed ? newString : TString(s.data(), s.size());
- }
- TString ToLowerUTF8(const char* s) {
- return ToLowerUTF8(TStringBuf(s));
- }
- bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) {
- return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString);
- }
- TString ToUpperUTF8(const TString& s) {
- TString newString;
- bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString);
- return changed ? newString : s;
- }
- TString ToUpperUTF8(TStringBuf s) {
- TString newString;
- bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString);
- return changed ? newString : TString(s.data(), s.size());
- }
- TString ToUpperUTF8(const char* s) {
- return ToUpperUTF8(TStringBuf(s));
- }
|