utf8.cpp 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. #include "unidata.h"
  2. #include "utf8.h"
  3. namespace {
  4. enum class ECaseConversion {
  5. ToUpper,
  6. ToLower,
  7. };
  8. wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) {
  9. switch (conversion) {
  10. case ECaseConversion::ToUpper:
  11. return ToUpper(ch);
  12. case ECaseConversion::ToLower:
  13. return ToLower(ch);
  14. }
  15. Y_ASSERT(false); // NOTREACHED
  16. return 0;
  17. }
  18. bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n,
  19. TString& newString) {
  20. const unsigned char* p = (const unsigned char*)beg;
  21. const unsigned char* const end = p + n;
  22. // first loop searches for the first character, which is changed by ConvertChar
  23. // if there is no changed character, we don't need reallocation/copy
  24. wchar32 cNew = 0;
  25. size_t cLen = 0;
  26. while (p < end) {
  27. wchar32 c;
  28. if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
  29. ythrow yexception()
  30. << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
  31. }
  32. cNew = ConvertChar(conversion, c);
  33. if (cNew != c) {
  34. break;
  35. }
  36. p += cLen;
  37. }
  38. if (p == end) {
  39. return false;
  40. }
  41. // some character changed after ToLower. Write new string to newString.
  42. newString.resize(n);
  43. size_t written = (char*)p - beg;
  44. char* writePtr = newString.begin();
  45. memcpy(writePtr, beg, written);
  46. writePtr += written;
  47. size_t destSpace = n - written;
  48. // before each iteration (including the first one) variable 'cNew' contains unwritten symbol
  49. while (true) {
  50. size_t cNewLen;
  51. Y_ASSERT((writePtr - newString.data()) + destSpace == newString.size());
  52. if (RECODE_EOOUTPUT ==
  53. SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) {
  54. destSpace += newString.size();
  55. newString.resize(newString.size() * 2);
  56. writePtr = newString.begin() + (newString.size() - destSpace);
  57. continue;
  58. }
  59. destSpace -= cNewLen;
  60. writePtr += cNewLen;
  61. p += cLen;
  62. if (p == end) {
  63. newString.resize(newString.size() - destSpace);
  64. return true;
  65. }
  66. wchar32 c = 0;
  67. if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
  68. ythrow yexception()
  69. << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
  70. }
  71. cNew = ConvertChar(conversion, c);
  72. }
  73. Y_ASSERT(false);
  74. return false;
  75. }
  76. } // namespace
  77. extern const wchar32 BROKEN_RUNE = 0xFFFD;
  78. static const char* SkipUTF8Chars(const char* begin, const char* end, size_t numChars) {
  79. const unsigned char* uEnd = reinterpret_cast<const unsigned char*>(end);
  80. while (begin != end && numChars > 0) {
  81. const unsigned char* uBegin = reinterpret_cast<const unsigned char*>(begin);
  82. size_t runeLen;
  83. if (GetUTF8CharLen(runeLen, uBegin, uEnd) != RECODE_OK) {
  84. ythrow yexception() << "invalid UTF-8 char";
  85. }
  86. begin += runeLen;
  87. Y_ASSERT(begin <= end);
  88. --numChars;
  89. }
  90. return begin;
  91. }
  92. TStringBuf SubstrUTF8(const TStringBuf str Y_LIFETIME_BOUND, size_t pos, size_t len) {
  93. const char* start = SkipUTF8Chars(str.begin(), str.end(), pos);
  94. const char* end = SkipUTF8Chars(start, str.end(), len);
  95. return TStringBuf(start, end - start);
  96. }
  97. EUTF8Detect UTF8Detect(const char* s, size_t len) {
  98. const unsigned char* s0 = (const unsigned char*)s;
  99. const unsigned char* send = s0 + len;
  100. wchar32 rune;
  101. size_t rune_len;
  102. EUTF8Detect res = ASCII;
  103. while (s0 < send) {
  104. RECODE_RESULT rr = SafeReadUTF8Char(rune, rune_len, s0, send);
  105. if (rr != RECODE_OK) {
  106. return NotUTF8;
  107. }
  108. if (rune_len > 1) {
  109. res = UTF8;
  110. }
  111. s0 += rune_len;
  112. }
  113. return res;
  114. }
  115. bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString) {
  116. return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString);
  117. }
  118. TString ToLowerUTF8(const TString& s) {
  119. TString newString;
  120. bool changed = ToLowerUTF8Impl(s.data(), s.size(), newString);
  121. return changed ? newString : s;
  122. }
  123. TString ToLowerUTF8(TStringBuf s) {
  124. TString newString;
  125. bool changed = ToLowerUTF8Impl(s.data(), s.size(), newString);
  126. return changed ? newString : TString(s.data(), s.size());
  127. }
  128. TString ToLowerUTF8(const char* s) {
  129. return ToLowerUTF8(TStringBuf(s));
  130. }
  131. bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) {
  132. return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString);
  133. }
  134. TString ToUpperUTF8(const TString& s) {
  135. TString newString;
  136. bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString);
  137. return changed ? newString : s;
  138. }
  139. TString ToUpperUTF8(TStringBuf s) {
  140. TString newString;
  141. bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString);
  142. return changed ? newString : TString(s.data(), s.size());
  143. }
  144. TString ToUpperUTF8(const char* s) {
  145. return ToUpperUTF8(TStringBuf(s));
  146. }