utf8.cpp 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. #include "unidata.h"
  2. #include "utf8.h"
  3. namespace {
  4. enum class ECaseConversion {
  5. ToUpper,
  6. ToLower,
  7. };
  8. wchar32 ConvertChar(ECaseConversion conversion, wchar32 ch) {
  9. switch (conversion) {
  10. case ECaseConversion::ToUpper:
  11. return ToUpper(ch);
  12. case ECaseConversion::ToLower:
  13. return ToLower(ch);
  14. }
  15. Y_ASSERT(false); // NOTREACHED
  16. return 0;
  17. }
  18. bool ConvertCaseUTF8Impl(ECaseConversion conversion, const char* beg, size_t n,
  19. TString& newString) {
  20. const unsigned char* p = (const unsigned char*)beg;
  21. const unsigned char* const end = p + n;
  22. // first loop searches for the first character, which is changed by ConvertChar
  23. // if there is no changed character, we don't need reallocation/copy
  24. wchar32 cNew = 0;
  25. size_t cLen = 0;
  26. while (p < end) {
  27. wchar32 c;
  28. if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
  29. ythrow yexception()
  30. << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
  31. }
  32. cNew = ConvertChar(conversion, c);
  33. if (cNew != c)
  34. break;
  35. p += cLen;
  36. }
  37. if (p == end) {
  38. return false;
  39. }
  40. // some character changed after ToLower. Write new string to newString.
  41. newString.resize(n);
  42. size_t written = (char*)p - beg;
  43. char* writePtr = newString.begin();
  44. memcpy(writePtr, beg, written);
  45. writePtr += written;
  46. size_t destSpace = n - written;
  47. // before each iteration (including the first one) variable 'cNew' contains unwritten symbol
  48. while (true) {
  49. size_t cNewLen;
  50. Y_ASSERT((writePtr - newString.data()) + destSpace == newString.size());
  51. if (RECODE_EOOUTPUT ==
  52. SafeWriteUTF8Char(cNew, cNewLen, (unsigned char*)writePtr, destSpace)) {
  53. destSpace += newString.size();
  54. newString.resize(newString.size() * 2);
  55. writePtr = newString.begin() + (newString.size() - destSpace);
  56. continue;
  57. }
  58. destSpace -= cNewLen;
  59. writePtr += cNewLen;
  60. p += cLen;
  61. if (p == end) {
  62. newString.resize(newString.size() - destSpace);
  63. return true;
  64. }
  65. wchar32 c = 0;
  66. if (RECODE_OK != SafeReadUTF8Char(c, cLen, p, end)) {
  67. ythrow yexception()
  68. << "failed to decode UTF-8 string at pos " << ((const char*)p - beg);
  69. }
  70. cNew = ConvertChar(conversion, c);
  71. }
  72. Y_ASSERT(false);
  73. return false;
  74. }
  75. } // namespace
  76. extern const wchar32 BROKEN_RUNE = 0xFFFD;
  77. static const char* SkipUTF8Chars(const char* begin, const char* end, size_t numChars) {
  78. const unsigned char* uEnd = reinterpret_cast<const unsigned char*>(end);
  79. while (begin != end && numChars > 0) {
  80. const unsigned char* uBegin = reinterpret_cast<const unsigned char*>(begin);
  81. size_t runeLen;
  82. if (GetUTF8CharLen(runeLen, uBegin, uEnd) != RECODE_OK) {
  83. ythrow yexception() << "invalid UTF-8 char";
  84. }
  85. begin += runeLen;
  86. Y_ASSERT(begin <= end);
  87. --numChars;
  88. }
  89. return begin;
  90. }
  91. TStringBuf SubstrUTF8(const TStringBuf str, size_t pos, size_t len) {
  92. const char* start = SkipUTF8Chars(str.begin(), str.end(), pos);
  93. const char* end = SkipUTF8Chars(start, str.end(), len);
  94. return TStringBuf(start, end - start);
  95. }
  96. EUTF8Detect UTF8Detect(const char* s, size_t len) {
  97. const unsigned char* s0 = (const unsigned char*)s;
  98. const unsigned char* send = s0 + len;
  99. wchar32 rune;
  100. size_t rune_len;
  101. EUTF8Detect res = ASCII;
  102. while (s0 < send) {
  103. RECODE_RESULT rr = SafeReadUTF8Char(rune, rune_len, s0, send);
  104. if (rr != RECODE_OK) {
  105. return NotUTF8;
  106. }
  107. if (rune_len > 1) {
  108. res = UTF8;
  109. }
  110. s0 += rune_len;
  111. }
  112. return res;
  113. }
  114. bool ToLowerUTF8Impl(const char* beg, size_t n, TString& newString) {
  115. return ConvertCaseUTF8Impl(ECaseConversion::ToLower, beg, n, newString);
  116. }
  117. TString ToLowerUTF8(const TString& s) {
  118. TString newString;
  119. bool changed = ToLowerUTF8Impl(s.data(), s.size(), newString);
  120. return changed ? newString : s;
  121. }
  122. TString ToLowerUTF8(TStringBuf s) {
  123. TString newString;
  124. bool changed = ToLowerUTF8Impl(s.data(), s.size(), newString);
  125. return changed ? newString : TString(s.data(), s.size());
  126. }
  127. TString ToLowerUTF8(const char* s) {
  128. return ToLowerUTF8(TStringBuf(s));
  129. }
  130. bool ToUpperUTF8Impl(const char* beg, size_t n, TString& newString) {
  131. return ConvertCaseUTF8Impl(ECaseConversion::ToUpper, beg, n, newString);
  132. }
  133. TString ToUpperUTF8(const TString& s) {
  134. TString newString;
  135. bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString);
  136. return changed ? newString : s;
  137. }
  138. TString ToUpperUTF8(TStringBuf s) {
  139. TString newString;
  140. bool changed = ToUpperUTF8Impl(s.data(), s.size(), newString);
  141. return changed ? newString : TString(s.data(), s.size());
  142. }
  143. TString ToUpperUTF8(const char* s) {
  144. return ToUpperUTF8(TStringBuf(s));
  145. }