decodeunknownplane.cpp 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #include "ci_string.h"
  2. #include "codepage.h"
  3. #include "recyr.hh"
  4. #include <util/system/hi_lo.h>
  5. #include <util/generic/vector.h>
  6. template <typename TxChar>
  7. static inline RECODE_RESULT utf8_read_rune_from_unknown_plane(TxChar& rune, size_t& rune_len, const TxChar* s, const TxChar* end) {
  8. if ((*s & 0xFF00) != 0xF000) {
  9. rune_len = 1;
  10. rune = *s;
  11. return RECODE_OK;
  12. }
  13. rune_len = 0;
  14. size_t _len = UTF8RuneLen((unsigned char)(*s));
  15. if (s + _len > end)
  16. return RECODE_EOINPUT; //[EOINPUT]
  17. if (_len == 0)
  18. return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
  19. wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX]
  20. if (_len > 1) {
  21. _rune &= UTF8LeadByteMask(_len);
  22. wchar32 ch = *s++;
  23. if ((ch & 0xFFC0) != 0xF080)
  24. return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
  25. _rune <<= 6;
  26. _rune |= ch & 0x3F; //[00000XXX XXYYYYYY]
  27. if (_len > 2) {
  28. ch = *s++;
  29. if ((ch & 0xFFC0) != 0xF080)
  30. return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
  31. _rune <<= 6;
  32. _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ]
  33. if (_len > 3) {
  34. ch = *s;
  35. if ((ch & 0xFFC0) != 0xF080)
  36. return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
  37. _rune <<= 6;
  38. _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ]
  39. }
  40. }
  41. }
  42. rune_len = _len;
  43. if (_rune > Max<TxChar>())
  44. rune = ' '; // maybe put sequence
  45. else
  46. rune = TxChar(_rune);
  47. return RECODE_OK;
  48. }
  49. template <typename TxChar>
  50. void DoDecodeUnknownPlane(TxChar* str, TxChar*& ee, const ECharset enc) {
  51. TxChar* e = ee;
  52. if (SingleByteCodepage(enc)) {
  53. const CodePage* cp = CodePageByCharset(enc);
  54. for (TxChar* s = str; s < e; s++) {
  55. if (Hi8(Lo16(*s)) == 0xF0)
  56. *s = (TxChar)cp->unicode[Lo8(Lo16(*s))]; // NOT mb compliant
  57. }
  58. } else if (enc == CODES_UTF8) {
  59. TxChar* s;
  60. TxChar* d;
  61. for (s = d = str; s < e;) {
  62. size_t l = 0;
  63. if (utf8_read_rune_from_unknown_plane(*d, l, s, e) == RECODE_OK) {
  64. d++, s += l;
  65. } else {
  66. *d++ = BROKEN_RUNE;
  67. ++s;
  68. }
  69. }
  70. e = d;
  71. } else if (enc == CODES_UNKNOWN) {
  72. for (TxChar* s = str; s < e; s++) {
  73. if (Hi8(Lo16(*s)) == 0xF0)
  74. *s = Lo8(Lo16(*s));
  75. }
  76. } else {
  77. Y_ASSERT(!SingleByteCodepage(enc));
  78. TxChar* s = str;
  79. TxChar* d = str;
  80. TVector<char> buf;
  81. size_t read = 0;
  82. size_t written = 0;
  83. for (; s < e; ++s) {
  84. if (Hi8(Lo16(*s)) == 0xF0) {
  85. buf.push_back(Lo8(Lo16(*s)));
  86. } else {
  87. if (!buf.empty()) {
  88. if (RecodeToUnicode(enc, buf.data(), d, buf.size(), e - d, read, written) == RECODE_OK) {
  89. Y_ASSERT(read == buf.size());
  90. d += written;
  91. } else { // just copying broken symbols
  92. Y_ASSERT(buf.size() <= static_cast<size_t>(e - d));
  93. Copy(buf.data(), buf.size(), d);
  94. d += buf.size();
  95. }
  96. buf.clear();
  97. }
  98. *d++ = *s;
  99. }
  100. }
  101. }
  102. ee = e;
  103. }
  104. void DecodeUnknownPlane(wchar16* str, wchar16*& ee, const ECharset enc) {
  105. DoDecodeUnknownPlane(str, ee, enc);
  106. }
  107. void DecodeUnknownPlane(wchar32* str, wchar32*& ee, const ECharset enc) {
  108. DoDecodeUnknownPlane(str, ee, enc);
  109. }