wide_ut.cpp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. #include "wide.h"
  2. #include "codepage.h"
  3. #include "recyr.hh"
  4. #include <library/cpp/testing/unittest/registar.h>
  5. #include <util/charset/utf8.h>
  6. #include <util/digest/numeric.h>
  7. #include <util/generic/hash_set.h>
  8. #include <algorithm>
  9. namespace {
  10. //! three UTF8 encoded russian letters (A, B, V)
  11. const char yandexCyrillicAlphabet[] =
  12. "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF" // A - P
  13. "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF" // R - YA
  14. "\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF" // a - p
  15. "\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF"; // r - ya
  16. const wchar16 wideCyrillicAlphabet[] = {
  17. 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
  18. 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
  19. 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
  20. 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x00};
  21. const char utf8CyrillicAlphabet[] =
  22. "\xd0\x90\xd0\x91\xd0\x92\xd0\x93\xd0\x94\xd0\x95\xd0\x96\xd0\x97"
  23. "\xd0\x98\xd0\x99\xd0\x9a\xd0\x9b\xd0\x9c\xd0\x9d\xd0\x9e\xd0\x9f"
  24. "\xd0\xa0\xd0\xa1\xd0\xa2\xd0\xa3\xd0\xa4\xd0\xa5\xd0\xa6\xd0\xa7"
  25. "\xd0\xa8\xd0\xa9\xd0\xaa\xd0\xab\xd0\xac\xd0\xad\xd0\xae\xd0\xaf"
  26. "\xd0\xb0\xd0\xb1\xd0\xb2\xd0\xb3\xd0\xb4\xd0\xb5\xd0\xb6\xd0\xb7"
  27. "\xd0\xb8\xd0\xb9\xd0\xba\xd0\xbb\xd0\xbc\xd0\xbd\xd0\xbe\xd0\xbf"
  28. "\xd1\x80\xd1\x81\xd1\x82\xd1\x83\xd1\x84\xd1\x85\xd1\x86\xd1\x87"
  29. "\xd1\x88\xd1\x89\xd1\x8a\xd1\x8b\xd1\x8c\xd1\x8d\xd1\x8e\xd1\x8f";
  30. TString CreateYandexText() {
  31. const int len = 256;
  32. char text[len] = {0};
  33. for (int i = 0; i < len; ++i) {
  34. text[i] = static_cast<char>(i);
  35. }
  36. return TString(text, len);
  37. }
  38. TUtf16String CreateUnicodeText() {
  39. const int len = 256;
  40. wchar16 text[len] = {
  41. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x00 - 0x0F
  42. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x10 - 0x1F
  43. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x20 - 0x2F
  44. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x30 - 0x3F
  45. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x40 - 0x4F
  46. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x50 - 0x5F
  47. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x60 - 0x6F
  48. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x70 - 0x7F
  49. 0x0301, 0x00C4, 0x00D6, 0x00DC, 0x0104, 0x0106, 0x0118, 0x0141, 0x00E0, 0x00E2, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x0490, 0x00AD, // 0x80 - 0x8F
  50. 0x00DF, 0x00E4, 0x00F6, 0x00FC, 0x0105, 0x0107, 0x0119, 0x0142, 0x00EB, 0x00EE, 0x00EF, 0x00F4, 0x00F9, 0x00FB, 0x0491, 0x92CF, // 0x90 - 0x9F
  51. 0x00A0, 0x0143, 0x00D3, 0x015A, 0x017B, 0x0179, 0x046C, 0x00A7, 0x0401, 0x0462, 0x0472, 0x0474, 0x040E, 0x0406, 0x0404, 0x0407, // 0xA0 - 0xAF
  52. 0x00B0, 0x0144, 0x00F3, 0x015B, 0x017C, 0x017A, 0x046D, 0x2116, 0x0451, 0x0463, 0x0473, 0x0475, 0x045E, 0x0456, 0x0454, 0x0457 // 0xB0 - 0xBF
  53. };
  54. for (int i = 0; i < len; ++i) {
  55. if (i <= 0x7F) { // ASCII characters without 0x7 and 0x1B
  56. text[i] = static_cast<wchar16>(i);
  57. } else if (i >= 0xC0 && i <= 0xFF) { // russian characters (without YO and yo)
  58. text[i] = static_cast<wchar16>(i + 0x0350); // 0x0410 - 0x044F
  59. }
  60. }
  61. return TUtf16String(text, len);
  62. }
  63. TString CreateUTF8Text() {
  64. char text[] = {
  65. '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
  66. '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
  67. '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27', '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
  68. '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37', '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
  69. '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47', '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
  70. '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57', '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
  71. '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67', '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
  72. '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77', '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
  73. '\xcc', '\x81', '\xc3', '\x84', '\xc3', '\x96', '\xc3', '\x9c', '\xc4', '\x84', '\xc4', '\x86', '\xc4', '\x98', '\xc5', '\x81',
  74. '\xc3', '\xa0', '\xc3', '\xa2', '\xc3', '\xa7', '\xc3', '\xa8', '\xc3', '\xa9', '\xc3', '\xaa', '\xd2', '\x90', '\xc2', '\xad',
  75. '\xc3', '\x9f', '\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc', '\xc4', '\x85', '\xc4', '\x87', '\xc4', '\x99', '\xc5', '\x82',
  76. '\xc3', '\xab', '\xc3', '\xae', '\xc3', '\xaf', '\xc3', '\xb4', '\xc3', '\xb9', '\xc3', '\xbb', '\xd2', '\x91', '\xe9', '\x8b',
  77. '\x8f', '\xc2', '\xa0', '\xc5', '\x83', '\xc3', '\x93', '\xc5', '\x9a', '\xc5', '\xbb', '\xc5', '\xb9', '\xd1', '\xac', '\xc2',
  78. '\xa7', '\xd0', '\x81', '\xd1', '\xa2', '\xd1', '\xb2', '\xd1', '\xb4', '\xd0', '\x8e', '\xd0', '\x86', '\xd0', '\x84', '\xd0',
  79. '\x87', '\xc2', '\xb0', '\xc5', '\x84', '\xc3', '\xb3', '\xc5', '\x9b', '\xc5', '\xbc', '\xc5', '\xba', '\xd1', '\xad', '\xe2',
  80. '\x84', '\x96', '\xd1', '\x91', '\xd1', '\xa3', '\xd1', '\xb3', '\xd1', '\xb5', '\xd1', '\x9e', '\xd1', '\x96', '\xd1', '\x94',
  81. '\xd1', '\x97', '\xd0', '\x90', '\xd0', '\x91', '\xd0', '\x92', '\xd0', '\x93', '\xd0', '\x94', '\xd0', '\x95', '\xd0', '\x96',
  82. '\xd0', '\x97', '\xd0', '\x98', '\xd0', '\x99', '\xd0', '\x9a', '\xd0', '\x9b', '\xd0', '\x9c', '\xd0', '\x9d', '\xd0', '\x9e',
  83. '\xd0', '\x9f', '\xd0', '\xa0', '\xd0', '\xa1', '\xd0', '\xa2', '\xd0', '\xa3', '\xd0', '\xa4', '\xd0', '\xa5', '\xd0', '\xa6',
  84. '\xd0', '\xa7', '\xd0', '\xa8', '\xd0', '\xa9', '\xd0', '\xaa', '\xd0', '\xab', '\xd0', '\xac', '\xd0', '\xad', '\xd0', '\xae',
  85. '\xd0', '\xaf', '\xd0', '\xb0', '\xd0', '\xb1', '\xd0', '\xb2', '\xd0', '\xb3', '\xd0', '\xb4', '\xd0', '\xb5', '\xd0', '\xb6',
  86. '\xd0', '\xb7', '\xd0', '\xb8', '\xd0', '\xb9', '\xd0', '\xba', '\xd0', '\xbb', '\xd0', '\xbc', '\xd0', '\xbd', '\xd0', '\xbe',
  87. '\xd0', '\xbf', '\xd1', '\x80', '\xd1', '\x81', '\xd1', '\x82', '\xd1', '\x83', '\xd1', '\x84', '\xd1', '\x85', '\xd1', '\x86',
  88. '\xd1', '\x87', '\xd1', '\x88', '\xd1', '\x89', '\xd1', '\x8a', '\xd1', '\x8b', '\xd1', '\x8c', '\xd1', '\x8d', '\xd1', '\x8e',
  89. '\xd1', '\x8f'};
  90. return TString(text, Y_ARRAY_SIZE(text));
  91. }
  92. //! use this function to dump UTF8 text into a file in case of any changes
  93. // void DumpUTF8Text() {
  94. // TString s = WideToUTF8(UnicodeText);
  95. // std::ofstream f("utf8.txt");
  96. // f << std::hex;
  97. // for (int i = 0; i < (int)s.size(); ++i) {
  98. // f << "0x" << std::setw(2) << std::setfill('0') << (int)(ui8)s[i] << ", ";
  99. // if ((i + 1) % 16 == 0)
  100. // f << std::endl;
  101. // }
  102. // }
  103. }
  104. //! this unit tests ensure validity of Yandex-Unicode and UTF8-Unicode conversions
  105. //! @note only those conversions are verified because they are used in index
  106. class TConversionTest: public TTestBase {
  107. private:
  108. //! @note every of the text can have zeros in the middle
  109. const TString YandexText;
  110. const TUtf16String UnicodeText;
  111. const TString UTF8Text;
  112. private:
  113. UNIT_TEST_SUITE(TConversionTest);
  114. UNIT_TEST(TestCharToWide);
  115. UNIT_TEST(TestWideToChar);
  116. UNIT_TEST(TestYandexEncoding);
  117. UNIT_TEST(TestRecodeIntoString);
  118. UNIT_TEST(TestRecodeAppend);
  119. UNIT_TEST(TestRecode);
  120. UNIT_TEST(TestUnicodeLimit);
  121. UNIT_TEST(TestCanEncode);
  122. UNIT_TEST_SUITE_END();
  123. public:
  124. TConversionTest()
  125. : YandexText(CreateYandexText())
  126. , UnicodeText(CreateUnicodeText())
  127. , UTF8Text(CreateUTF8Text())
  128. {
  129. }
  130. void TestCharToWide();
  131. void TestWideToChar();
  132. void TestYandexEncoding();
  133. void TestRecodeIntoString();
  134. void TestRecodeAppend();
  135. void TestRecode();
  136. void TestUnicodeLimit();
  137. void TestCanEncode();
  138. };
  139. UNIT_TEST_SUITE_REGISTRATION(TConversionTest);
  140. // test conversions (char -> wchar32), (wchar32 -> char) and (wchar32 -> wchar16)
  141. #define TEST_WCHAR32(sbuf, wbuf, enc) \
  142. do { \
  143. /* convert char to wchar32 */ \
  144. TTempBuf tmpbuf1(sbuf.length() * sizeof(wchar32)); \
  145. const TBasicStringBuf<wchar32> s4buf = NDetail::NBaseOps::Recode<char>(sbuf, reinterpret_cast<wchar32*>(tmpbuf1.Data()), enc); \
  146. \
  147. /* convert wchar32 to char */ \
  148. TTempBuf tmpbuf2(s4buf.length() * 4); \
  149. const TStringBuf s1buf = NDetail::NBaseOps::Recode(s4buf, tmpbuf2.Data(), enc); \
  150. \
  151. /* convert wchar32 to wchar16 */ \
  152. const TUtf16String wstr2 = UTF32ToWide(s4buf.data(), s4buf.length()); \
  153. \
  154. /* test conversions */ \
  155. UNIT_ASSERT_VALUES_EQUAL(sbuf, s1buf); \
  156. UNIT_ASSERT_VALUES_EQUAL(wbuf, wstr2); \
  157. } while (false)
  158. void TConversionTest::TestCharToWide() {
  159. TUtf16String w = CharToWide(YandexText, CODES_YANDEX);
  160. UNIT_ASSERT(w.size() == 256);
  161. UNIT_ASSERT(w.size() == UnicodeText.size());
  162. for (int i = 0; i < 256; ++i) {
  163. UNIT_ASSERT_VALUES_EQUAL(w[i], UnicodeText[i]);
  164. }
  165. }
  166. void TConversionTest::TestWideToChar() {
  167. TString s = WideToChar(UnicodeText, CODES_YANDEX);
  168. UNIT_ASSERT(s.size() == 256);
  169. UNIT_ASSERT(s.size() == YandexText.size());
  170. for (int i = 0; i < 256; ++i) {
  171. UNIT_ASSERT_VALUES_EQUAL(s[i], YandexText[i]);
  172. }
  173. }
  174. static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize, ECharset enc) {
  175. TUtf16String w = UTF8ToWide(str);
  176. UNIT_ASSERT(w.size() == wideSize);
  177. UNIT_ASSERT(!memcmp(w.c_str(), wide, wideSize));
  178. TString s = WideToChar(w, enc);
  179. UNIT_ASSERT(s == str);
  180. }
  181. void TConversionTest::TestYandexEncoding() {
  182. TUtf16String w = UTF8ToWide(utf8CyrillicAlphabet, strlen(utf8CyrillicAlphabet), csYandex);
  183. UNIT_ASSERT(w == wideCyrillicAlphabet);
  184. w = UTF8ToWide(yandexCyrillicAlphabet, strlen(yandexCyrillicAlphabet), csYandex);
  185. UNIT_ASSERT(w == wideCyrillicAlphabet);
  186. const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n";
  187. wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'};
  188. TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2), CODES_UTF8);
  189. {
  190. const char* yandexNonBMP2 = "ab?n";
  191. UNIT_ASSERT(yandexNonBMP2 == WideToChar(wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2), CODES_YANDEX));
  192. TString temp;
  193. temp.resize(Y_ARRAY_SIZE(wNonBMPDummy2));
  194. size_t read = 0;
  195. size_t written = 0;
  196. RecodeFromUnicode(CODES_YANDEX, wNonBMPDummy2, temp.begin(), Y_ARRAY_SIZE(wNonBMPDummy2), temp.size(), read, written);
  197. temp.remove(written);
  198. UNIT_ASSERT(yandexNonBMP2 == temp);
  199. }
  200. }
  201. void TConversionTest::TestRecodeIntoString() {
  202. TString sYandex(UnicodeText.size() * 4, 'x');
  203. const char* sdata = sYandex.data();
  204. TStringBuf sres = NDetail::Recode<wchar16>(UnicodeText, sYandex, CODES_YANDEX);
  205. UNIT_ASSERT(sYandex == YandexText); // same content
  206. UNIT_ASSERT(sYandex.data() == sdata); // reserved buffer reused
  207. UNIT_ASSERT(sYandex.data() == sres.data()); // same buffer
  208. UNIT_ASSERT(sYandex.size() == sres.size()); // same size
  209. TEST_WCHAR32(sYandex, UnicodeText, CODES_YANDEX);
  210. TUtf16String sUnicode;
  211. sUnicode.reserve(YandexText.size() * 4);
  212. const wchar16* wdata = sUnicode.data();
  213. TWtringBuf wres = NDetail::Recode<char>(YandexText, sUnicode, CODES_YANDEX);
  214. UNIT_ASSERT(sUnicode == UnicodeText); // same content
  215. UNIT_ASSERT(sUnicode.data() == wdata); // reserved buffer reused
  216. UNIT_ASSERT(sUnicode.data() == wres.data()); // same buffer
  217. UNIT_ASSERT(sUnicode.size() == wres.size()); // same size
  218. TString sUtf8 = " ";
  219. size_t scap = sUtf8.capacity();
  220. sres = NDetail::Recode<wchar16>(UnicodeText, sUtf8, CODES_UTF8);
  221. UNIT_ASSERT(sUtf8 == UTF8Text); // same content
  222. UNIT_ASSERT(sUtf8.capacity() > scap); // increased buffer capacity (supplied was too small)
  223. UNIT_ASSERT(sUtf8.data() == sres.data()); // same buffer
  224. UNIT_ASSERT(sUtf8.size() == sres.size()); // same size
  225. TEST_WCHAR32(sUtf8, UnicodeText, CODES_UTF8);
  226. sUnicode.clear();
  227. wdata = sUnicode.data();
  228. TUtf16String copy = sUnicode; // increase ref-counter
  229. wres = NDetail::Recode<char>(UTF8Text, sUnicode, CODES_UTF8);
  230. UNIT_ASSERT(sUnicode == UnicodeText); // same content
  231. #ifndef TSTRING_IS_STD_STRING
  232. UNIT_ASSERT(sUnicode.data() != wdata); // re-allocated (shared buffer supplied)
  233. UNIT_ASSERT(sUnicode.data() == wres.data()); // same buffer
  234. #endif
  235. UNIT_ASSERT(sUnicode.size() == wres.size()); // same content
  236. }
  237. static TString GenerateJunk(size_t seed) {
  238. TString res;
  239. size_t hash = NumericHash(seed);
  240. size_t size = hash % 1024;
  241. res.reserve(size);
  242. for (size_t i = 0; i < size; ++i)
  243. res += static_cast<char>(NumericHash(hash + i) % 256);
  244. return res;
  245. }
  246. void TConversionTest::TestRecodeAppend() {
  247. {
  248. TString s1, s2;
  249. NDetail::RecodeAppend<wchar16>(TUtf16String(), s1, CODES_YANDEX);
  250. UNIT_ASSERT(s1.empty());
  251. NDetail::RecodeAppend<wchar16>(UnicodeText, s1, CODES_WIN);
  252. s2 += WideToChar(UnicodeText, CODES_WIN);
  253. UNIT_ASSERT_EQUAL(s1, s2);
  254. NDetail::RecodeAppend<wchar16>(UnicodeText, s1, CODES_YANDEX);
  255. s2 += WideToChar(UnicodeText, CODES_YANDEX);
  256. UNIT_ASSERT_EQUAL(s1, s2);
  257. NDetail::RecodeAppend<wchar16>(TUtf16String(), s1, CODES_YANDEX);
  258. UNIT_ASSERT_EQUAL(s1, s2);
  259. NDetail::RecodeAppend<wchar16>(UnicodeText, s1, CODES_UTF8);
  260. s2 += WideToUTF8(UnicodeText);
  261. UNIT_ASSERT_EQUAL(s1, s2);
  262. for (size_t i = 0; i < 100; ++i) {
  263. TUtf16String junk = CharToWide(GenerateJunk(i), CODES_YANDEX);
  264. NDetail::RecodeAppend<wchar16>(junk, s1, CODES_UTF8);
  265. s2 += WideToUTF8(junk);
  266. UNIT_ASSERT_EQUAL(s1, s2);
  267. }
  268. }
  269. {
  270. TUtf16String s1, s2;
  271. NDetail::RecodeAppend<char>(TString(), s1, CODES_YANDEX);
  272. UNIT_ASSERT(s1.empty());
  273. NDetail::RecodeAppend<char>(YandexText, s1, CODES_WIN);
  274. s2 += CharToWide(YandexText, CODES_WIN);
  275. UNIT_ASSERT_EQUAL(s1, s2);
  276. NDetail::RecodeAppend<char>(YandexText, s1, CODES_YANDEX);
  277. s2 += CharToWide(YandexText, CODES_YANDEX);
  278. UNIT_ASSERT_EQUAL(s1, s2);
  279. NDetail::RecodeAppend<char>(TString(), s1, CODES_YANDEX);
  280. UNIT_ASSERT_EQUAL(s1, s2);
  281. NDetail::RecodeAppend<char>(UTF8Text, s1, CODES_UTF8);
  282. s2 += UTF8ToWide(UTF8Text);
  283. UNIT_ASSERT_EQUAL(s1, s2);
  284. for (size_t i = 0; i < 100; ++i) {
  285. TString junk = GenerateJunk(i);
  286. NDetail::RecodeAppend<char>(junk, s1, CODES_YANDEX);
  287. s2 += CharToWide(junk, CODES_YANDEX);
  288. UNIT_ASSERT_EQUAL(s1, s2);
  289. }
  290. }
  291. }
  292. template <>
  293. void Out<RECODE_RESULT>(IOutputStream& out, RECODE_RESULT val) {
  294. out << int(val);
  295. }
  296. void TConversionTest::TestRecode() {
  297. for (int c = 0; c != CODES_MAX; ++c) {
  298. ECharset enc = static_cast<ECharset>(c);
  299. if (!SingleByteCodepage(enc))
  300. continue;
  301. using THash = THashSet<char>;
  302. THash hash;
  303. for (int i = 0; i != 256; ++i) {
  304. char ch = static_cast<char>(i);
  305. wchar32 wch;
  306. size_t read = 0;
  307. size_t written = 0;
  308. RECODE_RESULT res = RECODE_ERROR;
  309. res = RecodeToUnicode(enc, &ch, &wch, 1, 1, read, written);
  310. UNIT_ASSERT(res == RECODE_OK);
  311. if (wch == BROKEN_RUNE)
  312. continue;
  313. char rch = 0;
  314. res = RecodeFromUnicode(enc, &wch, &rch, 1, 1, read, written);
  315. UNIT_ASSERT(res == RECODE_OK);
  316. char rch2 = 0;
  317. UNIT_ASSERT_VALUES_EQUAL(RECODE_OK, RecodeFromUnicode(enc, wch, &rch2, 1, written));
  318. UNIT_ASSERT_VALUES_EQUAL(size_t(1), written);
  319. UNIT_ASSERT_VALUES_EQUAL(rch2, rch);
  320. if (hash.contains(rch)) { // there are some stupid encodings with duplicate characters
  321. continue;
  322. } else {
  323. hash.insert(rch);
  324. }
  325. UNIT_ASSERT(ch == rch);
  326. }
  327. }
  328. }
  329. void TConversionTest::TestUnicodeLimit() {
  330. for (int i = 0; i != CODES_MAX; ++i) {
  331. ECharset code = static_cast<ECharset>(i);
  332. if (!SingleByteCodepage(code))
  333. continue;
  334. const CodePage* page = CodePageByCharset(code);
  335. Y_ASSERT(page);
  336. for (int c = 0; c < 256; ++c) {
  337. UNIT_ASSERT(page->unicode[c] < 1 << 16);
  338. }
  339. }
  340. }
  341. static void TestCanEncodeEmpty() {
  342. TWtringBuf empty;
  343. UNIT_ASSERT(CanBeEncoded(empty, CODES_WIN));
  344. UNIT_ASSERT(CanBeEncoded(empty, CODES_YANDEX));
  345. UNIT_ASSERT(CanBeEncoded(empty, CODES_UTF8));
  346. }
  347. static void TestCanEncodeEach(const TWtringBuf& text, ECharset encoding, bool expectedResult) {
  348. // char by char
  349. for (size_t i = 0; i < text.size(); ++i) {
  350. if (CanBeEncoded(text.SubStr(i, 1), encoding) != expectedResult)
  351. ythrow yexception() << "assertion failed: encoding " << NameByCharset(encoding)
  352. << " on '" << text.SubStr(i, 1) << "' (expected " << expectedResult << ")";
  353. }
  354. // whole text
  355. UNIT_ASSERT_EQUAL(CanBeEncoded(text, encoding), expectedResult);
  356. }
  357. void TConversionTest::TestCanEncode() {
  358. TestCanEncodeEmpty();
  359. const TUtf16String lat = u"AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz";
  360. TestCanEncodeEach(lat, CODES_WIN, true);
  361. TestCanEncodeEach(lat, CODES_YANDEX, true);
  362. TestCanEncodeEach(lat, CODES_UTF8, true);
  363. const TUtf16String rus = u"АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя";
  364. TestCanEncodeEach(rus, CODES_WIN, true);
  365. TestCanEncodeEach(rus, CODES_YANDEX, true);
  366. TestCanEncodeEach(rus, CODES_UTF8, true);
  367. const TUtf16String ukr = u"ҐґЄєІіЇї";
  368. TestCanEncodeEach(ukr, CODES_WIN, true);
  369. TestCanEncodeEach(ukr, CODES_YANDEX, true);
  370. TestCanEncodeEach(ukr, CODES_UTF8, true);
  371. const TUtf16String pol = u"ĄĆĘŁŃÓŚŹŻąćęłńóśźż";
  372. TestCanEncodeEach(pol, CODES_WIN, false);
  373. TestCanEncodeEach(pol, CODES_YANDEX, true);
  374. TestCanEncodeEach(pol, CODES_UTF_16BE, true);
  375. const TUtf16String ger = u"ÄäÖöÜüß";
  376. TestCanEncodeEach(ger, CODES_WIN, false);
  377. TestCanEncodeEach(ger, CODES_YANDEX, true);
  378. TestCanEncodeEach(ger, CODES_UTF_16LE, true);
  379. const TUtf16String fra1 = u"éàèùâêîôûëïç"; // supported in yandex cp
  380. const TUtf16String fra2 = u"ÉÀÈÙÂÊÎÔÛËÏŸÿÇ";
  381. const TUtf16String fra3 = u"Æ挜";
  382. TestCanEncodeEach(fra1 + fra2 + fra3, CODES_WIN, false);
  383. TestCanEncodeEach(fra1, CODES_YANDEX, true);
  384. TestCanEncodeEach(fra2 + fra3, CODES_YANDEX, false);
  385. TestCanEncodeEach(fra1 + fra2 + fra3, CODES_UTF8, true);
  386. const TUtf16String kaz = u"ӘәҒғҚқҢңӨөҰұҮүҺһ";
  387. TestCanEncodeEach(kaz, CODES_WIN, false);
  388. TestCanEncodeEach(kaz, CODES_YANDEX, false);
  389. TestCanEncodeEach(kaz, CODES_UTF8, true);
  390. TestCanEncodeEach(kaz, CODES_KAZWIN, true);
  391. const TUtf16String tur1 = u"ĞİŞğş";
  392. const TUtf16String tur = tur1 + u"ı";
  393. TestCanEncodeEach(tur, CODES_WIN, false);
  394. TestCanEncodeEach(tur, CODES_YANDEX, false);
  395. TestCanEncodeEach(tur, CODES_UTF8, true);
  396. const TUtf16String chi = u"新隶体新隸體";
  397. TestCanEncodeEach(chi, CODES_WIN, false);
  398. TestCanEncodeEach(chi, CODES_YANDEX, false);
  399. TestCanEncodeEach(chi, CODES_UTF8, true);
  400. TestCanEncodeEach(chi, CODES_UTF_16LE, true);
  401. const TUtf16String jap = u"漢字仮字交じり文";
  402. TestCanEncodeEach(jap, CODES_WIN, false);
  403. TestCanEncodeEach(jap, CODES_YANDEX, false);
  404. TestCanEncodeEach(jap, CODES_UTF8, true);
  405. TestCanEncodeEach(jap, CODES_UTF_16BE, true);
  406. }