codepage_ut.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. #include "codepage.h"
  2. #include "recyr.hh"
  3. #include "wide.h"
  4. #include <library/cpp/testing/unittest/registar.h>
  5. #include <util/charset/utf8.h>
  6. #include <util/system/yassert.h>
  7. #if defined(_MSC_VER)
  8. #pragma warning(disable : 4309) /*truncation of constant value*/
  9. #endif
  10. namespace {
  11. const char yandexUpperCase[] =
  12. "\x81\x82\x83\x84\x85\x86\x87"
  13. "\x8E"
  14. "\xA1\xA2\xA3\xA4\xA5\xA6"
  15. "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
  16. "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF"
  17. "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF";
  18. const char yandexLowerCase[] =
  19. "\x91\x92\x93\x94\x95\x96\x97"
  20. "\x9E"
  21. "\xB1\xB2\xB3\xB4\xB5\xB6"
  22. "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"
  23. "\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF"
  24. "\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
  25. }
  26. class TCodepageTest: public TTestBase {
  27. private:
  28. UNIT_TEST_SUITE(TCodepageTest);
  29. UNIT_TEST(TestUTF);
  30. UNIT_TEST(TestUTFFromUnknownPlane);
  31. UNIT_TEST(TestBrokenMultibyte);
  32. UNIT_TEST(TestSurrogatePairs);
  33. UNIT_TEST(TestEncodingHints);
  34. UNIT_TEST(TestToLower);
  35. UNIT_TEST(TestToUpper);
  36. UNIT_TEST(TestUpperLower);
  37. UNIT_TEST(TestBrokenRune);
  38. UNIT_TEST(TestCanEncode);
  39. UNIT_TEST_SUITE_END();
  40. public:
  41. void TestUTF();
  42. void TestUTFFromUnknownPlane();
  43. void TestBrokenMultibyte();
  44. void TestSurrogatePairs();
  45. void TestEncodingHints();
  46. void TestToLower();
  47. void TestToUpper();
  48. void TestCanEncode();
  49. inline void TestUpperLower() {
  50. const CodePage* cp = CodePageByCharset(CODES_ASCII);
  51. char tmp[100];
  52. TStringBuf s = "abcde";
  53. TStringBuf upper(tmp, cp->ToUpper(s.begin(), s.end(), tmp));
  54. UNIT_ASSERT_VALUES_EQUAL(upper, TStringBuf("ABCDE"));
  55. TStringBuf lower(tmp, cp->ToLower(upper.begin(), upper.end(), tmp));
  56. UNIT_ASSERT_VALUES_EQUAL(lower, TStringBuf("abcde"));
  57. }
  58. void TestBrokenRune() {
  59. UNIT_ASSERT_VALUES_EQUAL(BROKEN_RUNE, 0xFFFDu);
  60. }
  61. };
  62. UNIT_TEST_SUITE_REGISTRATION(TCodepageTest);
  63. void TCodepageTest::TestUTF() {
  64. for (wchar32 i = 0; i <= 0x10FFFF; i++) {
  65. unsigned char buffer[32];
  66. Zero(buffer);
  67. size_t rune_len;
  68. size_t ref_len = 0;
  69. if (i < 0x80)
  70. ref_len = 1;
  71. else if (i < 0x800)
  72. ref_len = 2;
  73. else if (i < 0x10000)
  74. ref_len = 3;
  75. else
  76. ref_len = 4;
  77. RECODE_RESULT res = SafeWriteUTF8Char(i, rune_len, buffer, buffer + 32);
  78. UNIT_ASSERT(res == RECODE_OK);
  79. UNIT_ASSERT(rune_len == ref_len);
  80. res = SafeWriteUTF8Char(i, rune_len, buffer, buffer + ref_len - 1);
  81. UNIT_ASSERT(res == RECODE_EOOUTPUT);
  82. wchar32 rune;
  83. res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + 32);
  84. UNIT_ASSERT(res == RECODE_OK);
  85. UNIT_ASSERT(rune == i);
  86. UNIT_ASSERT(rune_len == ref_len);
  87. res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + ref_len - 1);
  88. UNIT_ASSERT(res == RECODE_EOINPUT);
  89. if (ref_len > 1) {
  90. res = SafeReadUTF8Char(rune, rune_len, buffer + 1, buffer + ref_len);
  91. UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
  92. buffer[1] |= 0xC0;
  93. res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + ref_len);
  94. UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
  95. buffer[1] &= 0x3F;
  96. res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + ref_len);
  97. UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
  98. }
  99. }
  100. const char* badStrings[] = {
  101. "\xfe",
  102. "\xff",
  103. "\xcc\xc0",
  104. "\xf4\x90\x80\x80",
  105. //overlong:
  106. "\xfe\xfe\xff\xff",
  107. "\xc0\xaf",
  108. "\xe0\x80\xaf",
  109. "\xf0\x80\x80\xaf",
  110. "\xf8\x80\x80\x80\xaf",
  111. "\xfc\x80\x80\x80\x80\xaf",
  112. "\xc1\xbf",
  113. "\xe0\x9f\xbf",
  114. "\xf0\x8f\xbf\xbf",
  115. "\xf8\x87\xbf\xbf\xbf",
  116. "\xfc\x83\xbf\xbf\xbf\xbf",
  117. "\xc0\x80",
  118. "\xe0\x80\x80",
  119. "\xf0\x80\x80\x80",
  120. "\xf8\x80\x80\x80\x80",
  121. "\xfc\x80\x80\x80\x80\x80",
  122. //UTF-16 surrogate (not covered):
  123. //"\xed\xa0\x80",
  124. //"\xed\xad\xbf",
  125. //"\xed\xae\x80",
  126. //"\xed\xaf\xbf",
  127. //"\xed\xb0\x80",
  128. //"\xed\xbe\x80",
  129. //"\xed\xbf\xbf",
  130. };
  131. for (size_t i = 0; i < Y_ARRAY_SIZE(badStrings); ++i) {
  132. wchar32 rune;
  133. const ui8* p = (const ui8*)badStrings[i];
  134. size_t len;
  135. RECODE_RESULT res = SafeReadUTF8Char(rune, len, p, p + strlen(badStrings[i]));
  136. UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
  137. }
  138. }
  139. void TCodepageTest::TestBrokenMultibyte() {
  140. const ECharset cp = CODES_EUC_JP;
  141. const char sampletext[] = {'\xe3'};
  142. wchar32 recodeResult[100];
  143. size_t nwritten = 0;
  144. size_t nread = 0;
  145. RECODE_RESULT res = RecodeToUnicode(cp, sampletext, recodeResult, Y_ARRAY_SIZE(sampletext), Y_ARRAY_SIZE(recodeResult), nread, nwritten);
  146. UNIT_ASSERT(res == RECODE_OK);
  147. UNIT_ASSERT(nread == 1);
  148. UNIT_ASSERT(nwritten == 0);
  149. const char bigSample[] = {'\xC3', '\x87', '\xC3', '\x8E', '\xC2', '\xB0', '\xC3', '\x85', '\xC3', '\x85', '\xC3', '\xB8'};
  150. res = RecodeToUnicode(cp, bigSample, recodeResult, Y_ARRAY_SIZE(bigSample), Y_ARRAY_SIZE(recodeResult), nread, nwritten);
  151. UNIT_ASSERT(res == RECODE_OK);
  152. UNIT_ASSERT(nread == Y_ARRAY_SIZE(bigSample));
  153. }
  154. void TCodepageTest::TestUTFFromUnknownPlane() {
  155. static const wchar32 sampletext[] = {0x61, 0x62, 0x63, 0x20,
  156. 0x430, 0x431, 0x432, 0x20,
  157. 0x1001, 0x1002, 0x1003, 0x20,
  158. 0x10001, 0x10002, 0x10003};
  159. static const size_t BUFFER_SIZE = 1024;
  160. char bytebuffer[BUFFER_SIZE];
  161. size_t readchars = 0;
  162. size_t writtenbytes = 0;
  163. size_t samplelen = Y_ARRAY_SIZE(sampletext);
  164. RECODE_RESULT res = RecodeFromUnicode(CODES_UTF8, sampletext, bytebuffer, samplelen, BUFFER_SIZE, readchars, writtenbytes);
  165. UNIT_ASSERT(res == RECODE_OK);
  166. UNIT_ASSERT(samplelen == readchars);
  167. size_t writtenbytes2 = 0;
  168. char bytebuffer2[BUFFER_SIZE];
  169. for (size_t i = 0; i != samplelen; ++i) {
  170. size_t nwr = 0;
  171. const int res = RecodeFromUnicode(CODES_UTF8, sampletext[i], bytebuffer2 + writtenbytes2, BUFFER_SIZE - writtenbytes2, nwr);
  172. UNIT_ASSERT_VALUES_EQUAL(res, int(RECODE_OK));
  173. writtenbytes2 += nwr;
  174. UNIT_ASSERT(BUFFER_SIZE > writtenbytes2);
  175. }
  176. UNIT_ASSERT_VALUES_EQUAL(TStringBuf(bytebuffer, writtenbytes), TStringBuf(bytebuffer2, writtenbytes2));
  177. wchar32 charbuffer[BUFFER_SIZE];
  178. size_t readbytes = 0;
  179. size_t writtenchars = 0;
  180. res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars);
  181. UNIT_ASSERT(res == RECODE_OK);
  182. UNIT_ASSERT(readbytes == writtenbytes);
  183. wchar32* charbufferend = charbuffer + writtenchars;
  184. DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8);
  185. UNIT_ASSERT(charbufferend == charbuffer + samplelen);
  186. for (size_t i = 0; i < samplelen; ++i)
  187. UNIT_ASSERT(sampletext[i] == charbuffer[i]);
  188. // Now, concatenate the thing with an explicit character and retest
  189. res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars);
  190. UNIT_ASSERT(res == RECODE_OK);
  191. UNIT_ASSERT(readbytes == writtenbytes);
  192. charbuffer[writtenchars] = 0x1234;
  193. size_t morewrittenchars = 0;
  194. res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer + writtenchars + 1, writtenbytes, BUFFER_SIZE, readbytes, morewrittenchars);
  195. UNIT_ASSERT(res == RECODE_OK);
  196. UNIT_ASSERT(readbytes == writtenbytes);
  197. UNIT_ASSERT(writtenchars == morewrittenchars);
  198. charbuffer[2 * writtenchars + 1] = 0x5678;
  199. charbufferend = charbuffer + 2 * writtenchars + 2;
  200. DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8);
  201. UNIT_ASSERT(charbufferend == charbuffer + 2 * samplelen + 2);
  202. for (size_t i = 0; i < samplelen; ++i) {
  203. UNIT_ASSERT(sampletext[i] == charbuffer[i]);
  204. UNIT_ASSERT(sampletext[i] == charbuffer[samplelen + 1 + i]);
  205. }
  206. UNIT_ASSERT(0x1234 == charbuffer[samplelen]);
  207. UNIT_ASSERT(0x5678 == charbuffer[2 * samplelen + 1]);
  208. // test TChar version
  209. // bytebuffer of len writtenbytes contains sampletext of len samplelen chars in utf8
  210. TUtf16String wtr = CharToWide(TStringBuf(bytebuffer, writtenbytes), CODES_UNKNOWNPLANE);
  211. TChar* strend = wtr.begin() + wtr.size();
  212. DecodeUnknownPlane(wtr.begin(), strend, CODES_UTF8);
  213. wtr.resize(strend - wtr.data(), 'Q');
  214. UNIT_ASSERT_VALUES_EQUAL(wtr.size(), samplelen);
  215. for (size_t i = 0; i < wtr.size(); ++i) {
  216. if (sampletext[i] >= 0x10000) {
  217. UNIT_ASSERT_VALUES_EQUAL(wtr[i], ' ');
  218. } else {
  219. UNIT_ASSERT_VALUES_EQUAL(wtr[i], sampletext[i]);
  220. }
  221. }
  222. }
  223. static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) {
  224. size_t sSize = strlen(str);
  225. size_t wSize = sSize * 2;
  226. TArrayHolder<wchar16> w(new wchar16[wSize]);
  227. size_t read = 0;
  228. size_t written = 0;
  229. RECODE_RESULT res = RecodeToUnicode(CODES_UTF8, str, w.Get(), sSize, wSize, read, written);
  230. UNIT_ASSERT(res == RECODE_OK);
  231. UNIT_ASSERT(read == sSize);
  232. UNIT_ASSERT(written == wideSize);
  233. UNIT_ASSERT(!memcmp(w.Get(), wide, wideSize));
  234. TArrayHolder<char> s(new char[sSize]);
  235. res = RecodeFromUnicode(CODES_UTF8, w.Get(), s.Get(), wideSize, sSize, read, written);
  236. UNIT_ASSERT(res == RECODE_OK);
  237. UNIT_ASSERT(read == wideSize);
  238. UNIT_ASSERT(written == sSize);
  239. UNIT_ASSERT(!memcmp(s.Get(), str, sSize));
  240. }
  241. void TCodepageTest::TestSurrogatePairs() {
  242. const char* utf8NonBMP = "\xf4\x80\x89\x84\xf4\x80\x89\x87\xf4\x80\x88\xba";
  243. wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A};
  244. TestSurrogates(utf8NonBMP, wNonBMPDummy, Y_ARRAY_SIZE(wNonBMPDummy));
  245. const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n";
  246. wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'};
  247. TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2));
  248. }
  249. void TCodepageTest::TestEncodingHints() {
  250. UNIT_ASSERT(CODES_WIN == EncodingHintByName("windows-1251"));
  251. UNIT_ASSERT(CODES_WIN == EncodingHintByName("Windows1251"));
  252. UNIT_ASSERT(CODES_WIN == EncodingHintByName("WIN1251"));
  253. UNIT_ASSERT(CODES_WIN == EncodingHintByName("window-cp1251"));
  254. UNIT_ASSERT(CODES_WIN == EncodingHintByName("!!!CP1251???"));
  255. UNIT_ASSERT(CODES_WIN == EncodingHintByName("'ansi-cp1251;'"));
  256. UNIT_ASSERT(CODES_WIN == EncodingHintByName("charset=Microsoft-CP1251;"));
  257. UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-8859-2"));
  258. UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-2"));
  259. UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-latin-2"));
  260. UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("charset=\"Latin2\";"));
  261. UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("widow1251"));
  262. UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("default"));
  263. UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("$phpcharset"));
  264. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("ShiftJIS"));
  265. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("Shift_JIS"));
  266. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("Big5"));
  267. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("euc-kr"));
  268. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("EUC-JP"));
  269. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("charset='Shift_JIS';;"));
  270. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("ISO-2022-KR"));
  271. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("ISO-2022-jp"));
  272. }
  273. void TCodepageTest::TestToLower() {
  274. TTempBuf buf;
  275. char* data = buf.Data();
  276. const size_t n = Y_ARRAY_SIZE(yandexUpperCase); // including NTS
  277. memcpy(data, yandexUpperCase, n);
  278. ToLower(data, n - 1);
  279. UNIT_ASSERT(strcmp(data, yandexLowerCase) == 0);
  280. }
  281. void TCodepageTest::TestToUpper() {
  282. TTempBuf buf;
  283. char* data = buf.Data();
  284. const size_t n = Y_ARRAY_SIZE(yandexLowerCase); // including NTS
  285. memcpy(data, yandexLowerCase, n);
  286. ToUpper(data, n - 1);
  287. UNIT_ASSERT(strcmp(data, yandexUpperCase) == 0);
  288. }
  289. static void TestCanEncodeEmpty() {
  290. TWtringBuf empty;
  291. UNIT_ASSERT(CanBeEncoded(empty, CODES_WIN));
  292. UNIT_ASSERT(CanBeEncoded(empty, CODES_YANDEX));
  293. UNIT_ASSERT(CanBeEncoded(empty, CODES_UTF8));
  294. }
  295. static void TestCanEncodeEach(const TWtringBuf& text, ECharset encoding, bool expectedResult) {
  296. // char by char
  297. for (size_t i = 0; i < text.size(); ++i) {
  298. if (CanBeEncoded(text.SubStr(i, 1), encoding) != expectedResult)
  299. ythrow yexception() << "assertion failed: encoding " << NameByCharset(encoding)
  300. << " on '" << text.SubStr(i, 1) << "' (expected " << expectedResult << ")";
  301. }
  302. // whole text
  303. UNIT_ASSERT_EQUAL(CanBeEncoded(text, encoding), expectedResult);
  304. }
  305. void TCodepageTest::TestCanEncode() {
  306. TestCanEncodeEmpty();
  307. const TUtf16String lat = u"AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz";
  308. TestCanEncodeEach(lat, CODES_WIN, true);
  309. TestCanEncodeEach(lat, CODES_YANDEX, true);
  310. TestCanEncodeEach(lat, CODES_UTF8, true);
  311. const TUtf16String rus = u"АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя";
  312. TestCanEncodeEach(rus, CODES_WIN, true);
  313. TestCanEncodeEach(rus, CODES_YANDEX, true);
  314. TestCanEncodeEach(rus, CODES_UTF8, true);
  315. const TUtf16String ukr = u"ҐґЄєІіЇї";
  316. TestCanEncodeEach(ukr, CODES_WIN, true);
  317. TestCanEncodeEach(ukr, CODES_YANDEX, true);
  318. TestCanEncodeEach(ukr, CODES_UTF8, true);
  319. const TUtf16String pol = u"ĄĆĘŁŃÓŚŹŻąćęłńóśźż";
  320. TestCanEncodeEach(pol, CODES_WIN, false);
  321. TestCanEncodeEach(pol, CODES_YANDEX, true);
  322. TestCanEncodeEach(pol, CODES_UTF_16BE, true);
  323. const TUtf16String ger = u"ÄäÖöÜüß";
  324. TestCanEncodeEach(ger, CODES_WIN, false);
  325. TestCanEncodeEach(ger, CODES_YANDEX, true);
  326. TestCanEncodeEach(ger, CODES_UTF_16LE, true);
  327. const TUtf16String fra1 = u"éàèùâêîôûëïç"; // supported in yandex cp
  328. const TUtf16String fra2 = u"ÉÀÈÙÂÊÎÔÛËÏŸÿÇ";
  329. const TUtf16String fra3 = u"Æ挜";
  330. TestCanEncodeEach(fra1 + fra2 + fra3, CODES_WIN, false);
  331. TestCanEncodeEach(fra1, CODES_YANDEX, true);
  332. TestCanEncodeEach(fra2 + fra3, CODES_YANDEX, false);
  333. TestCanEncodeEach(fra1 + fra2 + fra3, CODES_UTF8, true);
  334. const TUtf16String kaz = u"ӘәҒғҚқҢңӨөҰұҮүҺһ";
  335. TestCanEncodeEach(kaz, CODES_WIN, false);
  336. TestCanEncodeEach(kaz, CODES_YANDEX, false);
  337. TestCanEncodeEach(kaz, CODES_UTF8, true);
  338. TestCanEncodeEach(kaz, CODES_KAZWIN, true);
  339. const TUtf16String tur1 = u"ĞİŞğş";
  340. const TUtf16String tur = tur1 + u"ı";
  341. TestCanEncodeEach(tur, CODES_WIN, false);
  342. TestCanEncodeEach(tur, CODES_YANDEX, false);
  343. TestCanEncodeEach(tur, CODES_UTF8, true);
  344. const TUtf16String chi = u"新隶体新隸體";
  345. TestCanEncodeEach(chi, CODES_WIN, false);
  346. TestCanEncodeEach(chi, CODES_YANDEX, false);
  347. TestCanEncodeEach(chi, CODES_UTF8, true);
  348. TestCanEncodeEach(chi, CODES_UTF_16LE, true);
  349. const TUtf16String jap = u"漢字仮字交じり文";
  350. TestCanEncodeEach(jap, CODES_WIN, false);
  351. TestCanEncodeEach(jap, CODES_YANDEX, false);
  352. TestCanEncodeEach(jap, CODES_UTF8, true);
  353. TestCanEncodeEach(jap, CODES_UTF_16BE, true);
  354. }