codepage_ut.cpp 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. #include "codepage.h"
  2. #include "wide.h"
  3. #include <library/cpp/testing/unittest/registar.h>
  4. #include <util/charset/utf8.h>
  5. #include <util/system/yassert.h>
  6. #if defined(_MSC_VER)
  7. #pragma warning(disable : 4309) /*truncation of constant value*/
  8. #endif
  9. namespace {
  10. const char yandexUpperCase[] =
  11. "\x81\x82\x83\x84\x85\x86\x87"
  12. "\x8E"
  13. "\xA1\xA2\xA3\xA4\xA5\xA6"
  14. "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
  15. "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF"
  16. "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF";
  17. const char yandexLowerCase[] =
  18. "\x91\x92\x93\x94\x95\x96\x97"
  19. "\x9E"
  20. "\xB1\xB2\xB3\xB4\xB5\xB6"
  21. "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"
  22. "\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF"
  23. "\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
  24. }
  25. class TCodepageTest: public TTestBase {
  26. private:
  27. UNIT_TEST_SUITE(TCodepageTest);
  28. UNIT_TEST(TestUTF);
  29. UNIT_TEST(TestEncodingHints);
  30. UNIT_TEST(TestToLower);
  31. UNIT_TEST(TestToUpper);
  32. UNIT_TEST(TestUpperLower);
  33. UNIT_TEST(TestBrokenRune);
  34. UNIT_TEST_SUITE_END();
  35. public:
  36. void TestUTF();
  37. void TestEncodingHints();
  38. void TestToLower();
  39. void TestToUpper();
  40. inline void TestUpperLower() {
  41. const CodePage* cp = CodePageByCharset(CODES_ASCII);
  42. char tmp[100];
  43. TStringBuf s = "abcde";
  44. TStringBuf upper(tmp, cp->ToUpper(s.begin(), s.end(), tmp));
  45. UNIT_ASSERT_VALUES_EQUAL(upper, TStringBuf("ABCDE"));
  46. TStringBuf lower(tmp, cp->ToLower(upper.begin(), upper.end(), tmp));
  47. UNIT_ASSERT_VALUES_EQUAL(lower, TStringBuf("abcde"));
  48. }
  49. void TestBrokenRune() {
  50. UNIT_ASSERT_VALUES_EQUAL(BROKEN_RUNE, 0xFFFDu);
  51. }
  52. };
  53. UNIT_TEST_SUITE_REGISTRATION(TCodepageTest);
  54. void TCodepageTest::TestUTF() {
  55. for (wchar32 i = 0; i <= 0x10FFFF; i++) {
  56. unsigned char buffer[32];
  57. Zero(buffer);
  58. size_t rune_len;
  59. size_t ref_len = 0;
  60. if (i < 0x80)
  61. ref_len = 1;
  62. else if (i < 0x800)
  63. ref_len = 2;
  64. else if (i < 0x10000)
  65. ref_len = 3;
  66. else
  67. ref_len = 4;
  68. RECODE_RESULT res = SafeWriteUTF8Char(i, rune_len, buffer, buffer + 32);
  69. UNIT_ASSERT(res == RECODE_OK);
  70. UNIT_ASSERT(rune_len == ref_len);
  71. res = SafeWriteUTF8Char(i, rune_len, buffer, buffer + ref_len - 1);
  72. UNIT_ASSERT(res == RECODE_EOOUTPUT);
  73. wchar32 rune;
  74. res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + 32);
  75. UNIT_ASSERT(res == RECODE_OK);
  76. UNIT_ASSERT(rune == i);
  77. UNIT_ASSERT(rune_len == ref_len);
  78. res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + ref_len - 1);
  79. UNIT_ASSERT(res == RECODE_EOINPUT);
  80. if (ref_len > 1) {
  81. res = SafeReadUTF8Char(rune, rune_len, buffer + 1, buffer + ref_len);
  82. UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
  83. buffer[1] |= 0xC0;
  84. res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + ref_len);
  85. UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
  86. buffer[1] &= 0x3F;
  87. res = SafeReadUTF8Char(rune, rune_len, buffer, buffer + ref_len);
  88. UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
  89. }
  90. }
  91. const char* badStrings[] = {
  92. "\xfe",
  93. "\xff",
  94. "\xcc\xc0",
  95. "\xf4\x90\x80\x80",
  96. //overlong:
  97. "\xfe\xfe\xff\xff",
  98. "\xc0\xaf",
  99. "\xe0\x80\xaf",
  100. "\xf0\x80\x80\xaf",
  101. "\xf8\x80\x80\x80\xaf",
  102. "\xfc\x80\x80\x80\x80\xaf",
  103. "\xc1\xbf",
  104. "\xe0\x9f\xbf",
  105. "\xf0\x8f\xbf\xbf",
  106. "\xf8\x87\xbf\xbf\xbf",
  107. "\xfc\x83\xbf\xbf\xbf\xbf",
  108. "\xc0\x80",
  109. "\xe0\x80\x80",
  110. "\xf0\x80\x80\x80",
  111. "\xf8\x80\x80\x80\x80",
  112. "\xfc\x80\x80\x80\x80\x80",
  113. //UTF-16 surrogate (not covered):
  114. //"\xed\xa0\x80",
  115. //"\xed\xad\xbf",
  116. //"\xed\xae\x80",
  117. //"\xed\xaf\xbf",
  118. //"\xed\xb0\x80",
  119. //"\xed\xbe\x80",
  120. //"\xed\xbf\xbf",
  121. };
  122. for (size_t i = 0; i < Y_ARRAY_SIZE(badStrings); ++i) {
  123. wchar32 rune;
  124. const ui8* p = (const ui8*)badStrings[i];
  125. size_t len;
  126. RECODE_RESULT res = SafeReadUTF8Char(rune, len, p, p + strlen(badStrings[i]));
  127. UNIT_ASSERT(res == RECODE_BROKENSYMBOL);
  128. }
  129. }
  130. void TCodepageTest::TestEncodingHints() {
  131. UNIT_ASSERT(CODES_WIN == EncodingHintByName("windows-1251"));
  132. UNIT_ASSERT(CODES_WIN == EncodingHintByName("Windows1251"));
  133. UNIT_ASSERT(CODES_WIN == EncodingHintByName("WIN1251"));
  134. UNIT_ASSERT(CODES_WIN == EncodingHintByName("window-cp1251"));
  135. UNIT_ASSERT(CODES_WIN == EncodingHintByName("!!!CP1251???"));
  136. UNIT_ASSERT(CODES_WIN == EncodingHintByName("'ansi-cp1251;'"));
  137. UNIT_ASSERT(CODES_WIN == EncodingHintByName("charset=Microsoft-CP1251;"));
  138. UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-8859-2"));
  139. UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-2"));
  140. UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("iso-latin-2"));
  141. UNIT_ASSERT(CODES_ISO_EAST == EncodingHintByName("charset=\"Latin2\";"));
  142. UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("widow1251"));
  143. UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("default"));
  144. UNIT_ASSERT(CODES_UNKNOWN == EncodingHintByName("$phpcharset"));
  145. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("ShiftJIS"));
  146. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("Shift_JIS"));
  147. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("Big5"));
  148. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("euc-kr"));
  149. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("EUC-JP"));
  150. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("charset='Shift_JIS';;"));
  151. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("ISO-2022-KR"));
  152. UNIT_ASSERT(CODES_UNSUPPORTED != EncodingHintByName("ISO-2022-jp"));
  153. }
  154. void TCodepageTest::TestToLower() {
  155. TTempBuf buf;
  156. char* data = buf.Data();
  157. const size_t n = Y_ARRAY_SIZE(yandexUpperCase); // including NTS
  158. memcpy(data, yandexUpperCase, n);
  159. ToLower(data, n - 1);
  160. UNIT_ASSERT(strcmp(data, yandexLowerCase) == 0);
  161. }
  162. void TCodepageTest::TestToUpper() {
  163. TTempBuf buf;
  164. char* data = buf.Data();
  165. const size_t n = Y_ARRAY_SIZE(yandexLowerCase); // including NTS
  166. memcpy(data, yandexLowerCase, n);
  167. ToUpper(data, n - 1);
  168. UNIT_ASSERT(strcmp(data, yandexUpperCase) == 0);
  169. }