recyr_int_ut.cpp 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. #include "codepage.h"
  2. #include "recyr.hh"
  3. #include "wide.h"
  4. #include <library/cpp/testing/unittest/registar.h>
  5. #include <util/charset/utf8.h>
  6. #include <util/system/yassert.h>
  7. class TRecyr_intTest: public TTestBase {
  8. private:
  9. UNIT_TEST_SUITE(TRecyr_intTest);
  10. UNIT_TEST(TestUTFFromUnknownPlane);
  11. UNIT_TEST(TestBrokenMultibyte);
  12. UNIT_TEST(TestSurrogatePairs);
  13. UNIT_TEST_SUITE_END();
  14. public:
  15. void TestUTFFromUnknownPlane();
  16. void TestBrokenMultibyte();
  17. void TestSurrogatePairs();
  18. };
  19. void TRecyr_intTest::TestBrokenMultibyte() {
  20. const ECharset cp = CODES_EUC_JP;
  21. const char sampletext[] = {'\xe3'};
  22. wchar32 recodeResult[100];
  23. size_t nwritten = 0;
  24. size_t nread = 0;
  25. RECODE_RESULT res = RecodeToUnicode(cp, sampletext, recodeResult, Y_ARRAY_SIZE(sampletext), Y_ARRAY_SIZE(recodeResult), nread, nwritten);
  26. UNIT_ASSERT(res == RECODE_OK);
  27. UNIT_ASSERT(nread == 1);
  28. UNIT_ASSERT(nwritten == 0);
  29. const char bigSample[] = {'\xC3', '\x87', '\xC3', '\x8E', '\xC2', '\xB0', '\xC3', '\x85', '\xC3', '\x85', '\xC3', '\xB8'};
  30. res = RecodeToUnicode(cp, bigSample, recodeResult, Y_ARRAY_SIZE(bigSample), Y_ARRAY_SIZE(recodeResult), nread, nwritten);
  31. UNIT_ASSERT(res == RECODE_OK);
  32. UNIT_ASSERT(nread == Y_ARRAY_SIZE(bigSample));
  33. }
  34. void TRecyr_intTest::TestUTFFromUnknownPlane() {
  35. static const wchar32 sampletext[] = {0x61, 0x62, 0x63, 0x20,
  36. 0x430, 0x431, 0x432, 0x20,
  37. 0x1001, 0x1002, 0x1003, 0x20,
  38. 0x10001, 0x10002, 0x10003};
  39. static const size_t BUFFER_SIZE = 1024;
  40. char bytebuffer[BUFFER_SIZE];
  41. size_t readchars = 0;
  42. size_t writtenbytes = 0;
  43. size_t samplelen = Y_ARRAY_SIZE(sampletext);
  44. RECODE_RESULT res = RecodeFromUnicode(CODES_UTF8, sampletext, bytebuffer, samplelen, BUFFER_SIZE, readchars, writtenbytes);
  45. UNIT_ASSERT(res == RECODE_OK);
  46. UNIT_ASSERT(samplelen == readchars);
  47. size_t writtenbytes2 = 0;
  48. char bytebuffer2[BUFFER_SIZE];
  49. for (size_t i = 0; i != samplelen; ++i) {
  50. size_t nwr = 0;
  51. const int res = RecodeFromUnicode(CODES_UTF8, sampletext[i], bytebuffer2 + writtenbytes2, BUFFER_SIZE - writtenbytes2, nwr);
  52. UNIT_ASSERT_VALUES_EQUAL(res, int(RECODE_OK));
  53. writtenbytes2 += nwr;
  54. UNIT_ASSERT(BUFFER_SIZE > writtenbytes2);
  55. }
  56. UNIT_ASSERT_VALUES_EQUAL(TStringBuf(bytebuffer, writtenbytes), TStringBuf(bytebuffer2, writtenbytes2));
  57. wchar32 charbuffer[BUFFER_SIZE];
  58. size_t readbytes = 0;
  59. size_t writtenchars = 0;
  60. res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars);
  61. UNIT_ASSERT(res == RECODE_OK);
  62. UNIT_ASSERT(readbytes == writtenbytes);
  63. wchar32* charbufferend = charbuffer + writtenchars;
  64. DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8);
  65. UNIT_ASSERT(charbufferend == charbuffer + samplelen);
  66. for (size_t i = 0; i < samplelen; ++i)
  67. UNIT_ASSERT(sampletext[i] == charbuffer[i]);
  68. // Now, concatenate the thing with an explicit character and retest
  69. res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer, writtenbytes, BUFFER_SIZE, readbytes, writtenchars);
  70. UNIT_ASSERT(res == RECODE_OK);
  71. UNIT_ASSERT(readbytes == writtenbytes);
  72. charbuffer[writtenchars] = 0x1234;
  73. size_t morewrittenchars = 0;
  74. res = RecodeToUnicode(CODES_UNKNOWNPLANE, bytebuffer, charbuffer + writtenchars + 1, writtenbytes, BUFFER_SIZE, readbytes, morewrittenchars);
  75. UNIT_ASSERT(res == RECODE_OK);
  76. UNIT_ASSERT(readbytes == writtenbytes);
  77. UNIT_ASSERT(writtenchars == morewrittenchars);
  78. charbuffer[2 * writtenchars + 1] = 0x5678;
  79. charbufferend = charbuffer + 2 * writtenchars + 2;
  80. DecodeUnknownPlane(charbuffer, charbufferend, CODES_UTF8);
  81. UNIT_ASSERT(charbufferend == charbuffer + 2 * samplelen + 2);
  82. for (size_t i = 0; i < samplelen; ++i) {
  83. UNIT_ASSERT(sampletext[i] == charbuffer[i]);
  84. UNIT_ASSERT(sampletext[i] == charbuffer[samplelen + 1 + i]);
  85. }
  86. UNIT_ASSERT(0x1234 == charbuffer[samplelen]);
  87. UNIT_ASSERT(0x5678 == charbuffer[2 * samplelen + 1]);
  88. // test TChar version
  89. // bytebuffer of len writtenbytes contains sampletext of len samplelen chars in utf8
  90. TUtf16String wtr = CharToWide(TStringBuf(bytebuffer, writtenbytes), CODES_UNKNOWNPLANE);
  91. TChar* strend = wtr.begin() + wtr.size();
  92. DecodeUnknownPlane(wtr.begin(), strend, CODES_UTF8);
  93. wtr.resize(strend - wtr.data(), 'Q');
  94. UNIT_ASSERT_VALUES_EQUAL(wtr.size(), samplelen);
  95. for (size_t i = 0; i < wtr.size(); ++i) {
  96. if (sampletext[i] >= 0x10000) {
  97. UNIT_ASSERT_VALUES_EQUAL(wtr[i], ' ');
  98. } else {
  99. UNIT_ASSERT_VALUES_EQUAL(wtr[i], sampletext[i]);
  100. }
  101. }
  102. }
  103. static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) {
  104. size_t sSize = strlen(str);
  105. size_t wSize = sSize * 2;
  106. TArrayHolder<wchar16> w(new wchar16[wSize]);
  107. size_t read = 0;
  108. size_t written = 0;
  109. RECODE_RESULT res = RecodeToUnicode(CODES_UTF8, str, w.Get(), sSize, wSize, read, written);
  110. UNIT_ASSERT(res == RECODE_OK);
  111. UNIT_ASSERT(read == sSize);
  112. UNIT_ASSERT(written == wideSize);
  113. UNIT_ASSERT(!memcmp(w.Get(), wide, wideSize));
  114. TArrayHolder<char> s(new char[sSize]);
  115. res = RecodeFromUnicode(CODES_UTF8, w.Get(), s.Get(), wideSize, sSize, read, written);
  116. UNIT_ASSERT(res == RECODE_OK);
  117. UNIT_ASSERT(read == wideSize);
  118. UNIT_ASSERT(written == sSize);
  119. UNIT_ASSERT(!memcmp(s.Get(), str, sSize));
  120. }
  121. void TRecyr_intTest::TestSurrogatePairs() {
  122. const char* utf8NonBMP = "\xf4\x80\x89\x84\xf4\x80\x89\x87\xf4\x80\x88\xba";
  123. wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A};
  124. TestSurrogates(utf8NonBMP, wNonBMPDummy, Y_ARRAY_SIZE(wNonBMPDummy));
  125. const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n";
  126. wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'};
  127. TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2));
  128. }