wide_ut.cpp 63 KB


  1. #include "utf8.h"
  2. #include "wide.h"
  3. #include <library/cpp/testing/unittest/registar.h>
  4. #include <util/string/reverse.h>
  5. #include <algorithm>
  6. namespace {
  7. //! three UTF8 encoded russian letters (A, B, V)
  8. const char utext[] = "\xd0\x90\xd0\x91\xd0\x92";
  9. const char asciiLatinAlphabet[] = "ABCDEFGHIGKLMNOPQRSTUVWXYZabcdefghigklmnopqrstuvwxyz";
  10. const wchar16 wideLatinAlphabet[] = {
  11. 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'G', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  12. 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'g', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0};
  13. const wchar16 wideCyrillicAlphabet[] = {
  14. 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
  15. 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
  16. 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
  17. 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x00};
  18. const char utf8CyrillicAlphabet[] =
  19. "\xd0\x90\xd0\x91\xd0\x92\xd0\x93\xd0\x94\xd0\x95\xd0\x96\xd0\x97"
  20. "\xd0\x98\xd0\x99\xd0\x9a\xd0\x9b\xd0\x9c\xd0\x9d\xd0\x9e\xd0\x9f"
  21. "\xd0\xa0\xd0\xa1\xd0\xa2\xd0\xa3\xd0\xa4\xd0\xa5\xd0\xa6\xd0\xa7"
  22. "\xd0\xa8\xd0\xa9\xd0\xaa\xd0\xab\xd0\xac\xd0\xad\xd0\xae\xd0\xaf"
  23. "\xd0\xb0\xd0\xb1\xd0\xb2\xd0\xb3\xd0\xb4\xd0\xb5\xd0\xb6\xd0\xb7"
  24. "\xd0\xb8\xd0\xb9\xd0\xba\xd0\xbb\xd0\xbc\xd0\xbd\xd0\xbe\xd0\xbf"
  25. "\xd1\x80\xd1\x81\xd1\x82\xd1\x83\xd1\x84\xd1\x85\xd1\x86\xd1\x87"
  26. "\xd1\x88\xd1\x89\xd1\x8a\xd1\x8b\xd1\x8c\xd1\x8d\xd1\x8e\xd1\x8f";
  27. const wchar32 LEAD_BITS_MASK_2_BYTES = 0x1F;
  28. const wchar32 LEAD_BITS_MASK_3_BYTES = 0x0F;
  29. const wchar32 LEAD_BITS_MASK_4_BYTES = 0x07;
  30. wchar16 ws[] = {
  31. 0x0009,
  32. 0x000A, 0x2028, 0x2029,
  33. 0x000B,
  34. 0x000C,
  35. 0x000D,
  36. 0x0020, 0x1680,
  37. 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B,
  38. 0x202F, 0x205F, 0x3000,
  39. 0x00A0};
  40. const size_t CaseTestDataSize = 10;
  41. wchar32 WideStringTestData[][CaseTestDataSize] = {
  42. {0x01C4, 0x10428, 0x10429, 0x10447, 0x10441, 0x1C03, 0x00A0, 0x10400, 0x10415, 0x10437}, // original
  43. {0x01C6, 0x10428, 0x10429, 0x10447, 0x10441, 0x1C03, 0x00A0, 0x10428, 0x1043D, 0x10437}, // lower
  44. {0x01C4, 0x10400, 0x10401, 0x1041F, 0x10419, 0x1C03, 0x00A0, 0x10400, 0x10415, 0x1040F}, // upper
  45. {0x01C5, 0x10428, 0x10429, 0x10447, 0x10441, 0x1C03, 0x00A0, 0x10428, 0x1043D, 0x10437}, // title
  46. };
  47. TUtf16String CreateUnicodeText() {
  48. const int len = 256;
  49. wchar16 text[len] = {
  50. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x00 - 0x0F
  51. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x10 - 0x1F
  52. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x20 - 0x2F
  53. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x30 - 0x3F
  54. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x40 - 0x4F
  55. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x50 - 0x5F
  56. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x60 - 0x6F
  57. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x70 - 0x7F
  58. 0x0301, 0x00C4, 0x00D6, 0x00DC, 0x0104, 0x0106, 0x0118, 0x0141, 0x00E0, 0x00E2, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x0490, 0x00AD, // 0x80 - 0x8F
  59. 0x00DF, 0x00E4, 0x00F6, 0x00FC, 0x0105, 0x0107, 0x0119, 0x0142, 0x00EB, 0x00EE, 0x00EF, 0x00F4, 0x00F9, 0x00FB, 0x0491, 0x92CF, // 0x90 - 0x9F
  60. 0x00A0, 0x0143, 0x00D3, 0x015A, 0x017B, 0x0179, 0x046C, 0x00A7, 0x0401, 0x0462, 0x0472, 0x0474, 0x040E, 0x0406, 0x0404, 0x0407, // 0xA0 - 0xAF
  61. 0x00B0, 0x0144, 0x00F3, 0x015B, 0x017C, 0x017A, 0x046D, 0x2116, 0x0451, 0x0463, 0x0473, 0x0475, 0x045E, 0x0456, 0x0454, 0x0457 // 0xB0 - 0xBF
  62. };
  63. for (int i = 0; i < len; ++i) {
  64. if (i <= 0x7F) { // ASCII characters without 0x7 and 0x1B
  65. text[i] = static_cast<wchar16>(i);
  66. } else if (i >= 0xC0 && i <= 0xFF) { // russian characters (without YO and yo)
  67. text[i] = static_cast<wchar16>(i + 0x0350); // 0x0410 - 0x044F
  68. }
  69. }
  70. return TUtf16String(text, len);
  71. }
  72. TString CreateUTF8Text() {
  73. char text[] = {
  74. '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
  75. '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
  76. '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27', '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
  77. '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37', '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
  78. '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47', '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
  79. '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57', '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
  80. '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67', '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
  81. '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77', '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
  82. '\xcc', '\x81', '\xc3', '\x84', '\xc3', '\x96', '\xc3', '\x9c', '\xc4', '\x84', '\xc4', '\x86', '\xc4', '\x98', '\xc5', '\x81',
  83. '\xc3', '\xa0', '\xc3', '\xa2', '\xc3', '\xa7', '\xc3', '\xa8', '\xc3', '\xa9', '\xc3', '\xaa', '\xd2', '\x90', '\xc2', '\xad',
  84. '\xc3', '\x9f', '\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc', '\xc4', '\x85', '\xc4', '\x87', '\xc4', '\x99', '\xc5', '\x82',
  85. '\xc3', '\xab', '\xc3', '\xae', '\xc3', '\xaf', '\xc3', '\xb4', '\xc3', '\xb9', '\xc3', '\xbb', '\xd2', '\x91', '\xe9', '\x8b',
  86. '\x8f', '\xc2', '\xa0', '\xc5', '\x83', '\xc3', '\x93', '\xc5', '\x9a', '\xc5', '\xbb', '\xc5', '\xb9', '\xd1', '\xac', '\xc2',
  87. '\xa7', '\xd0', '\x81', '\xd1', '\xa2', '\xd1', '\xb2', '\xd1', '\xb4', '\xd0', '\x8e', '\xd0', '\x86', '\xd0', '\x84', '\xd0',
  88. '\x87', '\xc2', '\xb0', '\xc5', '\x84', '\xc3', '\xb3', '\xc5', '\x9b', '\xc5', '\xbc', '\xc5', '\xba', '\xd1', '\xad', '\xe2',
  89. '\x84', '\x96', '\xd1', '\x91', '\xd1', '\xa3', '\xd1', '\xb3', '\xd1', '\xb5', '\xd1', '\x9e', '\xd1', '\x96', '\xd1', '\x94',
  90. '\xd1', '\x97', '\xd0', '\x90', '\xd0', '\x91', '\xd0', '\x92', '\xd0', '\x93', '\xd0', '\x94', '\xd0', '\x95', '\xd0', '\x96',
  91. '\xd0', '\x97', '\xd0', '\x98', '\xd0', '\x99', '\xd0', '\x9a', '\xd0', '\x9b', '\xd0', '\x9c', '\xd0', '\x9d', '\xd0', '\x9e',
  92. '\xd0', '\x9f', '\xd0', '\xa0', '\xd0', '\xa1', '\xd0', '\xa2', '\xd0', '\xa3', '\xd0', '\xa4', '\xd0', '\xa5', '\xd0', '\xa6',
  93. '\xd0', '\xa7', '\xd0', '\xa8', '\xd0', '\xa9', '\xd0', '\xaa', '\xd0', '\xab', '\xd0', '\xac', '\xd0', '\xad', '\xd0', '\xae',
  94. '\xd0', '\xaf', '\xd0', '\xb0', '\xd0', '\xb1', '\xd0', '\xb2', '\xd0', '\xb3', '\xd0', '\xb4', '\xd0', '\xb5', '\xd0', '\xb6',
  95. '\xd0', '\xb7', '\xd0', '\xb8', '\xd0', '\xb9', '\xd0', '\xba', '\xd0', '\xbb', '\xd0', '\xbc', '\xd0', '\xbd', '\xd0', '\xbe',
  96. '\xd0', '\xbf', '\xd1', '\x80', '\xd1', '\x81', '\xd1', '\x82', '\xd1', '\x83', '\xd1', '\x84', '\xd1', '\x85', '\xd1', '\x86',
  97. '\xd1', '\x87', '\xd1', '\x88', '\xd1', '\x89', '\xd1', '\x8a', '\xd1', '\x8b', '\xd1', '\x8c', '\xd1', '\x8d', '\xd1', '\x8e',
  98. '\xd1', '\x8f'};
  99. return TString(text, Y_ARRAY_SIZE(text));
  100. }
  101. //! use this function to dump UTF8 text into a file in case of any changes
  102. // void DumpUTF8Text() {
  103. // TString s = WideToUTF8(UnicodeText);
  104. // std::ofstream f("utf8.txt");
  105. // f << std::hex;
  106. // for (int i = 0; i < (int)s.size(); ++i) {
  107. // f << "0x" << std::setw(2) << std::setfill('0') << (int)(ui8)s[i] << ", ";
  108. // if ((i + 1) % 16 == 0)
  109. // f << std::endl;
  110. // }
  111. // }
  112. template <StrictUTF8 strictMode = StrictUTF8::No>
  113. void CheckRecodeOK(wchar32 expected, unsigned char* first, size_t n) {
  114. wchar32 w = 0;
  115. const unsigned char* p = first;
  116. RECODE_RESULT r = ReadUTF8CharAndAdvance<strictMode>(w, p, first + n);
  117. UNIT_ASSERT(w == expected);
  118. UNIT_ASSERT(size_t(p - first) == n);
  119. UNIT_ASSERT(r == RECODE_OK);
  120. }
  121. template <StrictUTF8 strictMode = StrictUTF8::No>
  122. void CheckBrokenSymbol(unsigned char* first, unsigned char* last) {
  123. wchar32 w = 0;
  124. const unsigned char* p = first;
  125. RECODE_RESULT r = ReadUTF8CharAndAdvance<strictMode>(w, p, last);
  126. UNIT_ASSERT(w == BROKEN_RUNE);
  127. UNIT_ASSERT(p - first == 0);
  128. UNIT_ASSERT(r == RECODE_BROKENSYMBOL);
  129. }
  130. void CheckEndOfInput(unsigned char* first, size_t n) {
  131. wchar32 w = 0;
  132. const unsigned char* p = first;
  133. RECODE_RESULT r = ReadUTF8CharAndAdvance(w, p, first + n);
  134. (void)w;
  135. UNIT_ASSERT(p - first == 0);
  136. UNIT_ASSERT(r == RECODE_EOINPUT);
  137. }
  138. void CheckCharLen(unsigned char* first, unsigned char* last, size_t len, RECODE_RESULT result) {
  139. size_t n = 0;
  140. RECODE_RESULT r = GetUTF8CharLen(n, first, last);
  141. UNIT_ASSERT(n == len);
  142. UNIT_ASSERT(r == result);
  143. }
  144. }
  145. class TConversionTest: public TTestBase {
  146. private:
  147. //! @note every of the text can have zeros in the middle
  148. const TUtf16String UnicodeText_;
  149. const TString Utf8Text_;
  150. private:
  151. UNIT_TEST_SUITE(TConversionTest);
  152. UNIT_TEST(TestReadUTF8Char);
  153. UNIT_TEST(TestGetUTF8CharLen);
  154. UNIT_TEST(TestWriteUTF8Char);
  155. UNIT_TEST(TestUTF8ToWide);
  156. UNIT_TEST(TestWideToUTF8);
  157. UNIT_TEST(TestGetNumOfUTF8Chars);
  158. UNIT_TEST(TestSubstrUTF8);
  159. UNIT_TEST(TestUnicodeCase);
  160. UNIT_TEST(TestUnicodeDetails);
  161. UNIT_TEST(TestHexConversion);
  162. UNIT_TEST_SUITE_END();
  163. public:
  164. TConversionTest()
  165. : UnicodeText_(CreateUnicodeText())
  166. , Utf8Text_(CreateUTF8Text())
  167. {
  168. }
  169. void TestReadUTF8Char();
  170. void TestGetUTF8CharLen();
  171. void TestWriteUTF8Char();
  172. void TestUTF8ToWide();
  173. void TestWideToUTF8();
  174. void TestGetNumOfUTF8Chars();
  175. void TestSubstrUTF8();
  176. void TestUnicodeCase();
  177. void TestUnicodeDetails();
  178. void TestHexConversion();
  179. };
  180. UNIT_TEST_SUITE_REGISTRATION(TConversionTest);
  181. void TConversionTest::TestHexConversion() {
  182. for (char ch = '0'; ch <= '9'; ++ch) {
  183. UNIT_ASSERT(isxdigit(ch));
  184. UNIT_ASSERT(IsHexdigit(ch));
  185. }
  186. for (char ch = 'a'; ch <= 'f'; ++ch) {
  187. UNIT_ASSERT(isxdigit(ch));
  188. UNIT_ASSERT(IsHexdigit(ch));
  189. }
  190. for (char ch = 'A'; ch <= 'F'; ++ch) {
  191. UNIT_ASSERT(isxdigit(ch));
  192. UNIT_ASSERT(IsHexdigit(ch));
  193. }
  194. for (wchar16 i = std::numeric_limits<wchar16>::min(); i < std::numeric_limits<wchar16>::max(); ++i) {
  195. if (IsHexdigit(i)) {
  196. UNIT_ASSERT(isxdigit(char(i)));
  197. }
  198. }
  199. }
  200. void TConversionTest::TestReadUTF8Char() {
  201. wchar32 e; // expected unicode char
  202. wchar32 c;
  203. unsigned long u; // single UTF8 encoded character
  204. unsigned char* const first = reinterpret_cast<unsigned char*>(&u);
  205. unsigned char* const last = first + sizeof(u);
  206. // all ASCII characters are converted with no change (zero converted successfully as well)
  207. for (c = 0; c <= 0x7F; ++c) {
  208. u = c;
  209. CheckRecodeOK(c, first, 1);
  210. }
  211. // broken symbols from the second half of ASCII table (1000 0000 - 1011 1111)
  212. for (c = 0x80; c <= 0xBF; ++c) {
  213. u = c;
  214. CheckBrokenSymbol(first, last);
  215. }
  216. // overlong encoding: leading byte of 2-byte symbol: 1100 0000 - 1100 0001
  217. for (c = 0xC0; c <= 0xC1; ++c) {
  218. u = c;
  219. CheckBrokenSymbol(first, last);
  220. u |= 0x8000;
  221. CheckBrokenSymbol(first, first + 2);
  222. CheckEndOfInput(first, 1);
  223. }
  224. // leading byte of 2-byte symbol: 1100 0000 - 1101 1111
  225. for (c = 0xC2; c <= 0xDF; ++c) {
  226. u = c;
  227. CheckBrokenSymbol(first, last);
  228. u |= 0x8000;
  229. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  230. e = c & LEAD_BITS_MASK_2_BYTES;
  231. e <<= 6;
  232. CheckRecodeOK(e, first, 2);
  233. CheckEndOfInput(first, 1);
  234. }
  235. // possible overlong encoding with leading byte 1110 0000
  236. {
  237. u = c = 0xE0;
  238. CheckBrokenSymbol(first, last);
  239. u |= 0x808000;
  240. CheckBrokenSymbol(first, first + 3);
  241. u = c | 0x80A000;
  242. e = 0x800;
  243. CheckRecodeOK(e, first, 3);
  244. CheckEndOfInput(first, 2);
  245. CheckEndOfInput(first, 1);
  246. }
  247. // leading byte of 3-byte symbol: 1110 0001 - 1110 1111
  248. for (c = 0xE1; c <= 0xEF; ++c) {
  249. u = c;
  250. CheckBrokenSymbol(first, last);
  251. u |= 0x808000;
  252. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  253. e = c & LEAD_BITS_MASK_3_BYTES;
  254. e <<= 12;
  255. CheckRecodeOK(e, first, 3);
  256. CheckEndOfInput(first, 2);
  257. CheckEndOfInput(first, 1);
  258. }
  259. // leading byte of 3-byte symbol before surrogates: 1110 0001 - 1110 1100
  260. for (c = 0xE1; c <= 0xEC; ++c) {
  261. u = c;
  262. CheckBrokenSymbol<StrictUTF8::Yes>(first, last);
  263. u |= 0x808000;
  264. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  265. e = c & LEAD_BITS_MASK_3_BYTES;
  266. e <<= 12;
  267. CheckRecodeOK<StrictUTF8::Yes>(e, first, 3);
  268. CheckEndOfInput(first, 2);
  269. CheckEndOfInput(first, 1);
  270. }
  271. // rest of allowed characters before surrogate block
  272. {
  273. u = 0xED;
  274. CheckBrokenSymbol<StrictUTF8::Yes>(first, last);
  275. u |= 0xBF9F00;
  276. e = 0xD7FF;
  277. CheckRecodeOK<StrictUTF8::Yes>(e, first, 3);
  278. CheckEndOfInput(first, 2);
  279. CheckEndOfInput(first, 1);
  280. }
  281. // rfc3629 section 4 forbids characters 0xD800 - 0xDFFF
  282. {
  283. u = 0xED;
  284. CheckBrokenSymbol<StrictUTF8::Yes>(first, last);
  285. u |= 0x80A000;
  286. CheckBrokenSymbol<StrictUTF8::Yes>(first, last);
  287. CheckEndOfInput(first, 2);
  288. CheckEndOfInput(first, 1);
  289. }
  290. // leading byte of 3-byte symbol after surrogates: 1110 1110 - 1110 1111
  291. for (c = 0xEE; c <= 0xEF; ++c) {
  292. u = c;
  293. CheckBrokenSymbol<StrictUTF8::Yes>(first, last);
  294. u |= 0x808000;
  295. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  296. e = c & LEAD_BITS_MASK_3_BYTES;
  297. e <<= 12;
  298. CheckRecodeOK<StrictUTF8::Yes>(e, first, 3);
  299. CheckEndOfInput(first, 2);
  300. CheckEndOfInput(first, 1);
  301. }
  302. // possible overlong encoding with leading byte 1111 0000
  303. {
  304. u = c = 0xF0;
  305. CheckBrokenSymbol(first, last);
  306. u |= 0x80808000;
  307. CheckBrokenSymbol(first, first + 4);
  308. u = c | 0x80809000;
  309. e = 0x10000;
  310. CheckRecodeOK(e, first, 4);
  311. CheckEndOfInput(first, 3);
  312. CheckEndOfInput(first, 2);
  313. CheckEndOfInput(first, 1);
  314. }
  315. // leading byte of 4-byte symbol: 1111 0001 - 1111 0111
  316. for (c = 0xF1; c <= 0xF3; ++c) {
  317. u = c;
  318. CheckBrokenSymbol(first, last);
  319. u |= 0x80808000;
  320. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  321. e = c & LEAD_BITS_MASK_4_BYTES;
  322. e <<= 18;
  323. CheckRecodeOK(e, first, 4);
  324. CheckEndOfInput(first, 3);
  325. CheckEndOfInput(first, 2);
  326. CheckEndOfInput(first, 1);
  327. }
  328. // possible invalid code points with leading byte 1111 0100
  329. {
  330. c = 0xF4;
  331. u = 0x80808000 | c;
  332. e = c & LEAD_BITS_MASK_4_BYTES;
  333. e <<= 18;
  334. CheckRecodeOK(e, first, 4);
  335. // the largest possible Unicode code point
  336. u = 0xBFBF8F00 | c;
  337. e = 0x10FFFF;
  338. CheckRecodeOK(e, first, 4);
  339. u = 0x80809000 | c;
  340. CheckBrokenSymbol(first, last);
  341. }
  342. // broken symbols: 1111 0101 - 1111 1111
  343. for (c = 0xF5; c <= 0xFF; ++c) {
  344. u = c;
  345. CheckBrokenSymbol(first, last);
  346. }
  347. }
  348. void TConversionTest::TestGetUTF8CharLen() {
  349. wchar32 c;
  350. unsigned long u; // single UTF8 encoded character
  351. unsigned char* const first = reinterpret_cast<unsigned char*>(&u);
  352. unsigned char* const last = first + sizeof(u);
  353. // all ASCII characters are converted with no change (zero converted successfully as well)
  354. for (c = 0; c <= 0x7F; ++c) {
  355. u = c;
  356. CheckCharLen(first, last, 1, RECODE_OK);
  357. }
  358. // broken symbols from the second half of ASCII table (1000 0000 - 1011 1111)
  359. for (c = 0x80; c <= 0xBF; ++c) {
  360. u = c;
  361. CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
  362. }
  363. // leading byte of 2-byte symbol: 1100 0000 - 1101 1111
  364. for (c = 0xC0; c <= 0xDF; ++c) {
  365. u = c;
  366. CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
  367. u |= 0x8000;
  368. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  369. CheckCharLen(first, last, 2, RECODE_OK);
  370. CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
  371. }
  372. // leading byte of 3-byte symbol: 1110 0000 - 1110 1111
  373. for (c = 0xE0; c <= 0xEF; ++c) {
  374. u = c;
  375. CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
  376. u |= 0x808000;
  377. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  378. CheckCharLen(first, last, 3, RECODE_OK);
  379. CheckCharLen(first, first + 2, 0, RECODE_EOINPUT);
  380. CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
  381. }
  382. // leading byte of 4-byte symbol: 1111 0000 - 1111 0111
  383. for (c = 0xF0; c <= 0xF3; ++c) {
  384. u = c;
  385. CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
  386. u |= 0x80808000;
  387. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  388. CheckCharLen(first, last, 4, RECODE_OK);
  389. CheckCharLen(first, first + 3, 0, RECODE_EOINPUT);
  390. CheckCharLen(first, first + 2, 0, RECODE_EOINPUT);
  391. CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
  392. }
  393. // broken symbols: 1111 1000 - 1111 1111
  394. for (c = 0xF8; c <= 0xFF; ++c) {
  395. u = c;
  396. CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
  397. }
  398. }
  399. void TConversionTest::TestWriteUTF8Char() {
  400. wchar32 w;
  401. unsigned long u; // single UTF8 encoded character
  402. size_t n;
  403. for (w = 0x00; w < 0x80; ++w) {
  404. u = 0;
  405. WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
  406. UNIT_ASSERT((u & 0xFFFFFF80) == 0x00000000);
  407. UNIT_ASSERT(n == 1);
  408. }
  409. for (w = 0x80; w < 0x800; ++w) {
  410. u = 0;
  411. WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
  412. UNIT_ASSERT((u & 0xFFFFC000) == 0x00008000); // see constants in ReadUTF8Char
  413. UNIT_ASSERT(n == 2);
  414. }
  415. for (w = 0x800; w < 0x10000; ++w) {
  416. u = 0;
  417. WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
  418. UNIT_ASSERT((u & 0xFFC0C000) == 0x00808000); // see constants in ReadUTF8Char
  419. UNIT_ASSERT(n == 3);
  420. }
  421. for (w = 0x10000; w < 0x80; ++w) {
  422. WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
  423. UNIT_ASSERT((u & 0xC0C0C000) == 0x80808000); // see constants in ReadUTF8Char
  424. UNIT_ASSERT(n == 4);
  425. }
  426. }
  427. static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) {
  428. TUtf16String w = UTF8ToWide(str);
  429. UNIT_ASSERT(w.size() == wideSize);
  430. UNIT_ASSERT(!memcmp(w.c_str(), wide, wideSize));
  431. TString s = WideToUTF8(w);
  432. UNIT_ASSERT(s == str);
  433. }
  434. void TConversionTest::TestUTF8ToWide() {
  435. TUtf16String w = UTF8ToWide(Utf8Text_);
  436. UNIT_ASSERT(w.size() == 256);
  437. UNIT_ASSERT(w.size() == UnicodeText_.size());
  438. for (int i = 0; i < 256; ++i) {
  439. UNIT_ASSERT_VALUES_EQUAL(w[i], UnicodeText_[i]);
  440. }
  441. wchar16 buffer[4] = {0};
  442. size_t written = 0;
  443. // the function must extract 2 symbols only
  444. bool result = UTF8ToWide(utext, 5, buffer, written);
  445. UNIT_ASSERT(!result);
  446. UNIT_ASSERT(buffer[0] == 0x0410);
  447. UNIT_ASSERT(buffer[1] == 0x0411);
  448. UNIT_ASSERT(buffer[2] == 0x0000);
  449. UNIT_ASSERT(buffer[3] == 0x0000);
  450. UNIT_ASSERT(written == 2);
  451. memset(buffer, 0, 4);
  452. written = 0;
  453. result = UTF8ToWide(utext, 1, buffer, written);
  454. UNIT_ASSERT(!result);
  455. UNIT_ASSERT(buffer[0] == 0x0000);
  456. UNIT_ASSERT(buffer[1] == 0x0000);
  457. UNIT_ASSERT(buffer[2] == 0x0000);
  458. UNIT_ASSERT(buffer[3] == 0x0000);
  459. UNIT_ASSERT(written == 0);
  460. w = UTF8ToWide(asciiLatinAlphabet, strlen(asciiLatinAlphabet));
  461. UNIT_ASSERT(w == wideLatinAlphabet);
  462. w = UTF8ToWide(utf8CyrillicAlphabet, strlen(utf8CyrillicAlphabet));
  463. UNIT_ASSERT(w == wideCyrillicAlphabet);
  464. const char* utf8NonBMP = "\xf4\x80\x89\x84\xf4\x80\x89\x87\xf4\x80\x88\xba";
  465. wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A};
  466. TestSurrogates(utf8NonBMP, wNonBMPDummy, Y_ARRAY_SIZE(wNonBMPDummy));
  467. const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n";
  468. wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'};
  469. TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2));
  470. UNIT_ASSERT_VALUES_EQUAL(WideToUTF8(UTF8ToWide(WideToUTF8(UTF8ToWide<true>(
  471. "m\xFB\xB2\xA5\xAA\xAFyeuse.sexwebcamz.com")))),
  472. TString(
  473. "m\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBDyeuse.sexwebcamz.com"));
  474. }
  475. void TConversionTest::TestWideToUTF8() {
  476. TString s = WideToUTF8(UnicodeText_);
  477. size_t len = 0;
  478. for (TUtf16String::const_iterator i = UnicodeText_.begin(), ie = UnicodeText_.end(); i != ie; ++i) {
  479. len += UTF8RuneLenByUCS(*i);
  480. }
  481. UNIT_ASSERT(s.size() == Utf8Text_.size());
  482. UNIT_ASSERT(s.size() == len);
  483. for (int i = 0; i < static_cast<int>(s.size()); ++i) {
  484. UNIT_ASSERT_VALUES_EQUAL(s[i], Utf8Text_[i]);
  485. }
  486. }
  487. void TConversionTest::TestGetNumOfUTF8Chars() {
  488. size_t n = 0;
  489. bool result = GetNumberOfUTF8Chars(Utf8Text_.c_str(), Utf8Text_.size(), n);
  490. UNIT_ASSERT(result);
  491. UNIT_ASSERT(n == 256);
  492. n = 0;
  493. result = GetNumberOfUTF8Chars(utext, 5, n);
  494. UNIT_ASSERT(!result);
  495. UNIT_ASSERT(n == 2);
  496. n = 0;
  497. result = GetNumberOfUTF8Chars(utext, 1, n);
  498. UNIT_ASSERT(!result);
  499. UNIT_ASSERT(n == 0);
  500. UNIT_ASSERT_EQUAL(GetNumberOfUTF8Chars("привет!"), 7);
  501. }
  502. void TConversionTest::TestSubstrUTF8() {
  503. TStringBuf utextBuf(utext, sizeof(utext));
  504. UNIT_ASSERT(SubstrUTF8(utextBuf, 0, 2) == utextBuf.substr(0, 4));
  505. UNIT_ASSERT(SubstrUTF8(utextBuf, 1, 1) == utextBuf.substr(2, 2));
  506. UNIT_ASSERT(SubstrUTF8(utextBuf, 1, 2) == utextBuf.substr(2, 4));
  507. UNIT_ASSERT(SubstrUTF8(utextBuf, 1, 3) == utextBuf.substr(2, 6));
  508. }
  509. inline bool MustBeSurrogate(wchar32 ch) {
  510. return ch > 0xFFFF;
  511. }
  512. void TConversionTest::TestUnicodeCase() {
  513. // ToLower, ToUpper, ToTitle functions depend on equal size of both original and changed characters
  514. for (wchar32 i = 0; i != NUnicode::UnicodeInstancesLimit(); ++i) {
  515. UNIT_ASSERT(MustBeSurrogate(i) == MustBeSurrogate(ToLower(i)));
  516. UNIT_ASSERT(MustBeSurrogate(i) == MustBeSurrogate(ToUpper(i)));
  517. UNIT_ASSERT(MustBeSurrogate(i) == MustBeSurrogate(ToTitle(i)));
  518. }
  519. }
  520. void TConversionTest::TestUnicodeDetails() {
  521. TUtf16String temp;
  522. for (wchar32 i = 0; i != NUnicode::UnicodeInstancesLimit(); ++i) {
  523. temp.clear();
  524. WriteSymbol(i, temp);
  525. UNIT_ASSERT(temp.size() == W16SymbolSize(temp.c_str(), temp.c_str() + temp.size()));
  526. }
  527. }
  528. class TWideUtilTest: public TTestBase {
  529. UNIT_TEST_SUITE(TWideUtilTest);
  530. UNIT_TEST(TestCollapse);
  531. UNIT_TEST(TestCollapseBuffer);
  532. UNIT_TEST(TestStrip);
  533. UNIT_TEST(TestIsSpace);
  534. UNIT_TEST(TestEscapeHtmlChars);
  535. UNIT_TEST(TestToLower);
  536. UNIT_TEST(TestToUpper);
  537. UNIT_TEST(TestWideString);
  538. UNIT_TEST(TestCountWideChars);
  539. UNIT_TEST(TestIsValidUTF16);
  540. UNIT_TEST(TestIsStringASCII);
  541. UNIT_TEST(TestIsLowerWordStr);
  542. UNIT_TEST(TestIsUpperWordStr);
  543. UNIT_TEST(TestIsTitleStr);
  544. UNIT_TEST(TestIsLowerStr);
  545. UNIT_TEST(TestIsUpperStr);
  546. UNIT_TEST(TestToLowerStr);
  547. UNIT_TEST(TestToUpperStr);
  548. UNIT_TEST(TestToTitleStr);
  549. UNIT_TEST_SUITE_END();
  550. public:
  551. void TestCollapse() {
  552. TUtf16String s;
  553. s.append(ws, Y_ARRAY_SIZE(ws)).append(3, 'a').append(ws, Y_ARRAY_SIZE(ws)).append(3, 'b').append(ws, Y_ARRAY_SIZE(ws));
  554. Collapse(s);
  555. UNIT_ASSERT(s == ASCIIToWide(" aaa bbb "));
  556. {
  557. const TUtf16String w(ASCIIToWide(" a b c "));
  558. s = w;
  559. Collapse(s);
  560. UNIT_ASSERT(s == w);
  561. #ifndef TSTRING_IS_STD_STRING
  562. UNIT_ASSERT(s.c_str() == w.c_str()); // Collapse() does not change the string at all
  563. #endif
  564. }
  565. s = ASCIIToWide(" 123 456 ");
  566. Collapse(s);
  567. UNIT_ASSERT(s == ASCIIToWide(" 123 456 "));
  568. s = ASCIIToWide(" 1\n\n\n23\t 4\f\f56 ");
  569. Collapse(s);
  570. UNIT_ASSERT(s == ASCIIToWide(" 1 23 4 56 "));
  571. s = ASCIIToWide(" 1\n\n\n\f\f56 ");
  572. Collapse(s);
  573. UNIT_ASSERT(s == ASCIIToWide(" 1 56 "));
  574. s = ASCIIToWide(" 1\r\n,\n(\n23\t 4\f\f56 ");
  575. Collapse(s);
  576. UNIT_ASSERT(s == ASCIIToWide(" 1 , ( 23 4 56 "));
  577. s = ASCIIToWide("1 23 ");
  578. Collapse(s);
  579. UNIT_ASSERT(s == ASCIIToWide("1 23 "));
  580. {
  581. const TUtf16String w = ASCIIToWide(" ");
  582. s = w;
  583. Collapse(s);
  584. UNIT_ASSERT(s == w);
  585. #ifndef TSTRING_IS_STD_STRING
  586. UNIT_ASSERT(s.c_str() == w.c_str()); // Collapse() does not change the string at all
  587. #endif
  588. }
  589. s = ASCIIToWide(" ");
  590. Collapse(s);
  591. UNIT_ASSERT(s == ASCIIToWide(" "));
  592. s = ASCIIToWide(",\r\n\"");
  593. Collapse(s);
  594. UNIT_ASSERT(s == ASCIIToWide(", \""));
  595. s = ASCIIToWide("-");
  596. Collapse(s);
  597. UNIT_ASSERT(s == ASCIIToWide("-"));
  598. s.clear();
  599. Collapse(s);
  600. UNIT_ASSERT(s == TUtf16String());
  601. }
  602. void TestCollapseBuffer() {
  603. TUtf16String s;
  604. s.append(ws, Y_ARRAY_SIZE(ws)).append(3, 'a').append(ws, Y_ARRAY_SIZE(ws)).append(3, 'b').append(ws, Y_ARRAY_SIZE(ws));
  605. size_t n = Collapse(s.begin(), s.size());
  606. s.resize(n);
  607. UNIT_ASSERT(s == ASCIIToWide(" aaa bbb "));
  608. s = ASCIIToWide(" a b c ");
  609. n = Collapse(s.begin(), s.size());
  610. UNIT_ASSERT(n == s.size()); // length was not changed
  611. UNIT_ASSERT(s == ASCIIToWide(" a b c "));
  612. s = ASCIIToWide(" 123 456 ");
  613. n = Collapse(s.begin(), s.size());
  614. s.resize(n);
  615. UNIT_ASSERT(s == ASCIIToWide(" 123 456 "));
  616. s = ASCIIToWide(" 1\n\n\n23\t 4\f\f56 ");
  617. n = Collapse(s.begin(), s.size());
  618. s.resize(n);
  619. UNIT_ASSERT(s == ASCIIToWide(" 1 23 4 56 "));
  620. s = ASCIIToWide(" 1\n\n\n\f\f56 ");
  621. n = Collapse(s.begin(), s.size());
  622. s.resize(n);
  623. UNIT_ASSERT(s == ASCIIToWide(" 1 56 "));
  624. s = ASCIIToWide(" 1\r\n,\n(\n23\t 4\f\f56 ");
  625. n = Collapse(s.begin(), s.size());
  626. s.resize(n);
  627. UNIT_ASSERT(s == ASCIIToWide(" 1 , ( 23 4 56 "));
  628. s = ASCIIToWide("1 23 ");
  629. n = Collapse(s.begin(), s.size());
  630. s.resize(n);
  631. UNIT_ASSERT(s == ASCIIToWide("1 23 "));
  632. s = ASCIIToWide(" ");
  633. n = Collapse(s.begin(), s.size());
  634. UNIT_ASSERT(n == 1);
  635. UNIT_ASSERT(s == ASCIIToWide(" "));
  636. s = ASCIIToWide(" ");
  637. n = Collapse(s.begin(), s.size());
  638. s.resize(n);
  639. UNIT_ASSERT(s == ASCIIToWide(" "));
  640. s = ASCIIToWide(",\r\n\"");
  641. n = Collapse(s.begin(), s.size());
  642. s.resize(n);
  643. UNIT_ASSERT(s == ASCIIToWide(", \""));
  644. s = ASCIIToWide("-");
  645. n = Collapse(s.begin(), s.size());
  646. UNIT_ASSERT(n == 1);
  647. UNIT_ASSERT(s == ASCIIToWide("-"));
  648. s = ASCIIToWide("\t");
  649. n = Collapse(s.begin(), s.size());
  650. UNIT_ASSERT(n == 1);
  651. UNIT_ASSERT(s == ASCIIToWide(" "));
  652. s.clear();
  653. n = Collapse(s.begin(), s.size());
  654. UNIT_ASSERT(n == 0);
  655. UNIT_ASSERT(s == TUtf16String());
  656. }
  657. void TestStrip() {
  658. TUtf16String s;
  659. Strip(s);
  660. UNIT_ASSERT(s == TUtf16String());
  661. StripLeft(s);
  662. UNIT_ASSERT(s == TUtf16String());
  663. StripRight(s);
  664. UNIT_ASSERT(s == TUtf16String());
  665. s = ASCIIToWide(" \t\r\n");
  666. Strip(s);
  667. UNIT_ASSERT(s == TUtf16String());
  668. s = ASCIIToWide(" \t\r\n");
  669. StripLeft(s);
  670. UNIT_ASSERT(s == TUtf16String());
  671. s = ASCIIToWide(" \t\r\n");
  672. StripRight(s);
  673. UNIT_ASSERT(s == TUtf16String());
  674. s = ASCIIToWide("\t\f\va \r\n");
  675. Strip(s);
  676. UNIT_ASSERT(s == ASCIIToWide("a"));
  677. s = ASCIIToWide("\t\f\va \r\n");
  678. StripLeft(s);
  679. UNIT_ASSERT(s == ASCIIToWide("a \r\n"));
  680. s = ASCIIToWide("\t\f\va \r\n");
  681. StripRight(s);
  682. UNIT_ASSERT(s == ASCIIToWide("\t\f\va"));
  683. s = ASCIIToWide("\r\na\r\nb\t\tc\r\n");
  684. Strip(s);
  685. UNIT_ASSERT(s == ASCIIToWide("a\r\nb\t\tc"));
  686. s = ASCIIToWide("\r\na\r\nb\t\tc\r\n");
  687. StripLeft(s);
  688. UNIT_ASSERT(s == ASCIIToWide("a\r\nb\t\tc\r\n"));
  689. s = ASCIIToWide("\r\na\r\nb\t\tc\r\n");
  690. StripRight(s);
  691. UNIT_ASSERT(s == ASCIIToWide("\r\na\r\nb\t\tc"));
  692. const TUtf16String w(ASCIIToWide("a b"));
  693. s = w;
  694. Strip(s);
  695. UNIT_ASSERT(s == w);
  696. #ifndef TSTRING_IS_STD_STRING
  697. UNIT_ASSERT(s.c_str() == w.c_str()); // Strip() does not change the string at all
  698. #endif
  699. s = w;
  700. StripLeft(s);
  701. UNIT_ASSERT(s == w);
  702. #ifndef TSTRING_IS_STD_STRING
  703. UNIT_ASSERT(s.c_str() == w.c_str()); // Strip() does not change the string at all
  704. #endif
  705. s = w;
  706. StripRight(s);
  707. UNIT_ASSERT(s == w);
  708. #ifndef TSTRING_IS_STD_STRING
  709. UNIT_ASSERT(s.c_str() == w.c_str()); // Strip() does not change the string at all
  710. #endif
  711. }
  712. void TestIsSpace() {
  713. UNIT_ASSERT(!IsSpace(TUtf16String()));
  714. UNIT_ASSERT(IsSpace(ws, Y_ARRAY_SIZE(ws)));
  715. TUtf16String w;
  716. w.assign(ws, Y_ARRAY_SIZE(ws)).append(TUtf16String(1, '!'));
  717. UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
  718. w.assign(TUtf16String(1, '_')).append(ws, Y_ARRAY_SIZE(ws));
  719. UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
  720. w.assign(ws, Y_ARRAY_SIZE(ws)).append(TUtf16String(1, '$')).append(ws, Y_ARRAY_SIZE(ws));
  721. UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
  722. }
  723. void TestEscapeHtmlChars() {
  724. // characters from the first half of the ASCII table
  725. for (wchar16 c = 1; c < 0x7F; ++c) {
  726. TUtf16String w(1, c);
  727. EscapeHtmlChars<false>(w);
  728. switch (c) {
  729. case '<':
  730. UNIT_ASSERT(w == ASCIIToWide("&lt;"));
  731. break;
  732. case '>':
  733. UNIT_ASSERT(w == ASCIIToWide("&gt;"));
  734. break;
  735. case '&':
  736. UNIT_ASSERT(w == ASCIIToWide("&amp;"));
  737. break;
  738. case '"':
  739. UNIT_ASSERT(w == ASCIIToWide("&quot;"));
  740. break;
  741. default:
  742. UNIT_ASSERT(w == TUtf16String(1, c));
  743. break;
  744. }
  745. }
  746. for (wchar16 c = 1; c < 0x7F; ++c) {
  747. TUtf16String w(1, c);
  748. EscapeHtmlChars<true>(w);
  749. switch (c) {
  750. case '<':
  751. UNIT_ASSERT(w == ASCIIToWide("&lt;"));
  752. break;
  753. case '>':
  754. UNIT_ASSERT(w == ASCIIToWide("&gt;"));
  755. break;
  756. case '&':
  757. UNIT_ASSERT(w == ASCIIToWide("&amp;"));
  758. break;
  759. case '"':
  760. UNIT_ASSERT(w == ASCIIToWide("&quot;"));
  761. break;
  762. case '\r':
  763. case '\n':
  764. UNIT_ASSERT(w == ASCIIToWide("<BR>"));
  765. break;
  766. default:
  767. UNIT_ASSERT(w == TUtf16String(1, c));
  768. break;
  769. }
  770. }
  771. }
  772. void TestToLower() {
  773. const size_t n = 32;
  774. wchar16 upperCase[n];
  775. std::copy(wideCyrillicAlphabet, wideCyrillicAlphabet + n, upperCase);
  776. ToLower(upperCase, n);
  777. UNIT_ASSERT(TWtringBuf(upperCase, n) == TWtringBuf(wideCyrillicAlphabet + n, n));
  778. }
  779. void TestToUpper() {
  780. const size_t n = 32;
  781. wchar16 lowerCase[n];
  782. std::copy(wideCyrillicAlphabet + n, wideCyrillicAlphabet + n * 2, lowerCase);
  783. ToUpper(lowerCase, n);
  784. UNIT_ASSERT(TWtringBuf(lowerCase, n) == TWtringBuf(wideCyrillicAlphabet, n));
  785. }
  786. void TestWideString() {
  787. const TUtf16String original = UTF32ToWide(WideStringTestData[0], CaseTestDataSize);
  788. const TUtf16String lower = UTF32ToWide(WideStringTestData[1], CaseTestDataSize);
  789. const TUtf16String upper = UTF32ToWide(WideStringTestData[2], CaseTestDataSize);
  790. const TUtf16String title = UTF32ToWide(WideStringTestData[3], CaseTestDataSize);
  791. TUtf16String temp;
  792. temp = original;
  793. temp.to_lower();
  794. UNIT_ASSERT(temp == lower);
  795. temp = original;
  796. ToLower(temp.begin(), temp.size());
  797. UNIT_ASSERT(temp == lower);
  798. temp = original;
  799. temp.to_upper();
  800. UNIT_ASSERT(temp == upper);
  801. temp = original;
  802. ToUpper(temp.begin(), temp.size());
  803. UNIT_ASSERT(temp == upper);
  804. temp = original;
  805. temp.to_title();
  806. UNIT_ASSERT(temp == title);
  807. temp = original;
  808. ToTitle(temp.begin(), temp.size());
  809. UNIT_ASSERT(temp == title);
  810. TVector<wchar32> buffer(WideStringTestData[0], WideStringTestData[0] + CaseTestDataSize);
  811. std::reverse(buffer.begin(), buffer.end());
  812. const TUtf16String reversed = UTF32ToWide(buffer.data(), buffer.size());
  813. temp = original;
  814. ReverseInPlace(temp);
  815. UNIT_ASSERT(temp == reversed);
  816. }
  817. void TestCountWideChars() {
  818. UNIT_ASSERT_EQUAL(CountWideChars(UTF8ToWide("привет!")), 7);
  819. TUtf16String wideStr = UTF8ToWide("\xf0\x9f\x92\xb8привет!");
  820. UNIT_ASSERT_EQUAL(wideStr.size(), 9);
  821. UNIT_ASSERT_EQUAL(CountWideChars(wideStr), 8);
  822. }
  823. void TestIsValidUTF16() {
  824. static wchar16 str1[] = {'h', 'e', 'l', 'l', 'o', '!', 0};
  825. static wchar16 str2[] = {'h', 'e', 'l', 'l', 'o', 0xD842, 0xDEAD, '!', 0};
  826. static wchar16 str3[] = {'h', 'e', 'l', 'l', 'o', 0xD842, '!', 0};
  827. static wchar16 str4[] = {'h', 'e', 'l', 'l', 'o', 0xDEAD, 0xD842, '!', 0};
  828. static wchar16 str5[] = {'h', 'e', 'l', 'l', 'o', 0xD842, 0xDEAD, 0xDEAD, '!', 0};
  829. UNIT_ASSERT(IsValidUTF16(TWtringBuf(str1)));
  830. UNIT_ASSERT(IsValidUTF16(TWtringBuf(str2)));
  831. UNIT_ASSERT(!IsValidUTF16(TWtringBuf(str3)));
  832. UNIT_ASSERT(!IsValidUTF16(TWtringBuf(str4)));
  833. UNIT_ASSERT(!IsValidUTF16(TWtringBuf(str5)));
  834. }
  835. void TestIsStringASCII() {
  836. static char charAscii[] = "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF";
  837. static wchar16 char16Ascii[] = {
  838. '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'A',
  839. 'B', 'C', 'D', 'E', 'F', '0', '1', '2', '3', '4', '5', '6',
  840. '7', '8', '9', '0', 'A', 'B', 'C', 'D', 'E', 'F', 0};
  841. // Test a variety of the fragment start positions and lengths in order to make
  842. // sure that bit masking in IsStringASCII works correctly.
  843. // Also, test that a non-ASCII character will be detected regardless of its
  844. // position inside the string.
  845. {
  846. const size_t stringLength = Y_ARRAY_SIZE(charAscii) - 1;
  847. for (size_t offset = 0; offset < 8; ++offset) {
  848. for (size_t len = 0, maxLen = stringLength - offset; len < maxLen; ++len) {
  849. UNIT_ASSERT(IsStringASCII(charAscii + offset, charAscii + offset + len));
  850. for (size_t charPos = offset; charPos < len; ++charPos) {
  851. charAscii[charPos] |= '\x80';
  852. UNIT_ASSERT(!IsStringASCII(charAscii + offset, charAscii + offset + len));
  853. charAscii[charPos] &= ~'\x80';
  854. }
  855. }
  856. }
  857. }
  858. {
  859. const size_t stringLength = Y_ARRAY_SIZE(char16Ascii) - 1;
  860. for (size_t offset = 0; offset < 4; ++offset) {
  861. for (size_t len = 0, maxLen = stringLength - offset; len < maxLen; ++len) {
  862. UNIT_ASSERT(IsStringASCII(char16Ascii + offset, char16Ascii + offset + len));
  863. for (size_t charPos = offset; charPos < len; ++charPos) {
  864. char16Ascii[charPos] |= 0x80;
  865. UNIT_ASSERT(
  866. !IsStringASCII(char16Ascii + offset, char16Ascii + offset + len));
  867. char16Ascii[charPos] &= ~0x80;
  868. // Also test when the upper half is non-zero.
  869. char16Ascii[charPos] |= 0x100;
  870. UNIT_ASSERT(
  871. !IsStringASCII(char16Ascii + offset, char16Ascii + offset + len));
  872. char16Ascii[charPos] &= ~0x100;
  873. }
  874. }
  875. }
  876. }
  877. }
  878. void TestIsLowerWordStr() {
  879. UNIT_ASSERT(IsLowerWord(TWtringBuf()));
  880. UNIT_ASSERT(IsLowerWord(UTF8ToWide("")));
  881. UNIT_ASSERT(IsLowerWord(UTF8ToWide("test")));
  882. UNIT_ASSERT(IsLowerWord(UTF8ToWide("тест"))); // "тест" is "test" in russian (cyrrilic)
  883. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("тест тест")));
  884. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("тест100500")));
  885. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("Test")));
  886. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("tesT")));
  887. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("tEst")));
  888. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("Тест")));
  889. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("теСт")));
  890. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("тесТ")));
  891. }
  892. void TestIsUpperWordStr() {
  893. UNIT_ASSERT(IsUpperWord(TWtringBuf()));
  894. UNIT_ASSERT(IsUpperWord(UTF8ToWide("")));
  895. UNIT_ASSERT(IsUpperWord(UTF8ToWide("TEST")));
  896. UNIT_ASSERT(IsUpperWord(UTF8ToWide("ТЕСТ")));
  897. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("тест тест")));
  898. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("тест100500")));
  899. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("Test")));
  900. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("tesT")));
  901. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("tEst")));
  902. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("Тест")));
  903. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("теСт")));
  904. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("тесТ")));
  905. }
  906. void TestIsTitleStr() {
  907. UNIT_ASSERT(!IsTitleWord(TWtringBuf()));
  908. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("")));
  909. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("t")));
  910. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("й")));
  911. UNIT_ASSERT(IsTitleWord(UTF8ToWide("T")));
  912. UNIT_ASSERT(IsTitleWord(UTF8ToWide("Й")));
  913. UNIT_ASSERT(IsTitleWord(UTF8ToWide("Test")));
  914. UNIT_ASSERT(IsTitleWord(UTF8ToWide("Тест")));
  915. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("тест тест")));
  916. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("тест100500")));
  917. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("Тест тест")));
  918. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("Тест100500")));
  919. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("tesT")));
  920. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("tEst")));
  921. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("теСт")));
  922. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("тесТ")));
  923. }
  924. void TestIsLowerStr() {
  925. UNIT_ASSERT(IsLower(TWtringBuf()));
  926. UNIT_ASSERT(IsLower(UTF8ToWide("")));
  927. UNIT_ASSERT(IsLower(UTF8ToWide("test")));
  928. UNIT_ASSERT(IsLower(UTF8ToWide("тест"))); // "тест" is "test" in russian (cyrrilic)
  929. UNIT_ASSERT(IsLower(UTF8ToWide("тест тест")));
  930. UNIT_ASSERT(IsLower(UTF8ToWide("тест100500")));
  931. UNIT_ASSERT(!IsLower(UTF8ToWide("Test")));
  932. UNIT_ASSERT(!IsLower(UTF8ToWide("tesT")));
  933. UNIT_ASSERT(!IsLower(UTF8ToWide("tEst")));
  934. UNIT_ASSERT(!IsLower(UTF8ToWide("Тест")));
  935. UNIT_ASSERT(!IsLower(UTF8ToWide("теСт")));
  936. UNIT_ASSERT(!IsLower(UTF8ToWide("тесТ")));
  937. }
  938. void TestIsUpperStr() {
  939. UNIT_ASSERT(IsUpper(TWtringBuf()));
  940. UNIT_ASSERT(IsUpper(UTF8ToWide("")));
  941. UNIT_ASSERT(IsUpper(UTF8ToWide("TEST")));
  942. UNIT_ASSERT(IsUpper(UTF8ToWide("ТЕСТ")));
  943. UNIT_ASSERT(IsUpper(UTF8ToWide("ТЕСТ ТЕСТ")));
  944. UNIT_ASSERT(IsUpper(UTF8ToWide("ТЕСТ100500")));
  945. UNIT_ASSERT(!IsUpper(UTF8ToWide("Test")));
  946. UNIT_ASSERT(!IsUpper(UTF8ToWide("tesT")));
  947. UNIT_ASSERT(!IsUpper(UTF8ToWide("tEst")));
  948. UNIT_ASSERT(!IsUpper(UTF8ToWide("Тест")));
  949. UNIT_ASSERT(!IsUpper(UTF8ToWide("теСт")));
  950. UNIT_ASSERT(!IsUpper(UTF8ToWide("тесТ")));
  951. }
  952. void TestToLowerStr() {
  953. // In these test and test for `ToUpper` and `ToTitle` we are checking that string keep
  954. // pointing to the same piece of memory we are doing it the following way:
  955. //
  956. // TUtf16String s = ...
  957. // const auto copy = s;
  958. // ...
  959. // UNIT_ASSERT(s.data() == copy.data())
  960. //
  961. // It saves us a couple lines (we are reusing `copy` later) and if one day `TString` will
  962. // become non-refcounted we'll need to rewrite it to something like:
  963. //
  964. // TUtf16String s = ...
  965. // const auto* const data = s.data();
  966. // const auto length = s.length();
  967. // ...
  968. // UNIT_ASSERT(s.data() == data);
  969. // UNIT_ASSERT(s.length() == length);
  970. {
  971. TUtf16String s;
  972. auto writableCopy = s;
  973. const auto copy = s;
  974. const TUtf16String lower;
  975. UNIT_ASSERT(!ToLower(s));
  976. UNIT_ASSERT(s == lower);
  977. #ifndef TSTRING_IS_STD_STRING
  978. UNIT_ASSERT(s.data() == copy.data());
  979. #endif
  980. UNIT_ASSERT(!ToLower(writableCopy.Detach(), writableCopy.size()));
  981. UNIT_ASSERT(writableCopy == lower);
  982. UNIT_ASSERT(!ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  983. UNIT_ASSERT(writableCopy == lower);
  984. UNIT_ASSERT(ToLowerRet(copy) == lower);
  985. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  986. }
  987. {
  988. TUtf16String s = UTF8ToWide("");
  989. auto writableCopy = s;
  990. const auto copy = s;
  991. const TUtf16String lower;
  992. UNIT_ASSERT(!ToLower(s));
  993. UNIT_ASSERT(s == lower);
  994. #ifndef TSTRING_IS_STD_STRING
  995. UNIT_ASSERT(s.data() == copy.data());
  996. #endif
  997. UNIT_ASSERT(!ToLower(writableCopy.Detach(), writableCopy.size()));
  998. UNIT_ASSERT(writableCopy == lower);
  999. UNIT_ASSERT(!ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  1000. UNIT_ASSERT(writableCopy == lower);
  1001. UNIT_ASSERT(ToLowerRet(copy) == lower);
  1002. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  1003. }
  1004. {
  1005. TUtf16String s;
  1006. const auto copy = s;
  1007. const TUtf16String lower;
  1008. UNIT_ASSERT(!ToLower(s, 100500));
  1009. UNIT_ASSERT(s == lower);
  1010. #ifndef TSTRING_IS_STD_STRING
  1011. UNIT_ASSERT(s.data() == copy.data());
  1012. #endif
  1013. UNIT_ASSERT(ToLowerRet(copy, 100500) == lower);
  1014. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 100500) == lower);
  1015. }
  1016. {
  1017. TUtf16String s;
  1018. const auto copy = s;
  1019. const TUtf16String lower;
  1020. UNIT_ASSERT(!ToLower(s, 100500, 1111));
  1021. UNIT_ASSERT(s == lower);
  1022. #ifndef TSTRING_IS_STD_STRING
  1023. UNIT_ASSERT(s.data() == copy.data());
  1024. #endif
  1025. UNIT_ASSERT(ToLowerRet(copy, 100500, 1111) == lower);
  1026. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 100500, 1111) == lower);
  1027. }
  1028. {
  1029. auto s = UTF8ToWide("Й");
  1030. auto writableCopy = s;
  1031. const auto copy = s;
  1032. const auto lower = UTF8ToWide("й");
  1033. UNIT_ASSERT(ToLower(s));
  1034. UNIT_ASSERT(s == lower);
  1035. UNIT_ASSERT(ToLower(writableCopy.Detach(), writableCopy.size()));
  1036. UNIT_ASSERT(writableCopy == lower);
  1037. UNIT_ASSERT(ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  1038. UNIT_ASSERT(writableCopy == lower);
  1039. UNIT_ASSERT(ToLowerRet(copy) == lower);
  1040. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  1041. }
  1042. {
  1043. auto s = UTF8ToWide("й");
  1044. auto writableCopy = s;
  1045. const auto copy = s;
  1046. const auto lower = UTF8ToWide("й");
  1047. UNIT_ASSERT(!ToLower(s));
  1048. UNIT_ASSERT(s == lower);
  1049. #ifndef TSTRING_IS_STD_STRING
  1050. UNIT_ASSERT(s.data() == copy.data());
  1051. #endif
  1052. UNIT_ASSERT(!ToLower(writableCopy.Detach(), writableCopy.size()));
  1053. UNIT_ASSERT(writableCopy == lower);
  1054. UNIT_ASSERT(!ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  1055. UNIT_ASSERT(writableCopy == lower);
  1056. UNIT_ASSERT(ToLowerRet(copy) == lower);
  1057. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  1058. }
  1059. {
  1060. auto s = UTF8ToWide("тест");
  1061. auto writableCopy = s;
  1062. const auto copy = s;
  1063. const auto lower = UTF8ToWide("тест");
  1064. UNIT_ASSERT(!ToLower(s));
  1065. UNIT_ASSERT(s == lower);
  1066. #ifndef TSTRING_IS_STD_STRING
  1067. UNIT_ASSERT(s.data() == copy.data());
  1068. #endif
  1069. UNIT_ASSERT(!ToLower(writableCopy.Detach(), writableCopy.size()));
  1070. UNIT_ASSERT(writableCopy == lower);
  1071. UNIT_ASSERT(!ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  1072. UNIT_ASSERT(writableCopy == lower);
  1073. UNIT_ASSERT(ToLowerRet(copy) == lower);
  1074. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  1075. }
  1076. {
  1077. auto s = UTF8ToWide("Тест");
  1078. auto writableCopy = s;
  1079. const auto copy = s;
  1080. const auto lower = UTF8ToWide("тест");
  1081. UNIT_ASSERT(ToLower(s));
  1082. UNIT_ASSERT(s == lower);
  1083. UNIT_ASSERT(ToLower(writableCopy.Detach(), writableCopy.size()));
  1084. UNIT_ASSERT(writableCopy == lower);
  1085. UNIT_ASSERT(ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  1086. UNIT_ASSERT(writableCopy == lower);
  1087. UNIT_ASSERT(ToLowerRet(copy) == lower);
  1088. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  1089. }
  1090. {
  1091. TUtf16String s = UTF8ToWide("тЕст");
  1092. const auto copy = s;
  1093. const auto lower = UTF8ToWide("тест");
  1094. UNIT_ASSERT(ToLower(s));
  1095. UNIT_ASSERT(s == UTF8ToWide("тест"));
  1096. UNIT_ASSERT(ToLowerRet(copy) == lower);
  1097. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  1098. }
  1099. {
  1100. auto s = UTF8ToWide("тЕст");
  1101. const auto copy = s;
  1102. const auto lower = UTF8ToWide("тЕст");
  1103. UNIT_ASSERT(!ToLower(s, 2));
  1104. UNIT_ASSERT(s == lower);
  1105. #ifndef TSTRING_IS_STD_STRING
  1106. UNIT_ASSERT(s.data() == copy.data());
  1107. #endif
  1108. UNIT_ASSERT(ToLowerRet(copy, 2) == lower);
  1109. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 2) == lower);
  1110. }
  1111. {
  1112. auto s = UTF8ToWide("теСт");
  1113. const auto copy = s;
  1114. const auto lower = UTF8ToWide("тест");
  1115. UNIT_ASSERT(ToLower(s, 2));
  1116. UNIT_ASSERT(s == lower);
  1117. UNIT_ASSERT(ToLowerRet(copy, 2) == lower);
  1118. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 2) == lower);
  1119. }
  1120. {
  1121. auto s = UTF8ToWide("теСт");
  1122. const auto copy = s;
  1123. const auto lower = UTF8ToWide("теСт");
  1124. UNIT_ASSERT(!ToLower(s, 3, 1));
  1125. UNIT_ASSERT(s == copy);
  1126. #ifndef TSTRING_IS_STD_STRING
  1127. UNIT_ASSERT(s.data() == copy.data());
  1128. #endif
  1129. UNIT_ASSERT(ToLowerRet(copy, 3, 1) == lower);
  1130. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 3, 1) == lower);
  1131. }
  1132. {
  1133. auto s = UTF8ToWide("теСт");
  1134. const auto copy = s;
  1135. const auto lower = UTF8ToWide("теСт");
  1136. UNIT_ASSERT(!ToLower(s, 3, 100500));
  1137. UNIT_ASSERT(s == copy);
  1138. #ifndef TSTRING_IS_STD_STRING
  1139. UNIT_ASSERT(s.data() == copy.data());
  1140. #endif
  1141. UNIT_ASSERT(ToLowerRet(copy, 3, 100500) == lower);
  1142. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 3, 100500) == lower);
  1143. }
  1144. }
  1145. void TestToUpperStr() {
  1146. {
  1147. TUtf16String s;
  1148. auto writableCopy = s;
  1149. const auto copy = s;
  1150. const TUtf16String upper;
  1151. UNIT_ASSERT(!ToUpper(s));
  1152. UNIT_ASSERT(s == upper);
  1153. #ifndef TSTRING_IS_STD_STRING
  1154. UNIT_ASSERT(s.data() == copy.data());
  1155. #endif
  1156. UNIT_ASSERT(!ToUpper(writableCopy.Detach(), writableCopy.size()));
  1157. UNIT_ASSERT(writableCopy == upper);
  1158. UNIT_ASSERT(!ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1159. UNIT_ASSERT(writableCopy == upper);
  1160. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1161. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1162. }
  1163. {
  1164. auto s = UTF8ToWide("");
  1165. auto writableCopy = s;
  1166. const auto copy = s;
  1167. const TUtf16String upper;
  1168. UNIT_ASSERT(!ToUpper(s));
  1169. UNIT_ASSERT(s == upper);
  1170. #ifndef TSTRING_IS_STD_STRING
  1171. UNIT_ASSERT(s.data() == copy.data());
  1172. #endif
  1173. UNIT_ASSERT(!ToUpper(writableCopy.Detach(), writableCopy.size()));
  1174. UNIT_ASSERT(writableCopy == upper);
  1175. UNIT_ASSERT(!ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1176. UNIT_ASSERT(writableCopy == upper);
  1177. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1178. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1179. }
  1180. {
  1181. TUtf16String s;
  1182. auto writableCopy = s;
  1183. const auto copy = s;
  1184. const TUtf16String upper;
  1185. UNIT_ASSERT(!ToUpper(s, 100500));
  1186. UNIT_ASSERT(s == upper);
  1187. #ifndef TSTRING_IS_STD_STRING
  1188. UNIT_ASSERT(s.data() == copy.data());
  1189. #endif
  1190. UNIT_ASSERT(!ToUpper(writableCopy.Detach(), writableCopy.size()));
  1191. UNIT_ASSERT(writableCopy == upper);
  1192. UNIT_ASSERT(!ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1193. UNIT_ASSERT(writableCopy == upper);
  1194. UNIT_ASSERT(ToUpperRet(copy, 100500) == upper);
  1195. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 100500) == upper);
  1196. }
  1197. {
  1198. TUtf16String s;
  1199. const auto copy = s;
  1200. const TUtf16String upper;
  1201. UNIT_ASSERT(!ToUpper(s, 100500, 1111));
  1202. UNIT_ASSERT(s == upper);
  1203. #ifndef TSTRING_IS_STD_STRING
  1204. UNIT_ASSERT(s.data() == copy.data());
  1205. #endif
  1206. UNIT_ASSERT(ToUpperRet(copy, 100500, 1111) == upper);
  1207. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 100500, 1111) == upper);
  1208. }
  1209. {
  1210. auto s = UTF8ToWide("й");
  1211. auto writableCopy = s;
  1212. const auto copy = s;
  1213. const auto upper = UTF8ToWide("Й");
  1214. UNIT_ASSERT(ToUpper(s));
  1215. UNIT_ASSERT(s == upper);
  1216. UNIT_ASSERT(ToUpper(writableCopy.Detach(), writableCopy.size()));
  1217. UNIT_ASSERT(writableCopy == upper);
  1218. UNIT_ASSERT(ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1219. UNIT_ASSERT(writableCopy == upper);
  1220. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1221. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1222. }
  1223. {
  1224. auto s = UTF8ToWide("Й");
  1225. auto writableCopy = s;
  1226. const auto copy = s;
  1227. const auto upper = UTF8ToWide("Й");
  1228. UNIT_ASSERT(!ToUpper(s));
  1229. UNIT_ASSERT(s == copy);
  1230. #ifndef TSTRING_IS_STD_STRING
  1231. UNIT_ASSERT(s.data() == copy.data());
  1232. #endif
  1233. UNIT_ASSERT(!ToUpper(writableCopy.Detach(), writableCopy.size()));
  1234. UNIT_ASSERT(writableCopy == upper);
  1235. UNIT_ASSERT(!ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1236. UNIT_ASSERT(writableCopy == upper);
  1237. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1238. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1239. }
  1240. {
  1241. auto s = UTF8ToWide("тест");
  1242. auto writableCopy = s;
  1243. const auto copy = s;
  1244. const auto upper = UTF8ToWide("ТЕСТ");
  1245. UNIT_ASSERT(ToUpper(s));
  1246. UNIT_ASSERT(s == upper);
  1247. UNIT_ASSERT(ToUpper(writableCopy.Detach(), writableCopy.size()));
  1248. UNIT_ASSERT(writableCopy == upper);
  1249. UNIT_ASSERT(ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1250. UNIT_ASSERT(writableCopy == upper);
  1251. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1252. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1253. }
  1254. {
  1255. auto s = UTF8ToWide("Тест");
  1256. auto writableCopy = s;
  1257. const auto copy = s;
  1258. const auto upper = UTF8ToWide("ТЕСТ");
  1259. UNIT_ASSERT(ToUpper(s));
  1260. UNIT_ASSERT(s == upper);
  1261. UNIT_ASSERT(ToUpper(writableCopy.Detach(), writableCopy.size()));
  1262. UNIT_ASSERT(writableCopy == upper);
  1263. UNIT_ASSERT(ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1264. UNIT_ASSERT(writableCopy == upper);
  1265. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1266. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1267. }
  1268. {
  1269. auto s = UTF8ToWide("тЕст");
  1270. auto writableCopy = s;
  1271. const auto copy = s;
  1272. const auto upper = UTF8ToWide("ТЕСТ");
  1273. UNIT_ASSERT(ToUpper(s));
  1274. UNIT_ASSERT(s == upper);
  1275. UNIT_ASSERT(ToUpper(writableCopy.Detach(), writableCopy.size()));
  1276. UNIT_ASSERT(writableCopy == upper);
  1277. UNIT_ASSERT(ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1278. UNIT_ASSERT(writableCopy == upper);
  1279. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1280. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1281. }
  1282. {
  1283. auto s = UTF8ToWide("тЕст");
  1284. const auto copy = s;
  1285. const auto upper = UTF8ToWide("тЕСТ");
  1286. UNIT_ASSERT(ToUpper(s, 2));
  1287. UNIT_ASSERT(s == upper);
  1288. UNIT_ASSERT(ToUpperRet(copy, 2) == upper);
  1289. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 2) == upper);
  1290. }
  1291. {
  1292. auto s = UTF8ToWide("теСт");
  1293. const auto copy = s;
  1294. const auto upper = UTF8ToWide("теСТ");
  1295. UNIT_ASSERT(ToUpper(s, 2));
  1296. UNIT_ASSERT(s == upper);
  1297. UNIT_ASSERT(ToUpperRet(copy, 2) == upper);
  1298. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 2) == upper);
  1299. }
  1300. {
  1301. auto s = UTF8ToWide("теСт");
  1302. const auto copy = s;
  1303. const auto upper = UTF8ToWide("теСТ");
  1304. UNIT_ASSERT(ToUpper(s, 3, 1));
  1305. UNIT_ASSERT(s == upper);
  1306. UNIT_ASSERT(ToUpperRet(copy, 3, 1) == upper);
  1307. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 3, 1) == upper);
  1308. }
  1309. {
  1310. auto s = UTF8ToWide("теСт");
  1311. const auto copy = s;
  1312. const auto upper = UTF8ToWide("теСТ");
  1313. UNIT_ASSERT(ToUpper(s, 3, 100500));
  1314. UNIT_ASSERT(s == upper);
  1315. UNIT_ASSERT(ToUpperRet(copy, 3, 100500) == upper);
  1316. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 3, 100500) == upper);
  1317. }
  1318. }
  1319. void TestToTitleStr() {
  1320. {
  1321. TUtf16String s;
  1322. auto writableCopy = s;
  1323. const auto copy = s;
  1324. const TUtf16String title;
  1325. UNIT_ASSERT(!ToTitle(s));
  1326. UNIT_ASSERT(s == title);
  1327. #ifndef TSTRING_IS_STD_STRING
  1328. UNIT_ASSERT(s.data() == copy.data());
  1329. #endif
  1330. UNIT_ASSERT(!ToTitle(writableCopy.Detach(), writableCopy.size()));
  1331. UNIT_ASSERT(writableCopy == title);
  1332. UNIT_ASSERT(!ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1333. UNIT_ASSERT(writableCopy == title);
  1334. UNIT_ASSERT(ToTitleRet(copy) == title);
  1335. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1336. }
  1337. {
  1338. auto s = UTF8ToWide("");
  1339. auto writableCopy = s;
  1340. const auto copy = s;
  1341. const TUtf16String title;
  1342. UNIT_ASSERT(!ToTitle(s));
  1343. UNIT_ASSERT(s == title);
  1344. #ifndef TSTRING_IS_STD_STRING
  1345. UNIT_ASSERT(s.data() == copy.data());
  1346. #endif
  1347. UNIT_ASSERT(!ToTitle(writableCopy.Detach(), writableCopy.size()));
  1348. UNIT_ASSERT(writableCopy == title);
  1349. UNIT_ASSERT(!ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1350. UNIT_ASSERT(writableCopy == title);
  1351. UNIT_ASSERT(ToTitleRet(copy) == title);
  1352. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1353. }
  1354. {
  1355. TUtf16String s;
  1356. const auto copy = s;
  1357. const TUtf16String title;
  1358. UNIT_ASSERT(!ToTitle(s, 100500));
  1359. UNIT_ASSERT(s == title);
  1360. #ifndef TSTRING_IS_STD_STRING
  1361. UNIT_ASSERT(s.data() == copy.data());
  1362. #endif
  1363. UNIT_ASSERT(ToTitleRet(copy) == title);
  1364. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1365. }
  1366. {
  1367. TUtf16String s;
  1368. const auto copy = s;
  1369. const TUtf16String title;
  1370. UNIT_ASSERT(!ToTitle(s, 100500, 1111));
  1371. UNIT_ASSERT(s == title);
  1372. #ifndef TSTRING_IS_STD_STRING
  1373. UNIT_ASSERT(s.data() == copy.data());
  1374. #endif
  1375. UNIT_ASSERT(ToTitleRet(copy) == title);
  1376. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1377. }
  1378. {
  1379. auto s = UTF8ToWide("й");
  1380. auto writableCopy = s;
  1381. const auto copy = s;
  1382. const auto title = UTF8ToWide("Й");
  1383. UNIT_ASSERT(ToTitle(s));
  1384. UNIT_ASSERT(s == title);
  1385. UNIT_ASSERT(ToTitle(writableCopy.Detach(), writableCopy.size()));
  1386. UNIT_ASSERT(writableCopy == title);
  1387. UNIT_ASSERT(ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1388. UNIT_ASSERT(writableCopy == title);
  1389. UNIT_ASSERT(ToTitleRet(copy) == title);
  1390. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1391. }
  1392. {
  1393. auto s = UTF8ToWide("Й");
  1394. auto writableCopy = s;
  1395. const auto copy = s;
  1396. const auto title = UTF8ToWide("Й");
  1397. UNIT_ASSERT(!ToTitle(s));
  1398. UNIT_ASSERT(s == title);
  1399. #ifndef TSTRING_IS_STD_STRING
  1400. UNIT_ASSERT(s.data() == copy.data());
  1401. #endif
  1402. UNIT_ASSERT(!ToTitle(writableCopy.Detach(), writableCopy.size()));
  1403. UNIT_ASSERT(writableCopy == title);
  1404. UNIT_ASSERT(!ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1405. UNIT_ASSERT(writableCopy == title);
  1406. UNIT_ASSERT(ToTitleRet(copy) == title);
  1407. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1408. }
  1409. {
  1410. auto s = UTF8ToWide("тест");
  1411. auto writableCopy = s;
  1412. const auto copy = s;
  1413. const auto title = UTF8ToWide("Тест");
  1414. UNIT_ASSERT(ToTitle(s));
  1415. UNIT_ASSERT(s == title);
  1416. UNIT_ASSERT(ToTitle(writableCopy.Detach(), writableCopy.size()));
  1417. UNIT_ASSERT(writableCopy == title);
  1418. UNIT_ASSERT(ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1419. UNIT_ASSERT(writableCopy == title);
  1420. UNIT_ASSERT(ToTitleRet(copy) == title);
  1421. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1422. }
  1423. {
  1424. auto s = UTF8ToWide("Тест");
  1425. auto writableCopy = s;
  1426. const auto copy = s;
  1427. const auto title = UTF8ToWide("Тест");
  1428. UNIT_ASSERT(!ToTitle(s));
  1429. UNIT_ASSERT(s == title);
  1430. #ifndef TSTRING_IS_STD_STRING
  1431. UNIT_ASSERT(s.data() == copy.data());
  1432. #endif
  1433. UNIT_ASSERT(!ToTitle(writableCopy.Detach(), writableCopy.size()));
  1434. UNIT_ASSERT(writableCopy == title);
  1435. UNIT_ASSERT(!ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1436. UNIT_ASSERT(writableCopy == title);
  1437. UNIT_ASSERT(ToTitleRet(copy) == title);
  1438. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1439. }
  1440. {
  1441. auto s = UTF8ToWide("тЕст");
  1442. auto writableCopy = s;
  1443. const auto copy = s;
  1444. const auto title = UTF8ToWide("Тест");
  1445. UNIT_ASSERT(ToTitle(s));
  1446. UNIT_ASSERT(s == title);
  1447. UNIT_ASSERT(ToTitle(writableCopy.Detach(), writableCopy.size()));
  1448. UNIT_ASSERT(writableCopy == title);
  1449. UNIT_ASSERT(ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1450. UNIT_ASSERT(writableCopy == title);
  1451. UNIT_ASSERT(ToTitleRet(copy) == title);
  1452. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1453. }
  1454. {
  1455. auto s = UTF8ToWide("тЕст");
  1456. const auto copy = s;
  1457. const auto title = UTF8ToWide("тЕСт");
  1458. UNIT_ASSERT(ToTitle(s, 2));
  1459. UNIT_ASSERT(s == title);
  1460. UNIT_ASSERT(ToTitleRet(copy, 2) == title);
  1461. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy), 2) == title);
  1462. }
  1463. {
  1464. auto s = UTF8ToWide("теСт");
  1465. const auto copy = s;
  1466. const auto title = UTF8ToWide("теСт");
  1467. UNIT_ASSERT(!ToTitle(s, 2));
  1468. UNIT_ASSERT(s == title);
  1469. #ifndef TSTRING_IS_STD_STRING
  1470. UNIT_ASSERT(s.data() == copy.data());
  1471. #endif
  1472. UNIT_ASSERT(ToTitleRet(copy, 2) == title);
  1473. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy), 2) == title);
  1474. }
  1475. {
  1476. auto s = UTF8ToWide("теСт");
  1477. const auto copy = s;
  1478. const auto title = UTF8ToWide("теСТ");
  1479. UNIT_ASSERT(ToTitle(s, 3, 1));
  1480. UNIT_ASSERT(s == title);
  1481. UNIT_ASSERT(ToTitleRet(copy, 3, 1) == title);
  1482. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy), 3, 1) == title);
  1483. }
  1484. {
  1485. auto s = UTF8ToWide("теСт");
  1486. const auto copy = s;
  1487. const auto title = UTF8ToWide("теСТ");
  1488. UNIT_ASSERT(ToTitle(s, 3, 100500));
  1489. UNIT_ASSERT(s == title);
  1490. UNIT_ASSERT(ToTitleRet(copy, 3, 100500) == title);
  1491. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy), 3, 100500) == title);
  1492. }
  1493. }
  1494. };
  1495. UNIT_TEST_SUITE_REGISTRATION(TWideUtilTest);