wide_ut.cpp 62 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742
  1. #include "utf8.h"
  2. #include "wide.h"
  3. #include <library/cpp/testing/unittest/registar.h>
  4. #include <util/string/reverse.h>
  5. #include <algorithm>
  6. namespace {
  7. //! three UTF8 encoded russian letters (A, B, V)
  8. const char utext[] = "\xd0\x90\xd0\x91\xd0\x92";
  9. const char asciiLatinAlphabet[] = "ABCDEFGHIGKLMNOPQRSTUVWXYZabcdefghigklmnopqrstuvwxyz";
  10. const wchar16 wideLatinAlphabet[] = {
  11. 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'G', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
  12. 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'g', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0};
  13. const wchar16 wideCyrillicAlphabet[] = {
  14. 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
  15. 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
  16. 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
  17. 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, 0x00};
  18. const char utf8CyrillicAlphabet[] =
  19. "\xd0\x90\xd0\x91\xd0\x92\xd0\x93\xd0\x94\xd0\x95\xd0\x96\xd0\x97"
  20. "\xd0\x98\xd0\x99\xd0\x9a\xd0\x9b\xd0\x9c\xd0\x9d\xd0\x9e\xd0\x9f"
  21. "\xd0\xa0\xd0\xa1\xd0\xa2\xd0\xa3\xd0\xa4\xd0\xa5\xd0\xa6\xd0\xa7"
  22. "\xd0\xa8\xd0\xa9\xd0\xaa\xd0\xab\xd0\xac\xd0\xad\xd0\xae\xd0\xaf"
  23. "\xd0\xb0\xd0\xb1\xd0\xb2\xd0\xb3\xd0\xb4\xd0\xb5\xd0\xb6\xd0\xb7"
  24. "\xd0\xb8\xd0\xb9\xd0\xba\xd0\xbb\xd0\xbc\xd0\xbd\xd0\xbe\xd0\xbf"
  25. "\xd1\x80\xd1\x81\xd1\x82\xd1\x83\xd1\x84\xd1\x85\xd1\x86\xd1\x87"
  26. "\xd1\x88\xd1\x89\xd1\x8a\xd1\x8b\xd1\x8c\xd1\x8d\xd1\x8e\xd1\x8f";
  27. const wchar32 LEAD_BITS_MASK_2_BYTES = 0x1F;
  28. const wchar32 LEAD_BITS_MASK_3_BYTES = 0x0F;
  29. const wchar32 LEAD_BITS_MASK_4_BYTES = 0x07;
  30. wchar16 ws[] = {
  31. 0x0009,
  32. 0x000A, 0x2028, 0x2029,
  33. 0x000B,
  34. 0x000C,
  35. 0x000D,
  36. 0x0020, 0x1680,
  37. 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B,
  38. 0x202F, 0x205F, 0x3000,
  39. 0x00A0};
  40. const size_t CaseTestDataSize = 10;
  41. wchar32 WideStringTestData[][CaseTestDataSize] = {
  42. {0x01C4, 0x10428, 0x10429, 0x10447, 0x10441, 0x1C03, 0x00A0, 0x10400, 0x10415, 0x10437}, // original
  43. {0x01C6, 0x10428, 0x10429, 0x10447, 0x10441, 0x1C03, 0x00A0, 0x10428, 0x1043D, 0x10437}, // lower
  44. {0x01C4, 0x10400, 0x10401, 0x1041F, 0x10419, 0x1C03, 0x00A0, 0x10400, 0x10415, 0x1040F}, // upper
  45. {0x01C5, 0x10428, 0x10429, 0x10447, 0x10441, 0x1C03, 0x00A0, 0x10428, 0x1043D, 0x10437}, // title
  46. };
  47. TUtf16String CreateUnicodeText() {
  48. const int len = 256;
  49. wchar16 text[len] = {
  50. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x00 - 0x0F
  51. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x10 - 0x1F
  52. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x20 - 0x2F
  53. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x30 - 0x3F
  54. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x40 - 0x4F
  55. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x50 - 0x5F
  56. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x60 - 0x6F
  57. 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, // 0x70 - 0x7F
  58. 0x0301, 0x00C4, 0x00D6, 0x00DC, 0x0104, 0x0106, 0x0118, 0x0141, 0x00E0, 0x00E2, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x0490, 0x00AD, // 0x80 - 0x8F
  59. 0x00DF, 0x00E4, 0x00F6, 0x00FC, 0x0105, 0x0107, 0x0119, 0x0142, 0x00EB, 0x00EE, 0x00EF, 0x00F4, 0x00F9, 0x00FB, 0x0491, 0x92CF, // 0x90 - 0x9F
  60. 0x00A0, 0x0143, 0x00D3, 0x015A, 0x017B, 0x0179, 0x046C, 0x00A7, 0x0401, 0x0462, 0x0472, 0x0474, 0x040E, 0x0406, 0x0404, 0x0407, // 0xA0 - 0xAF
  61. 0x00B0, 0x0144, 0x00F3, 0x015B, 0x017C, 0x017A, 0x046D, 0x2116, 0x0451, 0x0463, 0x0473, 0x0475, 0x045E, 0x0456, 0x0454, 0x0457 // 0xB0 - 0xBF
  62. };
  63. for (int i = 0; i < len; ++i) {
  64. if (i <= 0x7F) { // ASCII characters without 0x7 and 0x1B
  65. text[i] = static_cast<wchar16>(i);
  66. } else if (i >= 0xC0 && i <= 0xFF) { // russian characters (without YO and yo)
  67. text[i] = static_cast<wchar16>(i + 0x0350); // 0x0410 - 0x044F
  68. }
  69. }
  70. return TUtf16String(text, len);
  71. }
  72. TString CreateUTF8Text() {
  73. char text[] = {
  74. '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
  75. '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
  76. '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27', '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
  77. '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37', '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
  78. '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47', '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
  79. '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57', '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
  80. '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67', '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
  81. '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77', '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
  82. '\xcc', '\x81', '\xc3', '\x84', '\xc3', '\x96', '\xc3', '\x9c', '\xc4', '\x84', '\xc4', '\x86', '\xc4', '\x98', '\xc5', '\x81',
  83. '\xc3', '\xa0', '\xc3', '\xa2', '\xc3', '\xa7', '\xc3', '\xa8', '\xc3', '\xa9', '\xc3', '\xaa', '\xd2', '\x90', '\xc2', '\xad',
  84. '\xc3', '\x9f', '\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc', '\xc4', '\x85', '\xc4', '\x87', '\xc4', '\x99', '\xc5', '\x82',
  85. '\xc3', '\xab', '\xc3', '\xae', '\xc3', '\xaf', '\xc3', '\xb4', '\xc3', '\xb9', '\xc3', '\xbb', '\xd2', '\x91', '\xe9', '\x8b',
  86. '\x8f', '\xc2', '\xa0', '\xc5', '\x83', '\xc3', '\x93', '\xc5', '\x9a', '\xc5', '\xbb', '\xc5', '\xb9', '\xd1', '\xac', '\xc2',
  87. '\xa7', '\xd0', '\x81', '\xd1', '\xa2', '\xd1', '\xb2', '\xd1', '\xb4', '\xd0', '\x8e', '\xd0', '\x86', '\xd0', '\x84', '\xd0',
  88. '\x87', '\xc2', '\xb0', '\xc5', '\x84', '\xc3', '\xb3', '\xc5', '\x9b', '\xc5', '\xbc', '\xc5', '\xba', '\xd1', '\xad', '\xe2',
  89. '\x84', '\x96', '\xd1', '\x91', '\xd1', '\xa3', '\xd1', '\xb3', '\xd1', '\xb5', '\xd1', '\x9e', '\xd1', '\x96', '\xd1', '\x94',
  90. '\xd1', '\x97', '\xd0', '\x90', '\xd0', '\x91', '\xd0', '\x92', '\xd0', '\x93', '\xd0', '\x94', '\xd0', '\x95', '\xd0', '\x96',
  91. '\xd0', '\x97', '\xd0', '\x98', '\xd0', '\x99', '\xd0', '\x9a', '\xd0', '\x9b', '\xd0', '\x9c', '\xd0', '\x9d', '\xd0', '\x9e',
  92. '\xd0', '\x9f', '\xd0', '\xa0', '\xd0', '\xa1', '\xd0', '\xa2', '\xd0', '\xa3', '\xd0', '\xa4', '\xd0', '\xa5', '\xd0', '\xa6',
  93. '\xd0', '\xa7', '\xd0', '\xa8', '\xd0', '\xa9', '\xd0', '\xaa', '\xd0', '\xab', '\xd0', '\xac', '\xd0', '\xad', '\xd0', '\xae',
  94. '\xd0', '\xaf', '\xd0', '\xb0', '\xd0', '\xb1', '\xd0', '\xb2', '\xd0', '\xb3', '\xd0', '\xb4', '\xd0', '\xb5', '\xd0', '\xb6',
  95. '\xd0', '\xb7', '\xd0', '\xb8', '\xd0', '\xb9', '\xd0', '\xba', '\xd0', '\xbb', '\xd0', '\xbc', '\xd0', '\xbd', '\xd0', '\xbe',
  96. '\xd0', '\xbf', '\xd1', '\x80', '\xd1', '\x81', '\xd1', '\x82', '\xd1', '\x83', '\xd1', '\x84', '\xd1', '\x85', '\xd1', '\x86',
  97. '\xd1', '\x87', '\xd1', '\x88', '\xd1', '\x89', '\xd1', '\x8a', '\xd1', '\x8b', '\xd1', '\x8c', '\xd1', '\x8d', '\xd1', '\x8e',
  98. '\xd1', '\x8f'};
  99. return TString(text, Y_ARRAY_SIZE(text));
  100. }
  101. //! use this function to dump UTF8 text into a file in case of any changes
  102. // void DumpUTF8Text() {
  103. // TString s = WideToUTF8(UnicodeText);
  104. // std::ofstream f("utf8.txt");
  105. // f << std::hex;
  106. // for (int i = 0; i < (int)s.size(); ++i) {
  107. // f << "0x" << std::setw(2) << std::setfill('0') << (int)(ui8)s[i] << ", ";
  108. // if ((i + 1) % 16 == 0)
  109. // f << std::endl;
  110. // }
  111. // }
  112. void CheckRecodeOK(wchar32 expected, unsigned char* first, size_t n) {
  113. wchar32 w = 0;
  114. const unsigned char* p = first;
  115. RECODE_RESULT r = ReadUTF8CharAndAdvance(w, p, first + n);
  116. UNIT_ASSERT(w == expected);
  117. UNIT_ASSERT(size_t(p - first) == n);
  118. UNIT_ASSERT(r == RECODE_OK);
  119. }
  120. void CheckBrokenSymbol(unsigned char* first, unsigned char* last) {
  121. wchar32 w = 0;
  122. const unsigned char* p = first;
  123. RECODE_RESULT r = ReadUTF8CharAndAdvance(w, p, last);
  124. UNIT_ASSERT(w == BROKEN_RUNE);
  125. UNIT_ASSERT(p - first == 0);
  126. UNIT_ASSERT(r == RECODE_BROKENSYMBOL);
  127. }
  128. void CheckEndOfInput(unsigned char* first, size_t n) {
  129. wchar32 w = 0;
  130. const unsigned char* p = first;
  131. RECODE_RESULT r = ReadUTF8CharAndAdvance(w, p, first + n);
  132. (void)w;
  133. UNIT_ASSERT(p - first == 0);
  134. UNIT_ASSERT(r == RECODE_EOINPUT);
  135. }
  136. void CheckCharLen(unsigned char* first, unsigned char* last, size_t len, RECODE_RESULT result) {
  137. size_t n = 0;
  138. RECODE_RESULT r = GetUTF8CharLen(n, first, last);
  139. UNIT_ASSERT(n == len);
  140. UNIT_ASSERT(r == result);
  141. }
  142. }
  143. class TConversionTest: public TTestBase {
  144. private:
  145. //! @note every of the text can have zeros in the middle
  146. const TUtf16String UnicodeText_;
  147. const TString Utf8Text_;
  148. private:
  149. UNIT_TEST_SUITE(TConversionTest);
  150. UNIT_TEST(TestReadUTF8Char);
  151. UNIT_TEST(TestGetUTF8CharLen);
  152. UNIT_TEST(TestWriteUTF8Char);
  153. UNIT_TEST(TestUTF8ToWide);
  154. UNIT_TEST(TestWideToUTF8);
  155. UNIT_TEST(TestGetNumOfUTF8Chars);
  156. UNIT_TEST(TestSubstrUTF8);
  157. UNIT_TEST(TestUnicodeCase);
  158. UNIT_TEST(TestUnicodeDetails);
  159. UNIT_TEST(TestHexConversion);
  160. UNIT_TEST_SUITE_END();
  161. public:
  162. TConversionTest()
  163. : UnicodeText_(CreateUnicodeText())
  164. , Utf8Text_(CreateUTF8Text())
  165. {
  166. }
  167. void TestReadUTF8Char();
  168. void TestGetUTF8CharLen();
  169. void TestWriteUTF8Char();
  170. void TestUTF8ToWide();
  171. void TestWideToUTF8();
  172. void TestGetNumOfUTF8Chars();
  173. void TestSubstrUTF8();
  174. void TestUnicodeCase();
  175. void TestUnicodeDetails();
  176. void TestHexConversion();
  177. };
  178. UNIT_TEST_SUITE_REGISTRATION(TConversionTest);
  179. void TConversionTest::TestHexConversion() {
  180. for (char ch = '0'; ch <= '9'; ++ch) {
  181. UNIT_ASSERT(isxdigit(ch));
  182. UNIT_ASSERT(IsHexdigit(ch));
  183. }
  184. for (char ch = 'a'; ch <= 'f'; ++ch) {
  185. UNIT_ASSERT(isxdigit(ch));
  186. UNIT_ASSERT(IsHexdigit(ch));
  187. }
  188. for (char ch = 'A'; ch <= 'F'; ++ch) {
  189. UNIT_ASSERT(isxdigit(ch));
  190. UNIT_ASSERT(IsHexdigit(ch));
  191. }
  192. for (wchar16 i = std::numeric_limits<wchar16>::min(); i < std::numeric_limits<wchar16>::max(); ++i) {
  193. if (IsHexdigit(i)) {
  194. UNIT_ASSERT(isxdigit(char(i)));
  195. }
  196. }
  197. }
  198. void TConversionTest::TestReadUTF8Char() {
  199. wchar32 e; // expected unicode char
  200. wchar32 c;
  201. unsigned long u; // single UTF8 encoded character
  202. unsigned char* const first = reinterpret_cast<unsigned char*>(&u);
  203. unsigned char* const last = first + sizeof(u);
  204. // all ASCII characters are converted with no change (zero converted successfully as well)
  205. for (c = 0; c <= 0x7F; ++c) {
  206. u = c;
  207. CheckRecodeOK(c, first, 1);
  208. }
  209. // broken symbols from the second half of ASCII table (1000 0000 - 1011 1111)
  210. for (c = 0x80; c <= 0xBF; ++c) {
  211. u = c;
  212. CheckBrokenSymbol(first, last);
  213. }
  214. // overlong encoding: leading byte of 2-byte symbol: 1100 0000 - 1100 0001
  215. for (c = 0xC0; c <= 0xC1; ++c) {
  216. u = c;
  217. CheckBrokenSymbol(first, last);
  218. u |= 0x8000;
  219. CheckBrokenSymbol(first, first + 2);
  220. CheckEndOfInput(first, 1);
  221. }
  222. // leading byte of 2-byte symbol: 1100 0000 - 1101 1111
  223. for (c = 0xC2; c <= 0xDF; ++c) {
  224. u = c;
  225. CheckBrokenSymbol(first, last);
  226. u |= 0x8000;
  227. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  228. e = c & LEAD_BITS_MASK_2_BYTES;
  229. e <<= 6;
  230. CheckRecodeOK(e, first, 2);
  231. CheckEndOfInput(first, 1);
  232. }
  233. // possible overlong encoding with leading byte 1110 0000
  234. {
  235. u = c = 0xE0;
  236. CheckBrokenSymbol(first, last);
  237. u |= 0x808000;
  238. CheckBrokenSymbol(first, first + 3);
  239. u = c | 0x80A000;
  240. e = 0x800;
  241. CheckRecodeOK(e, first, 3);
  242. CheckEndOfInput(first, 2);
  243. CheckEndOfInput(first, 1);
  244. }
  245. // leading byte of 3-byte symbol: 1110 0001 - 1110 1111
  246. for (c = 0xE1; c <= 0xEF; ++c) {
  247. u = c;
  248. CheckBrokenSymbol(first, last);
  249. u |= 0x808000;
  250. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  251. e = c & LEAD_BITS_MASK_3_BYTES;
  252. e <<= 12;
  253. CheckRecodeOK(e, first, 3);
  254. CheckEndOfInput(first, 2);
  255. CheckEndOfInput(first, 1);
  256. }
  257. // possible overlong encoding with leading byte 1111 0000
  258. {
  259. u = c = 0xF0;
  260. CheckBrokenSymbol(first, last);
  261. u |= 0x80808000;
  262. CheckBrokenSymbol(first, first + 4);
  263. u = c | 0x80809000;
  264. e = 0x10000;
  265. CheckRecodeOK(e, first, 4);
  266. CheckEndOfInput(first, 3);
  267. CheckEndOfInput(first, 2);
  268. CheckEndOfInput(first, 1);
  269. }
  270. // leading byte of 4-byte symbol: 1111 0001 - 1111 0111
  271. for (c = 0xF1; c <= 0xF3; ++c) {
  272. u = c;
  273. CheckBrokenSymbol(first, last);
  274. u |= 0x80808000;
  275. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  276. e = c & LEAD_BITS_MASK_4_BYTES;
  277. e <<= 18;
  278. CheckRecodeOK(e, first, 4);
  279. CheckEndOfInput(first, 3);
  280. CheckEndOfInput(first, 2);
  281. CheckEndOfInput(first, 1);
  282. }
  283. // possible invalid code points with leading byte 1111 0100
  284. {
  285. c = 0xF4;
  286. u = 0x80808000 | c;
  287. e = c & LEAD_BITS_MASK_4_BYTES;
  288. e <<= 18;
  289. CheckRecodeOK(e, first, 4);
  290. // the largest possible Unicode code point
  291. u = 0xBFBF8F00 | c;
  292. e = 0x10FFFF;
  293. CheckRecodeOK(e, first, 4);
  294. u = 0x80809000 | c;
  295. CheckBrokenSymbol(first, last);
  296. }
  297. // broken symbols: 1111 0101 - 1111 1111
  298. for (c = 0xF5; c <= 0xFF; ++c) {
  299. u = c;
  300. CheckBrokenSymbol(first, last);
  301. }
  302. }
  303. void TConversionTest::TestGetUTF8CharLen() {
  304. wchar32 c;
  305. unsigned long u; // single UTF8 encoded character
  306. unsigned char* const first = reinterpret_cast<unsigned char*>(&u);
  307. unsigned char* const last = first + sizeof(u);
  308. // all ASCII characters are converted with no change (zero converted successfully as well)
  309. for (c = 0; c <= 0x7F; ++c) {
  310. u = c;
  311. CheckCharLen(first, last, 1, RECODE_OK);
  312. }
  313. // broken symbols from the second half of ASCII table (1000 0000 - 1011 1111)
  314. for (c = 0x80; c <= 0xBF; ++c) {
  315. u = c;
  316. CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
  317. }
  318. // leading byte of 2-byte symbol: 1100 0000 - 1101 1111
  319. for (c = 0xC0; c <= 0xDF; ++c) {
  320. u = c;
  321. CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
  322. u |= 0x8000;
  323. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  324. CheckCharLen(first, last, 2, RECODE_OK);
  325. CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
  326. }
  327. // leading byte of 3-byte symbol: 1110 0000 - 1110 1111
  328. for (c = 0xE0; c <= 0xEF; ++c) {
  329. u = c;
  330. CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
  331. u |= 0x808000;
  332. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  333. CheckCharLen(first, last, 3, RECODE_OK);
  334. CheckCharLen(first, first + 2, 0, RECODE_EOINPUT);
  335. CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
  336. }
  337. // leading byte of 4-byte symbol: 1111 0000 - 1111 0111
  338. for (c = 0xF0; c <= 0xF3; ++c) {
  339. u = c;
  340. CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
  341. u |= 0x80808000;
  342. // w: 0000 0000 0000 0000 - 0000 0111 1100 0000
  343. CheckCharLen(first, last, 4, RECODE_OK);
  344. CheckCharLen(first, first + 3, 0, RECODE_EOINPUT);
  345. CheckCharLen(first, first + 2, 0, RECODE_EOINPUT);
  346. CheckCharLen(first, first + 1, 0, RECODE_EOINPUT);
  347. }
  348. // broken symbols: 1111 1000 - 1111 1111
  349. for (c = 0xF8; c <= 0xFF; ++c) {
  350. u = c;
  351. CheckCharLen(first, last, 0, RECODE_BROKENSYMBOL);
  352. }
  353. }
  354. void TConversionTest::TestWriteUTF8Char() {
  355. wchar32 w;
  356. unsigned long u; // single UTF8 encoded character
  357. size_t n;
  358. for (w = 0x00; w < 0x80; ++w) {
  359. u = 0;
  360. WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
  361. UNIT_ASSERT((u & 0xFFFFFF80) == 0x00000000);
  362. UNIT_ASSERT(n == 1);
  363. }
  364. for (w = 0x80; w < 0x800; ++w) {
  365. u = 0;
  366. WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
  367. UNIT_ASSERT((u & 0xFFFFC000) == 0x00008000); // see constants in ReadUTF8Char
  368. UNIT_ASSERT(n == 2);
  369. }
  370. for (w = 0x800; w < 0x10000; ++w) {
  371. u = 0;
  372. WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
  373. UNIT_ASSERT((u & 0xFFC0C000) == 0x00808000); // see constants in ReadUTF8Char
  374. UNIT_ASSERT(n == 3);
  375. }
  376. for (w = 0x10000; w < 0x80; ++w) {
  377. WriteUTF8Char(w, n, reinterpret_cast<unsigned char*>(&u));
  378. UNIT_ASSERT((u & 0xC0C0C000) == 0x80808000); // see constants in ReadUTF8Char
  379. UNIT_ASSERT(n == 4);
  380. }
  381. }
  382. static void TestSurrogates(const char* str, const wchar16* wide, size_t wideSize) {
  383. TUtf16String w = UTF8ToWide(str);
  384. UNIT_ASSERT(w.size() == wideSize);
  385. UNIT_ASSERT(!memcmp(w.c_str(), wide, wideSize));
  386. TString s = WideToUTF8(w);
  387. UNIT_ASSERT(s == str);
  388. }
  389. void TConversionTest::TestUTF8ToWide() {
  390. TUtf16String w = UTF8ToWide(Utf8Text_);
  391. UNIT_ASSERT(w.size() == 256);
  392. UNIT_ASSERT(w.size() == UnicodeText_.size());
  393. for (int i = 0; i < 256; ++i) {
  394. UNIT_ASSERT_VALUES_EQUAL(w[i], UnicodeText_[i]);
  395. }
  396. wchar16 buffer[4] = {0};
  397. size_t written = 0;
  398. // the function must extract 2 symbols only
  399. bool result = UTF8ToWide(utext, 5, buffer, written);
  400. UNIT_ASSERT(!result);
  401. UNIT_ASSERT(buffer[0] == 0x0410);
  402. UNIT_ASSERT(buffer[1] == 0x0411);
  403. UNIT_ASSERT(buffer[2] == 0x0000);
  404. UNIT_ASSERT(buffer[3] == 0x0000);
  405. UNIT_ASSERT(written == 2);
  406. memset(buffer, 0, 4);
  407. written = 0;
  408. result = UTF8ToWide(utext, 1, buffer, written);
  409. UNIT_ASSERT(!result);
  410. UNIT_ASSERT(buffer[0] == 0x0000);
  411. UNIT_ASSERT(buffer[1] == 0x0000);
  412. UNIT_ASSERT(buffer[2] == 0x0000);
  413. UNIT_ASSERT(buffer[3] == 0x0000);
  414. UNIT_ASSERT(written == 0);
  415. w = UTF8ToWide(asciiLatinAlphabet, strlen(asciiLatinAlphabet));
  416. UNIT_ASSERT(w == wideLatinAlphabet);
  417. w = UTF8ToWide(utf8CyrillicAlphabet, strlen(utf8CyrillicAlphabet));
  418. UNIT_ASSERT(w == wideCyrillicAlphabet);
  419. const char* utf8NonBMP = "\xf4\x80\x89\x84\xf4\x80\x89\x87\xf4\x80\x88\xba";
  420. wchar16 wNonBMPDummy[] = {0xDBC0, 0xDE44, 0xDBC0, 0xDE47, 0xDBC0, 0xDE3A};
  421. TestSurrogates(utf8NonBMP, wNonBMPDummy, Y_ARRAY_SIZE(wNonBMPDummy));
  422. const char* utf8NonBMP2 = "ab\xf4\x80\x89\x87n";
  423. wchar16 wNonBMPDummy2[] = {'a', 'b', 0xDBC0, 0xDE47, 'n'};
  424. TestSurrogates(utf8NonBMP2, wNonBMPDummy2, Y_ARRAY_SIZE(wNonBMPDummy2));
  425. UNIT_ASSERT_VALUES_EQUAL(WideToUTF8(UTF8ToWide(WideToUTF8(UTF8ToWide<true>(
  426. "m\xFB\xB2\xA5\xAA\xAFyeuse.sexwebcamz.com")))),
  427. TString(
  428. "m\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBDyeuse.sexwebcamz.com"));
  429. }
  430. void TConversionTest::TestWideToUTF8() {
  431. TString s = WideToUTF8(UnicodeText_);
  432. size_t len = 0;
  433. for (TUtf16String::const_iterator i = UnicodeText_.begin(), ie = UnicodeText_.end(); i != ie; ++i) {
  434. len += UTF8RuneLenByUCS(*i);
  435. }
  436. UNIT_ASSERT(s.size() == Utf8Text_.size());
  437. UNIT_ASSERT(s.size() == len);
  438. for (int i = 0; i < static_cast<int>(s.size()); ++i) {
  439. UNIT_ASSERT_VALUES_EQUAL(s[i], Utf8Text_[i]);
  440. }
  441. }
  442. void TConversionTest::TestGetNumOfUTF8Chars() {
  443. size_t n = 0;
  444. bool result = GetNumberOfUTF8Chars(Utf8Text_.c_str(), Utf8Text_.size(), n);
  445. UNIT_ASSERT(result);
  446. UNIT_ASSERT(n == 256);
  447. n = 0;
  448. result = GetNumberOfUTF8Chars(utext, 5, n);
  449. UNIT_ASSERT(!result);
  450. UNIT_ASSERT(n == 2);
  451. n = 0;
  452. result = GetNumberOfUTF8Chars(utext, 1, n);
  453. UNIT_ASSERT(!result);
  454. UNIT_ASSERT(n == 0);
  455. UNIT_ASSERT_EQUAL(GetNumberOfUTF8Chars("привет!"), 7);
  456. }
  457. void TConversionTest::TestSubstrUTF8() {
  458. TStringBuf utextBuf(utext, sizeof(utext));
  459. UNIT_ASSERT(SubstrUTF8(utextBuf, 0, 2) == utextBuf.substr(0, 4));
  460. UNIT_ASSERT(SubstrUTF8(utextBuf, 1, 1) == utextBuf.substr(2, 2));
  461. UNIT_ASSERT(SubstrUTF8(utextBuf, 1, 2) == utextBuf.substr(2, 4));
  462. UNIT_ASSERT(SubstrUTF8(utextBuf, 1, 3) == utextBuf.substr(2, 6));
  463. }
  464. inline bool MustBeSurrogate(wchar32 ch) {
  465. return ch > 0xFFFF;
  466. }
  467. void TConversionTest::TestUnicodeCase() {
  468. // ToLower, ToUpper, ToTitle functions depend on equal size of both original and changed characters
  469. for (wchar32 i = 0; i != NUnicode::UnicodeInstancesLimit(); ++i) {
  470. UNIT_ASSERT(MustBeSurrogate(i) == MustBeSurrogate(ToLower(i)));
  471. UNIT_ASSERT(MustBeSurrogate(i) == MustBeSurrogate(ToUpper(i)));
  472. UNIT_ASSERT(MustBeSurrogate(i) == MustBeSurrogate(ToTitle(i)));
  473. }
  474. }
  475. void TConversionTest::TestUnicodeDetails() {
  476. TUtf16String temp;
  477. for (wchar32 i = 0; i != NUnicode::UnicodeInstancesLimit(); ++i) {
  478. temp.clear();
  479. WriteSymbol(i, temp);
  480. UNIT_ASSERT(temp.size() == W16SymbolSize(temp.c_str(), temp.c_str() + temp.size()));
  481. }
  482. }
  483. class TWideUtilTest: public TTestBase {
  484. UNIT_TEST_SUITE(TWideUtilTest);
  485. UNIT_TEST(TestCollapse);
  486. UNIT_TEST(TestCollapseBuffer);
  487. UNIT_TEST(TestStrip);
  488. UNIT_TEST(TestIsSpace);
  489. UNIT_TEST(TestEscapeHtmlChars);
  490. UNIT_TEST(TestToLower);
  491. UNIT_TEST(TestToUpper);
  492. UNIT_TEST(TestWideString);
  493. UNIT_TEST(TestCountWideChars);
  494. UNIT_TEST(TestIsValidUTF16);
  495. UNIT_TEST(TestIsStringASCII);
  496. UNIT_TEST(TestIsLowerWordStr);
  497. UNIT_TEST(TestIsUpperWordStr);
  498. UNIT_TEST(TestIsTitleStr);
  499. UNIT_TEST(TestIsLowerStr);
  500. UNIT_TEST(TestIsUpperStr);
  501. UNIT_TEST(TestToLowerStr);
  502. UNIT_TEST(TestToUpperStr);
  503. UNIT_TEST(TestToTitleStr);
  504. UNIT_TEST_SUITE_END();
  505. public:
  506. void TestCollapse() {
  507. TUtf16String s;
  508. s.append(ws, Y_ARRAY_SIZE(ws)).append(3, 'a').append(ws, Y_ARRAY_SIZE(ws)).append(3, 'b').append(ws, Y_ARRAY_SIZE(ws));
  509. Collapse(s);
  510. UNIT_ASSERT(s == ASCIIToWide(" aaa bbb "));
  511. {
  512. const TUtf16String w(ASCIIToWide(" a b c "));
  513. s = w;
  514. Collapse(s);
  515. UNIT_ASSERT(s == w);
  516. #ifndef TSTRING_IS_STD_STRING
  517. UNIT_ASSERT(s.c_str() == w.c_str()); // Collapse() does not change the string at all
  518. #endif
  519. }
  520. s = ASCIIToWide(" 123 456 ");
  521. Collapse(s);
  522. UNIT_ASSERT(s == ASCIIToWide(" 123 456 "));
  523. s = ASCIIToWide(" 1\n\n\n23\t 4\f\f56 ");
  524. Collapse(s);
  525. UNIT_ASSERT(s == ASCIIToWide(" 1 23 4 56 "));
  526. s = ASCIIToWide(" 1\n\n\n\f\f56 ");
  527. Collapse(s);
  528. UNIT_ASSERT(s == ASCIIToWide(" 1 56 "));
  529. s = ASCIIToWide(" 1\r\n,\n(\n23\t 4\f\f56 ");
  530. Collapse(s);
  531. UNIT_ASSERT(s == ASCIIToWide(" 1 , ( 23 4 56 "));
  532. s = ASCIIToWide("1 23 ");
  533. Collapse(s);
  534. UNIT_ASSERT(s == ASCIIToWide("1 23 "));
  535. {
  536. const TUtf16String w = ASCIIToWide(" ");
  537. s = w;
  538. Collapse(s);
  539. UNIT_ASSERT(s == w);
  540. #ifndef TSTRING_IS_STD_STRING
  541. UNIT_ASSERT(s.c_str() == w.c_str()); // Collapse() does not change the string at all
  542. #endif
  543. }
  544. s = ASCIIToWide(" ");
  545. Collapse(s);
  546. UNIT_ASSERT(s == ASCIIToWide(" "));
  547. s = ASCIIToWide(",\r\n\"");
  548. Collapse(s);
  549. UNIT_ASSERT(s == ASCIIToWide(", \""));
  550. s = ASCIIToWide("-");
  551. Collapse(s);
  552. UNIT_ASSERT(s == ASCIIToWide("-"));
  553. s.clear();
  554. Collapse(s);
  555. UNIT_ASSERT(s == TUtf16String());
  556. }
  557. void TestCollapseBuffer() {
  558. TUtf16String s;
  559. s.append(ws, Y_ARRAY_SIZE(ws)).append(3, 'a').append(ws, Y_ARRAY_SIZE(ws)).append(3, 'b').append(ws, Y_ARRAY_SIZE(ws));
  560. size_t n = Collapse(s.begin(), s.size());
  561. s.resize(n);
  562. UNIT_ASSERT(s == ASCIIToWide(" aaa bbb "));
  563. s = ASCIIToWide(" a b c ");
  564. n = Collapse(s.begin(), s.size());
  565. UNIT_ASSERT(n == s.size()); // length was not changed
  566. UNIT_ASSERT(s == ASCIIToWide(" a b c "));
  567. s = ASCIIToWide(" 123 456 ");
  568. n = Collapse(s.begin(), s.size());
  569. s.resize(n);
  570. UNIT_ASSERT(s == ASCIIToWide(" 123 456 "));
  571. s = ASCIIToWide(" 1\n\n\n23\t 4\f\f56 ");
  572. n = Collapse(s.begin(), s.size());
  573. s.resize(n);
  574. UNIT_ASSERT(s == ASCIIToWide(" 1 23 4 56 "));
  575. s = ASCIIToWide(" 1\n\n\n\f\f56 ");
  576. n = Collapse(s.begin(), s.size());
  577. s.resize(n);
  578. UNIT_ASSERT(s == ASCIIToWide(" 1 56 "));
  579. s = ASCIIToWide(" 1\r\n,\n(\n23\t 4\f\f56 ");
  580. n = Collapse(s.begin(), s.size());
  581. s.resize(n);
  582. UNIT_ASSERT(s == ASCIIToWide(" 1 , ( 23 4 56 "));
  583. s = ASCIIToWide("1 23 ");
  584. n = Collapse(s.begin(), s.size());
  585. s.resize(n);
  586. UNIT_ASSERT(s == ASCIIToWide("1 23 "));
  587. s = ASCIIToWide(" ");
  588. n = Collapse(s.begin(), s.size());
  589. UNIT_ASSERT(n == 1);
  590. UNIT_ASSERT(s == ASCIIToWide(" "));
  591. s = ASCIIToWide(" ");
  592. n = Collapse(s.begin(), s.size());
  593. s.resize(n);
  594. UNIT_ASSERT(s == ASCIIToWide(" "));
  595. s = ASCIIToWide(",\r\n\"");
  596. n = Collapse(s.begin(), s.size());
  597. s.resize(n);
  598. UNIT_ASSERT(s == ASCIIToWide(", \""));
  599. s = ASCIIToWide("-");
  600. n = Collapse(s.begin(), s.size());
  601. UNIT_ASSERT(n == 1);
  602. UNIT_ASSERT(s == ASCIIToWide("-"));
  603. s = ASCIIToWide("\t");
  604. n = Collapse(s.begin(), s.size());
  605. UNIT_ASSERT(n == 1);
  606. UNIT_ASSERT(s == ASCIIToWide(" "));
  607. s.clear();
  608. n = Collapse(s.begin(), s.size());
  609. UNIT_ASSERT(n == 0);
  610. UNIT_ASSERT(s == TUtf16String());
  611. }
  612. void TestStrip() {
  613. TUtf16String s;
  614. Strip(s);
  615. UNIT_ASSERT(s == TUtf16String());
  616. StripLeft(s);
  617. UNIT_ASSERT(s == TUtf16String());
  618. StripRight(s);
  619. UNIT_ASSERT(s == TUtf16String());
  620. s = ASCIIToWide(" \t\r\n");
  621. Strip(s);
  622. UNIT_ASSERT(s == TUtf16String());
  623. s = ASCIIToWide(" \t\r\n");
  624. StripLeft(s);
  625. UNIT_ASSERT(s == TUtf16String());
  626. s = ASCIIToWide(" \t\r\n");
  627. StripRight(s);
  628. UNIT_ASSERT(s == TUtf16String());
  629. s = ASCIIToWide("\t\f\va \r\n");
  630. Strip(s);
  631. UNIT_ASSERT(s == ASCIIToWide("a"));
  632. s = ASCIIToWide("\t\f\va \r\n");
  633. StripLeft(s);
  634. UNIT_ASSERT(s == ASCIIToWide("a \r\n"));
  635. s = ASCIIToWide("\t\f\va \r\n");
  636. StripRight(s);
  637. UNIT_ASSERT(s == ASCIIToWide("\t\f\va"));
  638. s = ASCIIToWide("\r\na\r\nb\t\tc\r\n");
  639. Strip(s);
  640. UNIT_ASSERT(s == ASCIIToWide("a\r\nb\t\tc"));
  641. s = ASCIIToWide("\r\na\r\nb\t\tc\r\n");
  642. StripLeft(s);
  643. UNIT_ASSERT(s == ASCIIToWide("a\r\nb\t\tc\r\n"));
  644. s = ASCIIToWide("\r\na\r\nb\t\tc\r\n");
  645. StripRight(s);
  646. UNIT_ASSERT(s == ASCIIToWide("\r\na\r\nb\t\tc"));
  647. const TUtf16String w(ASCIIToWide("a b"));
  648. s = w;
  649. Strip(s);
  650. UNIT_ASSERT(s == w);
  651. #ifndef TSTRING_IS_STD_STRING
  652. UNIT_ASSERT(s.c_str() == w.c_str()); // Strip() does not change the string at all
  653. #endif
  654. s = w;
  655. StripLeft(s);
  656. UNIT_ASSERT(s == w);
  657. #ifndef TSTRING_IS_STD_STRING
  658. UNIT_ASSERT(s.c_str() == w.c_str()); // Strip() does not change the string at all
  659. #endif
  660. s = w;
  661. StripRight(s);
  662. UNIT_ASSERT(s == w);
  663. #ifndef TSTRING_IS_STD_STRING
  664. UNIT_ASSERT(s.c_str() == w.c_str()); // Strip() does not change the string at all
  665. #endif
  666. }
  667. void TestIsSpace() {
  668. UNIT_ASSERT(!IsSpace(TUtf16String()));
  669. UNIT_ASSERT(IsSpace(ws, Y_ARRAY_SIZE(ws)));
  670. TUtf16String w;
  671. w.assign(ws, Y_ARRAY_SIZE(ws)).append(TUtf16String(1, '!'));
  672. UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
  673. w.assign(TUtf16String(1, '_')).append(ws, Y_ARRAY_SIZE(ws));
  674. UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
  675. w.assign(ws, Y_ARRAY_SIZE(ws)).append(TUtf16String(1, '$')).append(ws, Y_ARRAY_SIZE(ws));
  676. UNIT_ASSERT(!IsSpace(w.c_str(), w.size()));
  677. }
  678. void TestEscapeHtmlChars() {
  679. // characters from the first half of the ASCII table
  680. for (wchar16 c = 1; c < 0x7F; ++c) {
  681. TUtf16String w(1, c);
  682. EscapeHtmlChars<false>(w);
  683. switch (c) {
  684. case '<':
  685. UNIT_ASSERT(w == ASCIIToWide("&lt;"));
  686. break;
  687. case '>':
  688. UNIT_ASSERT(w == ASCIIToWide("&gt;"));
  689. break;
  690. case '&':
  691. UNIT_ASSERT(w == ASCIIToWide("&amp;"));
  692. break;
  693. case '"':
  694. UNIT_ASSERT(w == ASCIIToWide("&quot;"));
  695. break;
  696. default:
  697. UNIT_ASSERT(w == TUtf16String(1, c));
  698. break;
  699. }
  700. }
  701. for (wchar16 c = 1; c < 0x7F; ++c) {
  702. TUtf16String w(1, c);
  703. EscapeHtmlChars<true>(w);
  704. switch (c) {
  705. case '<':
  706. UNIT_ASSERT(w == ASCIIToWide("&lt;"));
  707. break;
  708. case '>':
  709. UNIT_ASSERT(w == ASCIIToWide("&gt;"));
  710. break;
  711. case '&':
  712. UNIT_ASSERT(w == ASCIIToWide("&amp;"));
  713. break;
  714. case '"':
  715. UNIT_ASSERT(w == ASCIIToWide("&quot;"));
  716. break;
  717. case '\r':
  718. case '\n':
  719. UNIT_ASSERT(w == ASCIIToWide("<BR>"));
  720. break;
  721. default:
  722. UNIT_ASSERT(w == TUtf16String(1, c));
  723. break;
  724. }
  725. }
  726. }
  727. void TestToLower() {
  728. const size_t n = 32;
  729. wchar16 upperCase[n];
  730. std::copy(wideCyrillicAlphabet, wideCyrillicAlphabet + n, upperCase);
  731. ToLower(upperCase, n);
  732. UNIT_ASSERT(TWtringBuf(upperCase, n) == TWtringBuf(wideCyrillicAlphabet + n, n));
  733. }
  734. void TestToUpper() {
  735. const size_t n = 32;
  736. wchar16 lowerCase[n];
  737. std::copy(wideCyrillicAlphabet + n, wideCyrillicAlphabet + n * 2, lowerCase);
  738. ToUpper(lowerCase, n);
  739. UNIT_ASSERT(TWtringBuf(lowerCase, n) == TWtringBuf(wideCyrillicAlphabet, n));
  740. }
  741. void TestWideString() {
  742. const TUtf16String original = UTF32ToWide(WideStringTestData[0], CaseTestDataSize);
  743. const TUtf16String lower = UTF32ToWide(WideStringTestData[1], CaseTestDataSize);
  744. const TUtf16String upper = UTF32ToWide(WideStringTestData[2], CaseTestDataSize);
  745. const TUtf16String title = UTF32ToWide(WideStringTestData[3], CaseTestDataSize);
  746. TUtf16String temp;
  747. temp = original;
  748. temp.to_lower();
  749. UNIT_ASSERT(temp == lower);
  750. temp = original;
  751. ToLower(temp.begin(), temp.size());
  752. UNIT_ASSERT(temp == lower);
  753. temp = original;
  754. temp.to_upper();
  755. UNIT_ASSERT(temp == upper);
  756. temp = original;
  757. ToUpper(temp.begin(), temp.size());
  758. UNIT_ASSERT(temp == upper);
  759. temp = original;
  760. temp.to_title();
  761. UNIT_ASSERT(temp == title);
  762. temp = original;
  763. ToTitle(temp.begin(), temp.size());
  764. UNIT_ASSERT(temp == title);
  765. TVector<wchar32> buffer(WideStringTestData[0], WideStringTestData[0] + CaseTestDataSize);
  766. std::reverse(buffer.begin(), buffer.end());
  767. const TUtf16String reversed = UTF32ToWide(buffer.data(), buffer.size());
  768. temp = original;
  769. ReverseInPlace(temp);
  770. UNIT_ASSERT(temp == reversed);
  771. }
  772. void TestCountWideChars() {
  773. UNIT_ASSERT_EQUAL(CountWideChars(UTF8ToWide("привет!")), 7);
  774. TUtf16String wideStr = UTF8ToWide("\xf0\x9f\x92\xb8привет!");
  775. UNIT_ASSERT_EQUAL(wideStr.size(), 9);
  776. UNIT_ASSERT_EQUAL(CountWideChars(wideStr), 8);
  777. }
  778. void TestIsValidUTF16() {
  779. static wchar16 str1[] = {'h', 'e', 'l', 'l', 'o', '!', 0};
  780. static wchar16 str2[] = {'h', 'e', 'l', 'l', 'o', 0xD842, 0xDEAD, '!', 0};
  781. static wchar16 str3[] = {'h', 'e', 'l', 'l', 'o', 0xD842, '!', 0};
  782. static wchar16 str4[] = {'h', 'e', 'l', 'l', 'o', 0xDEAD, 0xD842, '!', 0};
  783. static wchar16 str5[] = {'h', 'e', 'l', 'l', 'o', 0xD842, 0xDEAD, 0xDEAD, '!', 0};
  784. UNIT_ASSERT(IsValidUTF16(TWtringBuf(str1)));
  785. UNIT_ASSERT(IsValidUTF16(TWtringBuf(str2)));
  786. UNIT_ASSERT(!IsValidUTF16(TWtringBuf(str3)));
  787. UNIT_ASSERT(!IsValidUTF16(TWtringBuf(str4)));
  788. UNIT_ASSERT(!IsValidUTF16(TWtringBuf(str5)));
  789. }
  790. void TestIsStringASCII() {
  791. static char charAscii[] = "0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF";
  792. static wchar16 char16Ascii[] = {
  793. '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'A',
  794. 'B', 'C', 'D', 'E', 'F', '0', '1', '2', '3', '4', '5', '6',
  795. '7', '8', '9', '0', 'A', 'B', 'C', 'D', 'E', 'F', 0};
  796. // Test a variety of the fragment start positions and lengths in order to make
  797. // sure that bit masking in IsStringASCII works correctly.
  798. // Also, test that a non-ASCII character will be detected regardless of its
  799. // position inside the string.
  800. {
  801. const size_t stringLength = Y_ARRAY_SIZE(charAscii) - 1;
  802. for (size_t offset = 0; offset < 8; ++offset) {
  803. for (size_t len = 0, maxLen = stringLength - offset; len < maxLen; ++len) {
  804. UNIT_ASSERT(IsStringASCII(charAscii + offset, charAscii + offset + len));
  805. for (size_t charPos = offset; charPos < len; ++charPos) {
  806. charAscii[charPos] |= '\x80';
  807. UNIT_ASSERT(!IsStringASCII(charAscii + offset, charAscii + offset + len));
  808. charAscii[charPos] &= ~'\x80';
  809. }
  810. }
  811. }
  812. }
  813. {
  814. const size_t stringLength = Y_ARRAY_SIZE(char16Ascii) - 1;
  815. for (size_t offset = 0; offset < 4; ++offset) {
  816. for (size_t len = 0, maxLen = stringLength - offset; len < maxLen; ++len) {
  817. UNIT_ASSERT(IsStringASCII(char16Ascii + offset, char16Ascii + offset + len));
  818. for (size_t charPos = offset; charPos < len; ++charPos) {
  819. char16Ascii[charPos] |= 0x80;
  820. UNIT_ASSERT(
  821. !IsStringASCII(char16Ascii + offset, char16Ascii + offset + len));
  822. char16Ascii[charPos] &= ~0x80;
  823. // Also test when the upper half is non-zero.
  824. char16Ascii[charPos] |= 0x100;
  825. UNIT_ASSERT(
  826. !IsStringASCII(char16Ascii + offset, char16Ascii + offset + len));
  827. char16Ascii[charPos] &= ~0x100;
  828. }
  829. }
  830. }
  831. }
  832. }
  833. void TestIsLowerWordStr() {
  834. UNIT_ASSERT(IsLowerWord(TWtringBuf()));
  835. UNIT_ASSERT(IsLowerWord(UTF8ToWide("")));
  836. UNIT_ASSERT(IsLowerWord(UTF8ToWide("test")));
  837. UNIT_ASSERT(IsLowerWord(UTF8ToWide("тест"))); // "тест" is "test" in russian (cyrrilic)
  838. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("тест тест")));
  839. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("тест100500")));
  840. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("Test")));
  841. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("tesT")));
  842. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("tEst")));
  843. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("Тест")));
  844. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("теСт")));
  845. UNIT_ASSERT(!IsLowerWord(UTF8ToWide("тесТ")));
  846. }
  847. void TestIsUpperWordStr() {
  848. UNIT_ASSERT(IsUpperWord(TWtringBuf()));
  849. UNIT_ASSERT(IsUpperWord(UTF8ToWide("")));
  850. UNIT_ASSERT(IsUpperWord(UTF8ToWide("TEST")));
  851. UNIT_ASSERT(IsUpperWord(UTF8ToWide("ТЕСТ")));
  852. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("тест тест")));
  853. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("тест100500")));
  854. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("Test")));
  855. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("tesT")));
  856. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("tEst")));
  857. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("Тест")));
  858. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("теСт")));
  859. UNIT_ASSERT(!IsUpperWord(UTF8ToWide("тесТ")));
  860. }
  861. void TestIsTitleStr() {
  862. UNIT_ASSERT(!IsTitleWord(TWtringBuf()));
  863. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("")));
  864. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("t")));
  865. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("й")));
  866. UNIT_ASSERT(IsTitleWord(UTF8ToWide("T")));
  867. UNIT_ASSERT(IsTitleWord(UTF8ToWide("Й")));
  868. UNIT_ASSERT(IsTitleWord(UTF8ToWide("Test")));
  869. UNIT_ASSERT(IsTitleWord(UTF8ToWide("Тест")));
  870. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("тест тест")));
  871. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("тест100500")));
  872. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("Тест тест")));
  873. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("Тест100500")));
  874. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("tesT")));
  875. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("tEst")));
  876. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("теСт")));
  877. UNIT_ASSERT(!IsTitleWord(UTF8ToWide("тесТ")));
  878. }
  879. void TestIsLowerStr() {
  880. UNIT_ASSERT(IsLower(TWtringBuf()));
  881. UNIT_ASSERT(IsLower(UTF8ToWide("")));
  882. UNIT_ASSERT(IsLower(UTF8ToWide("test")));
  883. UNIT_ASSERT(IsLower(UTF8ToWide("тест"))); // "тест" is "test" in russian (cyrrilic)
  884. UNIT_ASSERT(IsLower(UTF8ToWide("тест тест")));
  885. UNIT_ASSERT(IsLower(UTF8ToWide("тест100500")));
  886. UNIT_ASSERT(!IsLower(UTF8ToWide("Test")));
  887. UNIT_ASSERT(!IsLower(UTF8ToWide("tesT")));
  888. UNIT_ASSERT(!IsLower(UTF8ToWide("tEst")));
  889. UNIT_ASSERT(!IsLower(UTF8ToWide("Тест")));
  890. UNIT_ASSERT(!IsLower(UTF8ToWide("теСт")));
  891. UNIT_ASSERT(!IsLower(UTF8ToWide("тесТ")));
  892. }
  893. void TestIsUpperStr() {
  894. UNIT_ASSERT(IsUpper(TWtringBuf()));
  895. UNIT_ASSERT(IsUpper(UTF8ToWide("")));
  896. UNIT_ASSERT(IsUpper(UTF8ToWide("TEST")));
  897. UNIT_ASSERT(IsUpper(UTF8ToWide("ТЕСТ")));
  898. UNIT_ASSERT(IsUpper(UTF8ToWide("ТЕСТ ТЕСТ")));
  899. UNIT_ASSERT(IsUpper(UTF8ToWide("ТЕСТ100500")));
  900. UNIT_ASSERT(!IsUpper(UTF8ToWide("Test")));
  901. UNIT_ASSERT(!IsUpper(UTF8ToWide("tesT")));
  902. UNIT_ASSERT(!IsUpper(UTF8ToWide("tEst")));
  903. UNIT_ASSERT(!IsUpper(UTF8ToWide("Тест")));
  904. UNIT_ASSERT(!IsUpper(UTF8ToWide("теСт")));
  905. UNIT_ASSERT(!IsUpper(UTF8ToWide("тесТ")));
  906. }
  907. void TestToLowerStr() {
  908. // In these test and test for `ToUpper` and `ToTitle` we are checking that string keep
  909. // pointing to the same piece of memory we are doing it the following way:
  910. //
  911. // TUtf16String s = ...
  912. // const auto copy = s;
  913. // ...
  914. // UNIT_ASSERT(s.data() == copy.data())
  915. //
  916. // It saves us a couple lines (we are reusing `copy` later) and if one day `TString` will
  917. // become non-refcounted we'll need to rewrite it to something like:
  918. //
  919. // TUtf16String s = ...
  920. // const auto* const data = s.data();
  921. // const auto length = s.length();
  922. // ...
  923. // UNIT_ASSERT(s.data() == data);
  924. // UNIT_ASSERT(s.length() == length);
  925. {
  926. TUtf16String s;
  927. auto writableCopy = s;
  928. const auto copy = s;
  929. const TUtf16String lower;
  930. UNIT_ASSERT(!ToLower(s));
  931. UNIT_ASSERT(s == lower);
  932. #ifndef TSTRING_IS_STD_STRING
  933. UNIT_ASSERT(s.data() == copy.data());
  934. #endif
  935. UNIT_ASSERT(!ToLower(writableCopy.Detach(), writableCopy.size()));
  936. UNIT_ASSERT(writableCopy == lower);
  937. UNIT_ASSERT(!ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  938. UNIT_ASSERT(writableCopy == lower);
  939. UNIT_ASSERT(ToLowerRet(copy) == lower);
  940. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  941. }
  942. {
  943. TUtf16String s = UTF8ToWide("");
  944. auto writableCopy = s;
  945. const auto copy = s;
  946. const TUtf16String lower;
  947. UNIT_ASSERT(!ToLower(s));
  948. UNIT_ASSERT(s == lower);
  949. #ifndef TSTRING_IS_STD_STRING
  950. UNIT_ASSERT(s.data() == copy.data());
  951. #endif
  952. UNIT_ASSERT(!ToLower(writableCopy.Detach(), writableCopy.size()));
  953. UNIT_ASSERT(writableCopy == lower);
  954. UNIT_ASSERT(!ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  955. UNIT_ASSERT(writableCopy == lower);
  956. UNIT_ASSERT(ToLowerRet(copy) == lower);
  957. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  958. }
  959. {
  960. TUtf16String s;
  961. const auto copy = s;
  962. const TUtf16String lower;
  963. UNIT_ASSERT(!ToLower(s, 100500));
  964. UNIT_ASSERT(s == lower);
  965. #ifndef TSTRING_IS_STD_STRING
  966. UNIT_ASSERT(s.data() == copy.data());
  967. #endif
  968. UNIT_ASSERT(ToLowerRet(copy, 100500) == lower);
  969. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 100500) == lower);
  970. }
  971. {
  972. TUtf16String s;
  973. const auto copy = s;
  974. const TUtf16String lower;
  975. UNIT_ASSERT(!ToLower(s, 100500, 1111));
  976. UNIT_ASSERT(s == lower);
  977. #ifndef TSTRING_IS_STD_STRING
  978. UNIT_ASSERT(s.data() == copy.data());
  979. #endif
  980. UNIT_ASSERT(ToLowerRet(copy, 100500, 1111) == lower);
  981. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 100500, 1111) == lower);
  982. }
  983. {
  984. auto s = UTF8ToWide("Й");
  985. auto writableCopy = s;
  986. const auto copy = s;
  987. const auto lower = UTF8ToWide("й");
  988. UNIT_ASSERT(ToLower(s));
  989. UNIT_ASSERT(s == lower);
  990. UNIT_ASSERT(ToLower(writableCopy.Detach(), writableCopy.size()));
  991. UNIT_ASSERT(writableCopy == lower);
  992. UNIT_ASSERT(ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  993. UNIT_ASSERT(writableCopy == lower);
  994. UNIT_ASSERT(ToLowerRet(copy) == lower);
  995. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  996. }
  997. {
  998. auto s = UTF8ToWide("й");
  999. auto writableCopy = s;
  1000. const auto copy = s;
  1001. const auto lower = UTF8ToWide("й");
  1002. UNIT_ASSERT(!ToLower(s));
  1003. UNIT_ASSERT(s == lower);
  1004. #ifndef TSTRING_IS_STD_STRING
  1005. UNIT_ASSERT(s.data() == copy.data());
  1006. #endif
  1007. UNIT_ASSERT(!ToLower(writableCopy.Detach(), writableCopy.size()));
  1008. UNIT_ASSERT(writableCopy == lower);
  1009. UNIT_ASSERT(!ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  1010. UNIT_ASSERT(writableCopy == lower);
  1011. UNIT_ASSERT(ToLowerRet(copy) == lower);
  1012. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  1013. }
  1014. {
  1015. auto s = UTF8ToWide("тест");
  1016. auto writableCopy = s;
  1017. const auto copy = s;
  1018. const auto lower = UTF8ToWide("тест");
  1019. UNIT_ASSERT(!ToLower(s));
  1020. UNIT_ASSERT(s == lower);
  1021. #ifndef TSTRING_IS_STD_STRING
  1022. UNIT_ASSERT(s.data() == copy.data());
  1023. #endif
  1024. UNIT_ASSERT(!ToLower(writableCopy.Detach(), writableCopy.size()));
  1025. UNIT_ASSERT(writableCopy == lower);
  1026. UNIT_ASSERT(!ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  1027. UNIT_ASSERT(writableCopy == lower);
  1028. UNIT_ASSERT(ToLowerRet(copy) == lower);
  1029. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  1030. }
  1031. {
  1032. auto s = UTF8ToWide("Тест");
  1033. auto writableCopy = s;
  1034. const auto copy = s;
  1035. const auto lower = UTF8ToWide("тест");
  1036. UNIT_ASSERT(ToLower(s));
  1037. UNIT_ASSERT(s == lower);
  1038. UNIT_ASSERT(ToLower(writableCopy.Detach(), writableCopy.size()));
  1039. UNIT_ASSERT(writableCopy == lower);
  1040. UNIT_ASSERT(ToLower(copy.data(), copy.size(), writableCopy.Detach()));
  1041. UNIT_ASSERT(writableCopy == lower);
  1042. UNIT_ASSERT(ToLowerRet(copy) == lower);
  1043. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  1044. }
  1045. {
  1046. TUtf16String s = UTF8ToWide("тЕст");
  1047. const auto copy = s;
  1048. const auto lower = UTF8ToWide("тест");
  1049. UNIT_ASSERT(ToLower(s));
  1050. UNIT_ASSERT(s == UTF8ToWide("тест"));
  1051. UNIT_ASSERT(ToLowerRet(copy) == lower);
  1052. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy)) == lower);
  1053. }
  1054. {
  1055. auto s = UTF8ToWide("тЕст");
  1056. const auto copy = s;
  1057. const auto lower = UTF8ToWide("тЕст");
  1058. UNIT_ASSERT(!ToLower(s, 2));
  1059. UNIT_ASSERT(s == lower);
  1060. #ifndef TSTRING_IS_STD_STRING
  1061. UNIT_ASSERT(s.data() == copy.data());
  1062. #endif
  1063. UNIT_ASSERT(ToLowerRet(copy, 2) == lower);
  1064. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 2) == lower);
  1065. }
  1066. {
  1067. auto s = UTF8ToWide("теСт");
  1068. const auto copy = s;
  1069. const auto lower = UTF8ToWide("тест");
  1070. UNIT_ASSERT(ToLower(s, 2));
  1071. UNIT_ASSERT(s == lower);
  1072. UNIT_ASSERT(ToLowerRet(copy, 2) == lower);
  1073. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 2) == lower);
  1074. }
  1075. {
  1076. auto s = UTF8ToWide("теСт");
  1077. const auto copy = s;
  1078. const auto lower = UTF8ToWide("теСт");
  1079. UNIT_ASSERT(!ToLower(s, 3, 1));
  1080. UNIT_ASSERT(s == copy);
  1081. #ifndef TSTRING_IS_STD_STRING
  1082. UNIT_ASSERT(s.data() == copy.data());
  1083. #endif
  1084. UNIT_ASSERT(ToLowerRet(copy, 3, 1) == lower);
  1085. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 3, 1) == lower);
  1086. }
  1087. {
  1088. auto s = UTF8ToWide("теСт");
  1089. const auto copy = s;
  1090. const auto lower = UTF8ToWide("теСт");
  1091. UNIT_ASSERT(!ToLower(s, 3, 100500));
  1092. UNIT_ASSERT(s == copy);
  1093. #ifndef TSTRING_IS_STD_STRING
  1094. UNIT_ASSERT(s.data() == copy.data());
  1095. #endif
  1096. UNIT_ASSERT(ToLowerRet(copy, 3, 100500) == lower);
  1097. UNIT_ASSERT(ToLowerRet(TWtringBuf(copy), 3, 100500) == lower);
  1098. }
  1099. }
  1100. void TestToUpperStr() {
  1101. {
  1102. TUtf16String s;
  1103. auto writableCopy = s;
  1104. const auto copy = s;
  1105. const TUtf16String upper;
  1106. UNIT_ASSERT(!ToUpper(s));
  1107. UNIT_ASSERT(s == upper);
  1108. #ifndef TSTRING_IS_STD_STRING
  1109. UNIT_ASSERT(s.data() == copy.data());
  1110. #endif
  1111. UNIT_ASSERT(!ToUpper(writableCopy.Detach(), writableCopy.size()));
  1112. UNIT_ASSERT(writableCopy == upper);
  1113. UNIT_ASSERT(!ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1114. UNIT_ASSERT(writableCopy == upper);
  1115. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1116. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1117. }
  1118. {
  1119. auto s = UTF8ToWide("");
  1120. auto writableCopy = s;
  1121. const auto copy = s;
  1122. const TUtf16String upper;
  1123. UNIT_ASSERT(!ToUpper(s));
  1124. UNIT_ASSERT(s == upper);
  1125. #ifndef TSTRING_IS_STD_STRING
  1126. UNIT_ASSERT(s.data() == copy.data());
  1127. #endif
  1128. UNIT_ASSERT(!ToUpper(writableCopy.Detach(), writableCopy.size()));
  1129. UNIT_ASSERT(writableCopy == upper);
  1130. UNIT_ASSERT(!ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1131. UNIT_ASSERT(writableCopy == upper);
  1132. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1133. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1134. }
  1135. {
  1136. TUtf16String s;
  1137. auto writableCopy = s;
  1138. const auto copy = s;
  1139. const TUtf16String upper;
  1140. UNIT_ASSERT(!ToUpper(s, 100500));
  1141. UNIT_ASSERT(s == upper);
  1142. #ifndef TSTRING_IS_STD_STRING
  1143. UNIT_ASSERT(s.data() == copy.data());
  1144. #endif
  1145. UNIT_ASSERT(!ToUpper(writableCopy.Detach(), writableCopy.size()));
  1146. UNIT_ASSERT(writableCopy == upper);
  1147. UNIT_ASSERT(!ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1148. UNIT_ASSERT(writableCopy == upper);
  1149. UNIT_ASSERT(ToUpperRet(copy, 100500) == upper);
  1150. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 100500) == upper);
  1151. }
  1152. {
  1153. TUtf16String s;
  1154. const auto copy = s;
  1155. const TUtf16String upper;
  1156. UNIT_ASSERT(!ToUpper(s, 100500, 1111));
  1157. UNIT_ASSERT(s == upper);
  1158. #ifndef TSTRING_IS_STD_STRING
  1159. UNIT_ASSERT(s.data() == copy.data());
  1160. #endif
  1161. UNIT_ASSERT(ToUpperRet(copy, 100500, 1111) == upper);
  1162. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 100500, 1111) == upper);
  1163. }
  1164. {
  1165. auto s = UTF8ToWide("й");
  1166. auto writableCopy = s;
  1167. const auto copy = s;
  1168. const auto upper = UTF8ToWide("Й");
  1169. UNIT_ASSERT(ToUpper(s));
  1170. UNIT_ASSERT(s == upper);
  1171. UNIT_ASSERT(ToUpper(writableCopy.Detach(), writableCopy.size()));
  1172. UNIT_ASSERT(writableCopy == upper);
  1173. UNIT_ASSERT(ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1174. UNIT_ASSERT(writableCopy == upper);
  1175. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1176. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1177. }
  1178. {
  1179. auto s = UTF8ToWide("Й");
  1180. auto writableCopy = s;
  1181. const auto copy = s;
  1182. const auto upper = UTF8ToWide("Й");
  1183. UNIT_ASSERT(!ToUpper(s));
  1184. UNIT_ASSERT(s == copy);
  1185. #ifndef TSTRING_IS_STD_STRING
  1186. UNIT_ASSERT(s.data() == copy.data());
  1187. #endif
  1188. UNIT_ASSERT(!ToUpper(writableCopy.Detach(), writableCopy.size()));
  1189. UNIT_ASSERT(writableCopy == upper);
  1190. UNIT_ASSERT(!ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1191. UNIT_ASSERT(writableCopy == upper);
  1192. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1193. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1194. }
  1195. {
  1196. auto s = UTF8ToWide("тест");
  1197. auto writableCopy = s;
  1198. const auto copy = s;
  1199. const auto upper = UTF8ToWide("ТЕСТ");
  1200. UNIT_ASSERT(ToUpper(s));
  1201. UNIT_ASSERT(s == upper);
  1202. UNIT_ASSERT(ToUpper(writableCopy.Detach(), writableCopy.size()));
  1203. UNIT_ASSERT(writableCopy == upper);
  1204. UNIT_ASSERT(ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1205. UNIT_ASSERT(writableCopy == upper);
  1206. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1207. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1208. }
  1209. {
  1210. auto s = UTF8ToWide("Тест");
  1211. auto writableCopy = s;
  1212. const auto copy = s;
  1213. const auto upper = UTF8ToWide("ТЕСТ");
  1214. UNIT_ASSERT(ToUpper(s));
  1215. UNIT_ASSERT(s == upper);
  1216. UNIT_ASSERT(ToUpper(writableCopy.Detach(), writableCopy.size()));
  1217. UNIT_ASSERT(writableCopy == upper);
  1218. UNIT_ASSERT(ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1219. UNIT_ASSERT(writableCopy == upper);
  1220. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1221. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1222. }
  1223. {
  1224. auto s = UTF8ToWide("тЕст");
  1225. auto writableCopy = s;
  1226. const auto copy = s;
  1227. const auto upper = UTF8ToWide("ТЕСТ");
  1228. UNIT_ASSERT(ToUpper(s));
  1229. UNIT_ASSERT(s == upper);
  1230. UNIT_ASSERT(ToUpper(writableCopy.Detach(), writableCopy.size()));
  1231. UNIT_ASSERT(writableCopy == upper);
  1232. UNIT_ASSERT(ToUpper(copy.data(), copy.size(), writableCopy.Detach()));
  1233. UNIT_ASSERT(writableCopy == upper);
  1234. UNIT_ASSERT(ToUpperRet(copy) == upper);
  1235. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy)) == upper);
  1236. }
  1237. {
  1238. auto s = UTF8ToWide("тЕст");
  1239. const auto copy = s;
  1240. const auto upper = UTF8ToWide("тЕСТ");
  1241. UNIT_ASSERT(ToUpper(s, 2));
  1242. UNIT_ASSERT(s == upper);
  1243. UNIT_ASSERT(ToUpperRet(copy, 2) == upper);
  1244. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 2) == upper);
  1245. }
  1246. {
  1247. auto s = UTF8ToWide("теСт");
  1248. const auto copy = s;
  1249. const auto upper = UTF8ToWide("теСТ");
  1250. UNIT_ASSERT(ToUpper(s, 2));
  1251. UNIT_ASSERT(s == upper);
  1252. UNIT_ASSERT(ToUpperRet(copy, 2) == upper);
  1253. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 2) == upper);
  1254. }
  1255. {
  1256. auto s = UTF8ToWide("теСт");
  1257. const auto copy = s;
  1258. const auto upper = UTF8ToWide("теСТ");
  1259. UNIT_ASSERT(ToUpper(s, 3, 1));
  1260. UNIT_ASSERT(s == upper);
  1261. UNIT_ASSERT(ToUpperRet(copy, 3, 1) == upper);
  1262. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 3, 1) == upper);
  1263. }
  1264. {
  1265. auto s = UTF8ToWide("теСт");
  1266. const auto copy = s;
  1267. const auto upper = UTF8ToWide("теСТ");
  1268. UNIT_ASSERT(ToUpper(s, 3, 100500));
  1269. UNIT_ASSERT(s == upper);
  1270. UNIT_ASSERT(ToUpperRet(copy, 3, 100500) == upper);
  1271. UNIT_ASSERT(ToUpperRet(TWtringBuf(copy), 3, 100500) == upper);
  1272. }
  1273. }
  1274. void TestToTitleStr() {
  1275. {
  1276. TUtf16String s;
  1277. auto writableCopy = s;
  1278. const auto copy = s;
  1279. const TUtf16String title;
  1280. UNIT_ASSERT(!ToTitle(s));
  1281. UNIT_ASSERT(s == title);
  1282. #ifndef TSTRING_IS_STD_STRING
  1283. UNIT_ASSERT(s.data() == copy.data());
  1284. #endif
  1285. UNIT_ASSERT(!ToTitle(writableCopy.Detach(), writableCopy.size()));
  1286. UNIT_ASSERT(writableCopy == title);
  1287. UNIT_ASSERT(!ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1288. UNIT_ASSERT(writableCopy == title);
  1289. UNIT_ASSERT(ToTitleRet(copy) == title);
  1290. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1291. }
  1292. {
  1293. auto s = UTF8ToWide("");
  1294. auto writableCopy = s;
  1295. const auto copy = s;
  1296. const TUtf16String title;
  1297. UNIT_ASSERT(!ToTitle(s));
  1298. UNIT_ASSERT(s == title);
  1299. #ifndef TSTRING_IS_STD_STRING
  1300. UNIT_ASSERT(s.data() == copy.data());
  1301. #endif
  1302. UNIT_ASSERT(!ToTitle(writableCopy.Detach(), writableCopy.size()));
  1303. UNIT_ASSERT(writableCopy == title);
  1304. UNIT_ASSERT(!ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1305. UNIT_ASSERT(writableCopy == title);
  1306. UNIT_ASSERT(ToTitleRet(copy) == title);
  1307. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1308. }
  1309. {
  1310. TUtf16String s;
  1311. const auto copy = s;
  1312. const TUtf16String title;
  1313. UNIT_ASSERT(!ToTitle(s, 100500));
  1314. UNIT_ASSERT(s == title);
  1315. #ifndef TSTRING_IS_STD_STRING
  1316. UNIT_ASSERT(s.data() == copy.data());
  1317. #endif
  1318. UNIT_ASSERT(ToTitleRet(copy) == title);
  1319. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1320. }
  1321. {
  1322. TUtf16String s;
  1323. const auto copy = s;
  1324. const TUtf16String title;
  1325. UNIT_ASSERT(!ToTitle(s, 100500, 1111));
  1326. UNIT_ASSERT(s == title);
  1327. #ifndef TSTRING_IS_STD_STRING
  1328. UNIT_ASSERT(s.data() == copy.data());
  1329. #endif
  1330. UNIT_ASSERT(ToTitleRet(copy) == title);
  1331. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1332. }
  1333. {
  1334. auto s = UTF8ToWide("й");
  1335. auto writableCopy = s;
  1336. const auto copy = s;
  1337. const auto title = UTF8ToWide("Й");
  1338. UNIT_ASSERT(ToTitle(s));
  1339. UNIT_ASSERT(s == title);
  1340. UNIT_ASSERT(ToTitle(writableCopy.Detach(), writableCopy.size()));
  1341. UNIT_ASSERT(writableCopy == title);
  1342. UNIT_ASSERT(ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1343. UNIT_ASSERT(writableCopy == title);
  1344. UNIT_ASSERT(ToTitleRet(copy) == title);
  1345. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1346. }
  1347. {
  1348. auto s = UTF8ToWide("Й");
  1349. auto writableCopy = s;
  1350. const auto copy = s;
  1351. const auto title = UTF8ToWide("Й");
  1352. UNIT_ASSERT(!ToTitle(s));
  1353. UNIT_ASSERT(s == title);
  1354. #ifndef TSTRING_IS_STD_STRING
  1355. UNIT_ASSERT(s.data() == copy.data());
  1356. #endif
  1357. UNIT_ASSERT(!ToTitle(writableCopy.Detach(), writableCopy.size()));
  1358. UNIT_ASSERT(writableCopy == title);
  1359. UNIT_ASSERT(!ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1360. UNIT_ASSERT(writableCopy == title);
  1361. UNIT_ASSERT(ToTitleRet(copy) == title);
  1362. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1363. }
  1364. {
  1365. auto s = UTF8ToWide("тест");
  1366. auto writableCopy = s;
  1367. const auto copy = s;
  1368. const auto title = UTF8ToWide("Тест");
  1369. UNIT_ASSERT(ToTitle(s));
  1370. UNIT_ASSERT(s == title);
  1371. UNIT_ASSERT(ToTitle(writableCopy.Detach(), writableCopy.size()));
  1372. UNIT_ASSERT(writableCopy == title);
  1373. UNIT_ASSERT(ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1374. UNIT_ASSERT(writableCopy == title);
  1375. UNIT_ASSERT(ToTitleRet(copy) == title);
  1376. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1377. }
  1378. {
  1379. auto s = UTF8ToWide("Тест");
  1380. auto writableCopy = s;
  1381. const auto copy = s;
  1382. const auto title = UTF8ToWide("Тест");
  1383. UNIT_ASSERT(!ToTitle(s));
  1384. UNIT_ASSERT(s == title);
  1385. #ifndef TSTRING_IS_STD_STRING
  1386. UNIT_ASSERT(s.data() == copy.data());
  1387. #endif
  1388. UNIT_ASSERT(!ToTitle(writableCopy.Detach(), writableCopy.size()));
  1389. UNIT_ASSERT(writableCopy == title);
  1390. UNIT_ASSERT(!ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1391. UNIT_ASSERT(writableCopy == title);
  1392. UNIT_ASSERT(ToTitleRet(copy) == title);
  1393. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1394. }
  1395. {
  1396. auto s = UTF8ToWide("тЕст");
  1397. auto writableCopy = s;
  1398. const auto copy = s;
  1399. const auto title = UTF8ToWide("Тест");
  1400. UNIT_ASSERT(ToTitle(s));
  1401. UNIT_ASSERT(s == title);
  1402. UNIT_ASSERT(ToTitle(writableCopy.Detach(), writableCopy.size()));
  1403. UNIT_ASSERT(writableCopy == title);
  1404. UNIT_ASSERT(ToTitle(copy.data(), copy.size(), writableCopy.Detach()));
  1405. UNIT_ASSERT(writableCopy == title);
  1406. UNIT_ASSERT(ToTitleRet(copy) == title);
  1407. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy)) == title);
  1408. }
  1409. {
  1410. auto s = UTF8ToWide("тЕст");
  1411. const auto copy = s;
  1412. const auto title = UTF8ToWide("тЕСт");
  1413. UNIT_ASSERT(ToTitle(s, 2));
  1414. UNIT_ASSERT(s == title);
  1415. UNIT_ASSERT(ToTitleRet(copy, 2) == title);
  1416. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy), 2) == title);
  1417. }
  1418. {
  1419. auto s = UTF8ToWide("теСт");
  1420. const auto copy = s;
  1421. const auto title = UTF8ToWide("теСт");
  1422. UNIT_ASSERT(!ToTitle(s, 2));
  1423. UNIT_ASSERT(s == title);
  1424. #ifndef TSTRING_IS_STD_STRING
  1425. UNIT_ASSERT(s.data() == copy.data());
  1426. #endif
  1427. UNIT_ASSERT(ToTitleRet(copy, 2) == title);
  1428. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy), 2) == title);
  1429. }
  1430. {
  1431. auto s = UTF8ToWide("теСт");
  1432. const auto copy = s;
  1433. const auto title = UTF8ToWide("теСТ");
  1434. UNIT_ASSERT(ToTitle(s, 3, 1));
  1435. UNIT_ASSERT(s == title);
  1436. UNIT_ASSERT(ToTitleRet(copy, 3, 1) == title);
  1437. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy), 3, 1) == title);
  1438. }
  1439. {
  1440. auto s = UTF8ToWide("теСт");
  1441. const auto copy = s;
  1442. const auto title = UTF8ToWide("теСТ");
  1443. UNIT_ASSERT(ToTitle(s, 3, 100500));
  1444. UNIT_ASSERT(s == title);
  1445. UNIT_ASSERT(ToTitleRet(copy, 3, 100500) == title);
  1446. UNIT_ASSERT(ToTitleRet(TWtringBuf(copy), 3, 100500) == title);
  1447. }
  1448. }
  1449. };
  1450. UNIT_TEST_SUITE_REGISTRATION(TWideUtilTest);