utf8_ut.cpp 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. #include "utf8.h"
  2. #include "wide.h"
  3. #include <util/stream/file.h>
  4. #include <util/ysaveload.h>
  5. #include <library/cpp/testing/unittest/registar.h>
  6. #include <library/cpp/testing/unittest/env.h>
  7. Y_UNIT_TEST_SUITE(TUtfUtilTest) {
  8. Y_UNIT_TEST(TestUTF8Len) {
  9. UNIT_ASSERT_EQUAL(GetNumberOfUTF8Chars("привет!"), 7);
  10. }
  11. Y_UNIT_TEST(TestToLowerUtfString) {
  12. UNIT_ASSERT_VALUES_EQUAL(ToLowerUTF8("xyz XYZ ПРИВЕТ!"), "xyz xyz привет!");
  13. UNIT_ASSERT_VALUES_EQUAL(ToLowerUTF8(TStringBuf("xyz")), "xyz");
  14. {
  15. TString s = "привет!";
  16. TString q = "ПРИВЕТ!";
  17. TString tmp;
  18. UNIT_ASSERT(ToLowerUTF8Impl(s.data(), s.size(), tmp) == false);
  19. UNIT_ASSERT(ToLowerUTF8Impl(q.data(), q.size(), tmp) == true);
  20. }
  21. {
  22. const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(tolower_utf8(weird)) is 3
  23. const char* turkI = "İ"; //strlen("İ") == 2, strlen(tolower_utf8("İ") == 1
  24. TStringBuf chars[] = {"f", "F", "Б", "б", weird, turkI};
  25. const int N = Y_ARRAY_SIZE(chars);
  26. //try all combinations of these letters.
  27. int numberOfVariants = 1;
  28. for (int len = 0; len <= 4; ++len) {
  29. for (int i = 0; i < numberOfVariants; ++i) {
  30. TString s;
  31. int k = i;
  32. for (int j = 0; j < len; ++j) {
  33. //Treat 'i' like number in base-N system with digits from 'chars'-array
  34. s += chars[k % N];
  35. k /= N;
  36. }
  37. TUtf16String tmp = UTF8ToWide(s);
  38. tmp.to_lower();
  39. UNIT_ASSERT_VALUES_EQUAL(ToLowerUTF8(s), WideToUTF8(tmp));
  40. }
  41. numberOfVariants *= N;
  42. }
  43. }
  44. }
  45. Y_UNIT_TEST(TestToUpperUtfString) {
  46. UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8("xyz XYZ привет!"), "XYZ XYZ ПРИВЕТ!");
  47. UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8(TStringBuf("XYZ")), "XYZ");
  48. {
  49. TString s = "ПРИВЕТ!";
  50. TString q = "привет!";
  51. TString tmp;
  52. UNIT_ASSERT(ToUpperUTF8Impl(s.data(), s.size(), tmp) == false);
  53. UNIT_ASSERT(ToUpperUTF8Impl(q.data(), q.size(), tmp) == true);
  54. }
  55. {
  56. const char* weird = "\xC8\xBE"; // 'Ⱦ', U+023E. strlen(weird)==2, strlen(ToUpper_utf8(weird)) is 3
  57. const char* turkI = "İ"; //strlen("İ") == 2, strlen(ToUpper_utf8("İ") == 1
  58. TStringBuf chars[] = {"F", "f", "б", "Б", turkI, weird};
  59. const int N = Y_ARRAY_SIZE(chars);
  60. //try all combinations of these letters.
  61. int numberOfVariants = 1;
  62. for (int len = 0; len <= 4; ++len) {
  63. for (int i = 0; i < numberOfVariants; ++i) {
  64. TString s;
  65. int k = i;
  66. for (int j = 0; j < len; ++j) {
  67. //Treat 'i' like number in base-N system with digits from 'chars'-array
  68. s += chars[k % N];
  69. k /= N;
  70. }
  71. TUtf16String tmp = UTF8ToWide(s);
  72. tmp.to_upper();
  73. UNIT_ASSERT_VALUES_EQUAL(ToUpperUTF8(s), WideToUTF8(tmp));
  74. }
  75. numberOfVariants *= N;
  76. }
  77. }
  78. }
  79. Y_UNIT_TEST(TestUTF8ToWide) {
  80. TFileInput in(ArcadiaSourceRoot() + TStringBuf("/util/charset/ut/utf8/test1.txt"));
  81. TString text = in.ReadAll();
  82. UNIT_ASSERT(WideToUTF8(UTF8ToWide(text)) == text);
  83. }
  84. Y_UNIT_TEST(TestInvalidUTF8) {
  85. TVector<TString> testData;
  86. TFileInput input(ArcadiaSourceRoot() + TStringBuf("/util/charset/ut/utf8/invalid_UTF8.bin"));
  87. Load(&input, testData);
  88. for (const auto& text : testData) {
  89. UNIT_ASSERT_EXCEPTION(UTF8ToWide(text), yexception);
  90. }
  91. }
  92. Y_UNIT_TEST(TestUTF8ToWideScalar) {
  93. TFileInput in(ArcadiaSourceRoot() + TStringBuf("/util/charset/ut/utf8/test1.txt"));
  94. TString text = in.ReadAll();
  95. TUtf16String wtextSSE = UTF8ToWide(text);
  96. TUtf16String wtextScalar = TUtf16String::Uninitialized(text.size());
  97. const unsigned char* textBegin = reinterpret_cast<const unsigned char*>(text.c_str());
  98. wchar16* wtextBegin = wtextScalar.begin();
  99. ::NDetail::UTF8ToWideImplScalar<false>(textBegin, textBegin + text.size(), wtextBegin);
  100. UNIT_ASSERT(wtextBegin == wtextScalar.begin() + wtextSSE.size());
  101. UNIT_ASSERT(textBegin == reinterpret_cast<const unsigned char*>(text.end()));
  102. wtextScalar.remove(wtextSSE.size());
  103. UNIT_ASSERT(wtextScalar == wtextSSE);
  104. }
  105. }