Browse Source

Support wide strings (and string_views) in Out<> / IOutputStream

thegeorg 1 year ago
parent
commit
eda4f4ec39
3 changed files with 104 additions and 20 deletions
  1. 42 5
      util/charset/wide.h
  2. 24 15
      util/stream/output.cpp
  3. 38 0
      util/stream/str_ut.cpp

+ 42 - 5
util/charset/wide.h

@@ -49,8 +49,8 @@ namespace NDetail {
 
     inline wchar32 ReadSurrogatePair(const wchar16* chars) noexcept {
         const wchar32 SURROGATE_OFFSET = static_cast<wchar32>(0x10000 - (0xD800 << 10) - 0xDC00);
-        wchar16 lead = chars[0];
-        wchar16 tail = chars[1];
+        wchar32 lead = chars[0];
+        wchar32 tail = chars[1];
 
         Y_ASSERT(IsW16SurrogateLead(lead));
         Y_ASSERT(IsW16SurrogateTail(tail));
@@ -98,7 +98,7 @@ inline wchar32 ReadSymbol(const wchar32* begin, const wchar32* end) noexcept {
 }
 
 //! presuming input data is either big enought of null terminated
-inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin) noexcept {
+inline wchar32 ReadSymbolAndAdvance(const char16_t*& begin) noexcept {
     Y_ASSERT(*begin);
     if (IsW16SurrogateLead(begin[0])) {
         if (IsW16SurrogateTail(begin[1])) {
@@ -117,12 +117,30 @@ inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin) noexcept {
 }
 
 //! presuming input data is either big enought of null terminated
-inline wchar32 ReadSymbolAndAdvance(const wchar32*& begin) noexcept {
+inline wchar32 ReadSymbolAndAdvance(const char32_t*& begin) noexcept {
     Y_ASSERT(*begin);
     return *(begin++);
 }
 
-inline wchar32 ReadSymbolAndAdvance(const wchar16*& begin, const wchar16* end) noexcept {
+inline wchar32 ReadSymbolAndAdvance(const wchar_t*& begin) noexcept {
+    // According to
+    // https://en.cppreference.com/w/cpp/language/types
+    // wchar_t holds UTF-16 on Windows and UTF-32 on Linux / macOS
+    //
+    // Apply reinterpret cast and dispatch to a proper type
+
+#ifdef _win_
+    using TDistinctChar = char16_t;
+#else
+    using TDistinctChar = char32_t;
+#endif
+    const TDistinctChar*& distinctBegin = reinterpret_cast<const TDistinctChar*&>(begin);
+    wchar32 result = ReadSymbolAndAdvance(distinctBegin);
+    begin = reinterpret_cast<const wchar_t*&>(distinctBegin);
+    return result;
+}
+
+inline wchar32 ReadSymbolAndAdvance(const char16_t*& begin, const char16_t* end) noexcept {
     Y_ASSERT(begin < end);
     if (IsW16SurrogateLead(begin[0])) {
         if (begin + 1 != end && IsW16SurrogateTail(begin[1])) {
@@ -144,6 +162,25 @@ inline wchar32 ReadSymbolAndAdvance(const wchar32*& begin, const wchar32* end) n
     return *(begin++);
 }
 
+inline wchar32 ReadSymbolAndAdvance(const wchar_t*& begin, const wchar_t* end) noexcept {
+    // According to
+    // https://en.cppreference.com/w/cpp/language/types
+    // wchar_t holds UTF-16 on Windows and UTF-32 on Linux / macOS
+    //
+    // Apply reinterpret cast and dispatch to a proper type
+
+#ifdef _win_
+    using TDistinctChar = char16_t;
+#else
+    using TDistinctChar = char32_t;
+#endif
+    const TDistinctChar* distinctBegin = reinterpret_cast<const TDistinctChar*>(begin);
+    const TDistinctChar* distinctEnd = reinterpret_cast<const TDistinctChar*>(end);
+    wchar32 result = ::ReadSymbolAndAdvance(distinctBegin, distinctEnd);
+    begin = reinterpret_cast<const wchar_t*>(distinctBegin);
+    return result;
+}
+
 template <class T>
 inline size_t WriteSymbol(wchar16 s, T& dest) noexcept {
     ::NDetail::TSelector<std::is_pointer<T>::value>::WriteSymbol(s, dest);

+ 24 - 15
util/stream/output.cpp

@@ -70,24 +70,13 @@ void Out<wchar32>(IOutputStream& o, wchar32 ch) {
     o.Write(buffer, length);
 }
 
-static void WriteString(IOutputStream& o, const wchar16* w, size_t n) {
+template <typename TCharType>
+static void WriteString(IOutputStream& o, const TCharType* w, size_t n) {
     const size_t buflen = (n * MAX_UTF8_BYTES); // * 4 because the conversion functions can convert unicode character into maximum 4 bytes of UTF8
     TTempBuf buffer(buflen + 1);
-    char* const data = buffer.Data();
     size_t written = 0;
-    WideToUTF8(w, n, data, written);
-    data[written] = 0;
-    o.Write(data, written);
-}
-
-static void WriteString(IOutputStream& o, const wchar32* w, size_t n) {
-    const size_t buflen = (n * MAX_UTF8_BYTES); // * 4 because the conversion functions can convert unicode character into maximum 4 bytes of UTF8
-    TTempBuf buffer(buflen + 1);
-    char* const data = buffer.Data();
-    size_t written = 0;
-    WideToUTF8(w, n, data, written);
-    data[written] = 0;
-    o.Write(data, written);
+    WideToUTF8(w, n, buffer.Data(), written);
+    o.Write(buffer.Data(), written);
 }
 
 template <>
@@ -100,11 +89,31 @@ void Out<std::string>(IOutputStream& o, const std::string& p) {
     o.Write(p.data(), p.length());
 }
 
+template <>
+void Out<std::wstring>(IOutputStream& o, const std::wstring& p) {
+    WriteString(o, p.data(), p.length());
+}
+
+template <>
+void Out<std::u16string>(IOutputStream& o, const std::u16string& p) {
+    WriteString(o, p.data(), p.length());
+}
+
+template <>
+void Out<std::u32string>(IOutputStream& o, const std::u32string& p) {
+    WriteString(o, p.data(), p.length());
+}
+
 template <>
 void Out<std::string_view>(IOutputStream& o, const std::string_view& p) {
     o.Write(p.data(), p.length());
 }
 
+template <>
+void Out<std::wstring_view>(IOutputStream& o, const std::wstring_view& p) {
+    WriteString(o, p.data(), p.length());
+}
+
 template <>
 void Out<std::u16string_view>(IOutputStream& o, const std::u16string_view& p) {
     WriteString(o, p.data(), p.length());

+ 38 - 0
util/stream/str_ut.cpp

@@ -149,4 +149,42 @@ Y_UNIT_TEST_SUITE(TStringInputOutputTest) {
         // Check old stream is in a valid state
         output1 << "baz";
     }
+
+    // There is no distinct tests for Out<> via IOutputStream.
+    // Let's tests strings output here.
+    Y_UNIT_TEST(TestWritingWideStrings) {
+        using namespace std::literals::string_literals;
+        TString str;
+        TStringOutput stream(str);
+
+        // test char16_t
+        const char16_t* utf16Data = u"Быть или не быть? Вот в чём вопрос";
+        stream << std::u16string(utf16Data);
+        UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+        str.clear();
+
+        stream << std::u16string_view(utf16Data);
+        UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+        str.clear();
+
+        // test char32_t
+        const char32_t* utf32Data = U"Быть или не быть? Вот в чём вопрос";
+        stream << std::u32string(utf32Data);
+        UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+        str.clear();
+
+        stream << std::u32string_view(utf32Data);
+        UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+        str.clear();
+
+        // test wchar_t
+        const wchar_t* wcharData = L"Быть или не быть? Вот в чём вопрос";
+        stream << std::wstring(wcharData);
+        UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+        str.clear();
+
+        stream << std::wstring_view(wcharData);
+        UNIT_ASSERT_STRINGS_EQUAL(str, "Быть или не быть? Вот в чём вопрос");
+        str.clear();
+    }
 }