#pragma once #include "codepage.h" #include "iconv.h" #include #include #include #include #include #include #include #include #include //! converts text from unicode to yandex codepage //! @attention destination buffer must be long enough to fit all characters of the text //! @note @c dest buffer must fit at least @c len number of characters template inline size_t WideToChar(const TCharType* text, size_t len, char* dest, ECharset enc) { Y_ASSERT(SingleByteCodepage(enc)); const char* start = dest; const Encoder* const encoder = &EncoderByCharset(enc); const TCharType* const last = text + len; for (const TCharType* cur = text; cur != last; ++dest) { *dest = encoder->Tr(ReadSymbolAndAdvance(cur, last)); } return dest - start; } //! converts text to unicode using a codepage object //! @attention destination buffer must be long enough to fit all characters of the text //! @note @c dest buffer must fit at least @c len number of characters; //! if you need convert zero terminated string you should determine length of the //! string using the @c strlen function and pass as the @c len parameter; //! it does not make sense to create an additional version of this function because //! it will call to @c strlen anyway in order to allocate destination buffer template inline void CharToWide(const char* text, size_t len, TCharType* dest, const CodePage& cp) { const unsigned char* cur = reinterpret_cast(text); const unsigned char* const last = cur + len; for (; cur != last; ++cur, ++dest) { *dest = static_cast(cp.unicode[*cur]); // static_cast is safe as no 1char codepage contains non-BMP symbols } } namespace NDetail { namespace NBaseOps { // Template interface base recoding drivers, do not perform any memory management, // do not care about buffer size, so supplied @dst // should have enough room for the result (with proper reserve for the worst case) // Depending on template params, perform conversion of single-byte/multi-byte/utf8 string to/from wide string. template inline TBasicStringBuf RecodeSingleByteChar(const TStringBuf src, TCharType* dst, const CodePage& cp) { Y_ASSERT(cp.SingleByteCodepage()); ::CharToWide(src.data(), src.size(), dst, cp); return TBasicStringBuf(dst, src.size()); } template inline TStringBuf RecodeSingleByteChar(const TBasicStringBuf src, char* dst, const CodePage& cp) { Y_ASSERT(cp.SingleByteCodepage()); ::WideToChar(src.data(), src.size(), dst, cp.CPEnum); return TStringBuf(dst, src.size()); } template inline TBasicStringBuf RecodeMultiByteChar(const TStringBuf src, TCharType* dst, ECharset encoding) { Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding)); size_t read = 0; size_t written = 0; ::NICONVPrivate::RecodeToUnicode(encoding, src.data(), dst, src.size(), src.size(), read, written); return TBasicStringBuf(dst, written); } template inline TStringBuf RecodeMultiByteChar(const TBasicStringBuf src, char* dst, ECharset encoding) { Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding)); size_t read = 0; size_t written = 0; ::NICONVPrivate::RecodeFromUnicode(encoding, src.data(), dst, src.size(), src.size() * 3, read, written); return TStringBuf(dst, written); } template inline TBasicStringBuf RecodeUtf8(const TStringBuf src, TCharType* dst) { size_t len = 0; if (!::UTF8ToWide(src.data(), src.size(), dst, len)) ythrow yexception() << "Invalid UTF8: \"" << src.SubStr(0, 50) << (src.size() > 50 ? "...\"" : "\""); return TBasicStringBuf(dst, len); } template inline TStringBuf RecodeUtf8(const TBasicStringBuf src, char* dst) { size_t len = 0; ::WideToUTF8(src.data(), src.size(), dst, len); return TStringBuf(dst, len); } // Select one of re-coding methods from above, based on provided @encoding template TBasicStringBuf Recode(const TBasicStringBuf src, TCharTo* dst, ECharset encoding) { if (encoding == CODES_UTF8) return RecodeUtf8(src, dst); else if (SingleByteCodepage(encoding)) return RecodeSingleByteChar(src, dst, *CodePageByCharset(encoding)); else return RecodeMultiByteChar(src, dst, encoding); } } template struct TRecodeTraits; template <> struct TRecodeTraits { using TCharTo = wchar16; using TStringBufTo = TWtringBuf; using TStringTo = TUtf16String; enum { ReserveSize = 4 }; // How many TCharFrom characters we should reserve for one TCharTo character in worst case // Here an unicode character can be converted up to 4 bytes of UTF8 }; template <> struct TRecodeTraits { using TCharTo = char; using TStringBufTo = TStringBuf; using TStringTo = TString; enum { ReserveSize = 2 }; // possible surrogate pairs ? }; // Operations with destination buffer where recoded string will be written template struct TRecodeResultOps { // default implementation will work with TString and TUtf16String - 99% of usage using TResultChar = typename TResult::char_type; static inline size_t Size(const TResult& dst) { return dst.size(); } static inline TResultChar* Reserve(TResult& dst, size_t len) { dst.ReserveAndResize(len); return dst.begin(); } static inline void Truncate(TResult& dst, size_t len) { dst.resize(len); } }; // Main template interface for recoding in both directions template typename TRecodeTraits::TStringBufTo Recode(const TBasicStringBuf src, TResult& dst, ECharset encoding) { using TCharTo = typename TRecodeTraits::TCharTo; // make enough room for re-coded string TCharTo* dstbuf = TRecodeResultOps::Reserve(dst, src.size() * TRecodeTraits::ReserveSize); // do re-coding TBasicStringBuf res = NBaseOps::Recode(src, dstbuf, encoding); // truncate result back to proper size TRecodeResultOps::Truncate(dst, res.size()); return res; } // appending version of Recode() template typename TRecodeTraits::TStringBufTo RecodeAppend(const TBasicStringBuf src, TResult& dst, ECharset encoding) { using TCharTo = typename TRecodeTraits::TCharTo; size_t dstOrigSize = TRecodeResultOps::Size(dst); TCharTo* dstbuf = TRecodeResultOps::Reserve(dst, dstOrigSize + src.size() * TRecodeTraits::ReserveSize); TBasicStringBuf appended = NBaseOps::Recode(src, dstbuf + dstOrigSize, encoding); size_t dstFinalSize = dstOrigSize + appended.size(); TRecodeResultOps::Truncate(dst, dstFinalSize); return TBasicStringBuf(dstbuf, dstFinalSize); } // special implementation for robust utf8 functions template TWtringBuf RecodeUTF8Robust(const TStringBuf src, TResult& dst) { // make enough room for re-coded string wchar16* dstbuf = TRecodeResultOps::Reserve(dst, src.size() * TRecodeTraits::ReserveSize); // do re-coding size_t written = 0; UTF8ToWide(src.data(), src.size(), dstbuf, written); // truncate result back to proper size TRecodeResultOps::Truncate(dst, written); return TWtringBuf(dstbuf, written); } template inline typename TRecodeTraits::TStringTo Recode(const TBasicStringBuf src, ECharset encoding) { typename TRecodeTraits::TStringTo res; Recode(src, res, encoding); return res; } } // Write result into @dst. Return string-buffer pointing to re-coded content of @dst. template inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) { if (robust && CODES_UTF8 == encoding) return ::NDetail::RecodeUTF8Robust(src, dst); return ::NDetail::Recode(src, dst, encoding); } inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) { return ::NDetail::Recode(src, dst, encoding); } inline TStringBuf WideToChar(const TWtringBuf src, TString& dst, ECharset encoding) { return ::NDetail::Recode(src, dst, encoding); } //! calls either to @c WideToUTF8 or @c WideToChar depending on the encoding type inline TString WideToChar(const wchar16* text, size_t len, ECharset enc) { if (NCodepagePrivate::NativeCodepage(enc)) { if (enc == CODES_UTF8) return WideToUTF8(text, len); TString s = TString::Uninitialized(len); s.remove(WideToChar(text, len, s.begin(), enc)); return s; } TString s = TString::Uninitialized(len * 3); size_t read = 0; size_t written = 0; NICONVPrivate::RecodeFromUnicode(enc, text, s.begin(), len, s.size(), read, written); s.remove(written); return s; } inline TUtf16String CharToWide(const char* text, size_t len, const CodePage& cp) { TUtf16String w = TUtf16String::Uninitialized(len); CharToWide(text, len, w.begin(), cp); return w; } //! calls either to @c UTF8ToWide or @c CharToWide depending on the encoding type template inline TUtf16String CharToWide(const char* text, size_t len, ECharset enc) { if (NCodepagePrivate::NativeCodepage(enc)) { if (enc == CODES_UTF8) return UTF8ToWide(text, len); return CharToWide(text, len, *CodePageByCharset(enc)); } TUtf16String w = TUtf16String::Uninitialized(len * 2); size_t read = 0; size_t written = 0; NICONVPrivate::RecodeToUnicode(enc, text, w.begin(), len, len, read, written); w.remove(written); return w; } //! converts text from UTF8 to unicode, if conversion fails it uses codepage to convert the text //! @param text text to be converted //! @param len length of the text in characters //! @param cp a codepage that is used in case of failed conversion from UTF8 inline TUtf16String UTF8ToWide(const char* text, size_t len, const CodePage& cp) { TUtf16String w = TUtf16String::Uninitialized(len); size_t written = 0; if (UTF8ToWide(text, len, w.begin(), written)) w.remove(written); else CharToWide(text, len, w.begin(), cp); return w; } inline TString WideToChar(const TWtringBuf w, ECharset enc) { return WideToChar(w.data(), w.size(), enc); } inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) { return CharToWide(s.data(), s.size(), enc); } template inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) { return CharToWide(s.data(), s.size(), enc); } inline TUtf16String CharToWide(const TStringBuf s, const CodePage& cp) { return CharToWide(s.data(), s.size(), cp); } // true if @text can be fully encoded to specified @encoding, // with possibility to recover exact original text after decoding bool CanBeEncoded(TWtringBuf text, ECharset encoding);