SMusatov
/
ydb
mirror of https://github.com/ydb-platform/ydb.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
							#pragma once

#include "codepage.h"
#include "iconv.h"

#include <util/charset/recode_result.h>
#include <util/charset/unidata.h>
#include <util/charset/utf8.h>
#include <util/charset/wide.h>
#include <util/generic/string.h>
#include <util/generic/algorithm.h>
#include <util/generic/yexception.h>
#include <util/memory/tempbuf.h>
#include <util/system/yassert.h>

//! converts text from unicode to yandex codepage
//! @attention destination buffer must be long enough to fit all characters of the text
//! @note @c dest buffer must fit at least @c len number of characters
template <typename TCharType>
inline size_t WideToChar(const TCharType* text, size_t len, char* dest, ECharset enc) {
    Y_ASSERT(SingleByteCodepage(enc));

    const char* start = dest;

    const Encoder* const encoder = &EncoderByCharset(enc);
    const TCharType* const last = text + len;
    for (const TCharType* cur = text; cur != last; ++dest) {
        *dest = encoder->Tr(ReadSymbolAndAdvance(cur, last));
    }

    return dest - start;
}

//! converts text to unicode using a codepage object
//! @attention destination buffer must be long enough to fit all characters of the text
//! @note @c dest buffer must fit at least @c len number of characters;
//!       if you need convert zero terminated string you should determine length of the
//!       string using the @c strlen function and pass as the @c len parameter;
//!       it does not make sense to create an additional version of this function because
//!       it will call to @c strlen anyway in order to allocate destination buffer
template <typename TCharType>
inline void CharToWide(const char* text, size_t len, TCharType* dest, const CodePage& cp) {
    const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
    const unsigned char* const last = cur + len;
    for (; cur != last; ++cur, ++dest) {
        *dest = static_cast<TCharType>(cp.unicode[*cur]); // static_cast is safe as no 1char codepage contains non-BMP symbols
    }
}

namespace NDetail {
    namespace NBaseOps {
        // Template interface base recoding drivers, do not perform any memory management,
        // do not care about buffer size, so supplied @dst
        // should have enough room for the result (with proper reserve for the worst case)

        // Depending on template params, perform conversion of single-byte/multi-byte/utf8 string to/from wide string.

        template <typename TCharType>
        inline TBasicStringBuf<TCharType> RecodeSingleByteChar(const TStringBuf src, TCharType* dst, const CodePage& cp) {
            Y_ASSERT(cp.SingleByteCodepage());
            ::CharToWide(src.data(), src.size(), dst, cp);
            return TBasicStringBuf<TCharType>(dst, src.size());
        }

        template <typename TCharType>
        inline TStringBuf RecodeSingleByteChar(const TBasicStringBuf<TCharType> src, char* dst, const CodePage& cp) {
            Y_ASSERT(cp.SingleByteCodepage());
            ::WideToChar(src.data(), src.size(), dst, cp.CPEnum);
            return TStringBuf(dst, src.size());
        }

        template <typename TCharType>
        inline TBasicStringBuf<TCharType> RecodeMultiByteChar(const TStringBuf src, TCharType* dst, ECharset encoding) {
            Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding));
            size_t read = 0;
            size_t written = 0;
            ::NICONVPrivate::RecodeToUnicode(encoding, src.data(), dst, src.size(), src.size(), read, written);
            return TBasicStringBuf<TCharType>(dst, written);
        }

        template <typename TCharType>
        inline TStringBuf RecodeMultiByteChar(const TBasicStringBuf<TCharType> src, char* dst, ECharset encoding) {
            Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding));
            size_t read = 0;
            size_t written = 0;
            ::NICONVPrivate::RecodeFromUnicode(encoding, src.data(), dst, src.size(), src.size() * 3, read, written);
            return TStringBuf(dst, written);
        }

        template <typename TCharType>
        inline TBasicStringBuf<TCharType> RecodeUtf8(const TStringBuf src, TCharType* dst) {
            size_t len = 0;
            if (!::UTF8ToWide(src.data(), src.size(), dst, len))
                ythrow yexception() << "Invalid UTF8: \"" << src.SubStr(0, 50) << (src.size() > 50 ? "...\"" : "\"");
            return TBasicStringBuf<TCharType>(dst, len);
        }

        template <typename TCharType>
        inline TStringBuf RecodeUtf8(const TBasicStringBuf<TCharType> src, char* dst) {
            size_t len = 0;
            ::WideToUTF8(src.data(), src.size(), dst, len);
            return TStringBuf(dst, len);
        }

        // Select one of re-coding methods from above, based on provided @encoding

        template <typename TCharFrom, typename TCharTo>
        TBasicStringBuf<TCharTo> Recode(const TBasicStringBuf<TCharFrom> src, TCharTo* dst, ECharset encoding) {
            if (encoding == CODES_UTF8)
                return RecodeUtf8(src, dst);
            else if (SingleByteCodepage(encoding))
                return RecodeSingleByteChar(src, dst, *CodePageByCharset(encoding));
            else
                return RecodeMultiByteChar(src, dst, encoding);
        }

    }

    template <typename TCharFrom>
    struct TRecodeTraits;

    template <>
    struct TRecodeTraits<char> {
        using TCharTo = wchar16;
        using TStringBufTo = TWtringBuf;
        using TStringTo = TUtf16String;
        enum { ReserveSize = 4 }; // How many TCharFrom characters we should reserve for one TCharTo character in worst case
                                  // Here an unicode character can be converted up to 4 bytes of UTF8
    };

    template <>
    struct TRecodeTraits<wchar16> {
        using TCharTo = char;
        using TStringBufTo = TStringBuf;
        using TStringTo = TString;
        enum { ReserveSize = 2 }; // possible surrogate pairs ?
    };

    // Operations with destination buffer where recoded string will be written
    template <typename TResult>
    struct TRecodeResultOps {
        // default implementation will work with TString and TUtf16String - 99% of usage
        using TResultChar = typename TResult::char_type;

        static inline size_t Size(const TResult& dst) {
            return dst.size();
        }

        static inline TResultChar* Reserve(TResult& dst, size_t len) {
            dst.ReserveAndResize(len);
            return dst.begin();
        }

        static inline void Truncate(TResult& dst, size_t len) {
            dst.resize(len);
        }
    };

    // Main template interface for recoding in both directions

    template <typename TCharFrom, typename TResult>
    typename TRecodeTraits<TCharFrom>::TStringBufTo Recode(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) {
        using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo;
        // make enough room for re-coded string
        TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<TCharTo>::ReserveSize);
        // do re-coding
        TBasicStringBuf<TCharTo> res = NBaseOps::Recode(src, dstbuf, encoding);
        // truncate result back to proper size
        TRecodeResultOps<TResult>::Truncate(dst, res.size());
        return res;
    }

    // appending version of Recode()
    template <typename TCharFrom, typename TResult>
    typename TRecodeTraits<TCharFrom>::TStringBufTo RecodeAppend(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) {
        using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo;
        size_t dstOrigSize = TRecodeResultOps<TResult>::Size(dst);
        TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, dstOrigSize + src.size() * TRecodeTraits<TCharTo>::ReserveSize);
        TBasicStringBuf<TCharTo> appended = NBaseOps::Recode(src, dstbuf + dstOrigSize, encoding);
        size_t dstFinalSize = dstOrigSize + appended.size();
        TRecodeResultOps<TResult>::Truncate(dst, dstFinalSize);
        return TBasicStringBuf<TCharTo>(dstbuf, dstFinalSize);
    }

    // special implementation for robust utf8 functions
    template <typename TResult>
    TWtringBuf RecodeUTF8Robust(const TStringBuf src, TResult& dst) {
        // make enough room for re-coded string
        wchar16* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<wchar16>::ReserveSize);

        // do re-coding
        size_t written = 0;
        UTF8ToWide<true>(src.data(), src.size(), dstbuf, written);

        // truncate result back to proper size
        TRecodeResultOps<TResult>::Truncate(dst, written);
        return TWtringBuf(dstbuf, written);
    }

    template <typename TCharFrom>
    inline typename TRecodeTraits<TCharFrom>::TStringTo Recode(const TBasicStringBuf<TCharFrom> src, ECharset encoding) {
        typename TRecodeTraits<TCharFrom>::TStringTo res;
        Recode<TCharFrom>(src, res, encoding);
        return res;
    }
}

// Write result into @dst. Return string-buffer pointing to re-coded content of @dst.

template <bool robust>
inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) {
    if (robust && CODES_UTF8 == encoding)
        return ::NDetail::RecodeUTF8Robust(src, dst);
    return ::NDetail::Recode<char>(src, dst, encoding);
}

inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) {
    return ::NDetail::Recode<char>(src, dst, encoding);
}

inline TStringBuf WideToChar(const TWtringBuf src, TString& dst, ECharset encoding) {
    return ::NDetail::Recode<wchar16>(src, dst, encoding);
}

//! calls either to @c WideToUTF8 or @c WideToChar depending on the encoding type
inline TString WideToChar(const wchar16* text, size_t len, ECharset enc) {
    if (NCodepagePrivate::NativeCodepage(enc)) {
        if (enc == CODES_UTF8)
            return WideToUTF8(text, len);

        TString s = TString::Uninitialized(len);
        s.remove(WideToChar(text, len, s.begin(), enc));

        return s;
    }

    TString s = TString::Uninitialized(len * 3);

    size_t read = 0;
    size_t written = 0;
    NICONVPrivate::RecodeFromUnicode(enc, text, s.begin(), len, s.size(), read, written);
    s.remove(written);

    return s;
}

inline TUtf16String CharToWide(const char* text, size_t len, const CodePage& cp) {
    TUtf16String w = TUtf16String::Uninitialized(len);
    CharToWide(text, len, w.begin(), cp);
    return w;
}

//! calls either to @c UTF8ToWide or @c CharToWide depending on the encoding type
template <bool robust>
inline TUtf16String CharToWide(const char* text, size_t len, ECharset enc) {
    if (NCodepagePrivate::NativeCodepage(enc)) {
        if (enc == CODES_UTF8)
            return UTF8ToWide<robust>(text, len);

        return CharToWide(text, len, *CodePageByCharset(enc));
    }

    TUtf16String w = TUtf16String::Uninitialized(len * 2);

    size_t read = 0;
    size_t written = 0;
    NICONVPrivate::RecodeToUnicode(enc, text, w.begin(), len, len, read, written);
    w.remove(written);

    return w;
}

//! converts text from UTF8 to unicode, if conversion fails it uses codepage to convert the text
//! @param text     text to be converted
//! @param len      length of the text in characters
//! @param cp       a codepage that is used in case of failed conversion from UTF8
inline TUtf16String UTF8ToWide(const char* text, size_t len, const CodePage& cp) {
    TUtf16String w = TUtf16String::Uninitialized(len);
    size_t written = 0;
    if (UTF8ToWide(text, len, w.begin(), written))
        w.remove(written);
    else
        CharToWide(text, len, w.begin(), cp);
    return w;
}

inline TString WideToChar(const TWtringBuf w, ECharset enc) {
    return WideToChar(w.data(), w.size(), enc);
}

inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
    return CharToWide<false>(s.data(), s.size(), enc);
}

template <bool robust>
inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
    return CharToWide<robust>(s.data(), s.size(), enc);
}

inline TUtf16String CharToWide(const TStringBuf s, const CodePage& cp) {
    return CharToWide(s.data(), s.size(), cp);
}

// true if @text can be fully encoded to specified @encoding,
// with possibility to recover exact original text after decoding
bool CanBeEncoded(TWtringBuf text, ECharset encoding);