123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306 |
- #pragma once
- #include "codepage.h"
- #include "iconv.h"
- #include <util/charset/recode_result.h>
- #include <util/charset/unidata.h>
- #include <util/charset/utf8.h>
- #include <util/charset/wide.h>
- #include <util/generic/string.h>
- #include <util/generic/algorithm.h>
- #include <util/generic/yexception.h>
- #include <util/memory/tempbuf.h>
- #include <util/system/yassert.h>
- //! converts text from unicode to yandex codepage
- //! @attention destination buffer must be long enough to fit all characters of the text
- //! @note @c dest buffer must fit at least @c len number of characters
- template <typename TCharType>
- inline size_t WideToChar(const TCharType* text, size_t len, char* dest, ECharset enc) {
- Y_ASSERT(SingleByteCodepage(enc));
- const char* start = dest;
- const Encoder* const encoder = &EncoderByCharset(enc);
- const TCharType* const last = text + len;
- for (const TCharType* cur = text; cur != last; ++dest) {
- *dest = encoder->Tr(ReadSymbolAndAdvance(cur, last));
- }
- return dest - start;
- }
- //! converts text to unicode using a codepage object
- //! @attention destination buffer must be long enough to fit all characters of the text
- //! @note @c dest buffer must fit at least @c len number of characters;
- //! if you need convert zero terminated string you should determine length of the
- //! string using the @c strlen function and pass as the @c len parameter;
- //! it does not make sense to create an additional version of this function because
- //! it will call to @c strlen anyway in order to allocate destination buffer
- template <typename TCharType>
- inline void CharToWide(const char* text, size_t len, TCharType* dest, const CodePage& cp) {
- const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
- const unsigned char* const last = cur + len;
- for (; cur != last; ++cur, ++dest) {
- *dest = static_cast<TCharType>(cp.unicode[*cur]); // static_cast is safe as no 1char codepage contains non-BMP symbols
- }
- }
- namespace NDetail {
- namespace NBaseOps {
- // Template interface base recoding drivers, do not perform any memory management,
- // do not care about buffer size, so supplied @dst
- // should have enough room for the result (with proper reserve for the worst case)
- // Depending on template params, perform conversion of single-byte/multi-byte/utf8 string to/from wide string.
- template <typename TCharType>
- inline TBasicStringBuf<TCharType> RecodeSingleByteChar(const TStringBuf src, TCharType* dst, const CodePage& cp) {
- Y_ASSERT(cp.SingleByteCodepage());
- ::CharToWide(src.data(), src.size(), dst, cp);
- return TBasicStringBuf<TCharType>(dst, src.size());
- }
- template <typename TCharType>
- inline TStringBuf RecodeSingleByteChar(const TBasicStringBuf<TCharType> src, char* dst, const CodePage& cp) {
- Y_ASSERT(cp.SingleByteCodepage());
- ::WideToChar(src.data(), src.size(), dst, cp.CPEnum);
- return TStringBuf(dst, src.size());
- }
- template <typename TCharType>
- inline TBasicStringBuf<TCharType> RecodeMultiByteChar(const TStringBuf src, TCharType* dst, ECharset encoding) {
- Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding));
- size_t read = 0;
- size_t written = 0;
- ::NICONVPrivate::RecodeToUnicode(encoding, src.data(), dst, src.size(), src.size(), read, written);
- return TBasicStringBuf<TCharType>(dst, written);
- }
- template <typename TCharType>
- inline TStringBuf RecodeMultiByteChar(const TBasicStringBuf<TCharType> src, char* dst, ECharset encoding) {
- Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding));
- size_t read = 0;
- size_t written = 0;
- ::NICONVPrivate::RecodeFromUnicode(encoding, src.data(), dst, src.size(), src.size() * 3, read, written);
- return TStringBuf(dst, written);
- }
- template <typename TCharType>
- inline TBasicStringBuf<TCharType> RecodeUtf8(const TStringBuf src, TCharType* dst) {
- size_t len = 0;
- if (!::UTF8ToWide(src.data(), src.size(), dst, len))
- ythrow yexception() << "Invalid UTF8: \"" << src.SubStr(0, 50) << (src.size() > 50 ? "...\"" : "\"");
- return TBasicStringBuf<TCharType>(dst, len);
- }
- template <typename TCharType>
- inline TStringBuf RecodeUtf8(const TBasicStringBuf<TCharType> src, char* dst) {
- size_t len = 0;
- ::WideToUTF8(src.data(), src.size(), dst, len);
- return TStringBuf(dst, len);
- }
- // Select one of re-coding methods from above, based on provided @encoding
- template <typename TCharFrom, typename TCharTo>
- TBasicStringBuf<TCharTo> Recode(const TBasicStringBuf<TCharFrom> src, TCharTo* dst, ECharset encoding) {
- if (encoding == CODES_UTF8)
- return RecodeUtf8(src, dst);
- else if (SingleByteCodepage(encoding))
- return RecodeSingleByteChar(src, dst, *CodePageByCharset(encoding));
- else
- return RecodeMultiByteChar(src, dst, encoding);
- }
- }
- template <typename TCharFrom>
- struct TRecodeTraits;
- template <>
- struct TRecodeTraits<char> {
- using TCharTo = wchar16;
- using TStringBufTo = TWtringBuf;
- using TStringTo = TUtf16String;
- enum { ReserveSize = 4 }; // How many TCharFrom characters we should reserve for one TCharTo character in worst case
- // Here an unicode character can be converted up to 4 bytes of UTF8
- };
- template <>
- struct TRecodeTraits<wchar16> {
- using TCharTo = char;
- using TStringBufTo = TStringBuf;
- using TStringTo = TString;
- enum { ReserveSize = 2 }; // possible surrogate pairs ?
- };
- // Operations with destination buffer where recoded string will be written
- template <typename TResult>
- struct TRecodeResultOps {
- // default implementation will work with TString and TUtf16String - 99% of usage
- using TResultChar = typename TResult::char_type;
- static inline size_t Size(const TResult& dst) {
- return dst.size();
- }
- static inline TResultChar* Reserve(TResult& dst, size_t len) {
- dst.ReserveAndResize(len);
- return dst.begin();
- }
- static inline void Truncate(TResult& dst, size_t len) {
- dst.resize(len);
- }
- };
- // Main template interface for recoding in both directions
- template <typename TCharFrom, typename TResult>
- typename TRecodeTraits<TCharFrom>::TStringBufTo Recode(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) {
- using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo;
- // make enough room for re-coded string
- TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<TCharTo>::ReserveSize);
- // do re-coding
- TBasicStringBuf<TCharTo> res = NBaseOps::Recode(src, dstbuf, encoding);
- // truncate result back to proper size
- TRecodeResultOps<TResult>::Truncate(dst, res.size());
- return res;
- }
- // appending version of Recode()
- template <typename TCharFrom, typename TResult>
- typename TRecodeTraits<TCharFrom>::TStringBufTo RecodeAppend(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) {
- using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo;
- size_t dstOrigSize = TRecodeResultOps<TResult>::Size(dst);
- TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, dstOrigSize + src.size() * TRecodeTraits<TCharTo>::ReserveSize);
- TBasicStringBuf<TCharTo> appended = NBaseOps::Recode(src, dstbuf + dstOrigSize, encoding);
- size_t dstFinalSize = dstOrigSize + appended.size();
- TRecodeResultOps<TResult>::Truncate(dst, dstFinalSize);
- return TBasicStringBuf<TCharTo>(dstbuf, dstFinalSize);
- }
- // special implementation for robust utf8 functions
- template <typename TResult>
- TWtringBuf RecodeUTF8Robust(const TStringBuf src, TResult& dst) {
- // make enough room for re-coded string
- wchar16* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<wchar16>::ReserveSize);
- // do re-coding
- size_t written = 0;
- UTF8ToWide<true>(src.data(), src.size(), dstbuf, written);
- // truncate result back to proper size
- TRecodeResultOps<TResult>::Truncate(dst, written);
- return TWtringBuf(dstbuf, written);
- }
- template <typename TCharFrom>
- inline typename TRecodeTraits<TCharFrom>::TStringTo Recode(const TBasicStringBuf<TCharFrom> src, ECharset encoding) {
- typename TRecodeTraits<TCharFrom>::TStringTo res;
- Recode<TCharFrom>(src, res, encoding);
- return res;
- }
- }
- // Write result into @dst. Return string-buffer pointing to re-coded content of @dst.
- template <bool robust>
- inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) {
- if (robust && CODES_UTF8 == encoding)
- return ::NDetail::RecodeUTF8Robust(src, dst);
- return ::NDetail::Recode<char>(src, dst, encoding);
- }
- inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) {
- return ::NDetail::Recode<char>(src, dst, encoding);
- }
- inline TStringBuf WideToChar(const TWtringBuf src, TString& dst, ECharset encoding) {
- return ::NDetail::Recode<wchar16>(src, dst, encoding);
- }
- //! calls either to @c WideToUTF8 or @c WideToChar depending on the encoding type
- inline TString WideToChar(const wchar16* text, size_t len, ECharset enc) {
- if (NCodepagePrivate::NativeCodepage(enc)) {
- if (enc == CODES_UTF8)
- return WideToUTF8(text, len);
- TString s = TString::Uninitialized(len);
- s.remove(WideToChar(text, len, s.begin(), enc));
- return s;
- }
- TString s = TString::Uninitialized(len * 3);
- size_t read = 0;
- size_t written = 0;
- NICONVPrivate::RecodeFromUnicode(enc, text, s.begin(), len, s.size(), read, written);
- s.remove(written);
- return s;
- }
- inline TUtf16String CharToWide(const char* text, size_t len, const CodePage& cp) {
- TUtf16String w = TUtf16String::Uninitialized(len);
- CharToWide(text, len, w.begin(), cp);
- return w;
- }
- //! calls either to @c UTF8ToWide or @c CharToWide depending on the encoding type
- template <bool robust>
- inline TUtf16String CharToWide(const char* text, size_t len, ECharset enc) {
- if (NCodepagePrivate::NativeCodepage(enc)) {
- if (enc == CODES_UTF8)
- return UTF8ToWide<robust>(text, len);
- return CharToWide(text, len, *CodePageByCharset(enc));
- }
- TUtf16String w = TUtf16String::Uninitialized(len * 2);
- size_t read = 0;
- size_t written = 0;
- NICONVPrivate::RecodeToUnicode(enc, text, w.begin(), len, len, read, written);
- w.remove(written);
- return w;
- }
- //! converts text from UTF8 to unicode, if conversion fails it uses codepage to convert the text
- //! @param text text to be converted
- //! @param len length of the text in characters
- //! @param cp a codepage that is used in case of failed conversion from UTF8
- inline TUtf16String UTF8ToWide(const char* text, size_t len, const CodePage& cp) {
- TUtf16String w = TUtf16String::Uninitialized(len);
- size_t written = 0;
- if (UTF8ToWide(text, len, w.begin(), written))
- w.remove(written);
- else
- CharToWide(text, len, w.begin(), cp);
- return w;
- }
- inline TString WideToChar(const TWtringBuf w, ECharset enc) {
- return WideToChar(w.data(), w.size(), enc);
- }
- inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
- return CharToWide<false>(s.data(), s.size(), enc);
- }
- template <bool robust>
- inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
- return CharToWide<robust>(s.data(), s.size(), enc);
- }
- inline TUtf16String CharToWide(const TStringBuf s, const CodePage& cp) {
- return CharToWide(s.data(), s.size(), cp);
- }
- // true if @text can be fully encoded to specified @encoding,
- // with possibility to recover exact original text after decoding
- bool CanBeEncoded(TWtringBuf text, ECharset encoding);
|