wide.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. #pragma once
  2. #include "codepage.h"
  3. #include "iconv.h"
  4. #include <util/charset/recode_result.h>
  5. #include <util/charset/unidata.h>
  6. #include <util/charset/utf8.h>
  7. #include <util/charset/wide.h>
  8. #include <util/generic/string.h>
  9. #include <util/generic/algorithm.h>
  10. #include <util/generic/yexception.h>
  11. #include <util/memory/tempbuf.h>
  12. #include <util/system/yassert.h>
  13. //! converts text from unicode to yandex codepage
  14. //! @attention destination buffer must be long enough to fit all characters of the text
  15. //! @note @c dest buffer must fit at least @c len number of characters
  16. template <typename TCharType>
  17. inline size_t WideToChar(const TCharType* text, size_t len, char* dest, ECharset enc) {
  18. Y_ASSERT(SingleByteCodepage(enc));
  19. const char* start = dest;
  20. const Encoder* const encoder = &EncoderByCharset(enc);
  21. const TCharType* const last = text + len;
  22. for (const TCharType* cur = text; cur != last; ++dest) {
  23. *dest = encoder->Tr(ReadSymbolAndAdvance(cur, last));
  24. }
  25. return dest - start;
  26. }
  27. //! converts text to unicode using a codepage object
  28. //! @attention destination buffer must be long enough to fit all characters of the text
  29. //! @note @c dest buffer must fit at least @c len number of characters;
  30. //! if you need convert zero terminated string you should determine length of the
  31. //! string using the @c strlen function and pass as the @c len parameter;
  32. //! it does not make sense to create an additional version of this function because
  33. //! it will call to @c strlen anyway in order to allocate destination buffer
  34. template <typename TCharType>
  35. inline void CharToWide(const char* text, size_t len, TCharType* dest, const CodePage& cp) {
  36. const unsigned char* cur = reinterpret_cast<const unsigned char*>(text);
  37. const unsigned char* const last = cur + len;
  38. for (; cur != last; ++cur, ++dest) {
  39. *dest = static_cast<TCharType>(cp.unicode[*cur]); // static_cast is safe as no 1char codepage contains non-BMP symbols
  40. }
  41. }
  42. namespace NDetail {
  43. namespace NBaseOps {
  44. // Template interface base recoding drivers, do not perform any memory management,
  45. // do not care about buffer size, so supplied @dst
  46. // should have enough room for the result (with proper reserve for the worst case)
  47. // Depending on template params, perform conversion of single-byte/multi-byte/utf8 string to/from wide string.
  48. template <typename TCharType>
  49. inline TBasicStringBuf<TCharType> RecodeSingleByteChar(const TStringBuf src, TCharType* dst, const CodePage& cp) {
  50. Y_ASSERT(cp.SingleByteCodepage());
  51. ::CharToWide(src.data(), src.size(), dst, cp);
  52. return TBasicStringBuf<TCharType>(dst, src.size());
  53. }
  54. template <typename TCharType>
  55. inline TStringBuf RecodeSingleByteChar(const TBasicStringBuf<TCharType> src, char* dst, const CodePage& cp) {
  56. Y_ASSERT(cp.SingleByteCodepage());
  57. ::WideToChar(src.data(), src.size(), dst, cp.CPEnum);
  58. return TStringBuf(dst, src.size());
  59. }
  60. template <typename TCharType>
  61. inline TBasicStringBuf<TCharType> RecodeMultiByteChar(const TStringBuf src, TCharType* dst, ECharset encoding) {
  62. Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding));
  63. size_t read = 0;
  64. size_t written = 0;
  65. ::NICONVPrivate::RecodeToUnicode(encoding, src.data(), dst, src.size(), src.size(), read, written);
  66. return TBasicStringBuf<TCharType>(dst, written);
  67. }
  68. template <typename TCharType>
  69. inline TStringBuf RecodeMultiByteChar(const TBasicStringBuf<TCharType> src, char* dst, ECharset encoding) {
  70. Y_ASSERT(!NCodepagePrivate::NativeCodepage(encoding));
  71. size_t read = 0;
  72. size_t written = 0;
  73. ::NICONVPrivate::RecodeFromUnicode(encoding, src.data(), dst, src.size(), src.size() * 3, read, written);
  74. return TStringBuf(dst, written);
  75. }
  76. template <typename TCharType>
  77. inline TBasicStringBuf<TCharType> RecodeUtf8(const TStringBuf src, TCharType* dst) {
  78. size_t len = 0;
  79. if (!::UTF8ToWide(src.data(), src.size(), dst, len))
  80. ythrow yexception() << "Invalid UTF8: \"" << src.SubStr(0, 50) << (src.size() > 50 ? "...\"" : "\"");
  81. return TBasicStringBuf<TCharType>(dst, len);
  82. }
  83. template <typename TCharType>
  84. inline TStringBuf RecodeUtf8(const TBasicStringBuf<TCharType> src, char* dst) {
  85. size_t len = 0;
  86. ::WideToUTF8(src.data(), src.size(), dst, len);
  87. return TStringBuf(dst, len);
  88. }
  89. // Select one of re-coding methods from above, based on provided @encoding
  90. template <typename TCharFrom, typename TCharTo>
  91. TBasicStringBuf<TCharTo> Recode(const TBasicStringBuf<TCharFrom> src, TCharTo* dst, ECharset encoding) {
  92. if (encoding == CODES_UTF8)
  93. return RecodeUtf8(src, dst);
  94. else if (SingleByteCodepage(encoding))
  95. return RecodeSingleByteChar(src, dst, *CodePageByCharset(encoding));
  96. else
  97. return RecodeMultiByteChar(src, dst, encoding);
  98. }
  99. }
  100. template <typename TCharFrom>
  101. struct TRecodeTraits;
  102. template <>
  103. struct TRecodeTraits<char> {
  104. using TCharTo = wchar16;
  105. using TStringBufTo = TWtringBuf;
  106. using TStringTo = TUtf16String;
  107. enum { ReserveSize = 4 }; // How many TCharFrom characters we should reserve for one TCharTo character in worst case
  108. // Here an unicode character can be converted up to 4 bytes of UTF8
  109. };
  110. template <>
  111. struct TRecodeTraits<wchar16> {
  112. using TCharTo = char;
  113. using TStringBufTo = TStringBuf;
  114. using TStringTo = TString;
  115. enum { ReserveSize = 2 }; // possible surrogate pairs ?
  116. };
  117. // Operations with destination buffer where recoded string will be written
  118. template <typename TResult>
  119. struct TRecodeResultOps {
  120. // default implementation will work with TString and TUtf16String - 99% of usage
  121. using TResultChar = typename TResult::char_type;
  122. static inline size_t Size(const TResult& dst) {
  123. return dst.size();
  124. }
  125. static inline TResultChar* Reserve(TResult& dst, size_t len) {
  126. dst.ReserveAndResize(len);
  127. return dst.begin();
  128. }
  129. static inline void Truncate(TResult& dst, size_t len) {
  130. dst.resize(len);
  131. }
  132. };
  133. // Main template interface for recoding in both directions
  134. template <typename TCharFrom, typename TResult>
  135. typename TRecodeTraits<TCharFrom>::TStringBufTo Recode(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) {
  136. using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo;
  137. // make enough room for re-coded string
  138. TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<TCharTo>::ReserveSize);
  139. // do re-coding
  140. TBasicStringBuf<TCharTo> res = NBaseOps::Recode(src, dstbuf, encoding);
  141. // truncate result back to proper size
  142. TRecodeResultOps<TResult>::Truncate(dst, res.size());
  143. return res;
  144. }
  145. // appending version of Recode()
  146. template <typename TCharFrom, typename TResult>
  147. typename TRecodeTraits<TCharFrom>::TStringBufTo RecodeAppend(const TBasicStringBuf<TCharFrom> src, TResult& dst, ECharset encoding) {
  148. using TCharTo = typename TRecodeTraits<TCharFrom>::TCharTo;
  149. size_t dstOrigSize = TRecodeResultOps<TResult>::Size(dst);
  150. TCharTo* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, dstOrigSize + src.size() * TRecodeTraits<TCharTo>::ReserveSize);
  151. TBasicStringBuf<TCharTo> appended = NBaseOps::Recode(src, dstbuf + dstOrigSize, encoding);
  152. size_t dstFinalSize = dstOrigSize + appended.size();
  153. TRecodeResultOps<TResult>::Truncate(dst, dstFinalSize);
  154. return TBasicStringBuf<TCharTo>(dstbuf, dstFinalSize);
  155. }
  156. // special implementation for robust utf8 functions
  157. template <typename TResult>
  158. TWtringBuf RecodeUTF8Robust(const TStringBuf src, TResult& dst) {
  159. // make enough room for re-coded string
  160. wchar16* dstbuf = TRecodeResultOps<TResult>::Reserve(dst, src.size() * TRecodeTraits<wchar16>::ReserveSize);
  161. // do re-coding
  162. size_t written = 0;
  163. UTF8ToWide<true>(src.data(), src.size(), dstbuf, written);
  164. // truncate result back to proper size
  165. TRecodeResultOps<TResult>::Truncate(dst, written);
  166. return TWtringBuf(dstbuf, written);
  167. }
  168. template <typename TCharFrom>
  169. inline typename TRecodeTraits<TCharFrom>::TStringTo Recode(const TBasicStringBuf<TCharFrom> src, ECharset encoding) {
  170. typename TRecodeTraits<TCharFrom>::TStringTo res;
  171. Recode<TCharFrom>(src, res, encoding);
  172. return res;
  173. }
  174. }
  175. // Write result into @dst. Return string-buffer pointing to re-coded content of @dst.
  176. template <bool robust>
  177. inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) {
  178. if (robust && CODES_UTF8 == encoding)
  179. return ::NDetail::RecodeUTF8Robust(src, dst);
  180. return ::NDetail::Recode<char>(src, dst, encoding);
  181. }
  182. inline TWtringBuf CharToWide(const TStringBuf src, TUtf16String& dst, ECharset encoding) {
  183. return ::NDetail::Recode<char>(src, dst, encoding);
  184. }
  185. inline TStringBuf WideToChar(const TWtringBuf src, TString& dst, ECharset encoding) {
  186. return ::NDetail::Recode<wchar16>(src, dst, encoding);
  187. }
  188. //! calls either to @c WideToUTF8 or @c WideToChar depending on the encoding type
  189. inline TString WideToChar(const wchar16* text, size_t len, ECharset enc) {
  190. if (NCodepagePrivate::NativeCodepage(enc)) {
  191. if (enc == CODES_UTF8)
  192. return WideToUTF8(text, len);
  193. TString s = TString::Uninitialized(len);
  194. s.remove(WideToChar(text, len, s.begin(), enc));
  195. return s;
  196. }
  197. TString s = TString::Uninitialized(len * 3);
  198. size_t read = 0;
  199. size_t written = 0;
  200. NICONVPrivate::RecodeFromUnicode(enc, text, s.begin(), len, s.size(), read, written);
  201. s.remove(written);
  202. return s;
  203. }
  204. inline TUtf16String CharToWide(const char* text, size_t len, const CodePage& cp) {
  205. TUtf16String w = TUtf16String::Uninitialized(len);
  206. CharToWide(text, len, w.begin(), cp);
  207. return w;
  208. }
  209. //! calls either to @c UTF8ToWide or @c CharToWide depending on the encoding type
  210. template <bool robust>
  211. inline TUtf16String CharToWide(const char* text, size_t len, ECharset enc) {
  212. if (NCodepagePrivate::NativeCodepage(enc)) {
  213. if (enc == CODES_UTF8)
  214. return UTF8ToWide<robust>(text, len);
  215. return CharToWide(text, len, *CodePageByCharset(enc));
  216. }
  217. TUtf16String w = TUtf16String::Uninitialized(len * 2);
  218. size_t read = 0;
  219. size_t written = 0;
  220. NICONVPrivate::RecodeToUnicode(enc, text, w.begin(), len, len, read, written);
  221. w.remove(written);
  222. return w;
  223. }
  224. //! converts text from UTF8 to unicode, if conversion fails it uses codepage to convert the text
  225. //! @param text text to be converted
  226. //! @param len length of the text in characters
  227. //! @param cp a codepage that is used in case of failed conversion from UTF8
  228. inline TUtf16String UTF8ToWide(const char* text, size_t len, const CodePage& cp) {
  229. TUtf16String w = TUtf16String::Uninitialized(len);
  230. size_t written = 0;
  231. if (UTF8ToWide(text, len, w.begin(), written))
  232. w.remove(written);
  233. else
  234. CharToWide(text, len, w.begin(), cp);
  235. return w;
  236. }
  237. inline TString WideToChar(const TWtringBuf w, ECharset enc) {
  238. return WideToChar(w.data(), w.size(), enc);
  239. }
  240. inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
  241. return CharToWide<false>(s.data(), s.size(), enc);
  242. }
  243. template <bool robust>
  244. inline TUtf16String CharToWide(const TStringBuf s, ECharset enc) {
  245. return CharToWide<robust>(s.data(), s.size(), enc);
  246. }
  247. inline TUtf16String CharToWide(const TStringBuf s, const CodePage& cp) {
  248. return CharToWide(s.data(), s.size(), cp);
  249. }
  250. // true if @text can be fully encoded to specified @encoding,
  251. // with possibility to recover exact original text after decoding
  252. bool CanBeEncoded(TWtringBuf text, ECharset encoding);