recyr_int.hh 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. #pragma once
  2. #include <util/charset/recode_result.h>
  3. #include <util/charset/utf8.h>
  4. #include <util/generic/ptr.h>
  5. #include <util/generic/string.h>
  6. #include <util/system/defaults.h>
  7. #include "codepage.h"
  8. #include "doccodes.h"
  9. #include "iconv.h"
  10. #include "wide.h"
  11. namespace NCodepagePrivate {
  12. inline RECODE_RESULT _recodeCopy(const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  13. in_readed = in_size;
  14. RECODE_RESULT res = RECODE_OK;
  15. if (in_readed > out_size) {
  16. res = RECODE_EOOUTPUT;
  17. in_readed = out_size;
  18. }
  19. if (in != out)
  20. memcpy(out, in, in_readed);
  21. out_writed = in_readed;
  22. return res;
  23. }
  24. inline RECODE_RESULT _recodeToUTF8(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  25. if (From == CODES_UTF8)
  26. return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
  27. const CodePage* cp = CodePageByCharset(From);
  28. const unsigned char* in_start = (const unsigned char*)in;
  29. const unsigned char* in_end = in_start + in_size;
  30. const unsigned char* out_start = (unsigned char*)out;
  31. const unsigned char* out_end = out_start + out_size;
  32. size_t rune_len;
  33. RECODE_RESULT res = RECODE_OK;
  34. while ((unsigned char*)in < in_end && res == RECODE_OK) {
  35. res = SafeWriteUTF8Char(cp->unicode[(unsigned char)(*in++)], rune_len, (unsigned char*)out, out_end);
  36. out += rune_len;
  37. }
  38. in_readed = (unsigned char*)in - in_start;
  39. out_writed = (unsigned char*)out - out_start;
  40. return res;
  41. }
  42. inline RECODE_RESULT _recodeFromUTF8(ECharset to, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  43. if (to == CODES_UTF8)
  44. return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
  45. Y_ASSERT(CODES_UNKNOWN < to && to < CODES_MAX);
  46. const Encoder* enc = &EncoderByCharset(to);
  47. const unsigned char* in_start = (const unsigned char*)in;
  48. const unsigned char* in_end = in_start + in_size;
  49. const unsigned char* out_start = (unsigned char*)out;
  50. const unsigned char* out_end = out_start + out_size;
  51. wchar32 rune;
  52. size_t rune_len;
  53. RECODE_RESULT res = RECODE_OK;
  54. while ((const unsigned char*)in < in_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL)) {
  55. res = SafeReadUTF8Char(rune, rune_len, (const unsigned char*)in, in_end);
  56. if (res == RECODE_BROKENSYMBOL)
  57. rune_len = 1;
  58. if (res != RECODE_EOINPUT)
  59. *out++ = enc->Tr(rune);
  60. in += rune_len;
  61. if (res == RECODE_OK && (const unsigned char*)in < in_end && (unsigned char*)out >= out_end)
  62. res = RECODE_EOOUTPUT;
  63. }
  64. in_readed = (unsigned char*)in - in_start;
  65. out_writed = (unsigned char*)out - out_start;
  66. return res;
  67. }
  68. inline RECODE_RESULT _recodeToYandex(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  69. if (From == CODES_YANDEX)
  70. return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
  71. if (From == CODES_UTF8)
  72. return _recodeFromUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed);
  73. in_readed = (out_size > in_size) ? in_size : out_size;
  74. const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_to_yandex[From];
  75. rcdr.Tr(in, out, in_readed);
  76. out_writed = in_readed;
  77. if (out_size < in_size)
  78. return RECODE_EOOUTPUT;
  79. return RECODE_OK;
  80. }
  81. inline RECODE_RESULT _recodeFromYandex(ECharset To, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  82. if (To == CODES_YANDEX)
  83. return _recodeCopy(in, out, in_size, out_size, in_readed, out_writed);
  84. if (To == CODES_UTF8)
  85. return _recodeToUTF8(CODES_YANDEX, in, out, in_size, out_size, in_readed, out_writed);
  86. in_readed = (out_size > in_size) ? in_size : out_size;
  87. const Recoder& rcdr = NCodepagePrivate::TCodePageData::rcdr_from_yandex[To];
  88. rcdr.Tr(in, out, in_readed);
  89. out_writed = in_readed;
  90. if (out_size < in_size)
  91. return RECODE_EOOUTPUT;
  92. return RECODE_OK;
  93. }
  94. template <class TCharType>
  95. inline RECODE_RESULT _recodeUTF8ToUnicode(const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  96. const unsigned char* inp = (const unsigned char*)in;
  97. const unsigned char* in_end = inp + in_size;
  98. TCharType* outp = out;
  99. const TCharType* out_end = outp + out_size;
  100. size_t rune_len;
  101. wchar32 rune;
  102. RECODE_RESULT res = RECODE_OK;
  103. while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp < in_end && outp < out_end) {
  104. res = SafeReadUTF8Char(rune, rune_len, inp, in_end);
  105. if (res == RECODE_BROKENSYMBOL)
  106. rune_len = 1;
  107. if (res == RECODE_OK || res == RECODE_BROKENSYMBOL) {
  108. if (!WriteSymbol(rune, outp, out_end)) {
  109. break;
  110. }
  111. inp += rune_len;
  112. }
  113. }
  114. in_readed = inp - (const unsigned char*)in;
  115. out_writed = outp - out;
  116. if ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && in_readed != in_size)
  117. return RECODE_EOOUTPUT;
  118. return res;
  119. }
  120. template <class TCharType>
  121. inline RECODE_RESULT _recodeSBToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  122. const CodePage* cp = CodePageByCharset(From);
  123. const unsigned char* inp = (const unsigned char*)in;
  124. const unsigned char* in_end = inp + in_size;
  125. TCharType* outp = out;
  126. const TCharType* out_end = outp + out_size;
  127. while (inp < in_end && outp < out_end)
  128. *outp++ = static_cast<TCharType>(cp->unicode[*inp++]);
  129. in_readed = inp - (const unsigned char*)in;
  130. out_writed = outp - out;
  131. if (in_readed != in_size)
  132. return RECODE_EOOUTPUT;
  133. return RECODE_OK;
  134. }
  135. template <class TCharType>
  136. inline RECODE_RESULT _recodeUnicodeToUTF8Impl(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  137. const TCharType* inp = in;
  138. const TCharType* in_end = in + in_size;
  139. unsigned char* outp = (unsigned char*)out;
  140. const unsigned char* out_end = outp + out_size;
  141. size_t rune_len;
  142. wchar32 rune;
  143. RECODE_RESULT res = RECODE_OK;
  144. while ((res == RECODE_OK || res == RECODE_BROKENSYMBOL) && inp != in_end) {
  145. rune = ReadSymbolAndAdvance(inp, in_end);
  146. res = SafeWriteUTF8Char(rune, rune_len, outp, out_end);
  147. if (outp >= out_end && (res == RECODE_OK || res == RECODE_BROKENSYMBOL))
  148. res = RECODE_EOOUTPUT;
  149. outp += rune_len;
  150. }
  151. in_readed = inp - in;
  152. out_writed = outp - (const unsigned char*)out;
  153. return res;
  154. }
  155. inline RECODE_RESULT _recodeUnicodeToUTF8(wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
  156. return SafeWriteUTF8Char(rune, nwritten, (unsigned char*)out, out_size);
  157. }
  158. template <class TCharType, int Size = sizeof(TCharType)>
  159. struct TCharTypeSwitch;
  160. template <class TCharType>
  161. struct TCharTypeSwitch<TCharType, 2> {
  162. using TRealCharType = wchar16;
  163. };
  164. template <class TCharType>
  165. struct TCharTypeSwitch<TCharType, 4> {
  166. using TRealCharType = wchar32;
  167. };
  168. template <class TCharType>
  169. inline RECODE_RESULT _recodeUnicodeToUTF8(const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  170. static_assert(sizeof(TCharType) > 1, "expect some wide type");
  171. using TRealCharType = typename TCharTypeSwitch<TCharType>::TRealCharType;
  172. return _recodeUnicodeToUTF8Impl(reinterpret_cast<const TRealCharType*>(in), out, in_size, out_size, in_readed, out_writed);
  173. }
  174. template <class TCharType>
  175. inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  176. const TCharType* inp = in;
  177. const TCharType* in_end = in + in_size;
  178. const char* out_begin = out;
  179. const char* out_end = out + out_size;
  180. const Encoder* enc = &EncoderByCharset(To);
  181. while (inp != in_end && out != out_end) {
  182. *out++ = enc->Tr(ReadSymbolAndAdvance(inp, in_end));
  183. }
  184. in_readed = inp - in;
  185. out_writed = out - out_begin;
  186. if (in_readed != in_size)
  187. return RECODE_EOOUTPUT;
  188. return RECODE_OK;
  189. }
  190. inline RECODE_RESULT _recodeUnicodeToSB(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
  191. if (0 == out_size)
  192. return RECODE_EOOUTPUT;
  193. *out = EncoderByCharset(To).Tr(rune);
  194. nwritten = 1;
  195. return RECODE_OK;
  196. }
  197. inline RECODE_RESULT _rune2hex(wchar32 in, char* out, size_t out_size, size_t& out_writed) {
  198. static const char hex_digs[] = "0123456789ABCDEF";
  199. out_writed = 0;
  200. RECODE_RESULT res = RECODE_OK;
  201. for (int i = 7; i >= 0; i--) {
  202. unsigned char h = (unsigned char)(in >> (i * 4) & 0x0F);
  203. if (h || i == 0) {
  204. if (out_writed + 1 >= out_size) {
  205. res = RECODE_EOOUTPUT;
  206. break;
  207. }
  208. out[out_writed++] = hex_digs[h];
  209. }
  210. }
  211. return res;
  212. }
  213. inline RECODE_RESULT _recodeUnicodeToHTMLEntities(const wchar32* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  214. const wchar32* in_end = in + in_size;
  215. const char* out_beg = out;
  216. const wchar32* in_beg = in;
  217. RECODE_RESULT res = RECODE_OK;
  218. const char* out_end = out + out_size - 1;
  219. while (in < in_end && out < out_end) {
  220. if (*in < 0x80 && *in != '<' && *in != '&' && *in != '>') { //ascii
  221. *out++ = char(*in & 0x00FF);
  222. } else { //entity
  223. char* ent = out;
  224. size_t ent_writed;
  225. if (ent > out_end - 6) {
  226. res = RECODE_EOOUTPUT;
  227. break;
  228. }
  229. memcpy(ent, "&#x", 3);
  230. ent += 3;
  231. res = _rune2hex(*in, ent, out_end - 1 - ent, ent_writed);
  232. if (res != RECODE_OK)
  233. break;
  234. ent += ent_writed;
  235. *ent++ = ';';
  236. out = ent;
  237. }
  238. in++;
  239. }
  240. *out++ = '\x00';
  241. out_writed = out - out_beg;
  242. in_readed = in - in_beg;
  243. return res;
  244. }
  245. template <class TCharType>
  246. inline RECODE_RESULT _recodeToUnicode(ECharset From, const char* in, TCharType* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  247. if (!ValidCodepage(From))
  248. return RECODE_ERROR;
  249. if (!NCodepagePrivate::NativeCodepage(From))
  250. return NICONVPrivate::RecodeToUnicodeNoThrow(From, in, out, in_size, out_size, in_readed, out_writed);
  251. if (From == CODES_UTF8)
  252. return _recodeUTF8ToUnicode(in, out, in_size, out_size, in_readed, out_writed);
  253. return _recodeSBToUnicode(From, in, out, in_size, out_size, in_readed, out_writed);
  254. }
  255. template <class TCharType>
  256. inline RECODE_RESULT _recodeFromUnicode(ECharset To, const TCharType* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  257. if (!ValidCodepage(To))
  258. return RECODE_ERROR;
  259. if (!NCodepagePrivate::NativeCodepage(To))
  260. return NICONVPrivate::RecodeFromUnicodeNoThrow(To, in, out, in_size, out_size, in_readed, out_writed);
  261. if (To == CODES_UTF8)
  262. return NCodepagePrivate::_recodeUnicodeToUTF8(in, out, in_size, out_size, in_readed, out_writed);
  263. return NCodepagePrivate::_recodeUnicodeToSB(To, in, out, in_size, out_size, in_readed, out_writed);
  264. }
  265. inline RECODE_RESULT _recodeFromUnicode(ECharset To, wchar32 rune, char* out, size_t out_size, size_t& nwritten) {
  266. if (!ValidCodepage(To))
  267. return RECODE_ERROR;
  268. if (!NCodepagePrivate::NativeCodepage(To)) {
  269. size_t nread = 0;
  270. return NICONVPrivate::RecodeFromUnicodeNoThrow(To, &rune, out, 1, out_size, nread, nwritten);
  271. }
  272. if (To == CODES_UTF8)
  273. return NCodepagePrivate::_recodeUnicodeToUTF8(rune, out, out_size, nwritten);
  274. return NCodepagePrivate::_recodeUnicodeToSB(To, rune, out, out_size, nwritten);
  275. }
  276. inline RECODE_RESULT _recodeToHTMLEntities(ECharset From, const char* in, char* out, size_t in_size, size_t out_size, size_t& in_readed, size_t& out_writed) {
  277. TArrayHolder<wchar32> bufHolder(new wchar32[in_size]);
  278. wchar32* buf = bufHolder.Get();
  279. size_t unicode_size;
  280. RECODE_RESULT res1, res2;
  281. //first pass - to unicode
  282. res1 = _recodeToUnicode(From, in, buf, in_size, in_size, in_readed, unicode_size);
  283. //second pass - to entities
  284. res2 = _recodeUnicodeToHTMLEntities(buf, out, in_size, out_size, in_readed, out_writed);
  285. return (res2 != RECODE_OK) ? res2 : res1;
  286. }
  287. }