recyr.hh 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. #pragma once
  2. #include <cstdlib>
  3. #include <util/charset/recode_result.h>
  4. #include <util/generic/ptr.h>
  5. #include <util/generic/yexception.h>
  6. #include "codepage.h"
  7. #include "doccodes.h"
  8. #include "iconv.h"
  9. #include "recyr_int.hh"
  10. ///////////////////////////////////////////////////////////////////////////////////////
  11. // input buf -> output buf //
  12. ///////////////////////////////////////////////////////////////////////////////////////
  13. template <class TCharType>
  14. inline RECODE_RESULT RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
  15. static_assert(sizeof(TCharType) > 1, "expect wide character type");
  16. return NCodepagePrivate::_recodeToUnicode(from, in, out, inSize, outSize, inRead, outWritten);
  17. }
  18. template <class TCharType>
  19. inline RECODE_RESULT RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
  20. static_assert(sizeof(TCharType) > 1, "expect wide character type");
  21. return NCodepagePrivate::_recodeFromUnicode(to, in, out, inSize, outSize, inRead, outWritten);
  22. }
  23. inline RECODE_RESULT RecodeFromUnicode(ECharset to, wchar32 rune, char* out, size_t outSize, size_t& outWritten) {
  24. return NCodepagePrivate::_recodeFromUnicode(to, rune, out, outSize, outWritten);
  25. }
  26. template <class TCharType>
  27. inline RECODE_RESULT RecodeToUnicode(ECharset from, const char* in, TCharType* out, size_t inSize, size_t outSize) {
  28. size_t inRead = 0;
  29. size_t outWritten = 0;
  30. return RecodeToUnicode(from, in, out, inSize, outSize, inRead, outWritten);
  31. }
  32. template <class TCharType>
  33. inline RECODE_RESULT RecodeFromUnicode(ECharset to, const TCharType* in, char* out, size_t inSize, size_t outSize) {
  34. size_t inRead = 0;
  35. size_t outWritten = 0;
  36. return RecodeFromUnicode(to, in, out, inSize, outSize, inRead, outWritten);
  37. }
  38. inline RECODE_RESULT RecodeFromUnicode(ECharset theEncoding, const wchar16* chars, size_t length,
  39. char* bytes, size_t size, size_t* read = nullptr, size_t* written = nullptr) {
  40. size_t w = 0, r = 0;
  41. RECODE_RESULT rc = ::RecodeFromUnicode(theEncoding, chars, bytes, length, size, r, w);
  42. if (read)
  43. *read = r;
  44. if (written)
  45. *written = w;
  46. return rc;
  47. }
  48. inline RECODE_RESULT Recode(ECharset from, ECharset to, const char* in, char* out, size_t inSize, size_t outSize, size_t& inRead, size_t& outWritten) {
  49. inRead = 0;
  50. outWritten = 0;
  51. if (!ValidCodepage(to) || !ValidCodepage(from))
  52. return RECODE_ERROR;
  53. if (to == from)
  54. return NCodepagePrivate::_recodeCopy(in, out, inSize, outSize, inRead, outWritten);
  55. if (NCodepagePrivate::NativeCodepage(from) && NCodepagePrivate::NativeCodepage(to)) {
  56. if (from == CODES_UTF8)
  57. return NCodepagePrivate::_recodeFromUTF8(to, in, out, inSize, outSize, inRead, outWritten);
  58. if (to == CODES_UTF8)
  59. return NCodepagePrivate::_recodeToUTF8(from, in, out, inSize, outSize, inRead, outWritten);
  60. if (from == CODES_YANDEX)
  61. return NCodepagePrivate::_recodeFromYandex(to, in, out, inSize, outSize, inRead, outWritten);
  62. if (to == CODES_YANDEX)
  63. return NCodepagePrivate::_recodeToYandex(from, in, out, inSize, outSize, inRead, outWritten);
  64. } else if (NICONVPrivate::CanConvert(from, to)) {
  65. return NICONVPrivate::RecodeNoThrow(from, to, in, out, inSize, outSize, inRead, outWritten);
  66. }
  67. size_t wideSize = inSize * 3;
  68. TArrayHolder<wchar16> wide(new wchar16[wideSize]);
  69. size_t wideRead = 0;
  70. size_t wideWritten = 0;
  71. RECODE_RESULT res = RecodeToUnicode(from, in, wide.Get(), inSize, wideSize, inRead, wideWritten);
  72. if (res != RECODE_OK)
  73. return res;
  74. res = RecodeFromUnicode(to, wide.Get(), out, wideWritten, outSize, wideRead, outWritten);
  75. return res;
  76. }
  77. inline RECODE_RESULT Recode(ECharset from, ECharset to, const char* in, char* out, size_t inSize, size_t outSize) {
  78. size_t inRead = 0;
  79. size_t outWritten = 0;
  80. return Recode(from, to, in, out, inSize, outSize, inRead, outWritten);
  81. }
  82. /**
  83. * Recode from one charset to another; throw an exception if conversion failed
  84. * @param[in] from the source character set
  85. * @param[in] to the target character set
  86. * @param[in] in the input string buffer
  87. * @param[out] out the output string object if conversion was successful
  88. * @return false if conversion was not attempted (charsets were the same),
  89. * true if successful
  90. */
  91. inline bool Recode(ECharset from, ECharset to, const TStringBuf& in, TString& out) {
  92. if (to == from)
  93. return false;
  94. const size_t inSize = in.length();
  95. const size_t outSize = SingleByteCodepage(to) ? inSize : 3 * inSize;
  96. out.clear(); // so we don't copy stuff around when resizing
  97. out.ReserveAndResize(outSize);
  98. size_t inRead = 0;
  99. size_t outWritten = 0;
  100. const RECODE_RESULT res = Recode(from, to, in.data(), out.begin(), inSize, outSize, inRead, outWritten);
  101. Y_ENSURE(RECODE_OK == res, "Recode failed. ");
  102. if (outWritten > outSize)
  103. ythrow yexception() << "Recode overrun the buffer: size="
  104. << outSize << " need=" << outWritten;
  105. out.remove(outWritten);
  106. return true;
  107. }
  108. ///////////////////////////////////////////////////////////////////////////////////////
  109. // TString -> TString //
  110. ///////////////////////////////////////////////////////////////////////////////////////
  111. inline TString Recode(ECharset from, ECharset to, const TString& in) {
  112. TString out;
  113. return to != from && Recode(from, to, in, out) ? out : in;
  114. }
  115. inline TString RecodeToYandex(ECharset from, const TString& in) {
  116. return Recode(from, CODES_YANDEX, in);
  117. }
  118. inline TString RecodeFromYandex(ECharset to, const TString& in) {
  119. return Recode(CODES_YANDEX, to, in);
  120. }
  121. inline TString RecodeToHTMLEntities(ECharset from, const TString& in) {
  122. RECODE_RESULT res;
  123. size_t outWritten, inRead;
  124. TString out;
  125. out.resize(in.length() * (4 + 4));
  126. res = NCodepagePrivate::_recodeToHTMLEntities(from, in.c_str(), out.begin(), in.length(), out.length(), inRead, outWritten);
  127. if (res == RECODE_EOOUTPUT) { //input contains many 8-byte characters?
  128. out.resize(in.length() * (4 + 8));
  129. res = NCodepagePrivate::_recodeToHTMLEntities(from, in.c_str(), out.begin(), in.length(), out.length(), inRead, outWritten);
  130. }
  131. if (res != RECODE_OK) {
  132. ythrow yexception() << "Recode to HTML entities failed";
  133. }
  134. out.resize(outWritten - 1);
  135. return out;
  136. }