ConvertUTF.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. /*===--- ConvertUTF.h - Universal Character Names conversions ---------------===
  7. *
  8. * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. * See https://llvm.org/LICENSE.txt for license information.
  10. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. *
  12. *==------------------------------------------------------------------------==*/
  13. /*
  14. * Copyright 2001-2004 Unicode, Inc.
  15. *
  16. * Disclaimer
  17. *
  18. * This source code is provided as is by Unicode, Inc. No claims are
  19. * made as to fitness for any particular purpose. No warranties of any
  20. * kind are expressed or implied. The recipient agrees to determine
  21. * applicability of information provided. If this file has been
  22. * purchased on magnetic or optical media from Unicode, Inc., the
  23. * sole remedy for any claim will be exchange of defective media
  24. * within 90 days of receipt.
  25. *
  26. * Limitations on Rights to Redistribute This Code
  27. *
  28. * Unicode, Inc. hereby grants the right to freely use the information
  29. * supplied in this file in the creation of products supporting the
  30. * Unicode Standard, and to make copies of this file in any form
  31. * for internal or external distribution as long as this notice
  32. * remains attached.
  33. */
  34. /* ---------------------------------------------------------------------
  35. Conversions between UTF32, UTF-16, and UTF-8. Header file.
  36. Several funtions are included here, forming a complete set of
  37. conversions between the three formats. UTF-7 is not included
  38. here, but is handled in a separate source file.
  39. Each of these routines takes pointers to input buffers and output
  40. buffers. The input buffers are const.
  41. Each routine converts the text between *sourceStart and sourceEnd,
  42. putting the result into the buffer between *targetStart and
  43. targetEnd. Note: the end pointers are *after* the last item: e.g.
  44. *(sourceEnd - 1) is the last item.
  45. The return result indicates whether the conversion was successful,
  46. and if not, whether the problem was in the source or target buffers.
  47. (Only the first encountered problem is indicated.)
  48. After the conversion, *sourceStart and *targetStart are both
  49. updated to point to the end of last text successfully converted in
  50. the respective buffers.
  51. Input parameters:
  52. sourceStart - pointer to a pointer to the source buffer.
  53. The contents of this are modified on return so that
  54. it points at the next thing to be converted.
  55. targetStart - similarly, pointer to pointer to the target buffer.
  56. sourceEnd, targetEnd - respectively pointers to the ends of the
  57. two buffers, for overflow checking only.
  58. These conversion functions take a ConversionFlags argument. When this
  59. flag is set to strict, both irregular sequences and isolated surrogates
  60. will cause an error. When the flag is set to lenient, both irregular
  61. sequences and isolated surrogates are converted.
  62. Whether the flag is strict or lenient, all illegal sequences will cause
  63. an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
  64. or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
  65. must check for illegal sequences.
  66. When the flag is set to lenient, characters over 0x10FFFF are converted
  67. to the replacement character; otherwise (when the flag is set to strict)
  68. they constitute an error.
  69. Output parameters:
  70. The value "sourceIllegal" is returned from some routines if the input
  71. sequence is malformed. When "sourceIllegal" is returned, the source
  72. value will point to the illegal value that caused the problem. E.g.,
  73. in UTF-8 when a sequence is malformed, it points to the start of the
  74. malformed sequence.
  75. Author: Mark E. Davis, 1994.
  76. Rev History: Rick McGowan, fixes & updates May 2001.
  77. Fixes & updates, Sept 2001.
  78. ------------------------------------------------------------------------ */
  79. #ifndef LLVM_SUPPORT_CONVERTUTF_H
  80. #define LLVM_SUPPORT_CONVERTUTF_H
  81. #include <cstddef>
  82. #include <string>
  83. #if defined(_WIN32)
  84. #include <system_error>
  85. #endif
  86. // Wrap everything in namespace llvm so that programs can link with llvm and
  87. // their own version of the unicode libraries.
  88. namespace llvm {
  89. /* ---------------------------------------------------------------------
  90. The following 4 definitions are compiler-specific.
  91. The C standard does not guarantee that wchar_t has at least
  92. 16 bits, so wchar_t is no less portable than unsigned short!
  93. All should be unsigned values to avoid sign extension during
  94. bit mask & shift operations.
  95. ------------------------------------------------------------------------ */
  96. typedef unsigned int UTF32; /* at least 32 bits */
  97. typedef unsigned short UTF16; /* at least 16 bits */
  98. typedef unsigned char UTF8; /* typically 8 bits */
  99. typedef unsigned char Boolean; /* 0 or 1 */
  100. /* Some fundamental constants */
  101. #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
  102. #define UNI_MAX_BMP (UTF32)0x0000FFFF
  103. #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
  104. #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
  105. #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
  106. #define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4
  107. #define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF
  108. #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
  109. typedef enum {
  110. conversionOK, /* conversion successful */
  111. sourceExhausted, /* partial character in source, but hit end */
  112. targetExhausted, /* insuff. room in target for conversion */
  113. sourceIllegal /* source sequence is illegal/malformed */
  114. } ConversionResult;
  115. typedef enum {
  116. strictConversion = 0,
  117. lenientConversion
  118. } ConversionFlags;
  119. ConversionResult ConvertUTF8toUTF16 (
  120. const UTF8** sourceStart, const UTF8* sourceEnd,
  121. UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
  122. /**
  123. * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an
  124. * incomplete code unit sequence, returns \c sourceExhausted.
  125. */
  126. ConversionResult ConvertUTF8toUTF32Partial(
  127. const UTF8** sourceStart, const UTF8* sourceEnd,
  128. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
  129. /**
  130. * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an
  131. * incomplete code unit sequence, returns \c sourceIllegal.
  132. */
  133. ConversionResult ConvertUTF8toUTF32(
  134. const UTF8** sourceStart, const UTF8* sourceEnd,
  135. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
  136. ConversionResult ConvertUTF16toUTF8 (
  137. const UTF16** sourceStart, const UTF16* sourceEnd,
  138. UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
  139. ConversionResult ConvertUTF32toUTF8 (
  140. const UTF32** sourceStart, const UTF32* sourceEnd,
  141. UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
  142. ConversionResult ConvertUTF16toUTF32 (
  143. const UTF16** sourceStart, const UTF16* sourceEnd,
  144. UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
  145. ConversionResult ConvertUTF32toUTF16 (
  146. const UTF32** sourceStart, const UTF32* sourceEnd,
  147. UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
  148. Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
  149. Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
  150. unsigned getNumBytesForUTF8(UTF8 firstByte);
  151. /*************************************************************************/
  152. /* Below are LLVM-specific wrappers of the functions above. */
  153. template <typename T> class ArrayRef;
  154. template <typename T> class SmallVectorImpl;
  155. class StringRef;
  156. /**
  157. * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
  158. * WideCharWidth. The converted data is written to ResultPtr, which needs to
  159. * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
  160. * ResultPtr will point one after the end of the copied string. On failure,
  161. * ResultPtr will not be changed, and ErrorPtr will be set to the location of
  162. * the first character which could not be converted.
  163. * \return true on success.
  164. */
  165. bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
  166. char *&ResultPtr, const UTF8 *&ErrorPtr);
  167. /**
  168. * Converts a UTF-8 StringRef to a std::wstring.
  169. * \return true on success.
  170. */
  171. bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result);
  172. /**
  173. * Converts a UTF-8 C-string to a std::wstring.
  174. * \return true on success.
  175. */
  176. bool ConvertUTF8toWide(const char *Source, std::wstring &Result);
  177. /**
  178. * Converts a std::wstring to a UTF-8 encoded std::string.
  179. * \return true on success.
  180. */
  181. bool convertWideToUTF8(const std::wstring &Source, std::string &Result);
  182. /**
  183. * Convert an Unicode code point to UTF8 sequence.
  184. *
  185. * \param Source a Unicode code point.
  186. * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least
  187. * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes. On success \c ResultPtr is
  188. * updated one past end of the converted sequence.
  189. *
  190. * \returns true on success.
  191. */
  192. bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr);
  193. /**
  194. * Convert the first UTF8 sequence in the given source buffer to a UTF32
  195. * code point.
  196. *
  197. * \param [in,out] source A pointer to the source buffer. If the conversion
  198. * succeeds, this pointer will be updated to point to the byte just past the
  199. * end of the converted sequence.
  200. * \param sourceEnd A pointer just past the end of the source buffer.
  201. * \param [out] target The converted code
  202. * \param flags Whether the conversion is strict or lenient.
  203. *
  204. * \returns conversionOK on success
  205. *
  206. * \sa ConvertUTF8toUTF32
  207. */
  208. inline ConversionResult convertUTF8Sequence(const UTF8 **source,
  209. const UTF8 *sourceEnd,
  210. UTF32 *target,
  211. ConversionFlags flags) {
  212. if (*source == sourceEnd)
  213. return sourceExhausted;
  214. unsigned size = getNumBytesForUTF8(**source);
  215. if ((ptrdiff_t)size > sourceEnd - *source)
  216. return sourceExhausted;
  217. return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
  218. }
  219. /**
  220. * Returns true if a blob of text starts with a UTF-16 big or little endian byte
  221. * order mark.
  222. */
  223. bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
  224. /**
  225. * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
  226. *
  227. * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text.
  228. * \param [out] Out Converted UTF-8 is stored here on success.
  229. * \returns true on success
  230. */
  231. bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
  232. /**
  233. * Converts a UTF16 string into a UTF8 std::string.
  234. *
  235. * \param [in] Src A buffer of UTF-16 encoded text.
  236. * \param [out] Out Converted UTF-8 is stored here on success.
  237. * \returns true on success
  238. */
  239. bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
  240. /**
  241. * Converts a UTF-8 string into a UTF-16 string with native endianness.
  242. *
  243. * \returns true on success
  244. */
  245. bool convertUTF8ToUTF16String(StringRef SrcUTF8,
  246. SmallVectorImpl<UTF16> &DstUTF16);
  247. #if defined(_WIN32)
  248. namespace sys {
  249. namespace windows {
  250. std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
  251. /// Convert to UTF16 from the current code page used in the system
  252. std::error_code CurCPToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
  253. std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
  254. SmallVectorImpl<char> &utf8);
  255. /// Convert from UTF16 to the current code page used in the system
  256. std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
  257. SmallVectorImpl<char> &utf8);
  258. } // namespace windows
  259. } // namespace sys
  260. #endif
  261. } /* end namespace llvm */
  262. #endif
  263. #ifdef __GNUC__
  264. #pragma GCC diagnostic pop
  265. #endif