Unicode.h 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===- llvm/Support/Unicode.h - Unicode character properties -*- C++ -*-=====//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // This file defines functions that allow querying certain properties of Unicode
  15. // characters.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #ifndef LLVM_SUPPORT_UNICODE_H
  19. #define LLVM_SUPPORT_UNICODE_H
  20. #include "llvm/ADT/SmallString.h"
  21. #include <optional>
  22. #include <string>
  23. namespace llvm {
  24. class StringRef;
  25. namespace sys {
  26. namespace unicode {
  27. enum ColumnWidthErrors {
  28. ErrorInvalidUTF8 = -2,
  29. ErrorNonPrintableCharacter = -1
  30. };
  31. /// Determines if a character is likely to be displayed correctly on the
  32. /// terminal. Exact implementation would have to depend on the specific
  33. /// terminal, so we define the semantic that should be suitable for generic case
  34. /// of a terminal capable to output Unicode characters.
  35. ///
  36. /// Printable codepoints are those in the categories L, M, N, P, S and Zs
  37. /// \return true if the character is considered printable.
  38. bool isPrintable(int UCS);
  39. // Formatting codepoints are codepoints in the Cf category.
  40. bool isFormatting(int UCS);
  41. /// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
  42. /// when output on a terminal ("character width"). This depends on the
  43. /// implementation of the terminal, and there's no standard definition of
  44. /// character width.
  45. ///
  46. /// The implementation defines it in a way that is expected to be compatible
  47. /// with a generic Unicode-capable terminal.
  48. ///
  49. /// \return Character width:
  50. /// * ErrorNonPrintableCharacter (-1) if \p Text contains non-printable
  51. /// characters (as identified by isPrintable);
  52. /// * 0 for each non-spacing and enclosing combining mark;
  53. /// * 2 for each CJK character excluding halfwidth forms;
  54. /// * 1 for each of the remaining characters.
  55. int columnWidthUTF8(StringRef Text);
  56. /// Fold input unicode character according the Simple unicode case folding
  57. /// rules.
  58. int foldCharSimple(int C);
  59. /// Maps the name or the alias of a Unicode character to its associated
  60. /// codepoints.
  61. /// The names and aliases are derived from UnicodeData.txt and NameAliases.txt
  62. /// For compatibility with the semantics of named character escape sequences in
  63. /// C++, this mapping does an exact match sensitive to casing and spacing.
  64. /// \return The codepoint of the corresponding character, if any.
  65. std::optional<char32_t> nameToCodepointStrict(StringRef Name);
  66. struct LooseMatchingResult {
  67. char32_t CodePoint;
  68. SmallString<64> Name;
  69. };
  70. std::optional<LooseMatchingResult> nameToCodepointLooseMatching(StringRef Name);
  71. struct MatchForCodepointName {
  72. std::string Name;
  73. uint32_t Distance = 0;
  74. char32_t Value = 0;
  75. };
  76. SmallVector<MatchForCodepointName>
  77. nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount);
  78. } // namespace unicode
  79. } // namespace sys
  80. } // namespace llvm
  81. #endif
  82. #ifdef __GNUC__
  83. #pragma GCC diagnostic pop
  84. #endif