UnicodeCharRanges.h 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. #pragma once
  2. #ifdef __GNUC__
  3. #pragma GCC diagnostic push
  4. #pragma GCC diagnostic ignored "-Wunused-parameter"
  5. #endif
  6. //===--- UnicodeCharRanges.h - Types and functions for character ranges ---===//
  7. //
  8. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  9. // See https://llvm.org/LICENSE.txt for license information.
  10. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #ifndef LLVM_SUPPORT_UNICODECHARRANGES_H
  14. #define LLVM_SUPPORT_UNICODECHARRANGES_H
  15. #include "llvm/ADT/ArrayRef.h"
  16. #include "llvm/Support/Compiler.h"
  17. #include "llvm/Support/Debug.h"
  18. #include "llvm/Support/raw_ostream.h"
  19. #include <algorithm>
  20. #define DEBUG_TYPE "unicode"
  21. namespace llvm {
  22. namespace sys {
  23. /// Represents a closed range of Unicode code points [Lower, Upper].
  24. struct UnicodeCharRange {
  25. uint32_t Lower;
  26. uint32_t Upper;
  27. };
  28. inline bool operator<(uint32_t Value, UnicodeCharRange Range) {
  29. return Value < Range.Lower;
  30. }
  31. inline bool operator<(UnicodeCharRange Range, uint32_t Value) {
  32. return Range.Upper < Value;
  33. }
  34. /// Holds a reference to an ordered array of UnicodeCharRange and allows
  35. /// to quickly check if a code point is contained in the set represented by this
  36. /// array.
  37. class UnicodeCharSet {
  38. public:
  39. typedef ArrayRef<UnicodeCharRange> CharRanges;
  40. /// Constructs a UnicodeCharSet instance from an array of
  41. /// UnicodeCharRanges.
  42. ///
  43. /// Array pointed by \p Ranges should have the lifetime at least as long as
  44. /// the UnicodeCharSet instance, and should not change. Array is validated by
  45. /// the constructor, so it makes sense to create as few UnicodeCharSet
  46. /// instances per each array of ranges, as possible.
  47. #ifdef NDEBUG
  48. // FIXME: This could use constexpr + static_assert. This way we
  49. // may get rid of NDEBUG in this header. Unfortunately there are some
  50. // problems to get this working with MSVC 2013. Change this when
  51. // the support for MSVC 2013 is dropped.
  52. constexpr UnicodeCharSet(CharRanges Ranges) : Ranges(Ranges) {}
  53. #else
  54. UnicodeCharSet(CharRanges Ranges) : Ranges(Ranges) {
  55. assert(rangesAreValid());
  56. }
  57. #endif
  58. /// Returns true if the character set contains the Unicode code point
  59. /// \p C.
  60. bool contains(uint32_t C) const {
  61. return std::binary_search(Ranges.begin(), Ranges.end(), C);
  62. }
  63. private:
  64. /// Returns true if each of the ranges is a proper closed range
  65. /// [min, max], and if the ranges themselves are ordered and non-overlapping.
  66. bool rangesAreValid() const {
  67. uint32_t Prev = 0;
  68. for (CharRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
  69. I != E; ++I) {
  70. if (I != Ranges.begin() && Prev >= I->Lower) {
  71. LLVM_DEBUG(dbgs() << "Upper bound 0x");
  72. LLVM_DEBUG(dbgs().write_hex(Prev));
  73. LLVM_DEBUG(dbgs() << " should be less than succeeding lower bound 0x");
  74. LLVM_DEBUG(dbgs().write_hex(I->Lower) << "\n");
  75. return false;
  76. }
  77. if (I->Upper < I->Lower) {
  78. LLVM_DEBUG(dbgs() << "Upper bound 0x");
  79. LLVM_DEBUG(dbgs().write_hex(I->Lower));
  80. LLVM_DEBUG(dbgs() << " should not be less than lower bound 0x");
  81. LLVM_DEBUG(dbgs().write_hex(I->Upper) << "\n");
  82. return false;
  83. }
  84. Prev = I->Upper;
  85. }
  86. return true;
  87. }
  88. const CharRanges Ranges;
  89. };
  90. } // namespace sys
  91. } // namespace llvm
  92. #undef DEBUG_TYPE // "unicode"
  93. #endif // LLVM_SUPPORT_UNICODECHARRANGES_H
  94. #ifdef __GNUC__
  95. #pragma GCC diagnostic pop
  96. #endif