unicode_set.h 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. #pragma once
  2. #include <util/str_stl.h>
  3. #include <util/charset/unidata.h>
  4. #include <util/generic/algorithm.h>
  5. #include <util/generic/ptr.h>
  6. #include <util/generic/strbuf.h>
  7. #include <util/generic/string.h>
  8. #include <util/generic/utility.h>
  9. #include <util/generic/vector.h>
  10. class IInputStream;
  11. class IOutputStream;
  12. namespace NUnicode {
  13. namespace NPrivate {
  14. struct TCategoryRanges;
  15. }
  16. class TUnicodeSet {
  17. private:
  18. typedef TSimpleSharedPtr<wchar32, TDeleteArray> TDynamicBuffer;
  19. // Ranges can point to:
  20. // 1) ShortBuffer for short sets (not more than 2 ranges)
  21. // 2) static data (for predefined unicode categories)
  22. // 3) or DynBuffer for big sets
  23. const wchar32* Ranges;
  24. wchar32 ShortBuffer[5];
  25. TDynamicBuffer DynBuffer; // Can be shared between multiple sets
  26. size_t Length; // Number of slots in Ranges
  27. size_t Capacity; // Capacity of currently used buffer. Zero value means reference to static data
  28. private:
  29. Y_FORCE_INLINE bool IsShared() const {
  30. return Ranges == DynBuffer.Get() && DynBuffer.RefCount() > 1;
  31. }
  32. Y_FORCE_INLINE bool IsStatic() const {
  33. return 0 == Capacity;
  34. }
  35. size_t GetRangeItem(wchar32 c, size_t from = 0) const;
  36. // Extends buffer capacity if required and returns pointer to the writable buffer of slots
  37. wchar32* EnsureCapacity(size_t capacity);
  38. // Makes the copy of buffer if the unicode set points to the static or shared data, and returns pointer to the writable buffer of slots
  39. wchar32* EnsureWritable() {
  40. if (IsShared()) {
  41. // If multiple UnicodeSets refer to the same buffer then make the copy
  42. Capacity = 0;
  43. }
  44. if (IsStatic()) {
  45. // Copy static or shared data to own buffer before modifying
  46. return EnsureCapacity(Length);
  47. }
  48. return const_cast<wchar32*>(Ranges);
  49. }
  50. // Returns pointer to the first inserted slot
  51. wchar32* InsertRangeSlots(const size_t pos, const size_t count);
  52. void EraseRangeSlots(const size_t pos, const size_t count);
  53. void AddPredefRanges(const NPrivate::TCategoryRanges& ranges);
  54. void SetPredefRanges(const NPrivate::TCategoryRanges& ranges);
  55. public:
  56. enum {
  57. CODEPOINT_HIGH = 0x110000 // Next value after maximum valid code point
  58. };
  59. TUnicodeSet();
  60. TUnicodeSet(const TUnicodeSet& s);
  61. // Unicode set for specific character range. "from", "to" are inclusive
  62. TUnicodeSet(wchar32 from, wchar32 to);
  63. // Unicode set consists of all characters from the specified string
  64. TUnicodeSet(const TWtringBuf& s);
  65. // Unicode set for predefined category
  66. TUnicodeSet(WC_TYPE c);
  67. TUnicodeSet& operator=(const TUnicodeSet& s) {
  68. return Set(s);
  69. }
  70. inline bool operator==(const TUnicodeSet& s) const {
  71. return Length == s.Length && (Ranges == s.Ranges || ::Equal(Ranges, Ranges + Length, s.Ranges));
  72. }
  73. friend inline TUnicodeSet operator~(TUnicodeSet s) {
  74. return s.Invert();
  75. }
  76. friend inline TUnicodeSet operator+(const TUnicodeSet& s1, const TUnicodeSet& s2) {
  77. return TUnicodeSet(s1).Add(s2);
  78. }
  79. TUnicodeSet& Add(const TUnicodeSet& s);
  80. TUnicodeSet& Add(const TWtringBuf& s);
  81. TUnicodeSet& Add(wchar32 c);
  82. // from, to - inclusive
  83. TUnicodeSet& Add(wchar32 from, wchar32 to);
  84. TUnicodeSet& Add(WC_TYPE c);
  85. // Add unicode category by name (one- or two-letter)
  86. TUnicodeSet& AddCategory(const TStringBuf& catName);
  87. TUnicodeSet& Set(const TUnicodeSet& s);
  88. // from, to - inclusive
  89. TUnicodeSet& Set(wchar32 from, wchar32 to);
  90. TUnicodeSet& Set(const TWtringBuf& s);
  91. TUnicodeSet& Set(WC_TYPE c);
  92. TUnicodeSet& SetCategory(const TStringBuf& catName);
  93. TUnicodeSet& Invert();
  94. // Converts existing unicode set to the case-insensitive set
  95. TUnicodeSet& MakeCaseInsensitive();
  96. TUnicodeSet& Clear();
  97. size_t Hash() const;
  98. TString ToString(bool escapeAllChars = false) const;
  99. inline bool Valid() const {
  100. return Length > 0 && Ranges[Length - 1] == CODEPOINT_HIGH;
  101. }
  102. inline bool Has(wchar32 c) const {
  103. if (Y_UNLIKELY(c >= CODEPOINT_HIGH)) {
  104. return false;
  105. }
  106. const size_t item = GetRangeItem(c);
  107. return (item & 1);
  108. }
  109. inline bool Empty() const {
  110. Y_ASSERT(Valid());
  111. return Length < 2;
  112. }
  113. void Save(IOutputStream* out) const;
  114. void Load(IInputStream* in);
  115. TUnicodeSet& Parse(const TWtringBuf& data);
  116. };
  117. using TUnicodeSetPtr = TSimpleSharedPtr<TUnicodeSet>;
  118. }
  119. template <>
  120. struct THash<NUnicode::TUnicodeSet> {
  121. size_t operator()(const NUnicode::TUnicodeSet& s) const {
  122. return s.Hash();
  123. }
  124. };