#pragma once #include #include #include #include #include #include #include #include class IInputStream; class IOutputStream; namespace NUnicode { namespace NPrivate { struct TCategoryRanges; } class TUnicodeSet { private: typedef TSimpleSharedPtr TDynamicBuffer; // Ranges can point to: // 1) ShortBuffer for short sets (not more than 2 ranges) // 2) static data (for predefined unicode categories) // 3) or DynBuffer for big sets const wchar32* Ranges; wchar32 ShortBuffer[5]; TDynamicBuffer DynBuffer; // Can be shared between multiple sets size_t Length; // Number of slots in Ranges size_t Capacity; // Capacity of currently used buffer. Zero value means reference to static data private: Y_FORCE_INLINE bool IsShared() const { return Ranges == DynBuffer.Get() && DynBuffer.RefCount() > 1; } Y_FORCE_INLINE bool IsStatic() const { return 0 == Capacity; } size_t GetRangeItem(wchar32 c, size_t from = 0) const; // Extends buffer capacity if required and returns pointer to the writable buffer of slots wchar32* EnsureCapacity(size_t capacity); // Makes the copy of buffer if the unicode set points to the static or shared data, and returns pointer to the writable buffer of slots wchar32* EnsureWritable() { if (IsShared()) { // If multiple UnicodeSets refer to the same buffer then make the copy Capacity = 0; } if (IsStatic()) { // Copy static or shared data to own buffer before modifying return EnsureCapacity(Length); } return const_cast(Ranges); } // Returns pointer to the first inserted slot wchar32* InsertRangeSlots(const size_t pos, const size_t count); void EraseRangeSlots(const size_t pos, const size_t count); void AddPredefRanges(const NPrivate::TCategoryRanges& ranges); void SetPredefRanges(const NPrivate::TCategoryRanges& ranges); public: enum { CODEPOINT_HIGH = 0x110000 // Next value after maximum valid code point }; TUnicodeSet(); TUnicodeSet(const TUnicodeSet& s); // Unicode set for specific character range. "from", "to" are inclusive TUnicodeSet(wchar32 from, wchar32 to); // Unicode set consists of all characters from the specified string TUnicodeSet(const TWtringBuf& s); // Unicode set for predefined category TUnicodeSet(WC_TYPE c); TUnicodeSet& operator=(const TUnicodeSet& s) { return Set(s); } inline bool operator==(const TUnicodeSet& s) const { return Length == s.Length && (Ranges == s.Ranges || ::Equal(Ranges, Ranges + Length, s.Ranges)); } friend inline TUnicodeSet operator~(TUnicodeSet s) { return s.Invert(); } friend inline TUnicodeSet operator+(const TUnicodeSet& s1, const TUnicodeSet& s2) { return TUnicodeSet(s1).Add(s2); } TUnicodeSet& Add(const TUnicodeSet& s); TUnicodeSet& Add(const TWtringBuf& s); TUnicodeSet& Add(wchar32 c); // from, to - inclusive TUnicodeSet& Add(wchar32 from, wchar32 to); TUnicodeSet& Add(WC_TYPE c); // Add unicode category by name (one- or two-letter) TUnicodeSet& AddCategory(const TStringBuf& catName); TUnicodeSet& Set(const TUnicodeSet& s); // from, to - inclusive TUnicodeSet& Set(wchar32 from, wchar32 to); TUnicodeSet& Set(const TWtringBuf& s); TUnicodeSet& Set(WC_TYPE c); TUnicodeSet& SetCategory(const TStringBuf& catName); TUnicodeSet& Invert(); // Converts existing unicode set to the case-insensitive set TUnicodeSet& MakeCaseInsensitive(); TUnicodeSet& Clear(); size_t Hash() const; TString ToString(bool escapeAllChars = false) const; inline bool Valid() const { return Length > 0 && Ranges[Length - 1] == CODEPOINT_HIGH; } inline bool Has(wchar32 c) const { if (Y_UNLIKELY(c >= CODEPOINT_HIGH)) { return false; } const size_t item = GetRangeItem(c); return (item & 1); } inline bool Empty() const { Y_ASSERT(Valid()); return Length < 2; } void Save(IOutputStream* out) const; void Load(IInputStream* in); TUnicodeSet& Parse(const TWtringBuf& data); }; using TUnicodeSetPtr = TSimpleSharedPtr; } template <> struct THash { size_t operator()(const NUnicode::TUnicodeSet& s) const { return s.Hash(); } };