#pragma once #include "comptrie_impl.h" #include "comptrie_packer.h" #include "opaque_trie_iterator.h" #include "leaf_skipper.h" #include "key_selector.h" #include #include #include #include #include #include #include template class TCompactTrieBuilder; namespace NCompactTrie { template class TFirstSymbolIterator; } template class TSearchIterator; template class TPrefixIterator; // in case of specialization cannot distinguish between "" and "\0" keys template > class TCompactTrie { public: typedef T TSymbol; typedef D TData; typedef S TPacker; typedef typename TCompactTrieKeySelector::TKey TKey; typedef typename TCompactTrieKeySelector::TKeyBuf TKeyBuf; typedef std::pair TValueType; typedef std::pair TPhraseMatch; typedef TVector TPhraseMatchVector; typedef TCompactTrieBuilder TBuilder; protected: TBlob DataHolder; const char* EmptyValue = nullptr; TPacker Packer; NCompactTrie::TPackerLeafSkipper Skipper = &Packer; // This should be true for every constructor. public: TCompactTrie() = default; TCompactTrie(const char* d, size_t len, TPacker packer); TCompactTrie(const char* d, size_t len) : TCompactTrie{d, len, TPacker{}} { } TCompactTrie(TBlob data, TPacker packer); explicit TCompactTrie(TBlob data) : TCompactTrie{std::move(data), TPacker{}} { } // Skipper should be initialized with &Packer, not with &other.Packer, so you have to redefine these. TCompactTrie(const TCompactTrie& other); TCompactTrie(TCompactTrie&& other) noexcept; TCompactTrie& operator=(const TCompactTrie& other); TCompactTrie& operator=(TCompactTrie&& other) noexcept; explicit operator bool() const { return !IsEmpty(); } void Init(const char* d, size_t len, TPacker packer = TPacker()); void Init(TBlob data, TPacker packer = TPacker()); bool IsInitialized() const; bool IsEmpty() const; bool Find(const TSymbol* key, size_t keylen, TData* value = nullptr) const; bool Find(const TKeyBuf& key, TData* value = nullptr) const { return Find(key.data(), key.size(), value); } TData Get(const TSymbol* key, size_t keylen) const { TData value; if (!Find(key, keylen, &value)) ythrow yexception() << "key " << TKey(key, keylen).Quote() << " not found in trie"; return value; } TData Get(const TKeyBuf& key) const { return Get(key.data(), key.size()); } TData GetDefault(const TKeyBuf& key, const TData& def) const { TData value; if (!Find(key.data(), key.size(), &value)) return def; else return value; } const TBlob& Data() const { return DataHolder; } const NCompactTrie::ILeafSkipper& GetSkipper() const { return Skipper; } TPacker GetPacker() const { return Packer; } bool HasCorrectSkipper() const { return Skipper.GetPacker() == &Packer; } void FindPhrases(const TSymbol* key, size_t keylen, TPhraseMatchVector& matches, TSymbol separator = TSymbol(' ')) const; void FindPhrases(const TKeyBuf& key, TPhraseMatchVector& matches, TSymbol separator = TSymbol(' ')) const { return FindPhrases(key.data(), key.size(), matches, separator); } bool FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixLen, TData* value = nullptr, bool* hasNext = nullptr) const; bool FindLongestPrefix(const TKeyBuf& key, size_t* prefixLen, TData* value = nullptr, bool* hasNext = nullptr) const { return FindLongestPrefix(key.data(), key.size(), prefixLen, value, hasNext); } // Return trie, containing all tails for the given key inline TCompactTrie FindTails(const TSymbol* key, size_t keylen) const; TCompactTrie FindTails(const TKeyBuf& key) const { return FindTails(key.data(), key.size()); } bool FindTails(const TSymbol* key, size_t keylen, TCompactTrie& res) const; bool FindTails(const TKeyBuf& key, TCompactTrie& res) const { return FindTails(key.data(), key.size(), res); } // same as FindTails(&key, 1), a bit faster // return false, if no arc with @label exists inline bool FindTails(TSymbol label, TCompactTrie& res) const; class TConstIterator { private: typedef NCompactTrie::TOpaqueTrieIterator TOpaqueTrieIterator; typedef NCompactTrie::TOpaqueTrie TOpaqueTrie; friend class TCompactTrie; TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, TPacker packer); // only usable from Begin() and End() methods TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, const TKeyBuf& key, TPacker packer); // only usable from UpperBound() method public: TConstIterator() = default; bool IsEmpty() const { return !Impl; } // Almost no other method can be called. bool operator==(const TConstIterator& other) const; bool operator!=(const TConstIterator& other) const; TConstIterator& operator++(); TConstIterator operator++(int /*unused*/); TConstIterator& operator--(); TConstIterator operator--(int /*unused*/); TValueType operator*(); TKey GetKey() const; size_t GetKeySize() const; TData GetValue() const; void GetValue(TData& data) const; const char* GetValuePtr() const; private: TPacker Packer; TCopyPtr Impl; }; TConstIterator Begin() const; TConstIterator begin() const; TConstIterator End() const; TConstIterator end() const; // Returns an iterator pointing to the smallest key in the trie >= the argument. // TODO: misleading name. Should be called LowerBound for consistency with stl. // No. It is the STL that has a misleading name. // LowerBound of X cannot be greater than X. TConstIterator UpperBound(const TKeyBuf& key) const; void Print(IOutputStream& os); size_t Size() const; friend class NCompactTrie::TFirstSymbolIterator; friend class TSearchIterator; friend class TPrefixIterator; protected: explicit TCompactTrie(const char* emptyValue); TCompactTrie(TBlob data, const char* emptyValue, TPacker packer = TPacker()); bool LookupLongestPrefix(const TSymbol* key, size_t keylen, size_t& prefixLen, const char*& valuepos, bool& hasNext) const; bool LookupLongestPrefix(const TSymbol* key, size_t keylen, size_t& prefixLen, const char*& valuepos) const { bool hasNext; return LookupLongestPrefix(key, keylen, prefixLen, valuepos, hasNext); } void LookupPhrases(const char* datapos, size_t len, const TSymbol* key, size_t keylen, TVector& matches, TSymbol separator) const; }; template > class TCompactTrieHolder: public TCompactTrie, NNonCopyable::TNonCopyable { private: typedef TCompactTrie TBase; TArrayHolder Storage; public: TCompactTrieHolder(IInputStream& is, size_t len); }; //------------------------// // Implementation section // //------------------------// // TCompactTrie template TCompactTrie::TCompactTrie(TBlob data, TPacker packer) { Init(std::move(data), packer); } template TCompactTrie::TCompactTrie(const char* d, size_t len, TPacker packer) { Init(d, len, packer); } template TCompactTrie::TCompactTrie(const char* emptyValue) : EmptyValue(emptyValue) { } template TCompactTrie::TCompactTrie(TBlob data, const char* emptyValue, TPacker packer) : DataHolder(std::move(data)) , EmptyValue(emptyValue) , Packer(packer) { } template TCompactTrie::TCompactTrie(const TCompactTrie& other) : DataHolder(other.DataHolder) , EmptyValue(other.EmptyValue) , Packer(other.Packer) { } template TCompactTrie::TCompactTrie(TCompactTrie&& other) noexcept : DataHolder(std::move(other.DataHolder)) , EmptyValue(std::move(other.EmptyValue)) , Packer(std::move(other.Packer)) { } template TCompactTrie& TCompactTrie::operator=(const TCompactTrie& other) { if (this != &other) { DataHolder = other.DataHolder; EmptyValue = other.EmptyValue; Packer = other.Packer; } return *this; } template TCompactTrie& TCompactTrie::operator=(TCompactTrie&& other) noexcept { if (this != &other) { DataHolder = std::move(other.DataHolder); EmptyValue = std::move(other.EmptyValue); Packer = std::move(other.Packer); } return *this; } template void TCompactTrie::Init(const char* d, size_t len, TPacker packer) { Init(TBlob::NoCopy(d, len), packer); } template void TCompactTrie::Init(TBlob data, TPacker packer) { using namespace NCompactTrie; DataHolder = std::move(data); Packer = packer; const char* datapos = DataHolder.AsCharPtr(); size_t len = DataHolder.Length(); if (!len) return; const char* const dataend = datapos + len; const char* emptypos = datapos; char flags = LeapByte(emptypos, dataend, 0); if (emptypos && (flags & MT_FINAL)) { Y_ASSERT(emptypos <= dataend); EmptyValue = emptypos; } } template bool TCompactTrie::IsInitialized() const { return DataHolder.Data() != nullptr; } template bool TCompactTrie::IsEmpty() const { return DataHolder.Size() == 0 && EmptyValue == nullptr; } template bool TCompactTrie::Find(const TSymbol* key, size_t keylen, TData* value) const { size_t prefixLen = 0; const char* valuepos = nullptr; bool hasNext; if (!LookupLongestPrefix(key, keylen, prefixLen, valuepos, hasNext) || prefixLen != keylen) return false; if (value) Packer.UnpackLeaf(valuepos, *value); return true; } template void TCompactTrie::FindPhrases(const TSymbol* key, size_t keylen, TPhraseMatchVector& matches, TSymbol separator) const { LookupPhrases(DataHolder.AsCharPtr(), DataHolder.Length(), key, keylen, matches, separator); } template inline TCompactTrie TCompactTrie::FindTails(const TSymbol* key, size_t keylen) const { TCompactTrie ret; FindTails(key, keylen, ret); return ret; } template bool TCompactTrie::FindTails(const TSymbol* key, size_t keylen, TCompactTrie& res) const { using namespace NCompactTrie; size_t len = DataHolder.Length(); if (!key || !len) return false; if (!keylen) { res = *this; return true; } const char* datastart = DataHolder.AsCharPtr(); const char* datapos = datastart; const char* const dataend = datapos + len; const TSymbol* keyend = key + keylen; const char* value = nullptr; while (key != keyend) { T label = *(key++); if (!NCompactTrie::Advance(datapos, dataend, value, label, Packer)) return false; if (key == keyend) { if (datapos) { Y_ASSERT(datapos >= datastart); res = TCompactTrie(TBlob::NoCopy(datapos, dataend - datapos), value); } else { res = TCompactTrie(value); } return true; } else if (!datapos) { return false; // No further way } } return false; } template inline bool TCompactTrie::FindTails(TSymbol label, TCompactTrie& res) const { using namespace NCompactTrie; const size_t len = DataHolder.Length(); if (!len) return false; const char* datastart = DataHolder.AsCharPtr(); const char* dataend = datastart + len; const char* datapos = datastart; const char* value = nullptr; if (!NCompactTrie::Advance(datapos, dataend, value, label, Packer)) return false; if (datapos) { Y_ASSERT(datapos >= datastart); res = TCompactTrie(TBlob::NoCopy(datapos, dataend - datapos), value); } else { res = TCompactTrie(value); } return true; } template typename TCompactTrie::TConstIterator TCompactTrie::Begin() const { NCompactTrie::TOpaqueTrie self(DataHolder.AsCharPtr(), DataHolder.Length(), Skipper); return TConstIterator(self, EmptyValue, false, Packer); } template typename TCompactTrie::TConstIterator TCompactTrie::begin() const { return Begin(); } template typename TCompactTrie::TConstIterator TCompactTrie::End() const { NCompactTrie::TOpaqueTrie self(DataHolder.AsCharPtr(), DataHolder.Length(), Skipper); return TConstIterator(self, EmptyValue, true, Packer); } template typename TCompactTrie::TConstIterator TCompactTrie::end() const { return End(); } template size_t TCompactTrie::Size() const { size_t res = 0; for (TConstIterator it = Begin(); it != End(); ++it) ++res; return res; } template typename TCompactTrie::TConstIterator TCompactTrie::UpperBound(const TKeyBuf& key) const { NCompactTrie::TOpaqueTrie self(DataHolder.AsCharPtr(), DataHolder.Length(), Skipper); return TConstIterator(self, EmptyValue, key, Packer); } template void TCompactTrie::Print(IOutputStream& os) { typedef typename ::TCompactTrieKeySelector::TKeyBuf TSBuffer; for (TConstIterator it = Begin(); it != End(); ++it) { os << TSBuffer((*it).first.data(), (*it).first.size()) << "\t" << (*it).second << Endl; } } template bool TCompactTrie::FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixLen, TData* value, bool* hasNext) const { const char* valuepos = nullptr; size_t tempPrefixLen = 0; bool tempHasNext; bool found = LookupLongestPrefix(key, keylen, tempPrefixLen, valuepos, tempHasNext); if (prefixLen) *prefixLen = tempPrefixLen; if (found && value) Packer.UnpackLeaf(valuepos, *value); if (hasNext) *hasNext = tempHasNext; return found; } template bool TCompactTrie::LookupLongestPrefix(const TSymbol* key, size_t keylen, size_t& prefixLen, const char*& valuepos, bool& hasNext) const { using namespace NCompactTrie; const char* datapos = DataHolder.AsCharPtr(); size_t len = DataHolder.Length(); prefixLen = 0; hasNext = false; bool found = false; if (EmptyValue) { valuepos = EmptyValue; found = true; } if (!key || !len) return found; const char* const dataend = datapos + len; const T* keyend = key + keylen; while (key != keyend) { T label = *(key++); for (i64 i = (i64)ExtraBits(); i >= 0; i -= 8) { const char flags = LeapByte(datapos, dataend, (char)(label >> i)); if (!datapos) { return found; // no such arc } Y_ASSERT(datapos <= dataend); if ((flags & MT_FINAL)) { prefixLen = keylen - (keyend - key) - (i ? 1 : 0); valuepos = datapos; hasNext = flags & MT_NEXT; found = true; if (!i && key == keyend) { // last byte, and got a match return found; } datapos += Packer.SkipLeaf(datapos); // skip intermediate leaf nodes } if (!(flags & MT_NEXT)) { return found; // no further way } } } return found; } template void TCompactTrie::LookupPhrases( const char* datapos, size_t len, const TSymbol* key, size_t keylen, TVector& matches, TSymbol separator) const { using namespace NCompactTrie; matches.clear(); if (!key || !len) return; const T* const keystart = key; const T* const keyend = key + keylen; const char* const dataend = datapos + len; while (datapos && key != keyend) { T label = *(key++); const char* value = nullptr; if (!Advance(datapos, dataend, value, label, Packer)) { return; } if (value && (key == keyend || *key == separator)) { size_t matchlength = (size_t)(key - keystart); D data; Packer.UnpackLeaf(value, data); matches.push_back(TPhraseMatch(matchlength, data)); } } } // TCompactTrieHolder template TCompactTrieHolder::TCompactTrieHolder(IInputStream& is, size_t len) : Storage(new char[len]) { if (is.Load(Storage.Get(), len) != len) { ythrow yexception() << "bad data load"; } TBase::Init(Storage.Get(), len); } //---------------------------------------------------------------------------------------------------------------- // TCompactTrie::TConstIterator template TCompactTrie::TConstIterator::TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, TPacker packer) : Packer(packer) , Impl(new TOpaqueTrieIterator(trie, emptyValue, atend)) { } template TCompactTrie::TConstIterator::TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, const TKeyBuf& key, TPacker packer) : Packer(packer) , Impl(new TOpaqueTrieIterator(trie, emptyValue, true)) { Impl->UpperBound(key); } template bool TCompactTrie::TConstIterator::operator==(const TConstIterator& other) const { if (!Impl) return !other.Impl; if (!other.Impl) return false; return *Impl == *other.Impl; } template bool TCompactTrie::TConstIterator::operator!=(const TConstIterator& other) const { return !operator==(other); } template typename TCompactTrie::TConstIterator& TCompactTrie::TConstIterator::operator++() { Impl->Forward(); return *this; } template typename TCompactTrie::TConstIterator TCompactTrie::TConstIterator::operator++(int /*unused*/) { TConstIterator copy(*this); Impl->Forward(); return copy; } template typename TCompactTrie::TConstIterator& TCompactTrie::TConstIterator::operator--() { Impl->Backward(); return *this; } template typename TCompactTrie::TConstIterator TCompactTrie::TConstIterator::operator--(int /*unused*/) { TConstIterator copy(*this); Impl->Backward(); return copy; } template typename TCompactTrie::TValueType TCompactTrie::TConstIterator::operator*() { return TValueType(GetKey(), GetValue()); } template typename TCompactTrie::TKey TCompactTrie::TConstIterator::GetKey() const { return Impl->GetKey(); } template size_t TCompactTrie::TConstIterator::GetKeySize() const { return Impl->MeasureKey(); } template const char* TCompactTrie::TConstIterator::GetValuePtr() const { return Impl->GetValuePtr(); } template typename TCompactTrie::TData TCompactTrie::TConstIterator::GetValue() const { D data; GetValue(data); return data; } template void TCompactTrie::TConstIterator::GetValue(typename TCompactTrie::TData& data) const { const char* ptr = GetValuePtr(); if (ptr) { Packer.UnpackLeaf(ptr, data); } else { data = typename TCompactTrie::TData(); } }