123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660 |
- #pragma once
- #include "comptrie_impl.h"
- #include "comptrie_packer.h"
- #include "opaque_trie_iterator.h"
- #include "leaf_skipper.h"
- #include "key_selector.h"
- #include <util/generic/buffer.h>
- #include <util/generic/ptr.h>
- #include <util/generic/vector.h>
- #include <util/generic/yexception.h>
- #include <util/memory/blob.h>
- #include <util/stream/input.h>
- #include <utility>
- template <class T, class D, class S>
- class TCompactTrieBuilder;
- namespace NCompactTrie {
- template <class TTrie>
- class TFirstSymbolIterator;
- }
- template <class TTrie>
- class TSearchIterator;
- template <class TTrie>
- class TPrefixIterator;
- // in case of <char> specialization cannot distinguish between "" and "\0" keys
- template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
- class TCompactTrie {
- public:
- typedef T TSymbol;
- typedef D TData;
- typedef S TPacker;
- typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
- typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
- typedef std::pair<TKey, TData> TValueType;
- typedef std::pair<size_t, TData> TPhraseMatch;
- typedef TVector<TPhraseMatch> TPhraseMatchVector;
- typedef TCompactTrieBuilder<T, D, S> TBuilder;
- protected:
- TBlob DataHolder;
- const char* EmptyValue = nullptr;
- TPacker Packer;
- NCompactTrie::TPackerLeafSkipper<TPacker> Skipper = &Packer; // This should be true for every constructor.
- public:
- TCompactTrie() = default;
- TCompactTrie(const char* d, size_t len, TPacker packer);
- TCompactTrie(const char* d, size_t len)
- : TCompactTrie{d, len, TPacker{}} {
- }
- TCompactTrie(TBlob data, TPacker packer);
- explicit TCompactTrie(TBlob data)
- : TCompactTrie{std::move(data), TPacker{}} {
- }
- // Skipper should be initialized with &Packer, not with &other.Packer, so you have to redefine these.
- TCompactTrie(const TCompactTrie& other);
- TCompactTrie(TCompactTrie&& other) noexcept;
- TCompactTrie& operator=(const TCompactTrie& other);
- TCompactTrie& operator=(TCompactTrie&& other) noexcept;
- explicit operator bool() const {
- return !IsEmpty();
- }
- void Init(const char* d, size_t len, TPacker packer = TPacker());
- void Init(TBlob data, TPacker packer = TPacker());
- bool IsInitialized() const;
- bool IsEmpty() const;
- bool Find(const TSymbol* key, size_t keylen, TData* value = nullptr) const;
- bool Find(const TKeyBuf& key, TData* value = nullptr) const {
- return Find(key.data(), key.size(), value);
- }
- TData Get(const TSymbol* key, size_t keylen) const {
- TData value;
- if (!Find(key, keylen, &value))
- ythrow yexception() << "key " << TKey(key, keylen).Quote() << " not found in trie";
- return value;
- }
- TData Get(const TKeyBuf& key) const {
- return Get(key.data(), key.size());
- }
- TData GetDefault(const TKeyBuf& key, const TData& def) const {
- TData value;
- if (!Find(key.data(), key.size(), &value))
- return def;
- else
- return value;
- }
- const TBlob& Data() const {
- return DataHolder;
- }
- const NCompactTrie::ILeafSkipper& GetSkipper() const {
- return Skipper;
- }
- TPacker GetPacker() const {
- return Packer;
- }
- bool HasCorrectSkipper() const {
- return Skipper.GetPacker() == &Packer;
- }
- void FindPhrases(const TSymbol* key, size_t keylen, TPhraseMatchVector& matches, TSymbol separator = TSymbol(' ')) const;
- void FindPhrases(const TKeyBuf& key, TPhraseMatchVector& matches, TSymbol separator = TSymbol(' ')) const {
- return FindPhrases(key.data(), key.size(), matches, separator);
- }
- bool FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixLen, TData* value = nullptr, bool* hasNext = nullptr) const;
- bool FindLongestPrefix(const TKeyBuf& key, size_t* prefixLen, TData* value = nullptr, bool* hasNext = nullptr) const {
- return FindLongestPrefix(key.data(), key.size(), prefixLen, value, hasNext);
- }
- // Return trie, containing all tails for the given key
- inline TCompactTrie<T, D, S> FindTails(const TSymbol* key, size_t keylen) const;
- TCompactTrie<T, D, S> FindTails(const TKeyBuf& key) const {
- return FindTails(key.data(), key.size());
- }
- bool FindTails(const TSymbol* key, size_t keylen, TCompactTrie<T, D, S>& res) const;
- bool FindTails(const TKeyBuf& key, TCompactTrie<T, D, S>& res) const {
- return FindTails(key.data(), key.size(), res);
- }
- // same as FindTails(&key, 1), a bit faster
- // return false, if no arc with @label exists
- inline bool FindTails(TSymbol label, TCompactTrie<T, D, S>& res) const;
- class TConstIterator {
- private:
- typedef NCompactTrie::TOpaqueTrieIterator TOpaqueTrieIterator;
- typedef NCompactTrie::TOpaqueTrie TOpaqueTrie;
- friend class TCompactTrie;
- TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, TPacker packer); // only usable from Begin() and End() methods
- TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, const TKeyBuf& key, TPacker packer); // only usable from UpperBound() method
- public:
- TConstIterator() = default;
- bool IsEmpty() const {
- return !Impl;
- } // Almost no other method can be called.
- bool operator==(const TConstIterator& other) const;
- bool operator!=(const TConstIterator& other) const;
- TConstIterator& operator++();
- TConstIterator operator++(int /*unused*/);
- TConstIterator& operator--();
- TConstIterator operator--(int /*unused*/);
- TValueType operator*();
- TKey GetKey() const;
- size_t GetKeySize() const;
- TData GetValue() const;
- void GetValue(TData& data) const;
- const char* GetValuePtr() const;
- private:
- TPacker Packer;
- TCopyPtr<TOpaqueTrieIterator> Impl;
- };
- TConstIterator Begin() const;
- TConstIterator begin() const;
- TConstIterator End() const;
- TConstIterator end() const;
- // Returns an iterator pointing to the smallest key in the trie >= the argument.
- // TODO: misleading name. Should be called LowerBound for consistency with stl.
- // No. It is the STL that has a misleading name.
- // LowerBound of X cannot be greater than X.
- TConstIterator UpperBound(const TKeyBuf& key) const;
- void Print(IOutputStream& os);
- size_t Size() const;
- friend class NCompactTrie::TFirstSymbolIterator<TCompactTrie>;
- friend class TSearchIterator<TCompactTrie>;
- friend class TPrefixIterator<TCompactTrie>;
- protected:
- explicit TCompactTrie(const char* emptyValue);
- TCompactTrie(TBlob data, const char* emptyValue, TPacker packer = TPacker());
- bool LookupLongestPrefix(const TSymbol* key, size_t keylen, size_t& prefixLen, const char*& valuepos, bool& hasNext) const;
- bool LookupLongestPrefix(const TSymbol* key, size_t keylen, size_t& prefixLen, const char*& valuepos) const {
- bool hasNext;
- return LookupLongestPrefix(key, keylen, prefixLen, valuepos, hasNext);
- }
- void LookupPhrases(const char* datapos, size_t len, const TSymbol* key, size_t keylen, TVector<TPhraseMatch>& matches, TSymbol separator) const;
- };
- template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
- class TCompactTrieHolder: public TCompactTrie<T, D, S>, NNonCopyable::TNonCopyable {
- private:
- typedef TCompactTrie<T, D, S> TBase;
- TArrayHolder<char> Storage;
- public:
- TCompactTrieHolder(IInputStream& is, size_t len);
- };
- //------------------------//
- // Implementation section //
- //------------------------//
- // TCompactTrie
- template <class T, class D, class S>
- TCompactTrie<T, D, S>::TCompactTrie(TBlob data, TPacker packer)
- {
- Init(std::move(data), packer);
- }
- template <class T, class D, class S>
- TCompactTrie<T, D, S>::TCompactTrie(const char* d, size_t len, TPacker packer)
- {
- Init(d, len, packer);
- }
- template <class T, class D, class S>
- TCompactTrie<T, D, S>::TCompactTrie(const char* emptyValue)
- : EmptyValue(emptyValue)
- {
- }
- template <class T, class D, class S>
- TCompactTrie<T, D, S>::TCompactTrie(TBlob data, const char* emptyValue, TPacker packer)
- : DataHolder(std::move(data))
- , EmptyValue(emptyValue)
- , Packer(packer)
- {
- }
- template <class T, class D, class S>
- TCompactTrie<T, D, S>::TCompactTrie(const TCompactTrie& other)
- : DataHolder(other.DataHolder)
- , EmptyValue(other.EmptyValue)
- , Packer(other.Packer)
- {
- }
- template <class T, class D, class S>
- TCompactTrie<T, D, S>::TCompactTrie(TCompactTrie&& other) noexcept
- : DataHolder(std::move(other.DataHolder))
- , EmptyValue(std::move(other.EmptyValue))
- , Packer(std::move(other.Packer))
- {
- }
- template <class T, class D, class S>
- TCompactTrie<T, D, S>& TCompactTrie<T, D, S>::operator=(const TCompactTrie& other) {
- if (this != &other) {
- DataHolder = other.DataHolder;
- EmptyValue = other.EmptyValue;
- Packer = other.Packer;
- }
- return *this;
- }
- template <class T, class D, class S>
- TCompactTrie<T, D, S>& TCompactTrie<T, D, S>::operator=(TCompactTrie&& other) noexcept {
- if (this != &other) {
- DataHolder = std::move(other.DataHolder);
- EmptyValue = std::move(other.EmptyValue);
- Packer = std::move(other.Packer);
- }
- return *this;
- }
- template <class T, class D, class S>
- void TCompactTrie<T, D, S>::Init(const char* d, size_t len, TPacker packer) {
- Init(TBlob::NoCopy(d, len), packer);
- }
- template <class T, class D, class S>
- void TCompactTrie<T, D, S>::Init(TBlob data, TPacker packer) {
- using namespace NCompactTrie;
- DataHolder = std::move(data);
- Packer = packer;
- const char* datapos = DataHolder.AsCharPtr();
- size_t len = DataHolder.Length();
- if (!len)
- return;
- const char* const dataend = datapos + len;
- const char* emptypos = datapos;
- char flags = LeapByte(emptypos, dataend, 0);
- if (emptypos && (flags & MT_FINAL)) {
- Y_ASSERT(emptypos <= dataend);
- EmptyValue = emptypos;
- }
- }
- template <class T, class D, class S>
- bool TCompactTrie<T, D, S>::IsInitialized() const {
- return DataHolder.Data() != nullptr;
- }
- template <class T, class D, class S>
- bool TCompactTrie<T, D, S>::IsEmpty() const {
- return DataHolder.Size() == 0 && EmptyValue == nullptr;
- }
- template <class T, class D, class S>
- bool TCompactTrie<T, D, S>::Find(const TSymbol* key, size_t keylen, TData* value) const {
- size_t prefixLen = 0;
- const char* valuepos = nullptr;
- bool hasNext;
- if (!LookupLongestPrefix(key, keylen, prefixLen, valuepos, hasNext) || prefixLen != keylen)
- return false;
- if (value)
- Packer.UnpackLeaf(valuepos, *value);
- return true;
- }
- template <class T, class D, class S>
- void TCompactTrie<T, D, S>::FindPhrases(const TSymbol* key, size_t keylen, TPhraseMatchVector& matches, TSymbol separator) const {
- LookupPhrases(DataHolder.AsCharPtr(), DataHolder.Length(), key, keylen, matches, separator);
- }
- template <class T, class D, class S>
- inline TCompactTrie<T, D, S> TCompactTrie<T, D, S>::FindTails(const TSymbol* key, size_t keylen) const {
- TCompactTrie<T, D, S> ret;
- FindTails(key, keylen, ret);
- return ret;
- }
- template <class T, class D, class S>
- bool TCompactTrie<T, D, S>::FindTails(const TSymbol* key, size_t keylen, TCompactTrie<T, D, S>& res) const {
- using namespace NCompactTrie;
- size_t len = DataHolder.Length();
- if (!key || !len)
- return false;
- if (!keylen) {
- res = *this;
- return true;
- }
- const char* datastart = DataHolder.AsCharPtr();
- const char* datapos = datastart;
- const char* const dataend = datapos + len;
- const TSymbol* keyend = key + keylen;
- const char* value = nullptr;
- while (key != keyend) {
- T label = *(key++);
- if (!NCompactTrie::Advance(datapos, dataend, value, label, Packer))
- return false;
- if (key == keyend) {
- if (datapos) {
- Y_ASSERT(datapos >= datastart);
- res = TCompactTrie<T, D, S>(TBlob::NoCopy(datapos, dataend - datapos), value);
- } else {
- res = TCompactTrie<T, D, S>(value);
- }
- return true;
- } else if (!datapos) {
- return false; // No further way
- }
- }
- return false;
- }
- template <class T, class D, class S>
- inline bool TCompactTrie<T, D, S>::FindTails(TSymbol label, TCompactTrie<T, D, S>& res) const {
- using namespace NCompactTrie;
- const size_t len = DataHolder.Length();
- if (!len)
- return false;
- const char* datastart = DataHolder.AsCharPtr();
- const char* dataend = datastart + len;
- const char* datapos = datastart;
- const char* value = nullptr;
- if (!NCompactTrie::Advance(datapos, dataend, value, label, Packer))
- return false;
- if (datapos) {
- Y_ASSERT(datapos >= datastart);
- res = TCompactTrie<T, D, S>(TBlob::NoCopy(datapos, dataend - datapos), value);
- } else {
- res = TCompactTrie<T, D, S>(value);
- }
- return true;
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::Begin() const {
- NCompactTrie::TOpaqueTrie self(DataHolder.AsCharPtr(), DataHolder.Length(), Skipper);
- return TConstIterator(self, EmptyValue, false, Packer);
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::begin() const {
- return Begin();
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::End() const {
- NCompactTrie::TOpaqueTrie self(DataHolder.AsCharPtr(), DataHolder.Length(), Skipper);
- return TConstIterator(self, EmptyValue, true, Packer);
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::end() const {
- return End();
- }
- template <class T, class D, class S>
- size_t TCompactTrie<T, D, S>::Size() const {
- size_t res = 0;
- for (TConstIterator it = Begin(); it != End(); ++it)
- ++res;
- return res;
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::UpperBound(const TKeyBuf& key) const {
- NCompactTrie::TOpaqueTrie self(DataHolder.AsCharPtr(), DataHolder.Length(), Skipper);
- return TConstIterator(self, EmptyValue, key, Packer);
- }
- template <class T, class D, class S>
- void TCompactTrie<T, D, S>::Print(IOutputStream& os) {
- typedef typename ::TCompactTrieKeySelector<T>::TKeyBuf TSBuffer;
- for (TConstIterator it = Begin(); it != End(); ++it) {
- os << TSBuffer((*it).first.data(), (*it).first.size()) << "\t" << (*it).second << Endl;
- }
- }
- template <class T, class D, class S>
- bool TCompactTrie<T, D, S>::FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixLen, TData* value, bool* hasNext) const {
- const char* valuepos = nullptr;
- size_t tempPrefixLen = 0;
- bool tempHasNext;
- bool found = LookupLongestPrefix(key, keylen, tempPrefixLen, valuepos, tempHasNext);
- if (prefixLen)
- *prefixLen = tempPrefixLen;
- if (found && value)
- Packer.UnpackLeaf(valuepos, *value);
- if (hasNext)
- *hasNext = tempHasNext;
- return found;
- }
- template <class T, class D, class S>
- bool TCompactTrie<T, D, S>::LookupLongestPrefix(const TSymbol* key, size_t keylen, size_t& prefixLen, const char*& valuepos, bool& hasNext) const {
- using namespace NCompactTrie;
- const char* datapos = DataHolder.AsCharPtr();
- size_t len = DataHolder.Length();
- prefixLen = 0;
- hasNext = false;
- bool found = false;
- if (EmptyValue) {
- valuepos = EmptyValue;
- found = true;
- }
- if (!key || !len)
- return found;
- const char* const dataend = datapos + len;
- const T* keyend = key + keylen;
- while (key != keyend) {
- T label = *(key++);
- for (i64 i = (i64)ExtraBits<TSymbol>(); i >= 0; i -= 8) {
- const char flags = LeapByte(datapos, dataend, (char)(label >> i));
- if (!datapos) {
- return found; // no such arc
- }
- Y_ASSERT(datapos <= dataend);
- if ((flags & MT_FINAL)) {
- prefixLen = keylen - (keyend - key) - (i ? 1 : 0);
- valuepos = datapos;
- hasNext = flags & MT_NEXT;
- found = true;
- if (!i && key == keyend) { // last byte, and got a match
- return found;
- }
- datapos += Packer.SkipLeaf(datapos); // skip intermediate leaf nodes
- }
- if (!(flags & MT_NEXT)) {
- return found; // no further way
- }
- }
- }
- return found;
- }
- template <class T, class D, class S>
- void TCompactTrie<T, D, S>::LookupPhrases(
- const char* datapos, size_t len, const TSymbol* key, size_t keylen,
- TVector<TPhraseMatch>& matches, TSymbol separator) const {
- using namespace NCompactTrie;
- matches.clear();
- if (!key || !len)
- return;
- const T* const keystart = key;
- const T* const keyend = key + keylen;
- const char* const dataend = datapos + len;
- while (datapos && key != keyend) {
- T label = *(key++);
- const char* value = nullptr;
- if (!Advance(datapos, dataend, value, label, Packer)) {
- return;
- }
- if (value && (key == keyend || *key == separator)) {
- size_t matchlength = (size_t)(key - keystart);
- D data;
- Packer.UnpackLeaf(value, data);
- matches.push_back(TPhraseMatch(matchlength, data));
- }
- }
- }
- // TCompactTrieHolder
- template <class T, class D, class S>
- TCompactTrieHolder<T, D, S>::TCompactTrieHolder(IInputStream& is, size_t len)
- : Storage(new char[len])
- {
- if (is.Load(Storage.Get(), len) != len) {
- ythrow yexception() << "bad data load";
- }
- TBase::Init(Storage.Get(), len);
- }
- //----------------------------------------------------------------------------------------------------------------
- // TCompactTrie::TConstIterator
- template <class T, class D, class S>
- TCompactTrie<T, D, S>::TConstIterator::TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, bool atend, TPacker packer)
- : Packer(packer)
- , Impl(new TOpaqueTrieIterator(trie, emptyValue, atend))
- {
- }
- template <class T, class D, class S>
- TCompactTrie<T, D, S>::TConstIterator::TConstIterator(const TOpaqueTrie& trie, const char* emptyValue, const TKeyBuf& key, TPacker packer)
- : Packer(packer)
- , Impl(new TOpaqueTrieIterator(trie, emptyValue, true))
- {
- Impl->UpperBound<TSymbol>(key);
- }
- template <class T, class D, class S>
- bool TCompactTrie<T, D, S>::TConstIterator::operator==(const TConstIterator& other) const {
- if (!Impl)
- return !other.Impl;
- if (!other.Impl)
- return false;
- return *Impl == *other.Impl;
- }
- template <class T, class D, class S>
- bool TCompactTrie<T, D, S>::TConstIterator::operator!=(const TConstIterator& other) const {
- return !operator==(other);
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TConstIterator& TCompactTrie<T, D, S>::TConstIterator::operator++() {
- Impl->Forward();
- return *this;
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::TConstIterator::operator++(int /*unused*/) {
- TConstIterator copy(*this);
- Impl->Forward();
- return copy;
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TConstIterator& TCompactTrie<T, D, S>::TConstIterator::operator--() {
- Impl->Backward();
- return *this;
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TConstIterator TCompactTrie<T, D, S>::TConstIterator::operator--(int /*unused*/) {
- TConstIterator copy(*this);
- Impl->Backward();
- return copy;
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TValueType TCompactTrie<T, D, S>::TConstIterator::operator*() {
- return TValueType(GetKey(), GetValue());
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TKey TCompactTrie<T, D, S>::TConstIterator::GetKey() const {
- return Impl->GetKey<TSymbol>();
- }
- template <class T, class D, class S>
- size_t TCompactTrie<T, D, S>::TConstIterator::GetKeySize() const {
- return Impl->MeasureKey<TSymbol>();
- }
- template <class T, class D, class S>
- const char* TCompactTrie<T, D, S>::TConstIterator::GetValuePtr() const {
- return Impl->GetValuePtr();
- }
- template <class T, class D, class S>
- typename TCompactTrie<T, D, S>::TData TCompactTrie<T, D, S>::TConstIterator::GetValue() const {
- D data;
- GetValue(data);
- return data;
- }
- template <class T, class D, class S>
- void TCompactTrie<T, D, S>::TConstIterator::GetValue(typename TCompactTrie<T, D, S>::TData& data) const {
- const char* ptr = GetValuePtr();
- if (ptr) {
- Packer.UnpackLeaf(ptr, data);
- } else {
- data = typename TCompactTrie<T, D, S>::TData();
- }
- }
|