#pragma once #include "comptrie_impl.h" #include "comptrie_trie.h" #include "make_fast_layout.h" #include "array_with_size.h" #include #include #include #include #include #include #include #include #include #include #include #define CONSTEXPR_MAX2(a, b) (a) > (b) ? (a) : (b) #define CONSTEXPR_MAX3(a, b, c) CONSTEXPR_MAX2(CONSTEXPR_MAX2(a, b), c) // TCompactTrieBuilder::TCompactTrieBuilderImpl template class TCompactTrieBuilder::TCompactTrieBuilderImpl { protected: TMemoryPool Pool; size_t PayloadSize; THolder NodeAllocator; class TNode; class TArc; TNode* Root; TCompactTrieBuilderFlags Flags; size_t EntryCount; size_t NodeCount; TPacker Packer; enum EPayload { DATA_ABSENT, DATA_INSIDE, DATA_MALLOCED, DATA_IN_MEMPOOL, }; protected: void ConvertSymbolArrayToChar(const TSymbol* key, size_t keylen, TTempBuf& buf, size_t ckeylen) const; void NodeLinkTo(TNode* thiz, const TBlob& label, TNode* node); TNode* NodeForwardAdd(TNode* thiz, const char* label, size_t len, size_t& passed, size_t* nodeCount); bool FindEntryImpl(const char* key, size_t keylen, TData* value) const; bool FindLongestPrefixImpl(const char* keyptr, size_t keylen, size_t* prefixLen, TData* value) const; size_t NodeMeasureSubtree(TNode* thiz) const; ui64 NodeSaveSubtree(TNode* thiz, IOutputStream& os) const; ui64 NodeSaveSubtreeAndDestroy(TNode* thiz, IOutputStream& osy); void NodeBufferSubtree(TNode* thiz); size_t NodeMeasureLeafValue(TNode* thiz) const; ui64 NodeSaveLeafValue(TNode* thiz, IOutputStream& os) const; virtual ui64 ArcMeasure(const TArc* thiz, size_t leftsize, size_t rightsize) const; virtual ui64 ArcSaveSelf(const TArc* thiz, IOutputStream& os) const; ui64 ArcSave(const TArc* thiz, IOutputStream& os) const; ui64 ArcSaveAndDestroy(const TArc* thiz, IOutputStream& os); public: TCompactTrieBuilderImpl(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc); virtual ~TCompactTrieBuilderImpl(); void DestroyNode(TNode* node); void NodeReleasePayload(TNode* thiz); char* AddEntryForData(const TSymbol* key, size_t keylen, size_t dataLen, bool& isNewAddition); TNode* AddEntryForSomething(const TSymbol* key, size_t keylen, bool& isNewAddition); bool AddEntry(const TSymbol* key, size_t keylen, const TData& value); bool AddEntryPtr(const TSymbol* key, size_t keylen, const char* value); bool AddSubtreeInFile(const TSymbol* key, size_t keylen, const TString& fileName); bool AddSubtreeInBuffer(const TSymbol* key, size_t keylen, TArrayWithSizeHolder&& buffer); bool FindEntry(const TSymbol* key, size_t keylen, TData* value) const; bool FindLongestPrefix(const TSymbol* key, size_t keylen, size_t* prefixlen, TData* value) const; size_t Save(IOutputStream& os) const; size_t SaveAndDestroy(IOutputStream& os); void Clear(); // lies if some key was added at least twice size_t GetEntryCount() const; size_t GetNodeCount() const; size_t MeasureByteSize() const { return NodeMeasureSubtree(Root); } }; template class TCompactTrieBuilder::TCompactTrieBuilderImpl::TArc { public: TBlob Label; TNode* Node; mutable size_t LeftOffset; mutable size_t RightOffset; TArc(const TBlob& lbl, TNode* nd); }; template class TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode { public: typedef typename TCompactTrieBuilder::TCompactTrieBuilderImpl TBuilderImpl; typedef typename TBuilderImpl::TArc TArc; struct ISubtree { virtual ~ISubtree() = default; virtual bool IsLast() const = 0; virtual ui64 Measure(const TBuilderImpl* builder) const = 0; virtual ui64 Save(const TBuilderImpl* builder, IOutputStream& os) const = 0; virtual ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) = 0; virtual void Destroy(TBuilderImpl*) { } // Tries to find key in subtree. // Returns next node to find the key in (to avoid recursive calls) // If it has end result, writes it to @value and @result arguments and returns nullptr virtual const TNode* Find(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const = 0; virtual const TNode* FindLongestPrefix(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const = 0; }; class TArcSet: public ISubtree, public TCompactVector { public: typedef typename TCompactVector::iterator iterator; typedef typename TCompactVector::const_iterator const_iterator; TArcSet() { Y_ASSERT(reinterpret_cast(this) == static_cast(this)); // This assumption is used in TNode::Subtree() } iterator Find(char ch); const_iterator Find(char ch) const; void Add(const TBlob& s, TNode* node); bool IsLast() const override { return this->Empty(); } const TNode* Find(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override; const TNode* FindLongestPrefix(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override { return Find(key, value, result, packer); } ui64 Measure(const TBuilderImpl* builder) const override { return MeasureRange(builder, 0, this->size()); } ui64 MeasureRange(const TBuilderImpl* builder, size_t from, size_t to) const { if (from >= to) return 0; size_t median = (from + to) / 2; size_t leftsize = (size_t)MeasureRange(builder, from, median); size_t rightsize = (size_t)MeasureRange(builder, median + 1, to); return builder->ArcMeasure(&(*this)[median], leftsize, rightsize); } ui64 Save(const TBuilderImpl* builder, IOutputStream& os) const override { return SaveRange(builder, 0, this->size(), os); } ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) override { ui64 result = SaveRangeAndDestroy(builder, 0, this->size(), os); Destroy(builder); return result; } ui64 SaveRange(const TBuilderImpl* builder, size_t from, size_t to, IOutputStream& os) const { if (from >= to) return 0; size_t median = (from + to) / 2; ui64 written = builder->ArcSave(&(*this)[median], os); written += SaveRange(builder, from, median, os); written += SaveRange(builder, median + 1, to, os); return written; } ui64 SaveRangeAndDestroy(TBuilderImpl* builder, size_t from, size_t to, IOutputStream& os) { if (from >= to) return 0; size_t median = (from + to) / 2; ui64 written = builder->ArcSaveAndDestroy(&(*this)[median], os); written += SaveRangeAndDestroy(builder, from, median, os); written += SaveRangeAndDestroy(builder, median + 1, to, os); return written; } void Destroy(TBuilderImpl* builder) override { // Delete all nodes down the stream. for (iterator it = this->begin(); it != this->end(); ++it) { builder->DestroyNode(it->Node); } this->clear(); } ~TArcSet() override { Y_ASSERT(this->empty()); } }; struct TBufferedSubtree: public ISubtree { TArrayWithSizeHolder Buffer; TBufferedSubtree() { Y_ASSERT(reinterpret_cast(this) == static_cast(this)); // This assumption is used in TNode::Subtree() } bool IsLast() const override { return Buffer.Empty(); } const TNode* Find(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override { if (Buffer.Empty()) { result = false; return nullptr; } TCompactTrie trie(Buffer.Get(), Buffer.Size(), packer); result = trie.Find(key.data(), key.size(), value); return nullptr; } const TNode* FindLongestPrefix(TStringBuf& key, TData* value, bool& result, const TPacker& packer) const override { if (Buffer.Empty()) { result = false; return nullptr; } TCompactTrie trie(Buffer.Get(), Buffer.Size(), packer); size_t prefixLen = 0; result = trie.FindLongestPrefix(key.data(), key.size(), &prefixLen, value); key = key.SubStr(prefixLen); return nullptr; } ui64 Measure(const TBuilderImpl*) const override { return Buffer.Size(); } ui64 Save(const TBuilderImpl*, IOutputStream& os) const override { os.Write(Buffer.Get(), Buffer.Size()); return Buffer.Size(); } ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) override { ui64 result = Save(builder, os); TArrayWithSizeHolder().Swap(Buffer); return result; } }; struct TSubtreeInFile: public ISubtree { struct TData { TString FileName; ui64 Size; }; THolder Data; TSubtreeInFile(const TString& fileName) { // stupid API TFile file(fileName, RdOnly); i64 size = file.GetLength(); if (size < 0) ythrow yexception() << "unable to get file " << fileName.Quote() << " size for unknown reason"; Data.Reset(new TData); Data->FileName = fileName; Data->Size = size; Y_ASSERT(reinterpret_cast(this) == static_cast(this)); // This assumption is used in TNode::Subtree() } bool IsLast() const override { return Data->Size == 0; } const TNode* Find(TStringBuf& key, typename TCompactTrieBuilder::TData* value, bool& result, const TPacker& packer) const override { if (!Data) { result = false; return nullptr; } TCompactTrie trie(TBlob::FromFile(Data->FileName), packer); result = trie.Find(key.data(), key.size(), value); return nullptr; } const TNode* FindLongestPrefix(TStringBuf& key, typename TCompactTrieBuilder::TData* value, bool& result, const TPacker& packer) const override { if (!Data) { result = false; return nullptr; } TCompactTrie trie(TBlob::FromFile(Data->FileName), packer); size_t prefixLen = 0; result = trie.FindLongestPrefix(key.data(), key.size(), &prefixLen, value); key = key.SubStr(prefixLen); return nullptr; } ui64 Measure(const TBuilderImpl*) const override { return Data->Size; } ui64 Save(const TBuilderImpl*, IOutputStream& os) const override { TUnbufferedFileInput is(Data->FileName); ui64 written = TransferData(&is, &os); if (written != Data->Size) ythrow yexception() << "file " << Data->FileName.Quote() << " size changed"; return written; } ui64 SaveAndDestroy(TBuilderImpl* builder, IOutputStream& os) override { return Save(builder, os); } }; union { char ArcsData[CONSTEXPR_MAX3(sizeof(TArcSet), sizeof(TBufferedSubtree), sizeof(TSubtreeInFile))]; union { void* Data1; long long int Data2; } Aligner; }; inline ISubtree* Subtree() { return reinterpret_cast(ArcsData); } inline const ISubtree* Subtree() const { return reinterpret_cast(ArcsData); } EPayload PayloadType; inline const char* PayloadPtr() const { return ((const char*) this) + sizeof(TNode); } inline char* PayloadPtr() { return ((char*) this) + sizeof(TNode); } // *Payload() inline const char*& PayloadAsPtr() const { const char** payload = (const char**) PayloadPtr(); return *payload; } inline char*& PayloadAsPtr() { char** payload = (char**) PayloadPtr(); return *payload; } inline const char* GetPayload() const { switch (PayloadType) { case DATA_INSIDE: return PayloadPtr(); case DATA_MALLOCED: case DATA_IN_MEMPOOL: return PayloadAsPtr(); case DATA_ABSENT: default: abort(); } } inline char* GetPayload() { const TNode* thiz = this; return const_cast(thiz->GetPayload()); // const_cast is to avoid copy-paste style } bool IsFinal() const { return PayloadType != DATA_ABSENT; } bool IsLast() const { return Subtree()->IsLast(); } inline void* operator new(size_t, TFixedSizeAllocator& pool) { return pool.Allocate(); } inline void operator delete(void* ptr, TFixedSizeAllocator& pool) noexcept { pool.Release(ptr); } TNode() : PayloadType(DATA_ABSENT) { new (Subtree()) TArcSet; } ~TNode() { Subtree()->~ISubtree(); Y_ASSERT(PayloadType == DATA_ABSENT); } }; // TCompactTrieBuilder template TCompactTrieBuilder::TCompactTrieBuilder(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc) : Impl(new TCompactTrieBuilderImpl(flags, packer, alloc)) { } template bool TCompactTrieBuilder::Add(const TSymbol* key, size_t keylen, const TData& value) { return Impl->AddEntry(key, keylen, value); } template bool TCompactTrieBuilder::AddPtr(const TSymbol* key, size_t keylen, const char* value) { return Impl->AddEntryPtr(key, keylen, value); } template bool TCompactTrieBuilder::AddSubtreeInFile(const TSymbol* key, size_t keylen, const TString& fileName) { return Impl->AddSubtreeInFile(key, keylen, fileName); } template bool TCompactTrieBuilder::AddSubtreeInBuffer(const TSymbol* key, size_t keylen, TArrayWithSizeHolder&& buffer) { return Impl->AddSubtreeInBuffer(key, keylen, std::move(buffer)); } template bool TCompactTrieBuilder::Find(const TSymbol* key, size_t keylen, TData* value) const { return Impl->FindEntry(key, keylen, value); } template bool TCompactTrieBuilder::FindLongestPrefix( const TSymbol* key, size_t keylen, size_t* prefixlen, TData* value) const { return Impl->FindLongestPrefix(key, keylen, prefixlen, value); } template size_t TCompactTrieBuilder::Save(IOutputStream& os) const { return Impl->Save(os); } template size_t TCompactTrieBuilder::SaveAndDestroy(IOutputStream& os) { return Impl->SaveAndDestroy(os); } template void TCompactTrieBuilder::Clear() { Impl->Clear(); } template size_t TCompactTrieBuilder::GetEntryCount() const { return Impl->GetEntryCount(); } template size_t TCompactTrieBuilder::GetNodeCount() const { return Impl->GetNodeCount(); } // TCompactTrieBuilder::TCompactTrieBuilderImpl template TCompactTrieBuilder::TCompactTrieBuilderImpl::TCompactTrieBuilderImpl(TCompactTrieBuilderFlags flags, TPacker packer, IAllocator* alloc) : Pool(1000000, TMemoryPool::TLinearGrow::Instance(), alloc) , PayloadSize(sizeof(void*)) // XXX: find better value , NodeAllocator(new TFixedSizeAllocator(sizeof(TNode) + PayloadSize, alloc)) , Flags(flags) , EntryCount(0) , NodeCount(1) , Packer(packer) { Root = new (*NodeAllocator) TNode; } template TCompactTrieBuilder::TCompactTrieBuilderImpl::~TCompactTrieBuilderImpl() { DestroyNode(Root); } template void TCompactTrieBuilder::TCompactTrieBuilderImpl::ConvertSymbolArrayToChar( const TSymbol* key, size_t keylen, TTempBuf& buf, size_t buflen) const { char* ckeyptr = buf.Data(); for (size_t i = 0; i < keylen; ++i) { TSymbol label = key[i]; for (int j = (int)NCompactTrie::ExtraBits(); j >= 0; j -= 8) { Y_ASSERT(ckeyptr < buf.Data() + buflen); *(ckeyptr++) = (char)(label >> j); } } buf.Proceed(buflen); Y_ASSERT(ckeyptr == buf.Data() + buf.Filled()); } template void TCompactTrieBuilder::TCompactTrieBuilderImpl::DestroyNode(TNode* thiz) { thiz->Subtree()->Destroy(this); NodeReleasePayload(thiz); thiz->~TNode(); NodeAllocator->Release(thiz); } template void TCompactTrieBuilder::TCompactTrieBuilderImpl::NodeReleasePayload(TNode* thiz) { switch (thiz->PayloadType) { case DATA_ABSENT: case DATA_INSIDE: case DATA_IN_MEMPOOL: break; case DATA_MALLOCED: delete[] thiz->PayloadAsPtr(); break; default: abort(); } thiz->PayloadType = DATA_ABSENT; } template bool TCompactTrieBuilder::TCompactTrieBuilderImpl::AddEntry( const TSymbol* key, size_t keylen, const TData& value) { size_t datalen = Packer.MeasureLeaf(value); bool isNewAddition = false; char* place = AddEntryForData(key, keylen, datalen, isNewAddition); Packer.PackLeaf(place, value, datalen); return isNewAddition; } template bool TCompactTrieBuilder::TCompactTrieBuilderImpl::AddEntryPtr( const TSymbol* key, size_t keylen, const char* value) { size_t datalen = Packer.SkipLeaf(value); bool isNewAddition = false; char* place = AddEntryForData(key, keylen, datalen, isNewAddition); memcpy(place, value, datalen); return isNewAddition; } template bool TCompactTrieBuilder::TCompactTrieBuilderImpl::AddSubtreeInFile( const TSymbol* key, size_t keylen, const TString& fileName) { typedef typename TNode::ISubtree ISubtree; typedef typename TNode::TSubtreeInFile TSubtreeInFile; bool isNewAddition = false; TNode* node = AddEntryForSomething(key, keylen, isNewAddition); node->Subtree()->Destroy(this); node->Subtree()->~ISubtree(); new (node->Subtree()) TSubtreeInFile(fileName); return isNewAddition; } template bool TCompactTrieBuilder::TCompactTrieBuilderImpl::AddSubtreeInBuffer( const TSymbol* key, size_t keylen, TArrayWithSizeHolder&& buffer) { typedef typename TNode::TBufferedSubtree TBufferedSubtree; bool isNewAddition = false; TNode* node = AddEntryForSomething(key, keylen, isNewAddition); node->Subtree()->Destroy(this); node->Subtree()->~ISubtree(); auto subtree = new (node->Subtree()) TBufferedSubtree(); subtree->Buffer.Swap(buffer); return isNewAddition; } template typename TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode* TCompactTrieBuilder::TCompactTrieBuilderImpl::AddEntryForSomething( const TSymbol* key, size_t keylen, bool& isNewAddition) { using namespace NCompactTrie; EntryCount++; if (Flags & CTBF_VERBOSE) ShowProgress(EntryCount); TNode* current = Root; size_t passed; // Special case of empty key: replace it by 1-byte "\0" key. size_t ckeylen = keylen ? keylen * sizeof(TSymbol) : 1; TTempBuf ckeybuf(ckeylen); if (keylen == 0) { ckeybuf.Append("\0", 1); } else { ConvertSymbolArrayToChar(key, keylen, ckeybuf, ckeylen); } char* ckey = ckeybuf.Data(); TNode* next; while ((ckeylen > 0) && (next = NodeForwardAdd(current, ckey, ckeylen, passed, &NodeCount)) != nullptr) { current = next; ckeylen -= passed; ckey += passed; } if (ckeylen != 0) { //new leaf NodeCount++; TNode* leaf = new (*NodeAllocator) TNode(); NodeLinkTo(current, TBlob::Copy(ckey, ckeylen), leaf); current = leaf; } isNewAddition = (current->PayloadType == DATA_ABSENT); if ((Flags & CTBF_UNIQUE) && !isNewAddition) ythrow yexception() << "Duplicate key"; return current; } template char* TCompactTrieBuilder::TCompactTrieBuilderImpl::AddEntryForData(const TSymbol* key, size_t keylen, size_t datalen, bool& isNewAddition) { TNode* current = AddEntryForSomething(key, keylen, isNewAddition); NodeReleasePayload(current); if (datalen <= PayloadSize) { current->PayloadType = DATA_INSIDE; } else if (Flags & CTBF_PREFIX_GROUPED) { current->PayloadType = DATA_MALLOCED; current->PayloadAsPtr() = new char[datalen]; } else { current->PayloadType = DATA_IN_MEMPOOL; current->PayloadAsPtr() = (char*) Pool.Allocate(datalen); // XXX: allocate unaligned } return current->GetPayload(); } template bool TCompactTrieBuilder::TCompactTrieBuilderImpl::FindEntry(const TSymbol* key, size_t keylen, TData* value) const { using namespace NCompactTrie; if (!keylen) { const char zero = '\0'; return FindEntryImpl(&zero, 1, value); } else { size_t ckeylen = keylen * sizeof(TSymbol); TTempBuf ckeybuf(ckeylen); ConvertSymbolArrayToChar(key, keylen, ckeybuf, ckeylen); return FindEntryImpl(ckeybuf.Data(), ckeylen, value); } } template bool TCompactTrieBuilder::TCompactTrieBuilderImpl::FindEntryImpl(const char* keyptr, size_t keylen, TData* value) const { const TNode* node = Root; bool result = false; TStringBuf key(keyptr, keylen); while (key && (node = node->Subtree()->Find(key, value, result, Packer))) { } return result; } template bool TCompactTrieBuilder::TCompactTrieBuilderImpl::FindLongestPrefix( const TSymbol* key, size_t keylen, size_t* prefixlen, TData* value) const { using namespace NCompactTrie; if (!keylen) { const char zero = '\0'; const bool ret = FindLongestPrefixImpl(&zero, 1, prefixlen, value); if (ret && prefixlen) *prefixlen = 0; // empty key found return ret; } else { size_t ckeylen = keylen * sizeof(TSymbol); TTempBuf ckeybuf(ckeylen); ConvertSymbolArrayToChar(key, keylen, ckeybuf, ckeylen); bool ret = FindLongestPrefixImpl(ckeybuf.Data(), ckeylen, prefixlen, value); if (ret && prefixlen && *prefixlen == 1 && ckeybuf.Data()[0] == '\0') *prefixlen = 0; // if we have found empty key, set prefixlen to zero else if (!ret) // try to find value with empty key, because empty key is prefix of a every key ret = FindLongestPrefix(nullptr, 0, prefixlen, value); if (ret && prefixlen) *prefixlen /= sizeof(TSymbol); return ret; } } template bool TCompactTrieBuilder::TCompactTrieBuilderImpl::FindLongestPrefixImpl(const char* keyptr, size_t keylen, size_t* prefixLen, TData* value) const { const TNode* node = Root; const TNode* lastFinalNode = nullptr; bool endResult = false; TStringBuf key(keyptr, keylen); TStringBuf keyTail = key; TStringBuf lastFinalKeyTail; while (keyTail && (node = node->Subtree()->FindLongestPrefix(keyTail, value, endResult, Packer))) { if (endResult) // no more ways to find prefix and prefix has been found break; if (node->IsFinal()) { lastFinalNode = node; lastFinalKeyTail = keyTail; } } if (!endResult && lastFinalNode) { if (value) Packer.UnpackLeaf(lastFinalNode->GetPayload(), *value); keyTail = lastFinalKeyTail; endResult = true; } if (endResult && prefixLen) *prefixLen = keyTail ? key.size() - keyTail.size() : key.size(); return endResult; } template void TCompactTrieBuilder::TCompactTrieBuilderImpl::Clear() { DestroyNode(Root); Pool.Clear(); NodeAllocator.Reset(new TFixedSizeAllocator(sizeof(TNode) + PayloadSize, TDefaultAllocator::Instance())); Root = new (*NodeAllocator) TNode; EntryCount = 0; NodeCount = 1; } template size_t TCompactTrieBuilder::TCompactTrieBuilderImpl::Save(IOutputStream& os) const { const size_t len = NodeMeasureSubtree(Root); if (len != NodeSaveSubtree(Root, os)) ythrow yexception() << "something wrong"; return len; } template size_t TCompactTrieBuilder::TCompactTrieBuilderImpl::SaveAndDestroy(IOutputStream& os) { const size_t len = NodeMeasureSubtree(Root); if (len != NodeSaveSubtreeAndDestroy(Root, os)) ythrow yexception() << "something wrong"; return len; } template size_t TCompactTrieBuilder::TCompactTrieBuilderImpl::GetEntryCount() const { return EntryCount; } template size_t TCompactTrieBuilder::TCompactTrieBuilderImpl::GetNodeCount() const { return NodeCount; } template typename TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode* TCompactTrieBuilder::TCompactTrieBuilderImpl::NodeForwardAdd( TNode* thiz, const char* label, size_t len, size_t& passed, size_t* nodeCount) { typename TNode::TArcSet* arcSet = dynamic_cast(thiz->Subtree()); if (!arcSet) ythrow yexception() << "Bad input order - expected input strings to be prefix-grouped."; typename TNode::TArcSet::iterator it = arcSet->Find(*label); if (it != arcSet->end()) { const char* arcLabel = it->Label.AsCharPtr(); size_t arcLabelLen = it->Label.Length(); for (passed = 0; (passed < len) && (passed < arcLabelLen) && (label[passed] == arcLabel[passed]); ++passed) { //just count } if (passed < arcLabelLen) { (*nodeCount)++; TNode* node = new (*NodeAllocator) TNode(); NodeLinkTo(node, it->Label.SubBlob(passed, arcLabelLen), it->Node); it->Node = node; it->Label = it->Label.SubBlob(passed); } return it->Node; } return nullptr; } template void TCompactTrieBuilder::TCompactTrieBuilderImpl::NodeLinkTo(TNode* thiz, const TBlob& label, TNode* node) { typename TNode::TArcSet* arcSet = dynamic_cast(thiz->Subtree()); if (!arcSet) ythrow yexception() << "Bad input order - expected input strings to be prefix-grouped."; // Buffer the node at the last arc if ((Flags & CTBF_PREFIX_GROUPED) && !arcSet->empty()) NodeBufferSubtree(arcSet->back().Node); arcSet->Add(label, node); } template size_t TCompactTrieBuilder::TCompactTrieBuilderImpl::NodeMeasureSubtree(TNode* thiz) const { return (size_t)thiz->Subtree()->Measure(this); } template ui64 TCompactTrieBuilder::TCompactTrieBuilderImpl::NodeSaveSubtree(TNode* thiz, IOutputStream& os) const { return thiz->Subtree()->Save(this, os); } template ui64 TCompactTrieBuilder::TCompactTrieBuilderImpl::NodeSaveSubtreeAndDestroy(TNode* thiz, IOutputStream& os) { return thiz->Subtree()->SaveAndDestroy(this, os); } template void TCompactTrieBuilder::TCompactTrieBuilderImpl::NodeBufferSubtree(TNode* thiz) { typedef typename TNode::TArcSet TArcSet; TArcSet* arcSet = dynamic_cast(thiz->Subtree()); if (!arcSet) return; size_t bufferLength = (size_t)arcSet->Measure(this); TArrayWithSizeHolder buffer; buffer.Resize(bufferLength); TMemoryOutput bufout(buffer.Get(), buffer.Size()); ui64 written = arcSet->Save(this, bufout); Y_ASSERT(written == bufferLength); arcSet->Destroy(this); arcSet->~TArcSet(); typename TNode::TBufferedSubtree* bufferedArcSet = new (thiz->Subtree()) typename TNode::TBufferedSubtree; bufferedArcSet->Buffer.Swap(buffer); } template size_t TCompactTrieBuilder::TCompactTrieBuilderImpl::NodeMeasureLeafValue(TNode* thiz) const { if (!thiz->IsFinal()) return 0; return Packer.SkipLeaf(thiz->GetPayload()); } template ui64 TCompactTrieBuilder::TCompactTrieBuilderImpl::NodeSaveLeafValue(TNode* thiz, IOutputStream& os) const { if (!thiz->IsFinal()) return 0; size_t len = Packer.SkipLeaf(thiz->GetPayload()); os.Write(thiz->GetPayload(), len); return len; } // TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArc template TCompactTrieBuilder::TCompactTrieBuilderImpl::TArc::TArc(const TBlob& lbl, TNode* nd) : Label(lbl) , Node(nd) , LeftOffset(0) , RightOffset(0) {} template ui64 TCompactTrieBuilder::TCompactTrieBuilderImpl::ArcMeasure( const TArc* thiz, size_t leftsize, size_t rightsize) const { using namespace NCompactTrie; size_t coresize = 2 + NodeMeasureLeafValue(thiz->Node); // 2 == (char + flags) size_t treesize = NodeMeasureSubtree(thiz->Node); if (thiz->Label.Length() > 0) treesize += 2 * (thiz->Label.Length() - 1); // Triple measurements are needed because the space needed to store the offset // shall be added to the offset itself. Hence three iterations. size_t leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize) : 0; size_t rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize) : 0; leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize + leftoffsetsize + rightoffsetsize) : 0; rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize + leftoffsetsize + rightoffsetsize) : 0; leftoffsetsize = leftsize ? MeasureOffset(coresize + treesize + leftoffsetsize + rightoffsetsize) : 0; rightoffsetsize = rightsize ? MeasureOffset(coresize + treesize + leftsize + leftoffsetsize + rightoffsetsize) : 0; coresize += leftoffsetsize + rightoffsetsize; thiz->LeftOffset = leftsize ? coresize + treesize : 0; thiz->RightOffset = rightsize ? coresize + treesize + leftsize : 0; return coresize + treesize + leftsize + rightsize; } template ui64 TCompactTrieBuilder::TCompactTrieBuilderImpl::ArcSaveSelf(const TArc* thiz, IOutputStream& os) const { using namespace NCompactTrie; ui64 written = 0; size_t leftoffsetsize = MeasureOffset(thiz->LeftOffset); size_t rightoffsetsize = MeasureOffset(thiz->RightOffset); size_t labelLen = thiz->Label.Length(); for (size_t i = 0; i < labelLen; ++i) { char flags = 0; if (i == 0) { flags |= (leftoffsetsize << MT_LEFTSHIFT); flags |= (rightoffsetsize << MT_RIGHTSHIFT); } if (i == labelLen-1) { if (thiz->Node->IsFinal()) flags |= MT_FINAL; if (!thiz->Node->IsLast()) flags |= MT_NEXT; } else { flags |= MT_NEXT; } os.Write(&flags, 1); os.Write(&thiz->Label.AsCharPtr()[i], 1); written += 2; if (i == 0) { written += ArcSaveOffset(thiz->LeftOffset, os); written += ArcSaveOffset(thiz->RightOffset, os); } } written += NodeSaveLeafValue(thiz->Node, os); return written; } template ui64 TCompactTrieBuilder::TCompactTrieBuilderImpl::ArcSave(const TArc* thiz, IOutputStream& os) const { ui64 written = ArcSaveSelf(thiz, os); written += NodeSaveSubtree(thiz->Node, os); return written; } template ui64 TCompactTrieBuilder::TCompactTrieBuilderImpl::ArcSaveAndDestroy(const TArc* thiz, IOutputStream& os) { ui64 written = ArcSaveSelf(thiz, os); written += NodeSaveSubtreeAndDestroy(thiz->Node, os); return written; } // TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArcSet template typename TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArcSet::iterator TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArcSet::Find(char ch) { using namespace NCompTriePrivate; iterator it = LowerBound(this->begin(), this->end(), ch, TCmp()); if (it != this->end() && it->Label[0] == (unsigned char)ch) { return it; } return this->end(); } template typename TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArcSet::const_iterator TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArcSet::Find(char ch) const { using namespace NCompTriePrivate; const_iterator it = LowerBound(this->begin(), this->end(), ch, TCmp()); if (it != this->end() && it->Label[0] == (unsigned char)ch) { return it; } return this->end(); } template void TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArcSet::Add(const TBlob& s, TNode* node) { using namespace NCompTriePrivate; this->insert(LowerBound(this->begin(), this->end(), s[0], TCmp()), TArc(s, node)); } template const typename TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode* TCompactTrieBuilder::TCompactTrieBuilderImpl::TNode::TArcSet::Find( TStringBuf& key, TData* value, bool& result, const TPacker& packer) const { result = false; if (!key) return nullptr; const const_iterator it = Find(key[0]); if (it != this->end()) { const char* const arcLabel = it->Label.AsCharPtr(); const size_t arcLabelLen = it->Label.Length(); if (key.size() >= arcLabelLen && memcmp(key.data(), arcLabel, arcLabelLen) == 0) { const TStringBuf srcKey = key; key = key.SubStr(arcLabelLen); const TNode* const node = it->Node; if (srcKey.size() == arcLabelLen) { // unpack value of it->Node, if it has value if (!node->IsFinal()) return nullptr; if (value) packer.UnpackLeaf(node->GetPayload(), *value); result = true; return nullptr; } // find in subtree return node; } } return nullptr; } // Different //---------------------------------------------------------------------------------------------------------------------- // Minimize the trie. The result is equivalent to the original // trie, except that it takes less space (and has marginally lower // performance, because of eventual epsilon links). // The algorithm is as follows: starting from the largest pieces, we find // nodes that have identical continuations (Daciuk's right language), // and repack the trie. Repacking is done in-place, so memory is less // of an issue; however, it may take considerable time. // IMPORTANT: never try to reminimize an already minimized trie or a trie with fast layout. // Because of non-local structure and epsilon links, it won't work // as you expect it to, and can destroy the trie in the making. template size_t CompactTrieMinimize(IOutputStream& os, const char* data, size_t datalength, bool verbose /*= false*/, const TPacker& packer /*= TPacker()*/, NCompactTrie::EMinimizeMode mode) { using namespace NCompactTrie; return CompactTrieMinimizeImpl(os, data, datalength, verbose, &packer, mode); } template size_t CompactTrieMinimize(IOutputStream& os, const TTrieBuilder& builder, bool verbose /*=false*/) { TBufferStream buftmp; size_t len = builder.Save(buftmp); return CompactTrieMinimize(os, buftmp.Buffer().Data(), len, verbose); } //---------------------------------------------------------------------------------------------------------------- // Lay the trie in memory in such a way that there are less cache misses when jumping from root to leaf. // The trie becomes about 2% larger, but the access became about 25% faster in our experiments. // Can be called on minimized and non-minimized tries, in the first case in requires half a trie more memory. // Calling it the second time on the same trie does nothing. // // The algorithm is based on van Emde Boas layout as described in the yandex data school lectures on external memory algoritms // by Maxim Babenko and Ivan Puzyrevsky. The difference is that when we cut the tree into levels // two nodes connected by a forward link are put into the same level (because they usually lie near each other in the original tree). // The original paper (describing the layout in Section 2.1) is: // Michael A. Bender, Erik D. Demaine, Martin Farach-Colton. Cache-Oblivious B-Trees // SIAM Journal on Computing, volume 35, number 2, 2005, pages 341-358. // Available on the web: http://erikdemaine.org/papers/CacheObliviousBTrees_SICOMP/ // Or: Michael A. Bender, Erik D. Demaine, and Martin Farach-Colton. Cache-Oblivious B-Trees // Proceedings of the 41st Annual Symposium // on Foundations of Computer Science (FOCS 2000), Redondo Beach, California, November 12-14, 2000, pages 399-409. // Available on the web: http://erikdemaine.org/papers/FOCS2000b/ // (there is not much difference between these papers, actually). // template size_t CompactTrieMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose /*= false*/, const TPacker& packer /*= TPacker()*/) { using namespace NCompactTrie; return CompactTrieMakeFastLayoutImpl(os, data, datalength, verbose, &packer); } template size_t CompactTrieMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose /*=false*/) { TBufferStream buftmp; size_t len = builder.Save(buftmp); return CompactTrieMakeFastLayout(os, buftmp.Buffer().Data(), len, verbose); } template size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const char* data, size_t datalength, bool verbose/*=false*/, const TPacker& packer/*= TPacker()*/) { TBufferStream buftmp; size_t len = CompactTrieMinimize(buftmp, data, datalength, verbose, packer); return CompactTrieMakeFastLayout(os, buftmp.Buffer().Data(), len, verbose, packer); } template size_t CompactTrieMinimizeAndMakeFastLayout(IOutputStream& os, const TTrieBuilder& builder, bool verbose /*=false*/) { TBufferStream buftmp; size_t len = CompactTrieMinimize(buftmp, builder, verbose); return CompactTrieMakeFastLayout(os, buftmp.Buffer().Data(), len, verbose); }