#pragma once #include #include #include #include #include #include #include #include #include #include #include "common.h" template struct meta_iterator_pair { typedef typename std::iterator_traits::value_type::first_type first_type; typedef typename std::iterator_traits::value_type::second_type second_type; }; /* * Builder implementation */ /* * Builder declaration */ template class TDefaultContainerBuilder { TGeneralVectorWriter List_; public: void AddOut(const O& o) { List_.PushBack(o); } bool IsEmpty() const { return List_.Size() == 0; } void SaveContent(IOutputStream* stream) const { List_.Save(*stream); } }; template class TSingleContainerBuilder { bool Empty; O Out_; public: TSingleContainerBuilder() : Empty(true) { } void AddOut(const O& o) { Empty = false; Out_ = o; } bool IsEmpty() const { return Empty; } void SaveContent(IOutputStream* stream) const { if (IsEmpty()) { WriteBin(stream, 0); } else { TBuffer buf; { TBufferOutput tempStream(buf); TSaveLoadVectorNonPodElement::Save(&tempStream, Out_); } WriteBin(stream, buf.Size()); stream->Write(buf.Data(), buf.Size()); } } }; template class TAhoCorasickBuilder; template class TAhoVertex : TNonCopyable { typedef TAhoVertex TMyself; typedef TAhoCorasickBuilder TParent; typedef typename TStringType::value_type TCharType; friend class TAhoCorasickBuilder; private: typedef THashMap TGotoMap; TGotoMap GotoMap_; C Output_; TMyself* FailVertex_; TMyself* SuffixVertex_; protected: const C& Output() const { return Output_; } void AddVertex(TMyself* v, TCharType c) { GotoMap_.insert(std::make_pair(c, v)); } TMyself* Fail() const { return FailVertex_; } TMyself* Suffix() const { return SuffixVertex_; } TMyself* GotoFunction(const TCharType c) { typename TGotoMap::iterator it; it = GotoMap_.find(c); return it != GotoMap_.end() ? it->second : nullptr; } TMyself const* GotoFunction(const TCharType c) const { typename TGotoMap::const_iterator it = GotoMap_.find(c); return it != GotoMap_.end() ? it->second : NULL; } TMyself* AddString(TParent* ahoCorasick, const TStringType& s, const ui32 position, const O& o) { if (position >= s.size()) { Output_.AddOut(o); return nullptr; } else { TCharType c = s[position]; TMyself* v = GotoFunction(c); if (!v) { v = ahoCorasick->CreateAhoVertex(); AddVertex(v, c); } return v; } } void SetFail(TMyself* v) { FailVertex_ = v; } void SetSuffix(TMyself* v) { SuffixVertex_ = v; } const TGotoMap& GotoMap() const { return GotoMap_; } public: TAhoVertex() : FailVertex_(nullptr) , SuffixVertex_(nullptr) { } }; template > class TAhoCorasickBuilder : TNonCopyable { public: typedef TAhoVertex TAhoVertexType; typedef typename TStringType::value_type TCharType; friend class TAhoVertex; friend class TTestMappedAhoCorasick; private: TDeque AhoVertexes; private: TAhoVertexType* GetRoot() { return AhoVertexes.front(); } TAhoVertexType const* GetRoot() const { return AhoVertexes.front(); } TAhoVertexType* CreateAhoVertex() { AhoVertexes.push_back(new TAhoVertexType()); return AhoVertexes.back(); } void ConstructFail(); public: TAhoCorasickBuilder() : AhoVertexes(1, new TAhoVertexType()) { } ~TAhoCorasickBuilder() { for (size_t i = 0; i < AhoVertexes.size(); ++i) { delete AhoVertexes[i]; } } void AddString(const TStringType& s, const O& value) { TAhoVertexType* c = GetRoot(); for (ui32 i = 0; i <= s.size(); ++i) { c = c->AddString(this, s, i, value); } } const TBlob Save(); const TBlob AtomicSave(); void SaveToStream(IOutputStream* stream); }; using TSimpleAhoCorasickBuilder = TAhoCorasickBuilder>; using TDefaultAhoCorasickBuilder = TAhoCorasickBuilder; template const TBlob BuildAho(AhoCorasick& ahoCorasick, Iterator begin, Iterator end) { for (Iterator it = begin; it != end; ++it) ahoCorasick.AddString(*it, it->size()); return ahoCorasick.Save(); } template const TBlob BuildAhoIndex(TAhoCorasickBuilder& ahoCorasick, Iterator begin, Iterator end) { ui32 index = 0; for (Iterator it = begin; it != end; ++it, ++index) ahoCorasick.AddString(*it, index); return ahoCorasick.Save(); } template const TBlob BuildAhoObject(TAhoCorasickBuilder::second_type>& ahoCorasick, Iterator begin, Iterator end) { for (Iterator it = begin; it != end; ++it) ahoCorasick.AddString(it->first, it->second); return ahoCorasick.Save(); } template void TAhoCorasickBuilder::ConstructFail() { TAhoVertexType* root = GetRoot(); root->SetFail(root); TDeque q; typename TAhoVertexType::TGotoMap::const_iterator it; for (it = root->GotoMap().begin(); it != root->GotoMap().end(); ++it) { TAhoVertexType* v = it->second; v->SetFail(root); q.push_back(v); } while (!q.empty()) { TAhoVertexType* c = q.front(); q.pop_front(); for (it = c->GotoMap().begin(); it != c->GotoMap().end(); ++it) { TAhoVertexType* v = it->second; TCharType a = it->first; q.push_back(v); TAhoVertexType* h = c->Fail(); bool outer = false; while (!h->GotoFunction(a)) { if (h->Fail() == h) { v->SetFail(h); outer = true; break; } h = h->Fail(); } if (outer) continue; TAhoVertexType* fail = h->GotoFunction(a); v->SetFail(fail); if (!fail->Output().IsEmpty()) v->SetSuffix(fail); else v->SetSuffix(fail->Suffix()); } } } template void TAhoCorasickBuilder::SaveToStream(IOutputStream* out) { ConstructFail(); /// the reason of non-const declaration Y_ASSERT(AhoVertexes.size() < Max()); const ui32 vertexAmount = (ui32)AhoVertexes.size(); TChunkedDataWriter writer(*out); { TSingleValueWriter versionWriter(TAhoCorasickCommon::GetVersion()); WriteBlock(writer, versionWriter); } writer.NewBlock(); TVector q(1, GetRoot()); THashMap vertex2id(vertexAmount + 1); TVector id2offset(vertexAmount, 0); TAhoVertexType* vt = nullptr; vertex2id[vt] = (ui32)-1; q.reserve(vertexAmount); for (ui32 curId = 0; curId < vertexAmount; ++curId) { TAhoVertexType const* c = q[curId]; vertex2id[c] = curId; id2offset[curId] = (ui32)writer.GetCurrentBlockOffset(); WriteBin(&writer, vertex2id[c->Fail()]); WriteBin(&writer, vertex2id[c->Suffix()]); typedef TVector> TChildren; TChildren children(c->GotoMap().begin(), c->GotoMap().end()); WriteBin(&writer, static_cast(children.size())); if (!children.empty()) { TPlainHashWriter hashWriter; const ui32 id = static_cast(q.size()); for (size_t i = 0; i < children.size(); ++i) { hashWriter.Add(children[i].first, ui32(id + i)); q.push_back(children[i].second); } hashWriter.Save(writer); } c->Output().SaveContent(&writer); } { Y_ASSERT(id2offset.size() < Max()); TSingleValueWriter lenWriter((ui32)id2offset.size()); WriteBlock(writer, lenWriter); } writer.NewBlock(); writer.Write((const char*)id2offset.data(), id2offset.size() * sizeof(ui32)); writer.WriteFooter(); Y_ASSERT(TAhoCorasickCommon::GetBlockCount() == writer.GetBlockCount()); } template const TBlob TAhoCorasickBuilder::Save() { TBufferStream buffer; SaveToStream(&buffer); return TBlob::FromStream(buffer); } template const TBlob TAhoCorasickBuilder::AtomicSave() { TBufferStream buffer; SaveToStream(&buffer); return TBlob::FromStream(buffer); }