123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606 |
- #pragma once
- #include "comptrie_builder.h"
- #include "comptrie_trie.h"
- #include "comptrie_impl.h"
- #include <library/cpp/packers/packers.h>
- #include <util/system/yassert.h>
- #include <util/generic/vector.h>
- #include <util/generic/deque.h>
- #include <util/stream/str.h>
- // Aho-Corasick algorithm implementation using CompactTrie implementation of Sedgewick's T-trie
- namespace NCompactTrie {
- struct TSuffixLink {
- ui64 NextSuffixOffset;
- ui64 NextSuffixWithDataOffset;
- TSuffixLink(ui64 nextSuffixOffset = 0, ui64 nextSuffixWithDataOffset = 0)
- : NextSuffixOffset(nextSuffixOffset)
- , NextSuffixWithDataOffset(nextSuffixWithDataOffset)
- {
- }
- };
- const size_t FLAGS_SIZE = sizeof(char);
- const size_t SYMBOL_SIZE = sizeof(char);
- }
- template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
- class TCompactPatternSearcherBuilder : protected TCompactTrieBuilder<T, D, S> {
- public:
- typedef T TSymbol;
- typedef D TData;
- typedef S TPacker;
- typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
- typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
- typedef TCompactTrieBuilder<T, D, S> TBase;
- public:
- TCompactPatternSearcherBuilder() {
- TBase::Impl = MakeHolder<TCompactPatternSearcherBuilderImpl>();
- }
- bool Add(const TSymbol* key, size_t keyLength, const TData& value) {
- return TBase::Impl->AddEntry(key, keyLength, value);
- }
- bool Add(const TKeyBuf& key, const TData& value) {
- return Add(key.data(), key.size(), value);
- }
- bool Find(const TSymbol* key, size_t keyLength, TData* value) const {
- return TBase::Impl->FindEntry(key, keyLength, value);
- }
- bool Find(const TKeyBuf& key, TData* value = nullptr) const {
- return Find(key.data(), key.size(), value);
- }
- size_t Save(IOutputStream& os) const {
- size_t trieSize = TBase::Impl->MeasureByteSize();
- TBufferOutput serializedTrie(trieSize);
- TBase::Impl->Save(serializedTrie);
- auto serializedTrieBuffer = serializedTrie.Buffer();
- CalculateSuffixLinks(
- serializedTrieBuffer.Data(),
- serializedTrieBuffer.Data() + serializedTrieBuffer.Size()
- );
- os.Write(serializedTrieBuffer.Data(), serializedTrieBuffer.Size());
- return trieSize;
- }
- TBlob Save() const {
- TBufferStream buffer;
- Save(buffer);
- return TBlob::FromStream(buffer);
- }
- size_t SaveToFile(const TString& fileName) const {
- TFileOutput out(fileName);
- return Save(out);
- }
- size_t MeasureByteSize() const {
- return TBase::Impl->MeasureByteSize();
- }
- private:
- void CalculateSuffixLinks(char* trieStart, const char* trieEnd) const;
- protected:
- class TCompactPatternSearcherBuilderImpl : public TBase::TCompactTrieBuilderImpl {
- public:
- typedef typename TBase::TCompactTrieBuilderImpl TImplBase;
- TCompactPatternSearcherBuilderImpl(
- TCompactTrieBuilderFlags flags = CTBF_NONE,
- TPacker packer = TPacker(),
- IAllocator* alloc = TDefaultAllocator::Instance()
- ) : TImplBase(flags, packer, alloc) {
- }
- ui64 ArcMeasure(
- const typename TImplBase::TArc* arc,
- size_t leftSize,
- size_t rightSize
- ) const override {
- using namespace NCompactTrie;
- size_t coreSize = SYMBOL_SIZE + FLAGS_SIZE +
- sizeof(TSuffixLink) +
- this->NodeMeasureLeafValue(arc->Node);
- size_t treeSize = this->NodeMeasureSubtree(arc->Node);
- if (arc->Label.Length() > 0)
- treeSize += (SYMBOL_SIZE + FLAGS_SIZE + sizeof(TSuffixLink)) *
- (arc->Label.Length() - 1);
- // Triple measurements are needed because the space needed to store the offset
- // shall be added to the offset itself. Hence three iterations.
- size_t leftOffsetSize = 0;
- size_t rightOffsetSize = 0;
- for (size_t iteration = 0; iteration < 3; ++iteration) {
- leftOffsetSize = leftSize ? MeasureOffset(
- coreSize + treeSize + leftOffsetSize + rightOffsetSize) : 0;
- rightOffsetSize = rightSize ? MeasureOffset(
- coreSize + treeSize + leftSize + leftOffsetSize + rightOffsetSize) : 0;
- }
- coreSize += leftOffsetSize + rightOffsetSize;
- arc->LeftOffset = leftSize ? coreSize + treeSize : 0;
- arc->RightOffset = rightSize ? coreSize + treeSize + leftSize : 0;
- return coreSize + treeSize + leftSize + rightSize;
- }
- ui64 ArcSaveSelf(const typename TImplBase::TArc* arc, IOutputStream& os) const override {
- using namespace NCompactTrie;
- ui64 written = 0;
- size_t leftOffsetSize = MeasureOffset(arc->LeftOffset);
- size_t rightOffsetSize = MeasureOffset(arc->RightOffset);
- size_t labelLen = arc->Label.Length();
- for (size_t labelPos = 0; labelPos < labelLen; ++labelPos) {
- char flags = 0;
- if (labelPos == 0) {
- flags |= (leftOffsetSize << MT_LEFTSHIFT);
- flags |= (rightOffsetSize << MT_RIGHTSHIFT);
- }
- if (labelPos == labelLen - 1) {
- if (arc->Node->IsFinal())
- flags |= MT_FINAL;
- if (!arc->Node->IsLast())
- flags |= MT_NEXT;
- } else {
- flags |= MT_NEXT;
- }
- os.Write(&flags, 1);
- os.Write(&arc->Label.AsCharPtr()[labelPos], 1);
- written += 2;
- TSuffixLink suffixlink;
- os.Write(&suffixlink, sizeof(TSuffixLink));
- written += sizeof(TSuffixLink);
- if (labelPos == 0) {
- written += ArcSaveOffset(arc->LeftOffset, os);
- written += ArcSaveOffset(arc->RightOffset, os);
- }
- }
- written += this->NodeSaveLeafValue(arc->Node, os);
- return written;
- }
- };
- };
- template <class T>
- struct TPatternMatch {
- ui64 End;
- T Data;
- TPatternMatch(ui64 end, const T& data)
- : End(end)
- , Data(data)
- {
- }
- };
- template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
- class TCompactPatternSearcher {
- public:
- typedef T TSymbol;
- typedef D TData;
- typedef S TPacker;
- typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
- typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
- typedef TCompactTrie<TSymbol, TData, TPacker> TTrie;
- public:
- TCompactPatternSearcher()
- {
- }
- explicit TCompactPatternSearcher(const TBlob& data)
- : Trie(data)
- {
- }
- TCompactPatternSearcher(const char* data, size_t size)
- : Trie(data, size)
- {
- }
- TVector<TPatternMatch<TData>> SearchMatches(const TSymbol* text, size_t textSize) const;
- TVector<TPatternMatch<TData>> SearchMatches(const TKeyBuf& text) const {
- return SearchMatches(text.data(), text.size());
- }
- private:
- TTrie Trie;
- };
- ////////////////////
- // Implementation //
- ////////////////////
- namespace {
- template <class TData, class TPacker>
- char ReadNode(
- char* nodeStart,
- char*& leftSibling,
- char*& rightSibling,
- char*& directChild,
- NCompactTrie::TSuffixLink*& suffixLink,
- TPacker packer = TPacker()
- ) {
- char* dataPos = nodeStart;
- char flags = *(dataPos++);
- Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed
- char label = *(dataPos++);
- suffixLink = (NCompactTrie::TSuffixLink*)dataPos;
- dataPos += sizeof(NCompactTrie::TSuffixLink);
- { // Left branch
- size_t offsetLength = NCompactTrie::LeftOffsetLen(flags);
- size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr;
- dataPos += offsetLength;
- }
- { // Right branch
- size_t offsetLength = NCompactTrie::RightOffsetLen(flags);
- size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr;
- dataPos += offsetLength;
- }
- directChild = nullptr;
- if (flags & NCompactTrie::MT_NEXT) {
- directChild = dataPos;
- if (flags & NCompactTrie::MT_FINAL) {
- directChild += packer.SkipLeaf(directChild);
- }
- }
- return label;
- }
- template <class TData, class TPacker>
- char ReadNodeConst(
- const char* nodeStart,
- const char*& leftSibling,
- const char*& rightSibling,
- const char*& directChild,
- const char*& data,
- NCompactTrie::TSuffixLink& suffixLink,
- TPacker packer = TPacker()
- ) {
- const char* dataPos = nodeStart;
- char flags = *(dataPos++);
- Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed
- char label = *(dataPos++);
- suffixLink = *((NCompactTrie::TSuffixLink*)dataPos);
- dataPos += sizeof(NCompactTrie::TSuffixLink);
- { // Left branch
- size_t offsetLength = NCompactTrie::LeftOffsetLen(flags);
- size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr;
- dataPos += offsetLength;
- }
- { // Right branch
- size_t offsetLength = NCompactTrie::RightOffsetLen(flags);
- size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr;
- dataPos += offsetLength;
- }
- data = nullptr;
- if (flags & NCompactTrie::MT_FINAL) {
- data = dataPos;
- }
- directChild = nullptr;
- if (flags & NCompactTrie::MT_NEXT) {
- directChild = dataPos;
- if (flags & NCompactTrie::MT_FINAL) {
- directChild += packer.SkipLeaf(directChild);
- }
- }
- return label;
- }
- Y_FORCE_INLINE bool Advance(
- const char*& dataPos,
- const char* const dataEnd,
- char label
- ) {
- if (dataPos == nullptr) {
- return false;
- }
- while (dataPos < dataEnd) {
- size_t offsetLength, offset;
- const char* startPos = dataPos;
- char flags = *(dataPos++);
- char symbol = *(dataPos++);
- dataPos += sizeof(NCompactTrie::TSuffixLink);
- // Left branch
- offsetLength = NCompactTrie::LeftOffsetLen(flags);
- if ((unsigned char)label < (unsigned char)symbol) {
- offset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- if (!offset)
- break;
- dataPos = startPos + offset;
- continue;
- }
- dataPos += offsetLength;
- // Right branch
- offsetLength = NCompactTrie::RightOffsetLen(flags);
- if ((unsigned char)label > (unsigned char)symbol) {
- offset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
- if (!offset)
- break;
- dataPos = startPos + offset;
- continue;
- }
- dataPos = startPos;
- return true;
- }
- // if we got here, we're past the dataend - bail out ASAP
- dataPos = nullptr;
- return false;
- }
- } // anonymous
- template <class T, class D, class S>
- void TCompactPatternSearcherBuilder<T, D, S>::CalculateSuffixLinks(
- char* trieStart,
- const char* trieEnd
- ) const {
- struct TBfsElement {
- char* Node;
- const char* Parent;
- TBfsElement(char* node, const char* parent)
- : Node(node)
- , Parent(parent)
- {
- }
- };
- TDeque<TBfsElement> bfsQueue;
- if (trieStart && trieStart != trieEnd) {
- bfsQueue.emplace_back(trieStart, nullptr);
- }
- while (!bfsQueue.empty()) {
- auto front = bfsQueue.front();
- char* node = front.Node;
- const char* parent = front.Parent;
- bfsQueue.pop_front();
- char* leftSibling;
- char* rightSibling;
- char* directChild;
- NCompactTrie::TSuffixLink* suffixLink;
- char label = ReadNode<TData, TPacker>(
- node,
- leftSibling,
- rightSibling,
- directChild,
- suffixLink
- );
- const char* suffix;
- if (parent == nullptr) {
- suffix = node;
- } else {
- const char* parentOfSuffix = parent;
- const char* temp;
- do {
- NCompactTrie::TSuffixLink parentOfSuffixSuffixLink;
- ReadNodeConst<TData, TPacker>(
- parentOfSuffix,
- /*left*/temp,
- /*right*/temp,
- /*direct*/temp,
- /*data*/temp,
- parentOfSuffixSuffixLink
- );
- if (parentOfSuffixSuffixLink.NextSuffixOffset == 0) {
- suffix = trieStart;
- if (!Advance(suffix, trieEnd, label)) {
- suffix = node;
- }
- break;
- }
- parentOfSuffix += parentOfSuffixSuffixLink.NextSuffixOffset;
- NCompactTrie::TSuffixLink tempSuffixLink;
- ReadNodeConst<TData, TPacker>(
- parentOfSuffix,
- /*left*/temp,
- /*right*/temp,
- /*direct*/suffix,
- /*data*/temp,
- tempSuffixLink
- );
- if (suffix == nullptr) {
- continue;
- }
- } while (!Advance(suffix, trieEnd, label));
- }
- suffixLink->NextSuffixOffset = suffix - node;
- NCompactTrie::TSuffixLink suffixSuffixLink;
- const char* suffixData;
- const char* temp;
- ReadNodeConst<TData, TPacker>(
- suffix,
- /*left*/temp,
- /*right*/temp,
- /*direct*/temp,
- suffixData,
- suffixSuffixLink
- );
- suffixLink->NextSuffixWithDataOffset = suffix - node;
- if (suffixData == nullptr) {
- suffixLink->NextSuffixWithDataOffset += suffixSuffixLink.NextSuffixWithDataOffset;
- }
- if (directChild) {
- bfsQueue.emplace_back(directChild, node);
- }
- if (leftSibling) {
- bfsQueue.emplace_front(leftSibling, parent);
- }
- if (rightSibling) {
- bfsQueue.emplace_front(rightSibling, parent);
- }
- }
- }
- template<class T, class D, class S>
- TVector<TPatternMatch<D>> TCompactPatternSearcher<T, D, S>::SearchMatches(
- const TSymbol* text,
- size_t textSize
- ) const {
- const char* temp;
- NCompactTrie::TSuffixLink tempSuffixLink;
- const auto& trieData = Trie.Data();
- const char* trieStart = trieData.AsCharPtr();
- size_t dataSize = trieData.Length();
- const char* trieEnd = trieStart + dataSize;
- const char* lastNode = nullptr;
- const char* currentSubtree = trieStart;
- TVector<TPatternMatch<TData>> matches;
- for (const TSymbol* position = text; position < text + textSize; ++position) {
- TSymbol symbol = *position;
- for (i64 i = (i64)NCompactTrie::ExtraBits<TSymbol>(); i >= 0; i -= 8) {
- char label = (char)(symbol >> i);
- // Find first suffix extendable by label
- while (true) {
- const char* nextLastNode = currentSubtree;
- if (Advance(nextLastNode, trieEnd, label)) {
- lastNode = nextLastNode;
- ReadNodeConst<TData, TPacker>(
- lastNode,
- /*left*/temp,
- /*right*/temp,
- currentSubtree,
- /*data*/temp,
- tempSuffixLink
- );
- break;
- } else {
- if (lastNode == nullptr) {
- break;
- }
- }
- NCompactTrie::TSuffixLink suffixLink;
- ReadNodeConst<TData, TPacker>(
- lastNode,
- /*left*/temp,
- /*right*/temp,
- /*direct*/temp,
- /*data*/temp,
- suffixLink
- );
- if (suffixLink.NextSuffixOffset == 0) {
- lastNode = nullptr;
- currentSubtree = trieStart;
- continue;
- }
- lastNode += suffixLink.NextSuffixOffset;
- ReadNodeConst<TData, TPacker>(
- lastNode,
- /*left*/temp,
- /*right*/temp,
- currentSubtree,
- /*data*/temp,
- tempSuffixLink
- );
- }
- // Iterate through all suffixes
- const char* suffix = lastNode;
- while (suffix != nullptr) {
- const char* nodeData;
- NCompactTrie::TSuffixLink suffixLink;
- ReadNodeConst<TData, TPacker>(
- suffix,
- /*left*/temp,
- /*right*/temp,
- /*direct*/temp,
- nodeData,
- suffixLink
- );
- if (nodeData != nullptr) {
- TData data;
- Trie.GetPacker().UnpackLeaf(nodeData, data);
- matches.emplace_back(
- position - text,
- data
- );
- }
- if (suffixLink.NextSuffixOffset == 0) {
- break;
- }
- suffix += suffixLink.NextSuffixWithDataOffset;
- }
- }
- }
- return matches;
- }
|