pattern_searcher.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. #pragma once
  2. #include "comptrie_builder.h"
  3. #include "comptrie_trie.h"
  4. #include "comptrie_impl.h"
  5. #include <library/cpp/packers/packers.h>
  6. #include <util/system/yassert.h>
  7. #include <util/generic/vector.h>
  8. #include <util/generic/deque.h>
  9. #include <util/stream/str.h>
  10. // Aho-Corasick algorithm implementation using CompactTrie implementation of Sedgewick's T-trie
  11. namespace NCompactTrie {
  12. struct TSuffixLink {
  13. ui64 NextSuffixOffset;
  14. ui64 NextSuffixWithDataOffset;
  15. TSuffixLink(ui64 nextSuffixOffset = 0, ui64 nextSuffixWithDataOffset = 0)
  16. : NextSuffixOffset(nextSuffixOffset)
  17. , NextSuffixWithDataOffset(nextSuffixWithDataOffset)
  18. {
  19. }
  20. };
  21. const size_t FLAGS_SIZE = sizeof(char);
  22. const size_t SYMBOL_SIZE = sizeof(char);
  23. }
  24. template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
  25. class TCompactPatternSearcherBuilder : protected TCompactTrieBuilder<T, D, S> {
  26. public:
  27. typedef T TSymbol;
  28. typedef D TData;
  29. typedef S TPacker;
  30. typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
  31. typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
  32. typedef TCompactTrieBuilder<T, D, S> TBase;
  33. public:
  34. TCompactPatternSearcherBuilder() {
  35. TBase::Impl = MakeHolder<TCompactPatternSearcherBuilderImpl>();
  36. }
  37. bool Add(const TSymbol* key, size_t keyLength, const TData& value) {
  38. return TBase::Impl->AddEntry(key, keyLength, value);
  39. }
  40. bool Add(const TKeyBuf& key, const TData& value) {
  41. return Add(key.data(), key.size(), value);
  42. }
  43. bool Find(const TSymbol* key, size_t keyLength, TData* value) const {
  44. return TBase::Impl->FindEntry(key, keyLength, value);
  45. }
  46. bool Find(const TKeyBuf& key, TData* value = nullptr) const {
  47. return Find(key.data(), key.size(), value);
  48. }
  49. size_t Save(IOutputStream& os) const {
  50. size_t trieSize = TBase::Impl->MeasureByteSize();
  51. TBufferOutput serializedTrie(trieSize);
  52. TBase::Impl->Save(serializedTrie);
  53. auto serializedTrieBuffer = serializedTrie.Buffer();
  54. CalculateSuffixLinks(
  55. serializedTrieBuffer.Data(),
  56. serializedTrieBuffer.Data() + serializedTrieBuffer.Size()
  57. );
  58. os.Write(serializedTrieBuffer.Data(), serializedTrieBuffer.Size());
  59. return trieSize;
  60. }
  61. TBlob Save() const {
  62. TBufferStream buffer;
  63. Save(buffer);
  64. return TBlob::FromStream(buffer);
  65. }
  66. size_t SaveToFile(const TString& fileName) const {
  67. TFileOutput out(fileName);
  68. return Save(out);
  69. }
  70. size_t MeasureByteSize() const {
  71. return TBase::Impl->MeasureByteSize();
  72. }
  73. private:
  74. void CalculateSuffixLinks(char* trieStart, const char* trieEnd) const;
  75. protected:
  76. class TCompactPatternSearcherBuilderImpl : public TBase::TCompactTrieBuilderImpl {
  77. public:
  78. typedef typename TBase::TCompactTrieBuilderImpl TImplBase;
  79. TCompactPatternSearcherBuilderImpl(
  80. TCompactTrieBuilderFlags flags = CTBF_NONE,
  81. TPacker packer = TPacker(),
  82. IAllocator* alloc = TDefaultAllocator::Instance()
  83. ) : TImplBase(flags, packer, alloc) {
  84. }
  85. ui64 ArcMeasure(
  86. const typename TImplBase::TArc* arc,
  87. size_t leftSize,
  88. size_t rightSize
  89. ) const override {
  90. using namespace NCompactTrie;
  91. size_t coreSize = SYMBOL_SIZE + FLAGS_SIZE +
  92. sizeof(TSuffixLink) +
  93. this->NodeMeasureLeafValue(arc->Node);
  94. size_t treeSize = this->NodeMeasureSubtree(arc->Node);
  95. if (arc->Label.Length() > 0)
  96. treeSize += (SYMBOL_SIZE + FLAGS_SIZE + sizeof(TSuffixLink)) *
  97. (arc->Label.Length() - 1);
  98. // Triple measurements are needed because the space needed to store the offset
  99. // shall be added to the offset itself. Hence three iterations.
  100. size_t leftOffsetSize = 0;
  101. size_t rightOffsetSize = 0;
  102. for (size_t iteration = 0; iteration < 3; ++iteration) {
  103. leftOffsetSize = leftSize ? MeasureOffset(
  104. coreSize + treeSize + leftOffsetSize + rightOffsetSize) : 0;
  105. rightOffsetSize = rightSize ? MeasureOffset(
  106. coreSize + treeSize + leftSize + leftOffsetSize + rightOffsetSize) : 0;
  107. }
  108. coreSize += leftOffsetSize + rightOffsetSize;
  109. arc->LeftOffset = leftSize ? coreSize + treeSize : 0;
  110. arc->RightOffset = rightSize ? coreSize + treeSize + leftSize : 0;
  111. return coreSize + treeSize + leftSize + rightSize;
  112. }
  113. ui64 ArcSaveSelf(const typename TImplBase::TArc* arc, IOutputStream& os) const override {
  114. using namespace NCompactTrie;
  115. ui64 written = 0;
  116. size_t leftOffsetSize = MeasureOffset(arc->LeftOffset);
  117. size_t rightOffsetSize = MeasureOffset(arc->RightOffset);
  118. size_t labelLen = arc->Label.Length();
  119. for (size_t labelPos = 0; labelPos < labelLen; ++labelPos) {
  120. char flags = 0;
  121. if (labelPos == 0) {
  122. flags |= (leftOffsetSize << MT_LEFTSHIFT);
  123. flags |= (rightOffsetSize << MT_RIGHTSHIFT);
  124. }
  125. if (labelPos == labelLen - 1) {
  126. if (arc->Node->IsFinal())
  127. flags |= MT_FINAL;
  128. if (!arc->Node->IsLast())
  129. flags |= MT_NEXT;
  130. } else {
  131. flags |= MT_NEXT;
  132. }
  133. os.Write(&flags, 1);
  134. os.Write(&arc->Label.AsCharPtr()[labelPos], 1);
  135. written += 2;
  136. TSuffixLink suffixlink;
  137. os.Write(&suffixlink, sizeof(TSuffixLink));
  138. written += sizeof(TSuffixLink);
  139. if (labelPos == 0) {
  140. written += ArcSaveOffset(arc->LeftOffset, os);
  141. written += ArcSaveOffset(arc->RightOffset, os);
  142. }
  143. }
  144. written += this->NodeSaveLeafValue(arc->Node, os);
  145. return written;
  146. }
  147. };
  148. };
  149. template <class T>
  150. struct TPatternMatch {
  151. ui64 End;
  152. T Data;
  153. TPatternMatch(ui64 end, const T& data)
  154. : End(end)
  155. , Data(data)
  156. {
  157. }
  158. };
  159. template <class T = char, class D = ui64, class S = TCompactTriePacker<D>>
  160. class TCompactPatternSearcher {
  161. public:
  162. typedef T TSymbol;
  163. typedef D TData;
  164. typedef S TPacker;
  165. typedef typename TCompactTrieKeySelector<TSymbol>::TKey TKey;
  166. typedef typename TCompactTrieKeySelector<TSymbol>::TKeyBuf TKeyBuf;
  167. typedef TCompactTrie<TSymbol, TData, TPacker> TTrie;
  168. public:
  169. TCompactPatternSearcher()
  170. {
  171. }
  172. explicit TCompactPatternSearcher(const TBlob& data)
  173. : Trie(data)
  174. {
  175. }
  176. TCompactPatternSearcher(const char* data, size_t size)
  177. : Trie(data, size)
  178. {
  179. }
  180. TVector<TPatternMatch<TData>> SearchMatches(const TSymbol* text, size_t textSize) const;
  181. TVector<TPatternMatch<TData>> SearchMatches(const TKeyBuf& text) const {
  182. return SearchMatches(text.data(), text.size());
  183. }
  184. private:
  185. TTrie Trie;
  186. };
  187. ////////////////////
  188. // Implementation //
  189. ////////////////////
  190. namespace {
  191. template <class TData, class TPacker>
  192. char ReadNode(
  193. char* nodeStart,
  194. char*& leftSibling,
  195. char*& rightSibling,
  196. char*& directChild,
  197. NCompactTrie::TSuffixLink*& suffixLink,
  198. TPacker packer = TPacker()
  199. ) {
  200. char* dataPos = nodeStart;
  201. char flags = *(dataPos++);
  202. Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed
  203. char label = *(dataPos++);
  204. suffixLink = (NCompactTrie::TSuffixLink*)dataPos;
  205. dataPos += sizeof(NCompactTrie::TSuffixLink);
  206. { // Left branch
  207. size_t offsetLength = NCompactTrie::LeftOffsetLen(flags);
  208. size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
  209. leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr;
  210. dataPos += offsetLength;
  211. }
  212. { // Right branch
  213. size_t offsetLength = NCompactTrie::RightOffsetLen(flags);
  214. size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
  215. rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr;
  216. dataPos += offsetLength;
  217. }
  218. directChild = nullptr;
  219. if (flags & NCompactTrie::MT_NEXT) {
  220. directChild = dataPos;
  221. if (flags & NCompactTrie::MT_FINAL) {
  222. directChild += packer.SkipLeaf(directChild);
  223. }
  224. }
  225. return label;
  226. }
  227. template <class TData, class TPacker>
  228. char ReadNodeConst(
  229. const char* nodeStart,
  230. const char*& leftSibling,
  231. const char*& rightSibling,
  232. const char*& directChild,
  233. const char*& data,
  234. NCompactTrie::TSuffixLink& suffixLink,
  235. TPacker packer = TPacker()
  236. ) {
  237. const char* dataPos = nodeStart;
  238. char flags = *(dataPos++);
  239. Y_ASSERT(!NCompactTrie::IsEpsilonLink(flags)); // Epsilon links are not allowed
  240. char label = *(dataPos++);
  241. suffixLink = *((NCompactTrie::TSuffixLink*)dataPos);
  242. dataPos += sizeof(NCompactTrie::TSuffixLink);
  243. { // Left branch
  244. size_t offsetLength = NCompactTrie::LeftOffsetLen(flags);
  245. size_t leftOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
  246. leftSibling = leftOffset ? (nodeStart + leftOffset) : nullptr;
  247. dataPos += offsetLength;
  248. }
  249. { // Right branch
  250. size_t offsetLength = NCompactTrie::RightOffsetLen(flags);
  251. size_t rightOffset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
  252. rightSibling = rightOffset ? (nodeStart + rightOffset) : nullptr;
  253. dataPos += offsetLength;
  254. }
  255. data = nullptr;
  256. if (flags & NCompactTrie::MT_FINAL) {
  257. data = dataPos;
  258. }
  259. directChild = nullptr;
  260. if (flags & NCompactTrie::MT_NEXT) {
  261. directChild = dataPos;
  262. if (flags & NCompactTrie::MT_FINAL) {
  263. directChild += packer.SkipLeaf(directChild);
  264. }
  265. }
  266. return label;
  267. }
  268. Y_FORCE_INLINE bool Advance(
  269. const char*& dataPos,
  270. const char* const dataEnd,
  271. char label
  272. ) {
  273. if (dataPos == nullptr) {
  274. return false;
  275. }
  276. while (dataPos < dataEnd) {
  277. size_t offsetLength, offset;
  278. const char* startPos = dataPos;
  279. char flags = *(dataPos++);
  280. char symbol = *(dataPos++);
  281. dataPos += sizeof(NCompactTrie::TSuffixLink);
  282. // Left branch
  283. offsetLength = NCompactTrie::LeftOffsetLen(flags);
  284. if ((unsigned char)label < (unsigned char)symbol) {
  285. offset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
  286. if (!offset)
  287. break;
  288. dataPos = startPos + offset;
  289. continue;
  290. }
  291. dataPos += offsetLength;
  292. // Right branch
  293. offsetLength = NCompactTrie::RightOffsetLen(flags);
  294. if ((unsigned char)label > (unsigned char)symbol) {
  295. offset = NCompactTrie::UnpackOffset(dataPos, offsetLength);
  296. if (!offset)
  297. break;
  298. dataPos = startPos + offset;
  299. continue;
  300. }
  301. dataPos = startPos;
  302. return true;
  303. }
  304. // if we got here, we're past the dataend - bail out ASAP
  305. dataPos = nullptr;
  306. return false;
  307. }
  308. } // anonymous
  309. template <class T, class D, class S>
  310. void TCompactPatternSearcherBuilder<T, D, S>::CalculateSuffixLinks(
  311. char* trieStart,
  312. const char* trieEnd
  313. ) const {
  314. struct TBfsElement {
  315. char* Node;
  316. const char* Parent;
  317. TBfsElement(char* node, const char* parent)
  318. : Node(node)
  319. , Parent(parent)
  320. {
  321. }
  322. };
  323. TDeque<TBfsElement> bfsQueue;
  324. if (trieStart && trieStart != trieEnd) {
  325. bfsQueue.emplace_back(trieStart, nullptr);
  326. }
  327. while (!bfsQueue.empty()) {
  328. auto front = bfsQueue.front();
  329. char* node = front.Node;
  330. const char* parent = front.Parent;
  331. bfsQueue.pop_front();
  332. char* leftSibling;
  333. char* rightSibling;
  334. char* directChild;
  335. NCompactTrie::TSuffixLink* suffixLink;
  336. char label = ReadNode<TData, TPacker>(
  337. node,
  338. leftSibling,
  339. rightSibling,
  340. directChild,
  341. suffixLink
  342. );
  343. const char* suffix;
  344. if (parent == nullptr) {
  345. suffix = node;
  346. } else {
  347. const char* parentOfSuffix = parent;
  348. const char* temp;
  349. do {
  350. NCompactTrie::TSuffixLink parentOfSuffixSuffixLink;
  351. ReadNodeConst<TData, TPacker>(
  352. parentOfSuffix,
  353. /*left*/temp,
  354. /*right*/temp,
  355. /*direct*/temp,
  356. /*data*/temp,
  357. parentOfSuffixSuffixLink
  358. );
  359. if (parentOfSuffixSuffixLink.NextSuffixOffset == 0) {
  360. suffix = trieStart;
  361. if (!Advance(suffix, trieEnd, label)) {
  362. suffix = node;
  363. }
  364. break;
  365. }
  366. parentOfSuffix += parentOfSuffixSuffixLink.NextSuffixOffset;
  367. NCompactTrie::TSuffixLink tempSuffixLink;
  368. ReadNodeConst<TData, TPacker>(
  369. parentOfSuffix,
  370. /*left*/temp,
  371. /*right*/temp,
  372. /*direct*/suffix,
  373. /*data*/temp,
  374. tempSuffixLink
  375. );
  376. if (suffix == nullptr) {
  377. continue;
  378. }
  379. } while (!Advance(suffix, trieEnd, label));
  380. }
  381. suffixLink->NextSuffixOffset = suffix - node;
  382. NCompactTrie::TSuffixLink suffixSuffixLink;
  383. const char* suffixData;
  384. const char* temp;
  385. ReadNodeConst<TData, TPacker>(
  386. suffix,
  387. /*left*/temp,
  388. /*right*/temp,
  389. /*direct*/temp,
  390. suffixData,
  391. suffixSuffixLink
  392. );
  393. suffixLink->NextSuffixWithDataOffset = suffix - node;
  394. if (suffixData == nullptr) {
  395. suffixLink->NextSuffixWithDataOffset += suffixSuffixLink.NextSuffixWithDataOffset;
  396. }
  397. if (directChild) {
  398. bfsQueue.emplace_back(directChild, node);
  399. }
  400. if (leftSibling) {
  401. bfsQueue.emplace_front(leftSibling, parent);
  402. }
  403. if (rightSibling) {
  404. bfsQueue.emplace_front(rightSibling, parent);
  405. }
  406. }
  407. }
  408. template<class T, class D, class S>
  409. TVector<TPatternMatch<D>> TCompactPatternSearcher<T, D, S>::SearchMatches(
  410. const TSymbol* text,
  411. size_t textSize
  412. ) const {
  413. const char* temp;
  414. NCompactTrie::TSuffixLink tempSuffixLink;
  415. const auto& trieData = Trie.Data();
  416. const char* trieStart = trieData.AsCharPtr();
  417. size_t dataSize = trieData.Length();
  418. const char* trieEnd = trieStart + dataSize;
  419. const char* lastNode = nullptr;
  420. const char* currentSubtree = trieStart;
  421. TVector<TPatternMatch<TData>> matches;
  422. for (const TSymbol* position = text; position < text + textSize; ++position) {
  423. TSymbol symbol = *position;
  424. for (i64 i = (i64)NCompactTrie::ExtraBits<TSymbol>(); i >= 0; i -= 8) {
  425. char label = (char)(symbol >> i);
  426. // Find first suffix extendable by label
  427. while (true) {
  428. const char* nextLastNode = currentSubtree;
  429. if (Advance(nextLastNode, trieEnd, label)) {
  430. lastNode = nextLastNode;
  431. ReadNodeConst<TData, TPacker>(
  432. lastNode,
  433. /*left*/temp,
  434. /*right*/temp,
  435. currentSubtree,
  436. /*data*/temp,
  437. tempSuffixLink
  438. );
  439. break;
  440. } else {
  441. if (lastNode == nullptr) {
  442. break;
  443. }
  444. }
  445. NCompactTrie::TSuffixLink suffixLink;
  446. ReadNodeConst<TData, TPacker>(
  447. lastNode,
  448. /*left*/temp,
  449. /*right*/temp,
  450. /*direct*/temp,
  451. /*data*/temp,
  452. suffixLink
  453. );
  454. if (suffixLink.NextSuffixOffset == 0) {
  455. lastNode = nullptr;
  456. currentSubtree = trieStart;
  457. continue;
  458. }
  459. lastNode += suffixLink.NextSuffixOffset;
  460. ReadNodeConst<TData, TPacker>(
  461. lastNode,
  462. /*left*/temp,
  463. /*right*/temp,
  464. currentSubtree,
  465. /*data*/temp,
  466. tempSuffixLink
  467. );
  468. }
  469. // Iterate through all suffixes
  470. const char* suffix = lastNode;
  471. while (suffix != nullptr) {
  472. const char* nodeData;
  473. NCompactTrie::TSuffixLink suffixLink;
  474. ReadNodeConst<TData, TPacker>(
  475. suffix,
  476. /*left*/temp,
  477. /*right*/temp,
  478. /*direct*/temp,
  479. nodeData,
  480. suffixLink
  481. );
  482. if (nodeData != nullptr) {
  483. TData data;
  484. Trie.GetPacker().UnpackLeaf(nodeData, data);
  485. matches.emplace_back(
  486. position - text,
  487. data
  488. );
  489. }
  490. if (suffixLink.NextSuffixOffset == 0) {
  491. break;
  492. }
  493. suffix += suffixLink.NextSuffixWithDataOffset;
  494. }
  495. }
  496. }
  497. return matches;
  498. }