writer.h 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. #pragma once
  2. #include <util/generic/deque.h>
  3. #include <util/generic/hash.h>
  4. #include <util/generic/string.h>
  5. #include <util/generic/vector.h>
  6. #include <util/generic/yexception.h>
  7. #include <util/stream/buffer.h>
  8. #include <util/memory/blob.h>
  9. #include <utility>
  10. #include <library/cpp/on_disk/chunks/writer.h>
  11. #include <library/cpp/on_disk/chunks/chunked_helpers.h>
  12. #include "common.h"
  13. template <class I>
  14. struct meta_iterator_pair {
  15. typedef typename std::iterator_traits<I>::value_type::first_type first_type;
  16. typedef typename std::iterator_traits<I>::value_type::second_type second_type;
  17. };
  18. /*
  19. * Builder implementation
  20. */
  21. /*
  22. * Builder declaration
  23. */
  24. template <class O>
  25. class TDefaultContainerBuilder {
  26. TGeneralVectorWriter<O> List_;
  27. public:
  28. void AddOut(const O& o) {
  29. List_.PushBack(o);
  30. }
  31. bool IsEmpty() const {
  32. return List_.Size() == 0;
  33. }
  34. void SaveContent(IOutputStream* stream) const {
  35. List_.Save(*stream);
  36. }
  37. };
  38. template <class O>
  39. class TSingleContainerBuilder {
  40. bool Empty;
  41. O Out_;
  42. public:
  43. TSingleContainerBuilder()
  44. : Empty(true)
  45. {
  46. }
  47. void AddOut(const O& o) {
  48. Empty = false;
  49. Out_ = o;
  50. }
  51. bool IsEmpty() const {
  52. return Empty;
  53. }
  54. void SaveContent(IOutputStream* stream) const {
  55. if (IsEmpty()) {
  56. WriteBin<ui32>(stream, 0);
  57. } else {
  58. TBuffer buf;
  59. {
  60. TBufferOutput tempStream(buf);
  61. TSaveLoadVectorNonPodElement<O>::Save(&tempStream, Out_);
  62. }
  63. WriteBin<ui32>(stream, buf.Size());
  64. stream->Write(buf.Data(), buf.Size());
  65. }
  66. }
  67. };
  68. template <class TStringType, class O, class C>
  69. class TAhoCorasickBuilder;
  70. template <class TStringType, class O, class C>
  71. class TAhoVertex : TNonCopyable {
  72. typedef TAhoVertex<TStringType, O, C> TMyself;
  73. typedef TAhoCorasickBuilder<TStringType, O, C> TParent;
  74. typedef typename TStringType::value_type TCharType;
  75. friend class TAhoCorasickBuilder<TStringType, O, C>;
  76. private:
  77. typedef THashMap<TCharType, TMyself*> TGotoMap;
  78. TGotoMap GotoMap_;
  79. C Output_;
  80. TMyself* FailVertex_;
  81. TMyself* SuffixVertex_;
  82. protected:
  83. const C& Output() const {
  84. return Output_;
  85. }
  86. void AddVertex(TMyself* v, TCharType c) {
  87. GotoMap_.insert(std::make_pair(c, v));
  88. }
  89. TMyself* Fail() const {
  90. return FailVertex_;
  91. }
  92. TMyself* Suffix() const {
  93. return SuffixVertex_;
  94. }
  95. TMyself* GotoFunction(const TCharType c) {
  96. typename TGotoMap::iterator it;
  97. it = GotoMap_.find(c);
  98. return it != GotoMap_.end() ? it->second : nullptr;
  99. }
  100. TMyself const* GotoFunction(const TCharType c) const {
  101. typename TGotoMap::const_iterator it = GotoMap_.find(c);
  102. return it != GotoMap_.end() ? it->second : NULL;
  103. }
  104. TMyself* AddString(TParent* ahoCorasick, const TStringType& s, const ui32 position, const O& o) {
  105. if (position >= s.size()) {
  106. Output_.AddOut(o);
  107. return nullptr;
  108. } else {
  109. TCharType c = s[position];
  110. TMyself* v = GotoFunction(c);
  111. if (!v) {
  112. v = ahoCorasick->CreateAhoVertex();
  113. AddVertex(v, c);
  114. }
  115. return v;
  116. }
  117. }
  118. void SetFail(TMyself* v) {
  119. FailVertex_ = v;
  120. }
  121. void SetSuffix(TMyself* v) {
  122. SuffixVertex_ = v;
  123. }
  124. const TGotoMap& GotoMap() const {
  125. return GotoMap_;
  126. }
  127. public:
  128. TAhoVertex()
  129. : FailVertex_(nullptr)
  130. , SuffixVertex_(nullptr)
  131. {
  132. }
  133. };
  134. template <class TStringType, class O, class C = TDefaultContainerBuilder<O>>
  135. class TAhoCorasickBuilder : TNonCopyable {
  136. public:
  137. typedef TAhoVertex<TStringType, O, C> TAhoVertexType;
  138. typedef typename TStringType::value_type TCharType;
  139. friend class TAhoVertex<TStringType, O, C>;
  140. friend class TTestMappedAhoCorasick;
  141. private:
  142. TDeque<TAhoVertexType*> AhoVertexes;
  143. private:
  144. TAhoVertexType* GetRoot() {
  145. return AhoVertexes.front();
  146. }
  147. TAhoVertexType const* GetRoot() const {
  148. return AhoVertexes.front();
  149. }
  150. TAhoVertexType* CreateAhoVertex() {
  151. AhoVertexes.push_back(new TAhoVertexType());
  152. return AhoVertexes.back();
  153. }
  154. void ConstructFail();
  155. public:
  156. TAhoCorasickBuilder()
  157. : AhoVertexes(1, new TAhoVertexType())
  158. {
  159. }
  160. ~TAhoCorasickBuilder() {
  161. for (size_t i = 0; i < AhoVertexes.size(); ++i) {
  162. delete AhoVertexes[i];
  163. }
  164. }
  165. void AddString(const TStringType& s, const O& value) {
  166. TAhoVertexType* c = GetRoot();
  167. for (ui32 i = 0; i <= s.size(); ++i) {
  168. c = c->AddString(this, s, i, value);
  169. }
  170. }
  171. const TBlob Save();
  172. const TBlob AtomicSave();
  173. void SaveToStream(IOutputStream* stream);
  174. };
  175. using TSimpleAhoCorasickBuilder = TAhoCorasickBuilder<TString, ui32, TSingleContainerBuilder<ui32>>;
  176. using TDefaultAhoCorasickBuilder = TAhoCorasickBuilder<TString, ui32>;
  177. template <class AhoCorasick, class Iterator>
  178. const TBlob BuildAho(AhoCorasick& ahoCorasick, Iterator begin, Iterator end) {
  179. for (Iterator it = begin; it != end; ++it)
  180. ahoCorasick.AddString(*it, it->size());
  181. return ahoCorasick.Save();
  182. }
  183. template <class TStringType, class Iterator>
  184. const TBlob BuildAhoIndex(TAhoCorasickBuilder<TStringType, ui32>& ahoCorasick, Iterator begin, Iterator end) {
  185. ui32 index = 0;
  186. for (Iterator it = begin; it != end; ++it, ++index)
  187. ahoCorasick.AddString(*it, index);
  188. return ahoCorasick.Save();
  189. }
  190. template <class TStringType, class Iterator>
  191. const TBlob BuildAhoObject(TAhoCorasickBuilder<TStringType, typename meta_iterator_pair<Iterator>::second_type>& ahoCorasick, Iterator begin, Iterator end) {
  192. for (Iterator it = begin; it != end; ++it)
  193. ahoCorasick.AddString(it->first, it->second);
  194. return ahoCorasick.Save();
  195. }
  196. template <class TStringType, class O, class C>
  197. void TAhoCorasickBuilder<TStringType, O, C>::ConstructFail() {
  198. TAhoVertexType* root = GetRoot();
  199. root->SetFail(root);
  200. TDeque<TAhoVertexType*> q;
  201. typename TAhoVertexType::TGotoMap::const_iterator it;
  202. for (it = root->GotoMap().begin(); it != root->GotoMap().end(); ++it) {
  203. TAhoVertexType* v = it->second;
  204. v->SetFail(root);
  205. q.push_back(v);
  206. }
  207. while (!q.empty()) {
  208. TAhoVertexType* c = q.front();
  209. q.pop_front();
  210. for (it = c->GotoMap().begin(); it != c->GotoMap().end(); ++it) {
  211. TAhoVertexType* v = it->second;
  212. TCharType a = it->first;
  213. q.push_back(v);
  214. TAhoVertexType* h = c->Fail();
  215. bool outer = false;
  216. while (!h->GotoFunction(a)) {
  217. if (h->Fail() == h) {
  218. v->SetFail(h);
  219. outer = true;
  220. break;
  221. }
  222. h = h->Fail();
  223. }
  224. if (outer)
  225. continue;
  226. TAhoVertexType* fail = h->GotoFunction(a);
  227. v->SetFail(fail);
  228. if (!fail->Output().IsEmpty())
  229. v->SetSuffix(fail);
  230. else
  231. v->SetSuffix(fail->Suffix());
  232. }
  233. }
  234. }
  235. template <class TStringType, class O, class C>
  236. void TAhoCorasickBuilder<TStringType, O, C>::SaveToStream(IOutputStream* out) {
  237. ConstructFail(); /// the reason of non-const declaration
  238. Y_ASSERT(AhoVertexes.size() < Max<ui32>());
  239. const ui32 vertexAmount = (ui32)AhoVertexes.size();
  240. TChunkedDataWriter writer(*out);
  241. {
  242. TSingleValueWriter<ui32> versionWriter(TAhoCorasickCommon::GetVersion());
  243. WriteBlock(writer, versionWriter);
  244. }
  245. writer.NewBlock();
  246. TVector<TAhoVertexType const*> q(1, GetRoot());
  247. THashMap<TAhoVertexType const*, ui32> vertex2id(vertexAmount + 1);
  248. TVector<ui32> id2offset(vertexAmount, 0);
  249. TAhoVertexType* vt = nullptr;
  250. vertex2id[vt] = (ui32)-1;
  251. q.reserve(vertexAmount);
  252. for (ui32 curId = 0; curId < vertexAmount; ++curId) {
  253. TAhoVertexType const* c = q[curId];
  254. vertex2id[c] = curId;
  255. id2offset[curId] = (ui32)writer.GetCurrentBlockOffset();
  256. WriteBin<ui32>(&writer, vertex2id[c->Fail()]);
  257. WriteBin<ui32>(&writer, vertex2id[c->Suffix()]);
  258. typedef TVector<std::pair<TCharType, TAhoVertexType const*>> TChildren;
  259. TChildren children(c->GotoMap().begin(), c->GotoMap().end());
  260. WriteBin<ui32>(&writer, static_cast<ui32>(children.size()));
  261. if (!children.empty()) {
  262. TPlainHashWriter<TCharType, ui32> hashWriter;
  263. const ui32 id = static_cast<ui32>(q.size());
  264. for (size_t i = 0; i < children.size(); ++i) {
  265. hashWriter.Add(children[i].first, ui32(id + i));
  266. q.push_back(children[i].second);
  267. }
  268. hashWriter.Save(writer);
  269. }
  270. c->Output().SaveContent(&writer);
  271. }
  272. {
  273. Y_ASSERT(id2offset.size() < Max<ui32>());
  274. TSingleValueWriter<ui32> lenWriter((ui32)id2offset.size());
  275. WriteBlock(writer, lenWriter);
  276. }
  277. writer.NewBlock();
  278. writer.Write((const char*)id2offset.data(), id2offset.size() * sizeof(ui32));
  279. writer.WriteFooter();
  280. Y_ASSERT(TAhoCorasickCommon::GetBlockCount() == writer.GetBlockCount());
  281. }
  282. template <class TStringType, class O, class C>
  283. const TBlob TAhoCorasickBuilder<TStringType, O, C>::Save() {
  284. TBufferStream buffer;
  285. SaveToStream(&buffer);
  286. return TBlob::FromStream(buffer);
  287. }
  288. template <class TStringType, class O, class C>
  289. const TBlob TAhoCorasickBuilder<TStringType, O, C>::AtomicSave() {
  290. TBufferStream buffer;
  291. SaveToStream(&buffer);
  292. return TBlob::FromStream(buffer);
  293. }