reader.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. #pragma once
  2. #include <util/generic/deque.h>
  3. #include <util/generic/strbuf.h>
  4. #include <util/generic/yexception.h>
  5. #include <util/memory/blob.h>
  6. #include <util/stream/buffer.h>
  7. #include <util/stream/mem.h>
  8. #include <util/system/unaligned_mem.h>
  9. #include <utility>
  10. #include <library/cpp/on_disk/chunks/chunked_helpers.h>
  11. #include "common.h"
  12. template <class O>
  13. class TAhoSearchResult: public TDeque<std::pair<ui32, O>> {
  14. };
  15. /*
  16. * Mapped-declaraion
  17. */
  18. template <class O>
  19. class TMappedDefaultOutputContainer {
  20. private:
  21. TGeneralVector<O> List_;
  22. public:
  23. TMappedDefaultOutputContainer(const char* data)
  24. : List_(TBlob::NoCopy(data, (size_t)-1))
  25. {
  26. }
  27. bool IsEmpty() const {
  28. return List_.GetSize() == 0;
  29. }
  30. void FillAnswer(TAhoSearchResult<O>& answer, ui32 pos) const {
  31. for (ui32 i = 0; i < List_.GetSize(); ++i) {
  32. answer.push_back(std::make_pair(pos, O()));
  33. List_.Get(i, answer.back().second);
  34. }
  35. }
  36. size_t CheckData() const {
  37. return List_.RealSize();
  38. }
  39. };
  40. template <class O>
  41. class TMappedSingleOutputContainer {
  42. const ui32* Data;
  43. ui32 Size() const {
  44. return ReadUnaligned<ui32>(Data);
  45. }
  46. public:
  47. TMappedSingleOutputContainer(const char* data)
  48. : Data((const ui32*)data)
  49. {
  50. }
  51. bool IsEmpty() const {
  52. return Size() == 0;
  53. }
  54. void FillAnswer(TAhoSearchResult<O>& answer, ui32 pos) const {
  55. if (!IsEmpty()) {
  56. answer.push_back(std::make_pair(pos, O()));
  57. TMemoryInput input(Data + 1, Size());
  58. TSaveLoadVectorNonPodElement<O>::Load(&input, answer.back().second, Size());
  59. }
  60. }
  61. size_t CheckData() const {
  62. return sizeof(ui32) + ReadUnaligned<ui32>(Data);
  63. }
  64. };
  65. template <class TStringType, class O, class C>
  66. class TMappedAhoCorasick;
  67. template <typename TKey, typename TValue>
  68. class TEmptyMapData : TNonCopyable {
  69. private:
  70. TBufferStream Stream;
  71. public:
  72. const char* P;
  73. TEmptyMapData() {
  74. TPlainHashWriter<TKey, TValue> hash;
  75. hash.Save(Stream);
  76. P = Stream.Buffer().Begin();
  77. }
  78. };
  79. /*
  80. * каждая вершина имеет свой ui32-номер
  81. * блок данных для вершины:
  82. * ui32, ui32, ui32, ui32, степень*char, данные контейнера
  83. * fail, suff, степень, самый левый сын, лексикографический список меток исходящих рёбер.
  84. * если степень нулевая, то в блоке только 3 инта
  85. */
  86. template <class TStringType, class O, class C>
  87. class TMappedAhoVertex {
  88. public:
  89. typedef typename TStringType::value_type TCharType;
  90. friend class TMappedAhoCorasick<TStringType, O, C>;
  91. private:
  92. const char* Data;
  93. typedef TPlainHash<TCharType, ui32> TGotoMap;
  94. TGotoMap GotoMap;
  95. static const TEmptyMapData<TCharType, ui32> EmptyData;
  96. static const size_t GENERAL_SHIFT = 3 * sizeof(ui32);
  97. private:
  98. const ui32* DataAsInt() const {
  99. return (const ui32*)Data;
  100. }
  101. ui32 Power() const {
  102. return ReadUnaligned<ui32>(DataAsInt() + 2);
  103. }
  104. protected:
  105. const C Output() const {
  106. return C(Power() ? GotoMap.ByteEnd() : Data + GENERAL_SHIFT);
  107. }
  108. ui32 Fail() const {
  109. return ReadUnaligned<ui32>(DataAsInt());
  110. }
  111. ui32 Suffix() const {
  112. return ReadUnaligned<ui32>(DataAsInt() + 1);
  113. }
  114. bool GotoFunction(const TCharType c, ui32* result) const {
  115. if (0 == Power())
  116. return false;
  117. return GotoMap.Find(c, result);
  118. }
  119. bool operator==(const TMappedAhoVertex& rhs) const {
  120. return Data == rhs.Data;
  121. }
  122. size_t CheckData(ui32 totalVertices) const; /// throws yexception in case of bad data
  123. public:
  124. TMappedAhoVertex(const char* data)
  125. : Data(data)
  126. , GotoMap(Power() ? (Data + GENERAL_SHIFT) : EmptyData.P)
  127. {
  128. }
  129. };
  130. /*
  131. * блок данных для бора:
  132. * количество вершин N, ui32
  133. * суммарный размер блоков для вершин, ui32
  134. * блоки данных для каждой вершины
  135. * отображение id->offset для блока вершины id, N*ui32
  136. */
  137. template <class TStringType, class O, class C = TMappedDefaultOutputContainer<O>>
  138. class TMappedAhoCorasick : TNonCopyable {
  139. public:
  140. typedef TAhoSearchResult<O> TSearchResult;
  141. typedef TMappedAhoVertex<TStringType, O, C> TAhoVertexType;
  142. typedef typename TStringType::value_type TCharType;
  143. typedef TBasicStringBuf<TCharType> TSample;
  144. private:
  145. const TBlob Blob;
  146. const char* const AhoVertexes;
  147. const ui32 VertexAmount;
  148. const ui32* const Id2Offset;
  149. const TAhoVertexType Root;
  150. private:
  151. bool ValidVertex(ui32 id) const {
  152. return id < VertexAmount;
  153. }
  154. TAhoVertexType GetVertexAt(ui32 id) const {
  155. if (!ValidVertex(id))
  156. ythrow yexception() << "TMappedAhoCorasick fatal error: invalid id " << id;
  157. return TAhoVertexType(AhoVertexes + Id2Offset[id]);
  158. }
  159. public:
  160. TMappedAhoCorasick(const TBlob& blob)
  161. : Blob(blob)
  162. , AhoVertexes(GetBlock(blob, 1).AsCharPtr())
  163. , VertexAmount(TSingleValue<ui32>(GetBlock(blob, 2)).Get())
  164. , Id2Offset((const ui32*)(GetBlock(Blob, 3).AsCharPtr()))
  165. , Root(GetVertexAt(0))
  166. {
  167. {
  168. const ui32 version = TSingleValue<ui32>(GetBlock(blob, 0)).Get();
  169. if (version != TAhoCorasickCommon::GetVersion())
  170. ythrow yexception() << "Unknown version " << version << " instead of " << TAhoCorasickCommon::GetVersion();
  171. }
  172. {
  173. TChunkedDataReader reader(blob);
  174. if (reader.GetBlocksCount() != TAhoCorasickCommon::GetBlockCount())
  175. ythrow yexception() << "wrong block count " << reader.GetBlocksCount();
  176. }
  177. }
  178. bool AhoContains(const TSample& str) const;
  179. TSearchResult AhoSearch(const TSample& str) const;
  180. void AhoSearch(const TSample& str, TSearchResult* result) const;
  181. size_t CheckData() const; /// throws yexception in case of bad data
  182. };
  183. using TSimpleMappedAhoCorasick = TMappedAhoCorasick<TString, ui32, TMappedSingleOutputContainer<ui32>>;
  184. using TDefaultMappedAhoCorasick = TMappedAhoCorasick<TString, ui32>;
  185. /*
  186. * Mapped-implementation
  187. */
  188. template <class TStringType, class O, class C>
  189. bool TMappedAhoCorasick<TStringType, O, C>::AhoContains(const TSample& str) const {
  190. TAhoVertexType current = Root;
  191. const size_t len = str.size();
  192. for (size_t i = 0; i < len; ++i) {
  193. bool outer = false;
  194. ui32 gotoVertex;
  195. while (!current.GotoFunction(str[i], &gotoVertex)) {
  196. if (current == Root) { /// nowhere to go
  197. outer = true;
  198. break;
  199. }
  200. current = GetVertexAt(current.Fail());
  201. }
  202. if (outer)
  203. continue;
  204. current = GetVertexAt(gotoVertex);
  205. TAhoVertexType v = current;
  206. while (true) {
  207. if (!v.Output().IsEmpty())
  208. return true;
  209. if ((ui32)-1 == v.Suffix())
  210. break;
  211. v = GetVertexAt(v.Suffix());
  212. }
  213. }
  214. return false;
  215. }
  216. template <class TStringType, class O, class C>
  217. void TMappedAhoCorasick<TStringType, O, C>::AhoSearch(const TSample& str, typename TMappedAhoCorasick<TStringType, O, C>::TSearchResult* answer) const {
  218. answer->clear();
  219. TAhoVertexType current = Root;
  220. const size_t len = str.length();
  221. for (size_t i = 0; i < len; ++i) {
  222. bool outer = false;
  223. ui32 gotoVertex;
  224. while (!current.GotoFunction(str[i], &gotoVertex)) {
  225. if (current == Root) { /// nowhere to go
  226. outer = true;
  227. break;
  228. }
  229. current = GetVertexAt(current.Fail());
  230. }
  231. if (outer)
  232. continue;
  233. current = GetVertexAt(gotoVertex);
  234. TAhoVertexType v = current;
  235. while (true) {
  236. v.Output().FillAnswer(*answer, (ui32)i);
  237. if ((ui32)-1 == v.Suffix())
  238. break;
  239. v = GetVertexAt(v.Suffix());
  240. }
  241. }
  242. }
  243. template <class TStringType, class O, class C>
  244. typename TMappedAhoCorasick<TStringType, O, C>::TSearchResult TMappedAhoCorasick<TStringType, O, C>::AhoSearch(const TSample& str) const {
  245. TAhoSearchResult<O> answer;
  246. AhoSearch(str, &answer);
  247. return answer;
  248. }
  249. /*
  250. * implementation of CheckData in Mapped-classes
  251. */
  252. static inline void CheckRange(ui32 id, ui32 strictUpperBound) {
  253. if (id >= strictUpperBound) {
  254. throw yexception() << id << " of " << strictUpperBound << " - index is invalid";
  255. }
  256. }
  257. template <class TStringType, class O, class C>
  258. const TEmptyMapData<typename TStringType::value_type, ui32> TMappedAhoVertex<TStringType, O, C>::EmptyData;
  259. template <class TStringType, class O, class C>
  260. size_t TMappedAhoVertex<TStringType, O, C>::CheckData(ui32 totalVertices) const {
  261. size_t bytesNeeded = GENERAL_SHIFT;
  262. CheckRange(Fail(), totalVertices);
  263. if (Suffix() != (ui32)(-1))
  264. CheckRange(Suffix(), totalVertices);
  265. if (Power()) {
  266. for (typename TGotoMap::TConstIterator toItem = GotoMap.Begin(); toItem != GotoMap.End(); ++toItem)
  267. CheckRange(toItem->Second(), totalVertices);
  268. bytesNeeded += GotoMap.ByteSize();
  269. }
  270. bytesNeeded += Output().CheckData();
  271. return bytesNeeded;
  272. }
  273. template <class TStringType, class O, class C>
  274. size_t TMappedAhoCorasick<TStringType, O, C>::CheckData() const {
  275. try {
  276. size_t bytesNeeded = 0;
  277. for (ui32 id = 0; id < VertexAmount; ++id) {
  278. if (Id2Offset[id] != bytesNeeded) {
  279. ythrow yexception() << "wrong offset[" << id << "]: " << Id2Offset[id];
  280. }
  281. bytesNeeded += GetVertexAt(id).CheckData(VertexAmount);
  282. }
  283. bytesNeeded += VertexAmount * sizeof(ui32);
  284. const size_t realsize = GetBlock(Blob, 1).Size() + GetBlock(Blob, 3).Size();
  285. if (realsize != bytesNeeded) {
  286. ythrow yexception() << "extra information " << bytesNeeded << " " << realsize;
  287. }
  288. return bytesNeeded;
  289. } catch (const yexception& e) {
  290. ythrow yexception() << "Bad data: " << e.what();
  291. }
  292. }