main.cpp 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. #include <library/cpp/testing/benchmark/bench.h>
  2. #include <library/cpp/containers/comptrie/comptrie_trie.h>
  3. #include <library/cpp/containers/comptrie/comptrie_builder.h>
  4. #include <library/cpp/containers/comptrie/search_iterator.h>
  5. #include <library/cpp/containers/comptrie/pattern_searcher.h>
  6. #include <library/cpp/on_disk/aho_corasick/writer.h>
  7. #include <library/cpp/on_disk/aho_corasick/reader.h>
  8. #include <library/cpp/on_disk/aho_corasick/helpers.h>
  9. #include <library/cpp/containers/dense_hash/dense_hash.h>
  10. #include <util/stream/file.h>
  11. #include <util/generic/algorithm.h>
  12. #include <util/random/fast.h>
  13. #include <util/random/shuffle.h>
  14. /////////////////
  15. // COMMON DATA //
  16. /////////////////
  17. const size_t MAX_PATTERN_LENGTH = 11;
  18. TVector<TString> letters = {
  19. "а", "б", "в", "г", "д", "е", "ё", "ж", "з", "и", "й",
  20. "к", "л", "м", "н", "о", "п", "р", "с", "т", "у", "ф",
  21. "х", "ц", "ч", "ж", "щ", "ъ", "ы", "ь", "э", "ю", "я"
  22. };
  23. TString GenerateOneString(
  24. TFastRng<ui64>& rng,
  25. size_t maxLength,
  26. const TVector<TString>& sequences
  27. ) {
  28. size_t length = rng.GenRand() % maxLength + 1;
  29. TString result;
  30. while (result.size() < length) {
  31. result += sequences[rng.GenRand() % sequences.size()];
  32. }
  33. return result;
  34. }
  35. TVector<TString> GenerateStrings(
  36. TFastRng<ui64>& rng,
  37. size_t num,
  38. size_t maxLength,
  39. const TVector<TString>& sequences
  40. ) {
  41. TVector<TString> strings;
  42. while (strings.size() < num) {
  43. strings.push_back(GenerateOneString(rng, maxLength, sequences));
  44. }
  45. return strings;
  46. }
  47. struct TDatasetInstance {
  48. TDatasetInstance(const TVector<TString>& sequences) {
  49. TFastRng<ui64> rng(0);
  50. TVector<TString> prefixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences);
  51. prefixes.push_back("");
  52. TVector<TString> roots = GenerateStrings(rng, /*num*/1000, /*maxLength*/5, sequences);
  53. TVector<TString> suffixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences);
  54. suffixes.push_back("");
  55. TVector<TString> dictionary;
  56. for (const auto& root : roots) {
  57. for (const auto& prefix : prefixes) {
  58. for (const auto& suffix : suffixes) {
  59. dictionary.push_back(prefix + root + suffix);
  60. Y_ASSERT(dictionary.back().size() < MAX_PATTERN_LENGTH);
  61. }
  62. }
  63. }
  64. Shuffle(dictionary.begin(), dictionary.end());
  65. Patterns.assign(dictionary.begin(), dictionary.begin() + 10'000);
  66. for (size_t sampleIdx = 0; sampleIdx < /*samplesNum*/1'000'000; ++sampleIdx) {
  67. Samples.emplace_back();
  68. size_t wordsNum = rng.GenRand() % 10;
  69. for (size_t wordIdx = 0; wordIdx < wordsNum; ++wordIdx) {
  70. if (wordIdx > 0) {
  71. Samples.back() += " ";
  72. }
  73. Samples.back() += dictionary[rng.GenRand() % dictionary.size()];
  74. }
  75. }
  76. }
  77. TString GetSample(size_t iteration) const {
  78. TFastRng<ui64> rng(iteration);
  79. return Samples[rng.GenRand() % Samples.size()];
  80. }
  81. TVector<TString> Patterns;
  82. TVector<TString> Samples;
  83. };
  84. static const TDatasetInstance dataset(letters);
  85. //////////////////////////
  86. // NEW PATTERN SEARCHER //
  87. //////////////////////////
  88. struct TPatternSearcherInstance {
  89. TPatternSearcherInstance() {
  90. TCompactPatternSearcherBuilder<char, ui32> builder;
  91. for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
  92. builder.Add(dataset.Patterns[patternId], patternId);
  93. }
  94. TBufferOutput buffer;
  95. builder.Save(buffer);
  96. Instance.Reset(
  97. new TCompactPatternSearcher<char, ui32>(
  98. buffer.Buffer().Data(),
  99. buffer.Buffer().Size()
  100. )
  101. );
  102. }
  103. THolder<TCompactPatternSearcher<char, ui32>> Instance;
  104. };
  105. static const TPatternSearcherInstance patternSearcherInstance;
  106. Y_CPU_BENCHMARK(PatternSearcher, iface) {
  107. TVector<TVector<std::pair<ui32, ui32>>> result;
  108. for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
  109. result.emplace_back();
  110. TString testString = dataset.GetSample(iteration);
  111. auto matches = patternSearcherInstance.Instance->SearchMatches(testString);
  112. for (auto& match : matches) {
  113. result.back().emplace_back(match.End, match.Data);
  114. }
  115. }
  116. }
  117. //////////////////////
  118. // OLD AHO CORASICK //
  119. //////////////////////
  120. struct TAhoCorasickInstance {
  121. TAhoCorasickInstance() {
  122. TAhoCorasickBuilder<TString, ui32> builder;
  123. for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
  124. builder.AddString(dataset.Patterns[patternId], patternId);
  125. }
  126. TBufferOutput buffer;
  127. builder.SaveToStream(&buffer);
  128. Instance.Reset(new TDefaultMappedAhoCorasick(TBlob::FromBuffer(buffer.Buffer())));
  129. }
  130. THolder<TDefaultMappedAhoCorasick> Instance;
  131. };
  132. static const TAhoCorasickInstance ahoCorasickInstance;
  133. Y_CPU_BENCHMARK(AhoCorasick, iface) {
  134. TVector<TDeque<std::pair<ui32, ui32>>> result;
  135. for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
  136. result.emplace_back();
  137. TString testString = dataset.GetSample(iteration);
  138. auto matches = ahoCorasickInstance.Instance->AhoSearch(testString);
  139. result.push_back(matches);
  140. }
  141. }
  142. ////////////////////////////////
  143. // COMPTRIE + SIMPLE MATCHING //
  144. ////////////////////////////////
  145. struct TCompactTrieInstance {
  146. TCompactTrieInstance() {
  147. TCompactTrieBuilder<char, ui32> builder;
  148. for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
  149. builder.Add(dataset.Patterns[patternId], patternId);
  150. }
  151. TBufferOutput buffer;
  152. CompactTrieMinimizeAndMakeFastLayout(buffer, builder);
  153. Instance.Reset(new TCompactTrie<char, ui32>(
  154. buffer.Buffer().Data(),
  155. buffer.Buffer().Size()
  156. ));
  157. }
  158. THolder<TCompactTrie<char, ui32>> Instance;
  159. };
  160. static const TCompactTrieInstance compactTrieInstance;
  161. Y_CPU_BENCHMARK(ComptrieSimple, iface) {
  162. TVector<TVector<std::pair<ui32, ui32>>> result;
  163. for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
  164. result.emplace_back();
  165. TString testString = dataset.GetSample(iteration);
  166. for (ui32 startPos = 0; startPos < testString.size(); ++startPos) {
  167. TSearchIterator<TCompactTrie<char, ui32>> iter(*(compactTrieInstance.Instance));
  168. for (ui32 position = startPos; position < testString.size(); ++position) {
  169. if (!iter.Advance(testString[position])) {
  170. break;
  171. }
  172. ui32 answer;
  173. if (iter.GetValue(&answer)) {
  174. result.back().emplace_back(position, answer);
  175. }
  176. }
  177. }
  178. }
  179. }
  180. ////////////////
  181. // DENSE_HASH //
  182. ////////////////
  183. struct TDenseHashInstance {
  184. TDenseHashInstance() {
  185. for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) {
  186. Instance[dataset.Patterns[patternId]] = patternId;
  187. }
  188. }
  189. TDenseHash<TString, ui32> Instance;
  190. };
  191. static const TDenseHashInstance denseHashInstance;
  192. Y_CPU_BENCHMARK(DenseHash, iface) {
  193. TVector<TVector<std::pair<ui32, ui32>>> result;
  194. for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) {
  195. result.emplace_back();
  196. TString testString = dataset.GetSample(iteration);
  197. for (size_t start = 0; start < testString.size(); ++start) {
  198. for (
  199. size_t length = 1;
  200. length <= MAX_PATTERN_LENGTH && start + length <= testString.size();
  201. ++length
  202. ) {
  203. auto value = denseHashInstance.Instance.find(testString.substr(start, length));
  204. if (value != denseHashInstance.Instance.end()) {
  205. result.back().emplace_back(start + length - 1, value->second);
  206. }
  207. }
  208. }
  209. }
  210. }