#include #include #include #include #include #include #include #include #include #include #include #include #include ///////////////// // COMMON DATA // ///////////////// const size_t MAX_PATTERN_LENGTH = 11; TVector letters = { "а", "б", "в", "г", "д", "е", "ё", "ж", "з", "и", "й", "к", "л", "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ж", "щ", "ъ", "ы", "ь", "э", "ю", "я" }; TString GenerateOneString( TFastRng& rng, size_t maxLength, const TVector& sequences ) { size_t length = rng.GenRand() % maxLength + 1; TString result; while (result.size() < length) { result += sequences[rng.GenRand() % sequences.size()]; } return result; } TVector GenerateStrings( TFastRng& rng, size_t num, size_t maxLength, const TVector& sequences ) { TVector strings; while (strings.size() < num) { strings.push_back(GenerateOneString(rng, maxLength, sequences)); } return strings; } struct TDatasetInstance { TDatasetInstance(const TVector& sequences) { TFastRng rng(0); TVector prefixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences); prefixes.push_back(""); TVector roots = GenerateStrings(rng, /*num*/1000, /*maxLength*/5, sequences); TVector suffixes = GenerateStrings(rng, /*num*/10, /*maxLength*/3, sequences); suffixes.push_back(""); TVector dictionary; for (const auto& root : roots) { for (const auto& prefix : prefixes) { for (const auto& suffix : suffixes) { dictionary.push_back(prefix + root + suffix); Y_ASSERT(dictionary.back().size() < MAX_PATTERN_LENGTH); } } } Shuffle(dictionary.begin(), dictionary.end()); Patterns.assign(dictionary.begin(), dictionary.begin() + 10'000); for (size_t sampleIdx = 0; sampleIdx < /*samplesNum*/1'000'000; ++sampleIdx) { Samples.emplace_back(); size_t wordsNum = rng.GenRand() % 10; for (size_t wordIdx = 0; wordIdx < wordsNum; ++wordIdx) { if (wordIdx > 0) { Samples.back() += " "; } Samples.back() += dictionary[rng.GenRand() % dictionary.size()]; } } } TString GetSample(size_t iteration) const { TFastRng rng(iteration); return Samples[rng.GenRand() % Samples.size()]; } TVector Patterns; TVector Samples; }; static const TDatasetInstance dataset(letters); ////////////////////////// // NEW PATTERN SEARCHER // ////////////////////////// struct TPatternSearcherInstance { TPatternSearcherInstance() { TCompactPatternSearcherBuilder builder; for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { builder.Add(dataset.Patterns[patternId], patternId); } TBufferOutput buffer; builder.Save(buffer); Instance.Reset( new TCompactPatternSearcher( buffer.Buffer().Data(), buffer.Buffer().Size() ) ); } THolder> Instance; }; static const TPatternSearcherInstance patternSearcherInstance; Y_CPU_BENCHMARK(PatternSearcher, iface) { TVector>> result; for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { result.emplace_back(); TString testString = dataset.GetSample(iteration); auto matches = patternSearcherInstance.Instance->SearchMatches(testString); for (auto& match : matches) { result.back().emplace_back(match.End, match.Data); } } } ////////////////////// // OLD AHO CORASICK // ////////////////////// struct TAhoCorasickInstance { TAhoCorasickInstance() { TAhoCorasickBuilder builder; for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { builder.AddString(dataset.Patterns[patternId], patternId); } TBufferOutput buffer; builder.SaveToStream(&buffer); Instance.Reset(new TDefaultMappedAhoCorasick(TBlob::FromBuffer(buffer.Buffer()))); } THolder Instance; }; static const TAhoCorasickInstance ahoCorasickInstance; Y_CPU_BENCHMARK(AhoCorasick, iface) { TVector>> result; for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { result.emplace_back(); TString testString = dataset.GetSample(iteration); auto matches = ahoCorasickInstance.Instance->AhoSearch(testString); result.push_back(matches); } } //////////////////////////////// // COMPTRIE + SIMPLE MATCHING // //////////////////////////////// struct TCompactTrieInstance { TCompactTrieInstance() { TCompactTrieBuilder builder; for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { builder.Add(dataset.Patterns[patternId], patternId); } TBufferOutput buffer; CompactTrieMinimizeAndMakeFastLayout(buffer, builder); Instance.Reset(new TCompactTrie( buffer.Buffer().Data(), buffer.Buffer().Size() )); } THolder> Instance; }; static const TCompactTrieInstance compactTrieInstance; Y_CPU_BENCHMARK(ComptrieSimple, iface) { TVector>> result; for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { result.emplace_back(); TString testString = dataset.GetSample(iteration); for (ui32 startPos = 0; startPos < testString.size(); ++startPos) { TSearchIterator> iter(*(compactTrieInstance.Instance)); for (ui32 position = startPos; position < testString.size(); ++position) { if (!iter.Advance(testString[position])) { break; } ui32 answer; if (iter.GetValue(&answer)) { result.back().emplace_back(position, answer); } } } } } //////////////// // DENSE_HASH // //////////////// struct TDenseHashInstance { TDenseHashInstance() { for (ui32 patternId = 0; patternId < dataset.Patterns.size(); ++patternId) { Instance[dataset.Patterns[patternId]] = patternId; } } TDenseHash Instance; }; static const TDenseHashInstance denseHashInstance; Y_CPU_BENCHMARK(DenseHash, iface) { TVector>> result; for (size_t iteration = 0; iteration < iface.Iterations(); ++iteration) { result.emplace_back(); TString testString = dataset.GetSample(iteration); for (size_t start = 0; start < testString.size(); ++start) { for ( size_t length = 1; length <= MAX_PATTERN_LENGTH && start + length <= testString.size(); ++length ) { auto value = denseHashInstance.Instance.find(testString.substr(start, length)); if (value != denseHashInstance.Instance.end()) { result.back().emplace_back(start + length - 1, value->second); } } } } }