#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "comptrie.h" #include "set.h" #include "first_symbol_iterator.h" #include "search_iterator.h" #include "pattern_searcher.h" #include #include class TCompactTrieTest: public TTestBase { private: UNIT_TEST_SUITE(TCompactTrieTest); UNIT_TEST(TestTrie8); UNIT_TEST(TestTrie16); UNIT_TEST(TestTrie32); UNIT_TEST(TestFastTrie8); UNIT_TEST(TestFastTrie16); UNIT_TEST(TestFastTrie32); UNIT_TEST(TestMinimizedTrie8); UNIT_TEST(TestMinimizedTrie16); UNIT_TEST(TestMinimizedTrie32); UNIT_TEST(TestFastMinimizedTrie8); UNIT_TEST(TestFastMinimizedTrie16); UNIT_TEST(TestFastMinimizedTrie32); UNIT_TEST(TestTrieIterator8); UNIT_TEST(TestTrieIterator16); UNIT_TEST(TestTrieIterator32); UNIT_TEST(TestMinimizedTrieIterator8); UNIT_TEST(TestMinimizedTrieIterator16); UNIT_TEST(TestMinimizedTrieIterator32); UNIT_TEST(TestPhraseSearch); UNIT_TEST(TestAddGet); UNIT_TEST(TestEmpty); UNIT_TEST(TestUninitializedNonEmpty); UNIT_TEST(TestRandom); UNIT_TEST(TestFindTails); UNIT_TEST(TestPrefixGrouped); UNIT_TEST(CrashTestPrefixGrouped); UNIT_TEST(TestMergeFromFile); UNIT_TEST(TestMergeFromBuffer); UNIT_TEST(TestUnique); UNIT_TEST(TestAddRetValue); UNIT_TEST(TestClear); UNIT_TEST(TestIterateEmptyKey); UNIT_TEST(TestTrieSet); UNIT_TEST(TestTrieForVectorInt64); UNIT_TEST(TestTrieForListInt64); UNIT_TEST(TestTrieForSetInt64); UNIT_TEST(TestTrieForVectorStroka); UNIT_TEST(TestTrieForListStroka); UNIT_TEST(TestTrieForSetStroka); UNIT_TEST(TestTrieForVectorWtroka); UNIT_TEST(TestTrieForVectorFloat); UNIT_TEST(TestTrieForVectorDouble); UNIT_TEST(TestTrieForListVectorInt64); UNIT_TEST(TestTrieForPairWtrokaVectorInt64); UNIT_TEST(TestEmptyValueOutOfOrder); UNIT_TEST(TestFindLongestPrefixWithEmptyValue); UNIT_TEST(TestSearchIterChar); UNIT_TEST(TestSearchIterWchar); UNIT_TEST(TestSearchIterWchar32) UNIT_TEST(TestCopyAndAssignment); UNIT_TEST(TestFirstSymbolIterator8); UNIT_TEST(TestFirstSymbolIterator16); UNIT_TEST(TestFirstSymbolIterator32); UNIT_TEST(TestFirstSymbolIteratorChar32); UNIT_TEST(TestArrayPacker); UNIT_TEST(TestBuilderFindLongestPrefix); UNIT_TEST(TestBuilderFindLongestPrefixWithEmptyValue); UNIT_TEST(TestPatternSearcherEmpty); UNIT_TEST(TestPatternSearcherSimple); UNIT_TEST(TestPatternSearcherRandom); UNIT_TEST_SUITE_END(); static const char* SampleData[]; template void CreateTrie(IOutputStream& out, bool minimize, bool useFastLayout); template void CheckData(const char* src, size_t len); template void CheckUpperBound(const char* src, size_t len); template void CheckIterator(const char* src, size_t len); template void TestTrie(bool minimize, bool useFastLayout); template void TestTrieIterator(bool minimize); template void TestRandom(const size_t n, const size_t maxKeySize); void TestFindTailsImpl(const TString& prefix); void TestUniqueImpl(bool isPrefixGrouped); TVector GetSampleKeys(size_t nKeys) const; template TVector GetSampleVectorData(size_t nValues); template TVector GetSampleTextVectorData(size_t nValues); template void CheckEquality(const T& value1, const T& value2) const; template void TestTrieWithContainers(const TVector& keys, const TVector& sampleData, TString methodName); template void TestSearchIterImpl(); template void TestFirstSymbolIteratorForTrie(const TTrie& trie, const TStringBuf& narrowAnswers); template void TestFirstSymbolIterator(); template class TIntPacker; template class TDummyPacker; class TStrokaPacker; public: void TestPackers(); void TestTrie8(); void TestTrie16(); void TestTrie32(); void TestFastTrie8(); void TestFastTrie16(); void TestFastTrie32(); void TestMinimizedTrie8(); void TestMinimizedTrie16(); void TestMinimizedTrie32(); void TestFastMinimizedTrie8(); void TestFastMinimizedTrie16(); void TestFastMinimizedTrie32(); void TestTrieIterator8(); void TestTrieIterator16(); void TestTrieIterator32(); void TestMinimizedTrieIterator8(); void TestMinimizedTrieIterator16(); void TestMinimizedTrieIterator32(); void TestPhraseSearch(); void TestAddGet(); void TestEmpty(); void TestUninitializedNonEmpty(); void TestRandom(); void TestFindTails(); void TestPrefixGrouped(); void CrashTestPrefixGrouped(); void TestMergeFromFile(); void TestMergeFromBuffer(); void TestUnique(); void TestAddRetValue(); void TestClear(); void TestIterateEmptyKey(); void TestTrieSet(); void TestTrieForVectorInt64(); void TestTrieForListInt64(); void TestTrieForSetInt64(); void TestTrieForVectorStroka(); void TestTrieForListStroka(); void TestTrieForSetStroka(); void TestTrieForVectorWtroka(); void TestTrieForVectorFloat(); void TestTrieForVectorDouble(); void TestTrieForListVectorInt64(); void TestTrieForPairWtrokaVectorInt64(); void TestEmptyValueOutOfOrder(); void TestFindLongestPrefixWithEmptyValue(); void TestSearchIterChar(); void TestSearchIterWchar(); void TestSearchIterWchar32(); void TestCopyAndAssignment(); void TestFirstSymbolIterator8(); void TestFirstSymbolIterator16(); void TestFirstSymbolIterator32(); void TestFirstSymbolIteratorChar32(); void TestArrayPacker(); void TestBuilderFindLongestPrefix(); void TestBuilderFindLongestPrefix(size_t keysCount, double branchProbability, bool isPrefixGrouped, bool hasEmptyKey); void TestBuilderFindLongestPrefixWithEmptyValue(); void TestPatternSearcherOnDataset( const TVector& patterns, const TVector& samples ); void TestPatternSearcherEmpty(); void TestPatternSearcherSimple(); void TestPatternSearcherRandom( size_t patternsNum, size_t patternMaxLength, size_t strMaxLength, int maxChar, TFastRng& rng ); void TestPatternSearcherRandom(); }; UNIT_TEST_SUITE_REGISTRATION(TCompactTrieTest); const char* TCompactTrieTest::SampleData[] = { "", "a", "b", "c", "d", "aa", "ab", "ac", "ad", "aaa", "aab", "aac", "aad", "aba", "abb", "abc", "abd", "fba", "fbb", "fbc", "fbd", "fbbaa", "c\x85\xA4\xBF" // Just something outside ASCII. }; template typename TCompactTrie::TKey MakeWideKey(const char* str, size_t len) { typename TCompactTrie::TKey buffer; for (size_t i = 0; i < len; i++) { unsigned int ch = (str[i] & 0xFF); buffer.push_back((T)(ch | ch << 8 | ch << 16 | ch << 24)); } return buffer; } template typename TCompactTrie::TKey MakeWideKey(const TString& str) { return MakeWideKey(str.c_str(), str.length()); } template typename TCompactTrie::TKey MakeWideKey(const TStringBuf& buf) { return MakeWideKey(buf.data(), buf.length()); } template void TCompactTrieTest::CreateTrie(IOutputStream& out, bool minimize, bool useFastLayout) { TCompactTrieBuilder builder; for (auto& i : SampleData) { size_t len = strlen(i); builder.Add(MakeWideKey(i, len), len * 2); } TBufferOutput tmp2; IOutputStream& currentOutput = useFastLayout ? tmp2 : out; if (minimize) { TBufferOutput buftmp; builder.Save(buftmp); CompactTrieMinimize>(currentOutput, buftmp.Buffer().Data(), buftmp.Buffer().Size(), false); } else { builder.Save(currentOutput); } if (useFastLayout) { CompactTrieMakeFastLayout>(out, tmp2.Buffer().Data(), tmp2.Buffer().Size(), false); } } // Iterates over all strings of length <= 4 made of letters a-g. static bool LexicographicStep(TString& s) { if (s.length() < 4) { s += "a"; return true; } while (!s.empty() && s.back() == 'g') s.pop_back(); if (s.empty()) return false; char last = s.back(); last++; s.pop_back(); s.push_back(last); return true; } template void TCompactTrieTest::CheckUpperBound(const char* data, size_t datalen) { TCompactTrie trie(data, datalen); typedef typename TCompactTrie::TKey TKey; typedef typename TCompactTrie::TData TData; TString key; do { const TKey wideKey = MakeWideKey(key); typename TCompactTrie::TConstIterator it = trie.UpperBound(wideKey); UNIT_ASSERT_C(it == trie.End() || it.GetKey() >= wideKey, "key=" + key); TData data; const bool found = trie.Find(wideKey, &data); if (found) UNIT_ASSERT_C(it.GetKey() == wideKey && it.GetValue() == data, "key=" + key); if (it != trie.Begin()) UNIT_ASSERT_C((--it).GetKey() < wideKey, "key=" + key); } while (LexicographicStep(key)); } template void TCompactTrieTest::CheckData(const char* data, size_t datalen) { TCompactTrie trie(data, datalen); UNIT_ASSERT_VALUES_EQUAL(Y_ARRAY_SIZE(SampleData), trie.Size()); for (auto& i : SampleData) { size_t len = strlen(i); ui64 value = 0; size_t prefixLen = 0; typename TCompactTrie::TKey key = MakeWideKey(i, len); UNIT_ASSERT(trie.Find(key, &value)); UNIT_ASSERT_EQUAL(len * 2, value); UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value)); UNIT_ASSERT_EQUAL(len, prefixLen); UNIT_ASSERT_EQUAL(len * 2, value); TString badkey("bb"); badkey += i; key = MakeWideKey(badkey); UNIT_ASSERT(!trie.Find(key)); value = 123; UNIT_ASSERT(!trie.Find(key, &value)); UNIT_ASSERT_EQUAL(123, value); UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value)); UNIT_ASSERT_EQUAL(1, prefixLen); UNIT_ASSERT_EQUAL(2, value); badkey = i; badkey += "x"; key = MakeWideKey(badkey); UNIT_ASSERT(!trie.Find(key)); value = 1234; UNIT_ASSERT(!trie.Find(key, &value)); UNIT_ASSERT_EQUAL(1234, value); UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value)); UNIT_ASSERT_EQUAL(len, prefixLen); UNIT_ASSERT_EQUAL(len * 2, value); UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, nullptr)); UNIT_ASSERT_EQUAL(len, prefixLen); } TString testkey("fbbaa"); typename TCompactTrie::TKey key = MakeWideKey(testkey); ui64 value = 0; size_t prefixLen = 0; UNIT_ASSERT(trie.FindLongestPrefix(key.data(), testkey.length() - 1, &prefixLen, &value)); UNIT_ASSERT_EQUAL(prefixLen, 3); UNIT_ASSERT_EQUAL(6, value); testkey = "fbbax"; key = MakeWideKey(testkey); UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value)); UNIT_ASSERT_EQUAL(prefixLen, 3); UNIT_ASSERT_EQUAL(6, value); value = 12345678; UNIT_ASSERT(!trie.Find(key, &value)); UNIT_ASSERT_EQUAL(12345678, value); //Failed Find() should not change value } template void TCompactTrieTest::CheckIterator(const char* data, size_t datalen) { typedef typename TCompactTrie::TKey TKey; typedef typename TCompactTrie::TValueType TValue; TMap stored; for (auto& i : SampleData) { size_t len = strlen(i); stored[MakeWideKey(i, len)] = len * 2; } TCompactTrie trie(data, datalen); TVector items; typename TCompactTrie::TConstIterator it = trie.Begin(); size_t entry_count = 0; TMap received; while (it != trie.End()) { UNIT_ASSERT_VALUES_EQUAL(it.GetKeySize(), it.GetKey().size()); received.insert(*it); items.push_back(*it); entry_count++; it++; } TMap received2; for (std::pair x : trie) { received2.insert(x); } UNIT_ASSERT(entry_count == stored.size()); UNIT_ASSERT(received == stored); UNIT_ASSERT(received2 == stored); std::reverse(items.begin(), items.end()); typename TCompactTrie::TConstIterator revIt = trie.End(); typename TCompactTrie::TConstIterator const begin = trie.Begin(); typename TCompactTrie::TConstIterator emptyIt; size_t pos = 0; while (revIt != begin) { revIt--; UNIT_ASSERT(*revIt == items[pos]); pos++; } // Checking the assignment operator. revIt = begin; UNIT_ASSERT(revIt == trie.Begin()); UNIT_ASSERT(!revIt.IsEmpty()); UNIT_ASSERT(revIt != emptyIt); UNIT_ASSERT(revIt != trie.End()); ++revIt; // Call a method that uses Skipper. revIt = emptyIt; UNIT_ASSERT(revIt == emptyIt); UNIT_ASSERT(revIt.IsEmpty()); UNIT_ASSERT(revIt != trie.End()); // Checking the move assignment operator. revIt = trie.Begin(); UNIT_ASSERT(revIt == trie.Begin()); UNIT_ASSERT(!revIt.IsEmpty()); UNIT_ASSERT(revIt != emptyIt); UNIT_ASSERT(revIt != trie.End()); ++revIt; // Call a method that uses Skipper. revIt = typename TCompactTrie::TConstIterator(); UNIT_ASSERT(revIt == emptyIt); UNIT_ASSERT(revIt.IsEmpty()); UNIT_ASSERT(revIt != trie.End()); } template void TCompactTrieTest::TestTrie(bool minimize, bool useFastLayout) { TBufferOutput bufout; CreateTrie(bufout, minimize, useFastLayout); CheckData(bufout.Buffer().Data(), bufout.Buffer().Size()); CheckUpperBound(bufout.Buffer().Data(), bufout.Buffer().Size()); } template void TCompactTrieTest::TestTrieIterator(bool minimize) { TBufferOutput bufout; CreateTrie(bufout, minimize, false); CheckIterator(bufout.Buffer().Data(), bufout.Buffer().Size()); } void TCompactTrieTest::TestTrie8() { TestTrie(false, false); } void TCompactTrieTest::TestTrie16() { TestTrie(false, false); } void TCompactTrieTest::TestTrie32() { TestTrie(false, false); } void TCompactTrieTest::TestFastTrie8() { TestTrie(false, true); } void TCompactTrieTest::TestFastTrie16() { TestTrie(false, true); } void TCompactTrieTest::TestFastTrie32() { TestTrie(false, true); } void TCompactTrieTest::TestMinimizedTrie8() { TestTrie(true, false); } void TCompactTrieTest::TestMinimizedTrie16() { TestTrie(true, false); } void TCompactTrieTest::TestMinimizedTrie32() { TestTrie(true, false); } void TCompactTrieTest::TestFastMinimizedTrie8() { TestTrie(true, true); } void TCompactTrieTest::TestFastMinimizedTrie16() { TestTrie(true, true); } void TCompactTrieTest::TestFastMinimizedTrie32() { TestTrie(true, true); } void TCompactTrieTest::TestTrieIterator8() { TestTrieIterator(false); } void TCompactTrieTest::TestTrieIterator16() { TestTrieIterator(false); } void TCompactTrieTest::TestTrieIterator32() { TestTrieIterator(false); } void TCompactTrieTest::TestMinimizedTrieIterator8() { TestTrieIterator(true); } void TCompactTrieTest::TestMinimizedTrieIterator16() { TestTrieIterator(true); } void TCompactTrieTest::TestMinimizedTrieIterator32() { TestTrieIterator(true); } void TCompactTrieTest::TestPhraseSearch() { static const char* phrases[] = {"ab", "ab cd", "ab cd ef"}; static const char* const goodphrase = "ab cd ef gh"; static const char* const badphrase = "cd ef gh ab"; TBufferOutput bufout; TCompactTrieBuilder builder; for (size_t i = 0; i < Y_ARRAY_SIZE(phrases); i++) { builder.Add(phrases[i], strlen(phrases[i]), i); } builder.Save(bufout); TCompactTrie trie(bufout.Buffer().Data(), bufout.Buffer().Size()); TVector::TPhraseMatch> matches; trie.FindPhrases(goodphrase, strlen(goodphrase), matches); UNIT_ASSERT(matches.size() == Y_ARRAY_SIZE(phrases)); for (size_t i = 0; i < Y_ARRAY_SIZE(phrases); i++) { UNIT_ASSERT(matches[i].first == strlen(phrases[i])); UNIT_ASSERT(matches[i].second == i); } trie.FindPhrases(badphrase, strlen(badphrase), matches); UNIT_ASSERT(matches.size() == 0); } void TCompactTrieTest::TestAddGet() { TCompactTrieBuilder builder; builder.Add("abcd", 4, 1); builder.Add("acde", 4, 2); ui64 dummy; UNIT_ASSERT(builder.Find("abcd", 4, &dummy)); UNIT_ASSERT(1 == dummy); UNIT_ASSERT(builder.Find("acde", 4, &dummy)); UNIT_ASSERT(2 == dummy); UNIT_ASSERT(!builder.Find("fgdgfacde", 9, &dummy)); UNIT_ASSERT(!builder.Find("ab", 2, &dummy)); } void TCompactTrieTest::TestEmpty() { TCompactTrieBuilder builder; ui64 dummy = 12345; size_t prefixLen; UNIT_ASSERT(!builder.Find("abc", 3, &dummy)); TBufferOutput bufout; builder.Save(bufout); TCompactTrie trie(bufout.Buffer().Data(), bufout.Buffer().Size()); UNIT_ASSERT(!trie.Find("abc", 3, &dummy)); UNIT_ASSERT(!trie.Find("", 0, &dummy)); UNIT_ASSERT(!trie.FindLongestPrefix("abc", 3, &prefixLen, &dummy)); UNIT_ASSERT(!trie.FindLongestPrefix("", 0, &prefixLen, &dummy)); UNIT_ASSERT_EQUAL(12345, dummy); UNIT_ASSERT(trie.Begin() == trie.End()); TCompactTrie<> trieNull; UNIT_ASSERT(!trieNull.Find(" ", 1)); TCompactTrie<>::TPhraseMatchVector matches; trieNull.FindPhrases(" ", 1, matches); // just to be sure it doesn't crash UNIT_ASSERT(trieNull.Begin() == trieNull.End()); } void TCompactTrieTest::TestUninitializedNonEmpty() { TBufferOutput bufout; CreateTrie(bufout, false, false); TCompactTrie trie(bufout.Buffer().Data(), bufout.Buffer().Size()); typedef TCompactTrie::TKey TKey; typedef TCompactTrie::TConstIterator TIter; TCompactTrie tails = trie.FindTails("abd", 3); // A trie that has empty value and no data. UNIT_ASSERT(!tails.IsEmpty()); UNIT_ASSERT(!tails.IsInitialized()); const TKey wideKey = MakeWideKey("c", 1); TIter it = tails.UpperBound(wideKey); UNIT_ASSERT(it == tails.End()); UNIT_ASSERT(it != tails.Begin()); --it; UNIT_ASSERT(it == tails.Begin()); ++it; UNIT_ASSERT(it == tails.End()); } static char RandChar() { return char(RandomNumber() % 256); } static TString RandStr(const size_t max) { size_t len = RandomNumber() % max; TString key; for (size_t j = 0; j < len; ++j) key += RandChar(); return key; } template void TCompactTrieTest::TestRandom(const size_t n, const size_t maxKeySize) { const TStringBuf EMPTY_KEY = TStringBuf("", 1); TCompactTrieBuilder builder; typedef TMap TKeys; TKeys keys; typename T::TData dummy; for (size_t i = 0; i < n; ++i) { const TString key = RandStr(maxKeySize); if (key != EMPTY_KEY && keys.find(key) == keys.end()) { const typename T::TData val = T::Data(key); keys[key] = val; UNIT_ASSERT_C(!builder.Find(key.data(), key.size(), &dummy), "key = " << HexEncode(TString(key))); builder.Add(key.data(), key.size(), val); UNIT_ASSERT_C(builder.Find(key.data(), key.size(), &dummy), "key = " << HexEncode(TString(key))); UNIT_ASSERT(dummy == val); } } TBufferStream stream; size_t len = builder.Save(stream); TCompactTrie trie(stream.Buffer().Data(), len); TBufferStream buftmp; if (minimize) { CompactTrieMinimize(buftmp, stream.Buffer().Data(), len, false); } TCompactTrie trieMin(buftmp.Buffer().Data(), buftmp.Buffer().Size()); TCompactTrieBuilder prefixGroupedBuilder(CTBF_PREFIX_GROUPED); for (typename TKeys::const_iterator i = keys.begin(), mi = keys.end(); i != mi; ++i) { UNIT_ASSERT(!prefixGroupedBuilder.Find(i->first.c_str(), i->first.size(), &dummy)); UNIT_ASSERT(trie.Find(i->first.c_str(), i->first.size(), &dummy)); UNIT_ASSERT(dummy == i->second); if (minimize) { UNIT_ASSERT(trieMin.Find(i->first.c_str(), i->first.size(), &dummy)); UNIT_ASSERT(dummy == i->second); } prefixGroupedBuilder.Add(i->first.c_str(), i->first.size(), dummy); UNIT_ASSERT(prefixGroupedBuilder.Find(i->first.c_str(), i->first.size(), &dummy)); for (typename TKeys::const_iterator j = keys.begin(), end = keys.end(); j != end; ++j) { typename T::TData valFound; if (j->first <= i->first) { UNIT_ASSERT(prefixGroupedBuilder.Find(j->first.c_str(), j->first.size(), &valFound)); UNIT_ASSERT_VALUES_EQUAL(j->second, valFound); } else { UNIT_ASSERT(!prefixGroupedBuilder.Find(j->first.c_str(), j->first.size(), &valFound)); } } } TBufferStream prefixGroupedBuffer; prefixGroupedBuilder.Save(prefixGroupedBuffer); UNIT_ASSERT_VALUES_EQUAL(stream.Buffer().Size(), prefixGroupedBuffer.Buffer().Size()); UNIT_ASSERT(0 == memcmp(stream.Buffer().Data(), prefixGroupedBuffer.Buffer().Data(), stream.Buffer().Size())); } void TCompactTrieTest::TestRandom() { TestRandom, true>(1000, 1000); TestRandom, true>(100, 100); TestRandom, true>(0, 0); TestRandom, true>(100, 3); TestRandom, true>(100, 100); TestRandom(100, 100); } void TCompactTrieTest::TestFindTailsImpl(const TString& prefix) { TCompactTrieBuilder<> builder; TMap input; for (auto& i : SampleData) { TString temp = i; ui64 val = temp.size() * 2; builder.Add(temp.data(), temp.size(), val); if (temp.StartsWith(prefix)) { input[temp.substr(prefix.size())] = val; } } typedef TCompactTrie<> TTrie; TBufferStream stream; size_t len = builder.Save(stream); TTrie trie(stream.Buffer().Data(), len); TTrie subtrie = trie.FindTails(prefix.data(), prefix.size()); TMap output; for (TTrie::TConstIterator i = subtrie.Begin(), mi = subtrie.End(); i != mi; ++i) { TTrie::TValueType val = *i; output[TString(val.first.data(), val.first.size())] = val.second; } UNIT_ASSERT(input.size() == output.size()); UNIT_ASSERT(input == output); TBufferStream buftmp; CompactTrieMinimize(buftmp, stream.Buffer().Data(), len, false); TTrie trieMin(buftmp.Buffer().Data(), buftmp.Buffer().Size()); subtrie = trieMin.FindTails(prefix.data(), prefix.size()); output.clear(); for (TTrie::TConstIterator i = subtrie.Begin(), mi = subtrie.End(); i != mi; ++i) { TTrie::TValueType val = *i; output[TString(val.first.data(), val.first.size())] = val.second; } UNIT_ASSERT(input.size() == output.size()); UNIT_ASSERT(input == output); } void TCompactTrieTest::TestPrefixGrouped() { TBuffer b1b; TCompactTrieBuilder b1(CTBF_PREFIX_GROUPED); const char* data[] = { "Kazan", "Moscow", "Monino", "Murmansk", "Fryanovo", "Fryazino", "Fryazevo", "Tumen", }; for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) { ui32 val = strlen(data[i]) + 1; b1.Add(data[i], strlen(data[i]), val); for (size_t j = 0; j < Y_ARRAY_SIZE(data); ++j) { ui32 mustHave = strlen(data[j]) + 1; ui32 found = 0; if (j <= i) { UNIT_ASSERT(b1.Find(data[j], strlen(data[j]), &found)); UNIT_ASSERT_VALUES_EQUAL(mustHave, found); } else { UNIT_ASSERT(!b1.Find(data[j], strlen(data[j]), &found)); } } } { TBufferOutput b1bo(b1b); b1.Save(b1bo); } TCompactTrie t1(TBlob::FromBuffer(b1b)); //t1.Print(Cerr); for (auto& i : data) { ui32 v; UNIT_ASSERT(t1.Find(i, strlen(i), &v)); UNIT_ASSERT_VALUES_EQUAL(strlen(i) + 1, v); } } void TCompactTrieTest::CrashTestPrefixGrouped() { TCompactTrieBuilder builder(CTBF_PREFIX_GROUPED); const char* data[] = { "Fryazino", "Fryanovo", "Monino", "", "Fryazevo", }; bool wasException = false; try { for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) { builder.Add(data[i], strlen(data[i]), i + 1); } } catch (const yexception& e) { wasException = true; UNIT_ASSERT(strstr(e.what(), "Bad input order - expected input strings to be prefix-grouped.")); } UNIT_ASSERT_C(wasException, "CrashTestPrefixGrouped"); } void TCompactTrieTest::TestMergeFromFile() { { TCompactTrieBuilder<> b; b.Add("yandex", 12); b.Add("google", 13); b.Add("mail", 14); TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-ru"); b.Save(out); } { TCompactTrieBuilder<> b; b.Add("yandex", 112); b.Add("google", 113); b.Add("yahoo", 114); TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-com"); b.Save(out); } { TCompactTrieBuilder<> b; UNIT_ASSERT(b.AddSubtreeInFile("com.", GetSystemTempDir() + "/TCompactTrieTest-TestMerge-com")); UNIT_ASSERT(b.Add("org.kernel", 22)); UNIT_ASSERT(b.AddSubtreeInFile("ru.", GetSystemTempDir() + "/TCompactTrieTest-TestMerge-ru")); TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-res"); b.Save(out); } TCompactTrie<> trie(TBlob::FromFileSingleThreaded(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-res")); UNIT_ASSERT_VALUES_EQUAL(12u, trie.Get("ru.yandex")); UNIT_ASSERT_VALUES_EQUAL(13u, trie.Get("ru.google")); UNIT_ASSERT_VALUES_EQUAL(14u, trie.Get("ru.mail")); UNIT_ASSERT_VALUES_EQUAL(22u, trie.Get("org.kernel")); UNIT_ASSERT_VALUES_EQUAL(112u, trie.Get("com.yandex")); UNIT_ASSERT_VALUES_EQUAL(113u, trie.Get("com.google")); UNIT_ASSERT_VALUES_EQUAL(114u, trie.Get("com.yahoo")); unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMerge-res").data()); unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMerge-com").data()); unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMerge-ru").data()); } void TCompactTrieTest::TestMergeFromBuffer() { TArrayWithSizeHolder buffer1; { TCompactTrieBuilder<> b; b.Add("aaaaa", 1); b.Add("bbbbb", 2); b.Add("ccccc", 3); buffer1.Resize(b.MeasureByteSize()); TMemoryOutput out(buffer1.Get(), buffer1.Size()); b.Save(out); } TArrayWithSizeHolder buffer2; { TCompactTrieBuilder<> b; b.Add("aaaaa", 10); b.Add("bbbbb", 20); b.Add("ccccc", 30); b.Add("xxxxx", 40); b.Add("yyyyy", 50); buffer2.Resize(b.MeasureByteSize()); TMemoryOutput out(buffer2.Get(), buffer2.Size()); b.Save(out); } { TCompactTrieBuilder<> b; UNIT_ASSERT(b.AddSubtreeInBuffer("com.", std::move(buffer1))); UNIT_ASSERT(b.Add("org.upyachka", 42)); UNIT_ASSERT(b.AddSubtreeInBuffer("ru.", std::move(buffer2))); TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMergeFromBuffer-res"); b.Save(out); } TCompactTrie<> trie(TBlob::FromFileSingleThreaded(GetSystemTempDir() + "/TCompactTrieTest-TestMergeFromBuffer-res")); UNIT_ASSERT_VALUES_EQUAL(10u, trie.Get("ru.aaaaa")); UNIT_ASSERT_VALUES_EQUAL(20u, trie.Get("ru.bbbbb")); UNIT_ASSERT_VALUES_EQUAL(40u, trie.Get("ru.xxxxx")); UNIT_ASSERT_VALUES_EQUAL(42u, trie.Get("org.upyachka")); UNIT_ASSERT_VALUES_EQUAL(1u, trie.Get("com.aaaaa")); UNIT_ASSERT_VALUES_EQUAL(2u, trie.Get("com.bbbbb")); UNIT_ASSERT_VALUES_EQUAL(3u, trie.Get("com.ccccc")); unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMergeFromBuffer-res").data()); } void TCompactTrieTest::TestUnique() { TestUniqueImpl(false); TestUniqueImpl(true); } void TCompactTrieTest::TestUniqueImpl(bool isPrefixGrouped) { TCompactTrieBuilder builder(CTBF_UNIQUE | (isPrefixGrouped ? CTBF_PREFIX_GROUPED : CTBF_NONE)); const char* data[] = { "Kazan", "Moscow", "Monino", "Murmansk", "Fryanovo", "Fryazino", "Fryazevo", "Fry", "Tumen", }; for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) { UNIT_ASSERT_C(builder.Add(data[i], strlen(data[i]), i + 1), i); } bool wasException = false; try { builder.Add(data[4], strlen(data[4]), 20); } catch (const yexception& e) { wasException = true; UNIT_ASSERT(strstr(e.what(), "Duplicate key")); } UNIT_ASSERT_C(wasException, "TestUnique"); } void TCompactTrieTest::TestAddRetValue() { TCompactTrieBuilder builder; const char* data[] = { "Kazan", "Moscow", "Monino", "Murmansk", "Fryanovo", "Fryazino", "Fryazevo", "Fry", "Tumen", }; for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) { UNIT_ASSERT(builder.Add(data[i], strlen(data[i]), i + 1)); UNIT_ASSERT(!builder.Add(data[i], strlen(data[i]), i + 2)); ui32 value; UNIT_ASSERT(builder.Find(data[i], strlen(data[i]), &value)); UNIT_ASSERT(value == i + 2); } } void TCompactTrieTest::TestClear() { TCompactTrieBuilder builder; const char* data[] = { "Kazan", "Moscow", "Monino", "Murmansk", "Fryanovo", "Fryazino", "Fryazevo", "Fry", "Tumen", }; for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) { builder.Add(data[i], strlen(data[i]), i + 1); } UNIT_ASSERT(builder.GetEntryCount() == Y_ARRAY_SIZE(data)); builder.Clear(); UNIT_ASSERT(builder.GetEntryCount() == 0); UNIT_ASSERT(builder.GetNodeCount() == 1); } void TCompactTrieTest::TestFindTails() { TestFindTailsImpl("aa"); TestFindTailsImpl("bb"); TestFindTailsImpl("fb"); TestFindTailsImpl("fbc"); TestFindTailsImpl("fbbaa"); } template class TCompactTrieTest::TDummyPacker: public TNullPacker { public: static T Data(const TString&) { T data; TNullPacker().UnpackLeaf(nullptr, data); return data; } typedef T TData; }; class TCompactTrieTest::TStrokaPacker: public TCompactTriePacker { public: typedef TString TData; static TString Data(const TString& str) { return str; } }; template class TCompactTrieTest::TIntPacker: public TCompactTriePacker { public: typedef T TData; static TData Data(const TString&) { return RandomNumber>(); } }; void TCompactTrieTest::TestIterateEmptyKey() { TBuffer trieBuffer; { TCompactTrieBuilder builder; UNIT_ASSERT(builder.Add("", 1)); TBufferStream trieBufferO(trieBuffer); builder.Save(trieBufferO); } TCompactTrie trie(TBlob::FromBuffer(trieBuffer)); ui32 val; UNIT_ASSERT(trie.Find("", &val)); UNIT_ASSERT(val == 1); TCompactTrie::TConstIterator it = trie.Begin(); UNIT_ASSERT(it.GetKey().empty()); UNIT_ASSERT(it.GetValue() == 1); } void TCompactTrieTest::TestTrieSet() { TBuffer buffer; { TCompactTrieSet::TBuilder builder; UNIT_ASSERT(builder.Add("a", 0)); UNIT_ASSERT(builder.Add("ab", 1)); UNIT_ASSERT(builder.Add("abc", 1)); UNIT_ASSERT(builder.Add("abcd", 0)); UNIT_ASSERT(!builder.Add("abcd", 1)); TBufferStream stream(buffer); builder.Save(stream); } TCompactTrieSet set(TBlob::FromBuffer(buffer)); UNIT_ASSERT(set.Has("a")); UNIT_ASSERT(set.Has("ab")); UNIT_ASSERT(set.Has("abc")); UNIT_ASSERT(set.Has("abcd")); UNIT_ASSERT(!set.Has("abcde")); UNIT_ASSERT(!set.Has("aa")); UNIT_ASSERT(!set.Has("b")); UNIT_ASSERT(!set.Has("")); TCompactTrieSet tails; UNIT_ASSERT(set.FindTails("a", tails)); UNIT_ASSERT(tails.Has("b")); UNIT_ASSERT(tails.Has("bcd")); UNIT_ASSERT(!tails.Has("ab")); UNIT_ASSERT(!set.Has("")); TCompactTrieSet empty; UNIT_ASSERT(set.FindTails("abcd", empty)); UNIT_ASSERT(!empty.Has("a")); UNIT_ASSERT(!empty.Has("b")); UNIT_ASSERT(!empty.Has("c")); UNIT_ASSERT(!empty.Has("d")); UNIT_ASSERT(!empty.Has("d")); UNIT_ASSERT(empty.Has("")); // contains only empty string } // Tests for trie with vector (list, set) values TVector TCompactTrieTest::GetSampleKeys(size_t nKeys) const { Y_ASSERT(nKeys <= 10); TString sampleKeys[] = {"a", "b", "ac", "bd", "abe", "bcf", "deg", "ah", "xy", "abc"}; TVector result; for (size_t i = 0; i < nKeys; i++) result.push_back(ASCIIToWide(sampleKeys[i])); return result; } template TVector TCompactTrieTest::GetSampleVectorData(size_t nValues) { TVector data; for (size_t i = 0; i < nValues; i++) { data.push_back(TContainer()); for (size_t j = 0; j < i; j++) data[i].insert(data[i].end(), (typename TContainer::value_type)((j == 3) ? 0 : (1 << (j * 5)))); } return data; } template TVector TCompactTrieTest::GetSampleTextVectorData(size_t nValues) { TVector data; for (size_t i = 0; i < nValues; i++) { data.push_back(TContainer()); for (size_t j = 0; j < i; j++) data[i].insert(data[i].end(), TString("abc") + ToString(j)); } return data; } template void TCompactTrieTest::CheckEquality(const T& value1, const T& value2) const { UNIT_ASSERT_VALUES_EQUAL(value1, value2); } template <> void TCompactTrieTest::CheckEquality>(const TVector& value1, const TVector& value2) const { UNIT_ASSERT_VALUES_EQUAL(value1.size(), value2.size()); for (size_t i = 0; i < value1.size(); i++) UNIT_ASSERT_VALUES_EQUAL(value1[i], value2[i]); } template void TCompactTrieTest::TestTrieWithContainers(const TVector& keys, const TVector& sampleData, TString methodName) { TString fileName = GetSystemTempDir() + "/TCompactTrieTest-TestTrieWithContainers-" + methodName; TCompactTrieBuilder b; for (size_t i = 0; i < keys.size(); i++) { b.Add(keys[i], sampleData[i]); } TUnbufferedFileOutput out(fileName); b.Save(out); TCompactTrie trie(TBlob::FromFileSingleThreaded(fileName)); for (size_t i = 0; i < keys.size(); i++) { TContainer value = trie.Get(keys[i]); UNIT_ASSERT_VALUES_EQUAL(value.size(), sampleData[i].size()); typename TContainer::const_iterator p = value.begin(); typename TContainer::const_iterator p1 = sampleData[i].begin(); for (; p != value.end(); p++, p1++) CheckEquality(*p, *p1); } unlink(fileName.data()); } template <> void TCompactTrieTest::TestTrieWithContainers>>(const TVector& keys, const TVector>>& sampleData, TString methodName) { typedef std::pair> TContainer; TString fileName = GetSystemTempDir() + "/TCompactTrieTest-TestTrieWithContainers-" + methodName; TCompactTrieBuilder b; for (size_t i = 0; i < keys.size(); i++) { b.Add(keys[i], sampleData[i]); } TUnbufferedFileOutput out(fileName); b.Save(out); TCompactTrie trie(TBlob::FromFileSingleThreaded(fileName)); for (size_t i = 0; i < keys.size(); i++) { TContainer value = trie.Get(keys[i]); CheckEquality(value.first, sampleData[i].first); CheckEquality(value.second, sampleData[i].second); } unlink(fileName.data()); } void TCompactTrieTest::TestTrieForVectorInt64() { TestTrieWithContainers>(GetSampleKeys(10), GetSampleVectorData>(10), "v-i64"); } void TCompactTrieTest::TestTrieForListInt64() { TestTrieWithContainers>(GetSampleKeys(10), GetSampleVectorData>(10), "l-i64"); } void TCompactTrieTest::TestTrieForSetInt64() { TestTrieWithContainers>(GetSampleKeys(10), GetSampleVectorData>(10), "s-i64"); } void TCompactTrieTest::TestTrieForVectorStroka() { TestTrieWithContainers>(GetSampleKeys(10), GetSampleTextVectorData>(10), "v-str"); } void TCompactTrieTest::TestTrieForListStroka() { TestTrieWithContainers>(GetSampleKeys(10), GetSampleTextVectorData>(10), "l-str"); } void TCompactTrieTest::TestTrieForSetStroka() { TestTrieWithContainers>(GetSampleKeys(10), GetSampleTextVectorData>(10), "s-str"); } void TCompactTrieTest::TestTrieForVectorWtroka() { TVector> data = GetSampleTextVectorData>(10); TVector> wData; for (size_t i = 0; i < data.size(); i++) { wData.push_back(TVector()); for (size_t j = 0; j < data[i].size(); j++) wData[i].push_back(UTF8ToWide(data[i][j])); } TestTrieWithContainers>(GetSampleKeys(10), wData, "v-wtr"); } void TCompactTrieTest::TestTrieForVectorFloat() { TestTrieWithContainers>(GetSampleKeys(10), GetSampleVectorData>(10), "v-float"); } void TCompactTrieTest::TestTrieForVectorDouble() { TestTrieWithContainers>(GetSampleKeys(10), GetSampleVectorData>(10), "v-double"); } void TCompactTrieTest::TestTrieForListVectorInt64() { TVector tmp; tmp.push_back(0); TList> dataElement(5, tmp); TVector>> data(10, dataElement); TestTrieWithContainers>>(GetSampleKeys(10), data, "l-v-i64"); } void TCompactTrieTest::TestTrieForPairWtrokaVectorInt64() { TVector keys = GetSampleKeys(10); TVector> values = GetSampleVectorData>(10); TVector>> data; for (size_t i = 0; i < 10; i++) data.push_back(std::pair>(keys[i] + u"_v", values[i])); TestTrieWithContainers>>(keys, data, "pair-str-v-i64"); } void TCompactTrieTest::TestEmptyValueOutOfOrder() { TBufferOutput buffer; using TSymbol = ui32; { TCompactTrieBuilder builder; TSymbol key = 1; builder.Add(&key, 1, 10); builder.Add(nullptr, 0, 14); builder.Save(buffer); } { TCompactTrie trie(buffer.Buffer().Data(), buffer.Buffer().Size()); UNIT_ASSERT(trie.Find(nullptr, 0)); } } void TCompactTrieTest::TestFindLongestPrefixWithEmptyValue() { TBufferOutput buffer; { TCompactTrieBuilder builder; builder.Add(u"", 42); builder.Add(u"yandex", 271828); builder.Add(u"ya", 31415); builder.Save(buffer); } { TCompactTrie trie(buffer.Buffer().Data(), buffer.Buffer().Size()); size_t prefixLen = 123; ui32 value = 0; UNIT_ASSERT(trie.FindLongestPrefix(u"google", &prefixLen, &value)); UNIT_ASSERT(prefixLen == 0); UNIT_ASSERT(value == 42); UNIT_ASSERT(trie.FindLongestPrefix(u"yahoo", &prefixLen, &value)); UNIT_ASSERT(prefixLen == 2); UNIT_ASSERT(value == 31415); } } template struct TConvertKey { static inline TString Convert(const TStringBuf& key) { return ToString(key); } }; template <> struct TConvertKey { static inline TUtf16String Convert(const TStringBuf& key) { return UTF8ToWide(key); } }; template <> struct TConvertKey { static inline TUtf32String Convert(const TStringBuf& key) { return TUtf32String::FromUtf8(key); } }; template static void MoveIter(TSearchIter& iter, const TKeyBuf& key) { for (size_t i = 0; i < key.length(); ++i) { UNIT_ASSERT(iter.Advance(key[i])); } } template void TCompactTrieTest::TestSearchIterImpl() { TBufferOutput buffer; { TCompactTrieBuilder builder; TStringBuf data[] = { TStringBuf("abaab"), TStringBuf("abcdef"), TStringBuf("abbbc"), TStringBuf("bdfaa"), }; for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) { builder.Add(TConvertKey::Convert(data[i]), i + 1); } builder.Save(buffer); } TCompactTrie trie(buffer.Buffer().Data(), buffer.Buffer().Size()); ui32 value = 0; auto iter(MakeSearchIterator(trie)); MoveIter(iter, TConvertKey::Convert(TStringBuf("abc"))); UNIT_ASSERT(!iter.GetValue(&value)); iter = MakeSearchIterator(trie); MoveIter(iter, TConvertKey::Convert(TStringBuf("abbbc"))); UNIT_ASSERT(iter.GetValue(&value)); UNIT_ASSERT_EQUAL(value, 3); iter = MakeSearchIterator(trie); UNIT_ASSERT(iter.Advance(TConvertKey::Convert(TStringBuf("bdfa")))); UNIT_ASSERT(!iter.GetValue(&value)); iter = MakeSearchIterator(trie); UNIT_ASSERT(iter.Advance(TConvertKey::Convert(TStringBuf("bdfaa")))); UNIT_ASSERT(iter.GetValue(&value)); UNIT_ASSERT_EQUAL(value, 4); UNIT_ASSERT(!MakeSearchIterator(trie).Advance(TChar('z'))); UNIT_ASSERT(!MakeSearchIterator(trie).Advance(TConvertKey::Convert(TStringBuf("cdf")))); UNIT_ASSERT(!MakeSearchIterator(trie).Advance(TConvertKey::Convert(TStringBuf("abca")))); } void TCompactTrieTest::TestSearchIterChar() { TestSearchIterImpl(); } void TCompactTrieTest::TestSearchIterWchar() { TestSearchIterImpl(); } void TCompactTrieTest::TestSearchIterWchar32() { TestSearchIterImpl(); } void TCompactTrieTest::TestCopyAndAssignment() { TBufferOutput bufout; typedef TCompactTrie<> TTrie; CreateTrie(bufout, false, false); TTrie trie(bufout.Buffer().Data(), bufout.Buffer().Size()); TTrie copy(trie); UNIT_ASSERT(copy.HasCorrectSkipper()); TTrie assign; assign = trie; UNIT_ASSERT(assign.HasCorrectSkipper()); TTrie move(std::move(trie)); UNIT_ASSERT(move.HasCorrectSkipper()); TTrie moveAssign; moveAssign = TTrie(bufout.Buffer().Data(), bufout.Buffer().Size()); UNIT_ASSERT(moveAssign.HasCorrectSkipper()); } template void TCompactTrieTest::TestFirstSymbolIteratorForTrie(const TTrie& trie, const TStringBuf& narrowAnswers) { NCompactTrie::TFirstSymbolIterator it; it.SetTrie(trie, trie.GetSkipper()); typename TTrie::TKey answers = MakeWideKey(narrowAnswers); auto answer = answers.begin(); for (; !it.AtEnd(); it.MakeStep(), ++answer) { UNIT_ASSERT(answer != answers.end()); UNIT_ASSERT(it.GetKey() == *answer); } UNIT_ASSERT(answer == answers.end()); } template void TCompactTrieTest::TestFirstSymbolIterator() { TBufferOutput bufout; typedef TCompactTrie TTrie; CreateTrie(bufout, false, false); TTrie trie(bufout.Buffer().Data(), bufout.Buffer().Size()); TStringBuf rootAnswers = "abcdf"; TestFirstSymbolIteratorForTrie(trie, rootAnswers); TStringBuf aAnswers = "abcd"; TestFirstSymbolIteratorForTrie(trie.FindTails(MakeWideKey("a", 1)), aAnswers); } void TCompactTrieTest::TestFirstSymbolIterator8() { TestFirstSymbolIterator(); } void TCompactTrieTest::TestFirstSymbolIterator16() { TestFirstSymbolIterator(); } void TCompactTrieTest::TestFirstSymbolIterator32() { TestFirstSymbolIterator(); } void TCompactTrieTest::TestFirstSymbolIteratorChar32() { TestFirstSymbolIterator(); } void TCompactTrieTest::TestArrayPacker() { using TDataInt = std::array; const std::pair dataXxx{"xxx", {{15, 16}}}; const std::pair dataYyy{"yyy", {{20, 30}}}; TCompactTrieBuilder trieBuilderOne; trieBuilderOne.Add(dataXxx.first, dataXxx.second); trieBuilderOne.Add(dataYyy.first, dataYyy.second); TBufferOutput bufferOne; trieBuilderOne.Save(bufferOne); const TCompactTrie trieOne(bufferOne.Buffer().Data(), bufferOne.Buffer().Size()); UNIT_ASSERT_VALUES_EQUAL(dataXxx.second, trieOne.Get(dataXxx.first)); UNIT_ASSERT_VALUES_EQUAL(dataYyy.second, trieOne.Get(dataYyy.first)); using TDataStroka = std::array; const std::pair dataZzz{"zzz", {{"hello", "there"}}}; const std::pair dataWww{"www", {{"half", "life"}}}; TCompactTrieBuilder trieBuilderTwo; trieBuilderTwo.Add(dataZzz.first, dataZzz.second); trieBuilderTwo.Add(dataWww.first, dataWww.second); TBufferOutput bufferTwo; trieBuilderTwo.Save(bufferTwo); const TCompactTrie trieTwo(bufferTwo.Buffer().Data(), bufferTwo.Buffer().Size()); UNIT_ASSERT_VALUES_EQUAL(dataZzz.second, trieTwo.Get(dataZzz.first)); UNIT_ASSERT_VALUES_EQUAL(dataWww.second, trieTwo.Get(dataWww.first)); } void TCompactTrieTest::TestBuilderFindLongestPrefix() { const size_t sizes[] = {10, 100}; const double branchProbabilities[] = {0.01, 0.1, 0.5, 0.9, 0.99}; for (size_t size : sizes) { for (double branchProbability : branchProbabilities) { TestBuilderFindLongestPrefix(size, branchProbability, false, false); TestBuilderFindLongestPrefix(size, branchProbability, false, true); TestBuilderFindLongestPrefix(size, branchProbability, true, false); TestBuilderFindLongestPrefix(size, branchProbability, true, true); } } } void TCompactTrieTest::TestBuilderFindLongestPrefix(size_t keysCount, double branchProbability, bool isPrefixGrouped, bool hasEmptyKey) { TVector keys; TString keyToAdd; for (size_t i = 0; i < keysCount; ++i) { const size_t prevKeyLen = keyToAdd.Size(); // add two random chars to prev key keyToAdd += RandChar(); keyToAdd += RandChar(); const bool changeBranch = prevKeyLen && RandomNumber() < branchProbability; if (changeBranch) { const size_t branchPlace = RandomNumber(prevKeyLen + 1); // random place in [0, prevKeyLen] *(keyToAdd.begin() + branchPlace) = RandChar(); } keys.push_back(keyToAdd); } if (isPrefixGrouped) Sort(keys.begin(), keys.end()); else Shuffle(keys.begin(), keys.end()); TCompactTrieBuilder builder(isPrefixGrouped ? CTBF_PREFIX_GROUPED : CTBF_NONE); const TString EMPTY_VALUE = "empty"; if (hasEmptyKey) builder.Add(nullptr, 0, EMPTY_VALUE); for (size_t i = 0; i < keysCount; ++i) { const TString& key = keys[i]; for (size_t j = 0; j < keysCount; ++j) { const TString& otherKey = keys[j]; const bool exists = j < i; size_t expectedSize = 0; if (exists) { expectedSize = otherKey.size(); } else { size_t max = 0; for (size_t k = 0; k < i; ++k) if (keys[k].Size() < otherKey.Size() && keys[k].Size() > max && otherKey.StartsWith(keys[k])) max = keys[k].Size(); expectedSize = max; } size_t prefixSize = 0xfcfcfc; TString value = "abcd"; const bool expectedResult = hasEmptyKey || expectedSize != 0; UNIT_ASSERT_VALUES_EQUAL_C(expectedResult, builder.FindLongestPrefix(otherKey.data(), otherKey.size(), &prefixSize, &value), "otherKey = " << HexEncode(otherKey)); if (expectedResult) { UNIT_ASSERT_VALUES_EQUAL(expectedSize, prefixSize); if (expectedSize) { UNIT_ASSERT_VALUES_EQUAL(TStringBuf(otherKey).SubStr(0, prefixSize), value); } else { UNIT_ASSERT_VALUES_EQUAL(EMPTY_VALUE, value); } } else { UNIT_ASSERT_VALUES_EQUAL("abcd", value); UNIT_ASSERT_VALUES_EQUAL(0xfcfcfc, prefixSize); } for (int c = 0; c < 10; ++c) { TString extendedKey = otherKey; extendedKey += RandChar(); size_t extendedPrefixSize = 0xdddddd; TString extendedValue = "dcba"; UNIT_ASSERT_VALUES_EQUAL(expectedResult, builder.FindLongestPrefix(extendedKey.data(), extendedKey.size(), &extendedPrefixSize, &extendedValue)); if (expectedResult) { UNIT_ASSERT_VALUES_EQUAL(value, extendedValue); UNIT_ASSERT_VALUES_EQUAL(prefixSize, extendedPrefixSize); } else { UNIT_ASSERT_VALUES_EQUAL("dcba", extendedValue); UNIT_ASSERT_VALUES_EQUAL(0xdddddd, extendedPrefixSize); } } } builder.Add(key.data(), key.size(), key); } TBufferOutput buffer; builder.Save(buffer); } void TCompactTrieTest::TestBuilderFindLongestPrefixWithEmptyValue() { TCompactTrieBuilder builder; builder.Add(u"", 42); builder.Add(u"yandex", 271828); builder.Add(u"ya", 31415); size_t prefixLen = 123; ui32 value = 0; UNIT_ASSERT(builder.FindLongestPrefix(u"google", &prefixLen, &value)); UNIT_ASSERT_VALUES_EQUAL(prefixLen, 0); UNIT_ASSERT_VALUES_EQUAL(value, 42); UNIT_ASSERT(builder.FindLongestPrefix(u"yahoo", &prefixLen, &value)); UNIT_ASSERT_VALUES_EQUAL(prefixLen, 2); UNIT_ASSERT_VALUES_EQUAL(value, 31415); TBufferOutput buffer; builder.Save(buffer); } void TCompactTrieTest::TestPatternSearcherEmpty() { TCompactPatternSearcherBuilder builder; TBufferOutput searcherData; builder.Save(searcherData); TCompactPatternSearcher searcher( searcherData.Buffer().Data(), searcherData.Buffer().Size() ); UNIT_ASSERT(searcher.SearchMatches("a").empty()); UNIT_ASSERT(searcher.SearchMatches("").empty()); UNIT_ASSERT(searcher.SearchMatches("abc").empty()); } void TCompactTrieTest::TestPatternSearcherOnDataset( const TVector& patterns, const TVector& samples ) { TCompactPatternSearcherBuilder builder; for (size_t patternIdx = 0; patternIdx < patterns.size(); ++patternIdx) { builder.Add(patterns[patternIdx], patternIdx); } TBufferOutput searcherData; builder.Save(searcherData); TCompactPatternSearcher searcher( searcherData.Buffer().Data(), searcherData.Buffer().Size() ); for (const auto& sample : samples) { const auto matches = searcher.SearchMatches(sample); size_t matchesNum = 0; THashSet processedPatterns; for (const auto& pattern : patterns) { if (pattern.Empty() || processedPatterns.contains(pattern)) { continue; } for (size_t start = 0; start + pattern.Size() <= sample.Size(); ++start) { matchesNum += (pattern == sample.substr(start, pattern.Size())); } processedPatterns.insert(pattern); } UNIT_ASSERT_VALUES_EQUAL(matchesNum, matches.size()); TSet> foundMatches; for (const auto& match : matches) { std::pair matchParams(match.End, match.Data); UNIT_ASSERT(!foundMatches.contains(matchParams)); foundMatches.insert(matchParams); const auto& pattern = patterns[match.Data]; UNIT_ASSERT_VALUES_EQUAL( sample.substr(match.End - pattern.size() + 1, pattern.size()), pattern ); } } } void TCompactTrieTest::TestPatternSearcherSimple() { TestPatternSearcherOnDataset( { // patterns "abcd", "abc", "ab", "a", "" }, { // samples "abcde", "abcd", "abc", "ab", "a", "" } ); TestPatternSearcherOnDataset( { // patterns "a" "ab", "abcd", }, { // samples "abcde", "abcd", "abc", "ab", "a", "" } ); TestPatternSearcherOnDataset( { // patterns "aaaa", "aaa", "aa", "a", }, { // samples "aaaaaaaaaaaa" } ); TestPatternSearcherOnDataset( { // patterns "aa", "ab", "ac", "ad", "ae", "af", "ba", "bb", "bc", "bd", "be", "bf", "ca", "cb", "cc", "cd", "ce", "cf", "da", "db", "dc", "dd", "de", "df", "ea", "eb", "ec", "ed", "ee", "ef", "fa", "fb", "fc", "fd", "fe", "ff" }, { // samples "dcabafeebfdcbacddacadbaabecdbaeffecdbfabcdcabcfaefecdfebacfedacefbdcacfeb", "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefancdefancdef", "fedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcba", "", "a", "b", "c", "d", "e", "f", "aa", "ab", "ac", "ad", "ae", "af", "ba", "bb", "bc", "bd", "be", "bf", "ca", "cb", "cc", "cd", "ce", "cf", "da", "db", "dc", "dd", "de", "df", "ea", "eb", "ec", "ed", "ee", "ef", "fa", "fb", "fc", "fd", "fe", "ff" } ); } static char RandChar( TFastRng& rng, int maxChar ) { return static_cast(rng.GenRand() % (maxChar + 1)); } static TString RandStr( TFastRng& rng, size_t maxLength, int maxChar, bool nonEmpty = false ) { Y_ASSERT(maxLength > 0); size_t length; if (nonEmpty) { length = rng.GenRand() % maxLength + 1; } else { length = rng.GenRand() % (maxLength + 1); } TString result; while (result.size() < length) { result += RandChar(rng, maxChar); } return result; } void TCompactTrieTest::TestPatternSearcherRandom( size_t patternsNum, size_t patternMaxLength, size_t strMaxLength, int maxChar, TFastRng& rng ) { auto patternToSearch = RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true); TVector patterns = {patternToSearch}; while (patterns.size() < patternsNum) { patterns.push_back(RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true)); } auto filler = RandStr(rng, strMaxLength - patternToSearch.Size() + 1, maxChar); size_t leftFillerSize = rng.GenRand() % (filler.size() + 1); auto leftFiller = filler.substr(0, leftFillerSize); auto rightFiller = filler.substr(leftFillerSize, filler.size() - leftFillerSize); auto sample = leftFiller + patternToSearch + rightFiller; TestPatternSearcherOnDataset(patterns, {sample}); } void TCompactTrieTest::TestPatternSearcherRandom() { TFastRng rng(0); for (size_t patternMaxLen : {1, 2, 10}) { for (size_t strMaxLen : TVector{patternMaxLen, 2 * patternMaxLen, 10}) { for (int maxChar : {0, 1, 5, 255}) { for (size_t patternsNum : {1, 10}) { for (size_t testIdx = 0; testIdx < 3; ++testIdx) { TestPatternSearcherRandom( patternsNum, patternMaxLen, strMaxLen, maxChar, rng ); } } } } } }