comptrie_ut.cpp 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791
  1. #include <util/random/shuffle.h>
  2. #include <library/cpp/testing/unittest/registar.h>
  3. #include <util/stream/output.h>
  4. #include <utility>
  5. #include <util/charset/wide.h>
  6. #include <util/generic/algorithm.h>
  7. #include <util/generic/buffer.h>
  8. #include <util/generic/map.h>
  9. #include <util/generic/vector.h>
  10. #include <util/generic/ptr.h>
  11. #include <util/generic/ylimits.h>
  12. #include <util/folder/dirut.h>
  13. #include <util/random/random.h>
  14. #include <util/random/fast.h>
  15. #include <util/string/hex.h>
  16. #include <util/string/cast.h>
  17. #include "comptrie.h"
  18. #include "set.h"
  19. #include "first_symbol_iterator.h"
  20. #include "search_iterator.h"
  21. #include "pattern_searcher.h"
  22. #include <array>
  23. #include <iterator>
  24. class TCompactTrieTest: public TTestBase {
  25. private:
  26. UNIT_TEST_SUITE(TCompactTrieTest);
  27. UNIT_TEST(TestTrie8);
  28. UNIT_TEST(TestTrie16);
  29. UNIT_TEST(TestTrie32);
  30. UNIT_TEST(TestFastTrie8);
  31. UNIT_TEST(TestFastTrie16);
  32. UNIT_TEST(TestFastTrie32);
  33. UNIT_TEST(TestMinimizedTrie8);
  34. UNIT_TEST(TestMinimizedTrie16);
  35. UNIT_TEST(TestMinimizedTrie32);
  36. UNIT_TEST(TestFastMinimizedTrie8);
  37. UNIT_TEST(TestFastMinimizedTrie16);
  38. UNIT_TEST(TestFastMinimizedTrie32);
  39. UNIT_TEST(TestTrieIterator8);
  40. UNIT_TEST(TestTrieIterator16);
  41. UNIT_TEST(TestTrieIterator32);
  42. UNIT_TEST(TestMinimizedTrieIterator8);
  43. UNIT_TEST(TestMinimizedTrieIterator16);
  44. UNIT_TEST(TestMinimizedTrieIterator32);
  45. UNIT_TEST(TestPhraseSearch);
  46. UNIT_TEST(TestAddGet);
  47. UNIT_TEST(TestEmpty);
  48. UNIT_TEST(TestUninitializedNonEmpty);
  49. UNIT_TEST(TestRandom);
  50. UNIT_TEST(TestFindTails);
  51. UNIT_TEST(TestPrefixGrouped);
  52. UNIT_TEST(CrashTestPrefixGrouped);
  53. UNIT_TEST(TestMergeFromFile);
  54. UNIT_TEST(TestMergeFromBuffer);
  55. UNIT_TEST(TestUnique);
  56. UNIT_TEST(TestAddRetValue);
  57. UNIT_TEST(TestClear);
  58. UNIT_TEST(TestIterateEmptyKey);
  59. UNIT_TEST(TestTrieSet);
  60. UNIT_TEST(TestTrieForVectorInt64);
  61. UNIT_TEST(TestTrieForListInt64);
  62. UNIT_TEST(TestTrieForSetInt64);
  63. UNIT_TEST(TestTrieForVectorStroka);
  64. UNIT_TEST(TestTrieForListStroka);
  65. UNIT_TEST(TestTrieForSetStroka);
  66. UNIT_TEST(TestTrieForVectorWtroka);
  67. UNIT_TEST(TestTrieForVectorFloat);
  68. UNIT_TEST(TestTrieForVectorDouble);
  69. UNIT_TEST(TestTrieForListVectorInt64);
  70. UNIT_TEST(TestTrieForPairWtrokaVectorInt64);
  71. UNIT_TEST(TestEmptyValueOutOfOrder);
  72. UNIT_TEST(TestFindLongestPrefixWithEmptyValue);
  73. UNIT_TEST(TestSearchIterChar);
  74. UNIT_TEST(TestSearchIterWchar);
  75. UNIT_TEST(TestSearchIterWchar32)
  76. UNIT_TEST(TestCopyAndAssignment);
  77. UNIT_TEST(TestFirstSymbolIterator8);
  78. UNIT_TEST(TestFirstSymbolIterator16);
  79. UNIT_TEST(TestFirstSymbolIterator32);
  80. UNIT_TEST(TestFirstSymbolIteratorChar32);
  81. UNIT_TEST(TestArrayPacker);
  82. UNIT_TEST(TestBuilderFindLongestPrefix);
  83. UNIT_TEST(TestBuilderFindLongestPrefixWithEmptyValue);
  84. UNIT_TEST(TestPatternSearcherEmpty);
  85. UNIT_TEST(TestPatternSearcherSimple);
  86. UNIT_TEST(TestPatternSearcherRandom);
  87. UNIT_TEST_SUITE_END();
  88. static const char* SampleData[];
  89. template <class T>
  90. void CreateTrie(IOutputStream& out, bool minimize, bool useFastLayout);
  91. template <class T>
  92. void CheckData(const char* src, size_t len);
  93. template <class T>
  94. void CheckUpperBound(const char* src, size_t len);
  95. template <class T>
  96. void CheckIterator(const char* src, size_t len);
  97. template <class T>
  98. void TestTrie(bool minimize, bool useFastLayout);
  99. template <class T>
  100. void TestTrieIterator(bool minimize);
  101. template <class T, bool minimize>
  102. void TestRandom(const size_t n, const size_t maxKeySize);
  103. void TestFindTailsImpl(const TString& prefix);
  104. void TestUniqueImpl(bool isPrefixGrouped);
  105. TVector<TUtf16String> GetSampleKeys(size_t nKeys) const;
  106. template <class TContainer>
  107. TVector<TContainer> GetSampleVectorData(size_t nValues);
  108. template <class TContainer>
  109. TVector<TContainer> GetSampleTextVectorData(size_t nValues);
  110. template <class T>
  111. void CheckEquality(const T& value1, const T& value2) const;
  112. template <class TContainer>
  113. void TestTrieWithContainers(const TVector<TUtf16String>& keys, const TVector<TContainer>& sampleData, TString methodName);
  114. template <typename TChar>
  115. void TestSearchIterImpl();
  116. template <class TTrie>
  117. void TestFirstSymbolIteratorForTrie(const TTrie& trie, const TStringBuf& narrowAnswers);
  118. template <typename TSymbol>
  119. void TestFirstSymbolIterator();
  120. template <class T>
  121. class TIntPacker;
  122. template <class T>
  123. class TDummyPacker;
  124. class TStrokaPacker;
  125. public:
  126. void TestPackers();
  127. void TestTrie8();
  128. void TestTrie16();
  129. void TestTrie32();
  130. void TestFastTrie8();
  131. void TestFastTrie16();
  132. void TestFastTrie32();
  133. void TestMinimizedTrie8();
  134. void TestMinimizedTrie16();
  135. void TestMinimizedTrie32();
  136. void TestFastMinimizedTrie8();
  137. void TestFastMinimizedTrie16();
  138. void TestFastMinimizedTrie32();
  139. void TestTrieIterator8();
  140. void TestTrieIterator16();
  141. void TestTrieIterator32();
  142. void TestMinimizedTrieIterator8();
  143. void TestMinimizedTrieIterator16();
  144. void TestMinimizedTrieIterator32();
  145. void TestPhraseSearch();
  146. void TestAddGet();
  147. void TestEmpty();
  148. void TestUninitializedNonEmpty();
  149. void TestRandom();
  150. void TestFindTails();
  151. void TestPrefixGrouped();
  152. void CrashTestPrefixGrouped();
  153. void TestMergeFromFile();
  154. void TestMergeFromBuffer();
  155. void TestUnique();
  156. void TestAddRetValue();
  157. void TestClear();
  158. void TestIterateEmptyKey();
  159. void TestTrieSet();
  160. void TestTrieForVectorInt64();
  161. void TestTrieForListInt64();
  162. void TestTrieForSetInt64();
  163. void TestTrieForVectorStroka();
  164. void TestTrieForListStroka();
  165. void TestTrieForSetStroka();
  166. void TestTrieForVectorWtroka();
  167. void TestTrieForVectorFloat();
  168. void TestTrieForVectorDouble();
  169. void TestTrieForListVectorInt64();
  170. void TestTrieForPairWtrokaVectorInt64();
  171. void TestEmptyValueOutOfOrder();
  172. void TestFindLongestPrefixWithEmptyValue();
  173. void TestSearchIterChar();
  174. void TestSearchIterWchar();
  175. void TestSearchIterWchar32();
  176. void TestCopyAndAssignment();
  177. void TestFirstSymbolIterator8();
  178. void TestFirstSymbolIterator16();
  179. void TestFirstSymbolIterator32();
  180. void TestFirstSymbolIteratorChar32();
  181. void TestArrayPacker();
  182. void TestBuilderFindLongestPrefix();
  183. void TestBuilderFindLongestPrefix(size_t keysCount, double branchProbability, bool isPrefixGrouped, bool hasEmptyKey);
  184. void TestBuilderFindLongestPrefixWithEmptyValue();
  185. void TestPatternSearcherOnDataset(
  186. const TVector<TString>& patterns,
  187. const TVector<TString>& samples
  188. );
  189. void TestPatternSearcherEmpty();
  190. void TestPatternSearcherSimple();
  191. void TestPatternSearcherRandom(
  192. size_t patternsNum,
  193. size_t patternMaxLength,
  194. size_t strMaxLength,
  195. int maxChar,
  196. TFastRng<ui64>& rng
  197. );
  198. void TestPatternSearcherRandom();
  199. };
  200. UNIT_TEST_SUITE_REGISTRATION(TCompactTrieTest);
  201. const char* TCompactTrieTest::SampleData[] = {
  202. "",
  203. "a", "b", "c", "d",
  204. "aa", "ab", "ac", "ad",
  205. "aaa", "aab", "aac", "aad",
  206. "aba", "abb", "abc", "abd",
  207. "fba", "fbb", "fbc", "fbd",
  208. "fbbaa",
  209. "c\x85\xA4\xBF" // Just something outside ASCII.
  210. };
  211. template <class T>
  212. typename TCompactTrie<T>::TKey MakeWideKey(const char* str, size_t len) {
  213. typename TCompactTrie<T>::TKey buffer;
  214. for (size_t i = 0; i < len; i++) {
  215. unsigned int ch = (str[i] & 0xFF);
  216. buffer.push_back((T)(ch | ch << 8 | ch << 16 | ch << 24));
  217. }
  218. return buffer;
  219. }
  220. template <class T>
  221. typename TCompactTrie<T>::TKey MakeWideKey(const TString& str) {
  222. return MakeWideKey<T>(str.c_str(), str.length());
  223. }
  224. template <class T>
  225. typename TCompactTrie<T>::TKey MakeWideKey(const TStringBuf& buf) {
  226. return MakeWideKey<T>(buf.data(), buf.length());
  227. }
  228. template <class T>
  229. void TCompactTrieTest::CreateTrie(IOutputStream& out, bool minimize, bool useFastLayout) {
  230. TCompactTrieBuilder<T> builder;
  231. for (auto& i : SampleData) {
  232. size_t len = strlen(i);
  233. builder.Add(MakeWideKey<T>(i, len), len * 2);
  234. }
  235. TBufferOutput tmp2;
  236. IOutputStream& currentOutput = useFastLayout ? tmp2 : out;
  237. if (minimize) {
  238. TBufferOutput buftmp;
  239. builder.Save(buftmp);
  240. CompactTrieMinimize<TCompactTriePacker<ui64>>(currentOutput, buftmp.Buffer().Data(), buftmp.Buffer().Size(), false);
  241. } else {
  242. builder.Save(currentOutput);
  243. }
  244. if (useFastLayout) {
  245. CompactTrieMakeFastLayout<TCompactTriePacker<T>>(out, tmp2.Buffer().Data(), tmp2.Buffer().Size(), false);
  246. }
  247. }
  248. // Iterates over all strings of length <= 4 made of letters a-g.
  249. static bool LexicographicStep(TString& s) {
  250. if (s.length() < 4) {
  251. s += "a";
  252. return true;
  253. }
  254. while (!s.empty() && s.back() == 'g')
  255. s.pop_back();
  256. if (s.empty())
  257. return false;
  258. char last = s.back();
  259. last++;
  260. s.pop_back();
  261. s.push_back(last);
  262. return true;
  263. }
  264. template <class T>
  265. void TCompactTrieTest::CheckUpperBound(const char* data, size_t datalen) {
  266. TCompactTrie<T> trie(data, datalen);
  267. typedef typename TCompactTrie<T>::TKey TKey;
  268. typedef typename TCompactTrie<T>::TData TData;
  269. TString key;
  270. do {
  271. const TKey wideKey = MakeWideKey<T>(key);
  272. typename TCompactTrie<T>::TConstIterator it = trie.UpperBound(wideKey);
  273. UNIT_ASSERT_C(it == trie.End() || it.GetKey() >= wideKey, "key=" + key);
  274. TData data;
  275. const bool found = trie.Find(wideKey, &data);
  276. if (found)
  277. UNIT_ASSERT_C(it.GetKey() == wideKey && it.GetValue() == data, "key=" + key);
  278. if (it != trie.Begin())
  279. UNIT_ASSERT_C((--it).GetKey() < wideKey, "key=" + key);
  280. } while (LexicographicStep(key));
  281. }
  282. template <class T>
  283. void TCompactTrieTest::CheckData(const char* data, size_t datalen) {
  284. TCompactTrie<T> trie(data, datalen);
  285. UNIT_ASSERT_VALUES_EQUAL(Y_ARRAY_SIZE(SampleData), trie.Size());
  286. for (auto& i : SampleData) {
  287. size_t len = strlen(i);
  288. ui64 value = 0;
  289. size_t prefixLen = 0;
  290. typename TCompactTrie<T>::TKey key = MakeWideKey<T>(i, len);
  291. UNIT_ASSERT(trie.Find(key, &value));
  292. UNIT_ASSERT_EQUAL(len * 2, value);
  293. UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value));
  294. UNIT_ASSERT_EQUAL(len, prefixLen);
  295. UNIT_ASSERT_EQUAL(len * 2, value);
  296. TString badkey("bb");
  297. badkey += i;
  298. key = MakeWideKey<T>(badkey);
  299. UNIT_ASSERT(!trie.Find(key));
  300. value = 123;
  301. UNIT_ASSERT(!trie.Find(key, &value));
  302. UNIT_ASSERT_EQUAL(123, value);
  303. UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value));
  304. UNIT_ASSERT_EQUAL(1, prefixLen);
  305. UNIT_ASSERT_EQUAL(2, value);
  306. badkey = i;
  307. badkey += "x";
  308. key = MakeWideKey<T>(badkey);
  309. UNIT_ASSERT(!trie.Find(key));
  310. value = 1234;
  311. UNIT_ASSERT(!trie.Find(key, &value));
  312. UNIT_ASSERT_EQUAL(1234, value);
  313. UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value));
  314. UNIT_ASSERT_EQUAL(len, prefixLen);
  315. UNIT_ASSERT_EQUAL(len * 2, value);
  316. UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, nullptr));
  317. UNIT_ASSERT_EQUAL(len, prefixLen);
  318. }
  319. TString testkey("fbbaa");
  320. typename TCompactTrie<T>::TKey key = MakeWideKey<T>(testkey);
  321. ui64 value = 0;
  322. size_t prefixLen = 0;
  323. UNIT_ASSERT(trie.FindLongestPrefix(key.data(), testkey.length() - 1, &prefixLen, &value));
  324. UNIT_ASSERT_EQUAL(prefixLen, 3);
  325. UNIT_ASSERT_EQUAL(6, value);
  326. testkey = "fbbax";
  327. key = MakeWideKey<T>(testkey);
  328. UNIT_ASSERT(trie.FindLongestPrefix(key, &prefixLen, &value));
  329. UNIT_ASSERT_EQUAL(prefixLen, 3);
  330. UNIT_ASSERT_EQUAL(6, value);
  331. value = 12345678;
  332. UNIT_ASSERT(!trie.Find(key, &value));
  333. UNIT_ASSERT_EQUAL(12345678, value); //Failed Find() should not change value
  334. }
  335. template <class T>
  336. void TCompactTrieTest::CheckIterator(const char* data, size_t datalen) {
  337. typedef typename TCompactTrie<T>::TKey TKey;
  338. typedef typename TCompactTrie<T>::TValueType TValue;
  339. TMap<TKey, ui64> stored;
  340. for (auto& i : SampleData) {
  341. size_t len = strlen(i);
  342. stored[MakeWideKey<T>(i, len)] = len * 2;
  343. }
  344. TCompactTrie<T> trie(data, datalen);
  345. TVector<TValue> items;
  346. typename TCompactTrie<T>::TConstIterator it = trie.Begin();
  347. size_t entry_count = 0;
  348. TMap<TKey, ui64> received;
  349. while (it != trie.End()) {
  350. UNIT_ASSERT_VALUES_EQUAL(it.GetKeySize(), it.GetKey().size());
  351. received.insert(*it);
  352. items.push_back(*it);
  353. entry_count++;
  354. it++;
  355. }
  356. TMap<TKey, ui64> received2;
  357. for (std::pair<TKey, ui64> x : trie) {
  358. received2.insert(x);
  359. }
  360. UNIT_ASSERT(entry_count == stored.size());
  361. UNIT_ASSERT(received == stored);
  362. UNIT_ASSERT(received2 == stored);
  363. std::reverse(items.begin(), items.end());
  364. typename TCompactTrie<T>::TConstIterator revIt = trie.End();
  365. typename TCompactTrie<T>::TConstIterator const begin = trie.Begin();
  366. typename TCompactTrie<T>::TConstIterator emptyIt;
  367. size_t pos = 0;
  368. while (revIt != begin) {
  369. revIt--;
  370. UNIT_ASSERT(*revIt == items[pos]);
  371. pos++;
  372. }
  373. // Checking the assignment operator.
  374. revIt = begin;
  375. UNIT_ASSERT(revIt == trie.Begin());
  376. UNIT_ASSERT(!revIt.IsEmpty());
  377. UNIT_ASSERT(revIt != emptyIt);
  378. UNIT_ASSERT(revIt != trie.End());
  379. ++revIt; // Call a method that uses Skipper.
  380. revIt = emptyIt;
  381. UNIT_ASSERT(revIt == emptyIt);
  382. UNIT_ASSERT(revIt.IsEmpty());
  383. UNIT_ASSERT(revIt != trie.End());
  384. // Checking the move assignment operator.
  385. revIt = trie.Begin();
  386. UNIT_ASSERT(revIt == trie.Begin());
  387. UNIT_ASSERT(!revIt.IsEmpty());
  388. UNIT_ASSERT(revIt != emptyIt);
  389. UNIT_ASSERT(revIt != trie.End());
  390. ++revIt; // Call a method that uses Skipper.
  391. revIt = typename TCompactTrie<T>::TConstIterator();
  392. UNIT_ASSERT(revIt == emptyIt);
  393. UNIT_ASSERT(revIt.IsEmpty());
  394. UNIT_ASSERT(revIt != trie.End());
  395. }
  396. template <class T>
  397. void TCompactTrieTest::TestTrie(bool minimize, bool useFastLayout) {
  398. TBufferOutput bufout;
  399. CreateTrie<T>(bufout, minimize, useFastLayout);
  400. CheckData<T>(bufout.Buffer().Data(), bufout.Buffer().Size());
  401. CheckUpperBound<T>(bufout.Buffer().Data(), bufout.Buffer().Size());
  402. }
  403. template <class T>
  404. void TCompactTrieTest::TestTrieIterator(bool minimize) {
  405. TBufferOutput bufout;
  406. CreateTrie<T>(bufout, minimize, false);
  407. CheckIterator<T>(bufout.Buffer().Data(), bufout.Buffer().Size());
  408. }
  409. void TCompactTrieTest::TestTrie8() {
  410. TestTrie<char>(false, false);
  411. }
  412. void TCompactTrieTest::TestTrie16() {
  413. TestTrie<wchar16>(false, false);
  414. }
  415. void TCompactTrieTest::TestTrie32() {
  416. TestTrie<wchar32>(false, false);
  417. }
  418. void TCompactTrieTest::TestFastTrie8() {
  419. TestTrie<char>(false, true);
  420. }
  421. void TCompactTrieTest::TestFastTrie16() {
  422. TestTrie<wchar16>(false, true);
  423. }
  424. void TCompactTrieTest::TestFastTrie32() {
  425. TestTrie<wchar32>(false, true);
  426. }
  427. void TCompactTrieTest::TestMinimizedTrie8() {
  428. TestTrie<char>(true, false);
  429. }
  430. void TCompactTrieTest::TestMinimizedTrie16() {
  431. TestTrie<wchar16>(true, false);
  432. }
  433. void TCompactTrieTest::TestMinimizedTrie32() {
  434. TestTrie<wchar32>(true, false);
  435. }
  436. void TCompactTrieTest::TestFastMinimizedTrie8() {
  437. TestTrie<char>(true, true);
  438. }
  439. void TCompactTrieTest::TestFastMinimizedTrie16() {
  440. TestTrie<wchar16>(true, true);
  441. }
  442. void TCompactTrieTest::TestFastMinimizedTrie32() {
  443. TestTrie<wchar32>(true, true);
  444. }
  445. void TCompactTrieTest::TestTrieIterator8() {
  446. TestTrieIterator<char>(false);
  447. }
  448. void TCompactTrieTest::TestTrieIterator16() {
  449. TestTrieIterator<wchar16>(false);
  450. }
  451. void TCompactTrieTest::TestTrieIterator32() {
  452. TestTrieIterator<wchar32>(false);
  453. }
  454. void TCompactTrieTest::TestMinimizedTrieIterator8() {
  455. TestTrieIterator<char>(true);
  456. }
  457. void TCompactTrieTest::TestMinimizedTrieIterator16() {
  458. TestTrieIterator<wchar16>(true);
  459. }
  460. void TCompactTrieTest::TestMinimizedTrieIterator32() {
  461. TestTrieIterator<wchar32>(true);
  462. }
  463. void TCompactTrieTest::TestPhraseSearch() {
  464. static const char* phrases[] = {"ab", "ab cd", "ab cd ef"};
  465. static const char* const goodphrase = "ab cd ef gh";
  466. static const char* const badphrase = "cd ef gh ab";
  467. TBufferOutput bufout;
  468. TCompactTrieBuilder<char> builder;
  469. for (size_t i = 0; i < Y_ARRAY_SIZE(phrases); i++) {
  470. builder.Add(phrases[i], strlen(phrases[i]), i);
  471. }
  472. builder.Save(bufout);
  473. TCompactTrie<char> trie(bufout.Buffer().Data(), bufout.Buffer().Size());
  474. TVector<TCompactTrie<char>::TPhraseMatch> matches;
  475. trie.FindPhrases(goodphrase, strlen(goodphrase), matches);
  476. UNIT_ASSERT(matches.size() == Y_ARRAY_SIZE(phrases));
  477. for (size_t i = 0; i < Y_ARRAY_SIZE(phrases); i++) {
  478. UNIT_ASSERT(matches[i].first == strlen(phrases[i]));
  479. UNIT_ASSERT(matches[i].second == i);
  480. }
  481. trie.FindPhrases(badphrase, strlen(badphrase), matches);
  482. UNIT_ASSERT(matches.size() == 0);
  483. }
  484. void TCompactTrieTest::TestAddGet() {
  485. TCompactTrieBuilder<char> builder;
  486. builder.Add("abcd", 4, 1);
  487. builder.Add("acde", 4, 2);
  488. ui64 dummy;
  489. UNIT_ASSERT(builder.Find("abcd", 4, &dummy));
  490. UNIT_ASSERT(1 == dummy);
  491. UNIT_ASSERT(builder.Find("acde", 4, &dummy));
  492. UNIT_ASSERT(2 == dummy);
  493. UNIT_ASSERT(!builder.Find("fgdgfacde", 9, &dummy));
  494. UNIT_ASSERT(!builder.Find("ab", 2, &dummy));
  495. }
  496. void TCompactTrieTest::TestEmpty() {
  497. TCompactTrieBuilder<char> builder;
  498. ui64 dummy = 12345;
  499. size_t prefixLen;
  500. UNIT_ASSERT(!builder.Find("abc", 3, &dummy));
  501. TBufferOutput bufout;
  502. builder.Save(bufout);
  503. TCompactTrie<char> trie(bufout.Buffer().Data(), bufout.Buffer().Size());
  504. UNIT_ASSERT(!trie.Find("abc", 3, &dummy));
  505. UNIT_ASSERT(!trie.Find("", 0, &dummy));
  506. UNIT_ASSERT(!trie.FindLongestPrefix("abc", 3, &prefixLen, &dummy));
  507. UNIT_ASSERT(!trie.FindLongestPrefix("", 0, &prefixLen, &dummy));
  508. UNIT_ASSERT_EQUAL(12345, dummy);
  509. UNIT_ASSERT(trie.Begin() == trie.End());
  510. TCompactTrie<> trieNull;
  511. UNIT_ASSERT(!trieNull.Find(" ", 1));
  512. TCompactTrie<>::TPhraseMatchVector matches;
  513. trieNull.FindPhrases(" ", 1, matches); // just to be sure it doesn't crash
  514. UNIT_ASSERT(trieNull.Begin() == trieNull.End());
  515. }
  516. void TCompactTrieTest::TestUninitializedNonEmpty() {
  517. TBufferOutput bufout;
  518. CreateTrie<char>(bufout, false, false);
  519. TCompactTrie<char> trie(bufout.Buffer().Data(), bufout.Buffer().Size());
  520. typedef TCompactTrie<char>::TKey TKey;
  521. typedef TCompactTrie<char>::TConstIterator TIter;
  522. TCompactTrie<char> tails = trie.FindTails("abd", 3); // A trie that has empty value and no data.
  523. UNIT_ASSERT(!tails.IsEmpty());
  524. UNIT_ASSERT(!tails.IsInitialized());
  525. const TKey wideKey = MakeWideKey<char>("c", 1);
  526. TIter it = tails.UpperBound(wideKey);
  527. UNIT_ASSERT(it == tails.End());
  528. UNIT_ASSERT(it != tails.Begin());
  529. --it;
  530. UNIT_ASSERT(it == tails.Begin());
  531. ++it;
  532. UNIT_ASSERT(it == tails.End());
  533. }
  534. static char RandChar() {
  535. return char(RandomNumber<size_t>() % 256);
  536. }
  537. static TString RandStr(const size_t max) {
  538. size_t len = RandomNumber<size_t>() % max;
  539. TString key;
  540. for (size_t j = 0; j < len; ++j)
  541. key += RandChar();
  542. return key;
  543. }
  544. template <class T, bool minimize>
  545. void TCompactTrieTest::TestRandom(const size_t n, const size_t maxKeySize) {
  546. const TStringBuf EMPTY_KEY = TStringBuf("", 1);
  547. TCompactTrieBuilder<char, typename T::TData, T> builder;
  548. typedef TMap<TString, typename T::TData> TKeys;
  549. TKeys keys;
  550. typename T::TData dummy;
  551. for (size_t i = 0; i < n; ++i) {
  552. const TString key = RandStr(maxKeySize);
  553. if (key != EMPTY_KEY && keys.find(key) == keys.end()) {
  554. const typename T::TData val = T::Data(key);
  555. keys[key] = val;
  556. UNIT_ASSERT_C(!builder.Find(key.data(), key.size(), &dummy), "key = " << HexEncode(TString(key)));
  557. builder.Add(key.data(), key.size(), val);
  558. UNIT_ASSERT_C(builder.Find(key.data(), key.size(), &dummy), "key = " << HexEncode(TString(key)));
  559. UNIT_ASSERT(dummy == val);
  560. }
  561. }
  562. TBufferStream stream;
  563. size_t len = builder.Save(stream);
  564. TCompactTrie<char, typename T::TData, T> trie(stream.Buffer().Data(), len);
  565. TBufferStream buftmp;
  566. if (minimize) {
  567. CompactTrieMinimize<T>(buftmp, stream.Buffer().Data(), len, false);
  568. }
  569. TCompactTrie<char, typename T::TData, T> trieMin(buftmp.Buffer().Data(), buftmp.Buffer().Size());
  570. TCompactTrieBuilder<char, typename T::TData, T> prefixGroupedBuilder(CTBF_PREFIX_GROUPED);
  571. for (typename TKeys::const_iterator i = keys.begin(), mi = keys.end(); i != mi; ++i) {
  572. UNIT_ASSERT(!prefixGroupedBuilder.Find(i->first.c_str(), i->first.size(), &dummy));
  573. UNIT_ASSERT(trie.Find(i->first.c_str(), i->first.size(), &dummy));
  574. UNIT_ASSERT(dummy == i->second);
  575. if (minimize) {
  576. UNIT_ASSERT(trieMin.Find(i->first.c_str(), i->first.size(), &dummy));
  577. UNIT_ASSERT(dummy == i->second);
  578. }
  579. prefixGroupedBuilder.Add(i->first.c_str(), i->first.size(), dummy);
  580. UNIT_ASSERT(prefixGroupedBuilder.Find(i->first.c_str(), i->first.size(), &dummy));
  581. for (typename TKeys::const_iterator j = keys.begin(), end = keys.end(); j != end; ++j) {
  582. typename T::TData valFound;
  583. if (j->first <= i->first) {
  584. UNIT_ASSERT(prefixGroupedBuilder.Find(j->first.c_str(), j->first.size(), &valFound));
  585. UNIT_ASSERT_VALUES_EQUAL(j->second, valFound);
  586. } else {
  587. UNIT_ASSERT(!prefixGroupedBuilder.Find(j->first.c_str(), j->first.size(), &valFound));
  588. }
  589. }
  590. }
  591. TBufferStream prefixGroupedBuffer;
  592. prefixGroupedBuilder.Save(prefixGroupedBuffer);
  593. UNIT_ASSERT_VALUES_EQUAL(stream.Buffer().Size(), prefixGroupedBuffer.Buffer().Size());
  594. UNIT_ASSERT(0 == memcmp(stream.Buffer().Data(), prefixGroupedBuffer.Buffer().Data(), stream.Buffer().Size()));
  595. }
  596. void TCompactTrieTest::TestRandom() {
  597. TestRandom<TIntPacker<ui64>, true>(1000, 1000);
  598. TestRandom<TIntPacker<int>, true>(100, 100);
  599. TestRandom<TDummyPacker<ui64>, true>(0, 0);
  600. TestRandom<TDummyPacker<ui64>, true>(100, 3);
  601. TestRandom<TDummyPacker<ui64>, true>(100, 100);
  602. TestRandom<TStrokaPacker, true>(100, 100);
  603. }
  604. void TCompactTrieTest::TestFindTailsImpl(const TString& prefix) {
  605. TCompactTrieBuilder<> builder;
  606. TMap<TString, ui64> input;
  607. for (auto& i : SampleData) {
  608. TString temp = i;
  609. ui64 val = temp.size() * 2;
  610. builder.Add(temp.data(), temp.size(), val);
  611. if (temp.StartsWith(prefix)) {
  612. input[temp.substr(prefix.size())] = val;
  613. }
  614. }
  615. typedef TCompactTrie<> TTrie;
  616. TBufferStream stream;
  617. size_t len = builder.Save(stream);
  618. TTrie trie(stream.Buffer().Data(), len);
  619. TTrie subtrie = trie.FindTails(prefix.data(), prefix.size());
  620. TMap<TString, ui64> output;
  621. for (TTrie::TConstIterator i = subtrie.Begin(), mi = subtrie.End(); i != mi; ++i) {
  622. TTrie::TValueType val = *i;
  623. output[TString(val.first.data(), val.first.size())] = val.second;
  624. }
  625. UNIT_ASSERT(input.size() == output.size());
  626. UNIT_ASSERT(input == output);
  627. TBufferStream buftmp;
  628. CompactTrieMinimize<TTrie::TPacker>(buftmp, stream.Buffer().Data(), len, false);
  629. TTrie trieMin(buftmp.Buffer().Data(), buftmp.Buffer().Size());
  630. subtrie = trieMin.FindTails(prefix.data(), prefix.size());
  631. output.clear();
  632. for (TTrie::TConstIterator i = subtrie.Begin(), mi = subtrie.End(); i != mi; ++i) {
  633. TTrie::TValueType val = *i;
  634. output[TString(val.first.data(), val.first.size())] = val.second;
  635. }
  636. UNIT_ASSERT(input.size() == output.size());
  637. UNIT_ASSERT(input == output);
  638. }
  639. void TCompactTrieTest::TestPrefixGrouped() {
  640. TBuffer b1b;
  641. TCompactTrieBuilder<char, ui32> b1(CTBF_PREFIX_GROUPED);
  642. const char* data[] = {
  643. "Kazan",
  644. "Moscow",
  645. "Monino",
  646. "Murmansk",
  647. "Fryanovo",
  648. "Fryazino",
  649. "Fryazevo",
  650. "Tumen",
  651. };
  652. for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
  653. ui32 val = strlen(data[i]) + 1;
  654. b1.Add(data[i], strlen(data[i]), val);
  655. for (size_t j = 0; j < Y_ARRAY_SIZE(data); ++j) {
  656. ui32 mustHave = strlen(data[j]) + 1;
  657. ui32 found = 0;
  658. if (j <= i) {
  659. UNIT_ASSERT(b1.Find(data[j], strlen(data[j]), &found));
  660. UNIT_ASSERT_VALUES_EQUAL(mustHave, found);
  661. } else {
  662. UNIT_ASSERT(!b1.Find(data[j], strlen(data[j]), &found));
  663. }
  664. }
  665. }
  666. {
  667. TBufferOutput b1bo(b1b);
  668. b1.Save(b1bo);
  669. }
  670. TCompactTrie<char, ui32> t1(TBlob::FromBuffer(b1b));
  671. //t1.Print(Cerr);
  672. for (auto& i : data) {
  673. ui32 v;
  674. UNIT_ASSERT(t1.Find(i, strlen(i), &v));
  675. UNIT_ASSERT_VALUES_EQUAL(strlen(i) + 1, v);
  676. }
  677. }
  678. void TCompactTrieTest::CrashTestPrefixGrouped() {
  679. TCompactTrieBuilder<char, ui32> builder(CTBF_PREFIX_GROUPED);
  680. const char* data[] = {
  681. "Fryazino",
  682. "Fryanovo",
  683. "Monino",
  684. "",
  685. "Fryazevo",
  686. };
  687. bool wasException = false;
  688. try {
  689. for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
  690. builder.Add(data[i], strlen(data[i]), i + 1);
  691. }
  692. } catch (const yexception& e) {
  693. wasException = true;
  694. UNIT_ASSERT(strstr(e.what(), "Bad input order - expected input strings to be prefix-grouped."));
  695. }
  696. UNIT_ASSERT_C(wasException, "CrashTestPrefixGrouped");
  697. }
  698. void TCompactTrieTest::TestMergeFromFile() {
  699. {
  700. TCompactTrieBuilder<> b;
  701. b.Add("yandex", 12);
  702. b.Add("google", 13);
  703. b.Add("mail", 14);
  704. TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-ru");
  705. b.Save(out);
  706. }
  707. {
  708. TCompactTrieBuilder<> b;
  709. b.Add("yandex", 112);
  710. b.Add("google", 113);
  711. b.Add("yahoo", 114);
  712. TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-com");
  713. b.Save(out);
  714. }
  715. {
  716. TCompactTrieBuilder<> b;
  717. UNIT_ASSERT(b.AddSubtreeInFile("com.", GetSystemTempDir() + "/TCompactTrieTest-TestMerge-com"));
  718. UNIT_ASSERT(b.Add("org.kernel", 22));
  719. UNIT_ASSERT(b.AddSubtreeInFile("ru.", GetSystemTempDir() + "/TCompactTrieTest-TestMerge-ru"));
  720. TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-res");
  721. b.Save(out);
  722. }
  723. TCompactTrie<> trie(TBlob::FromFileSingleThreaded(GetSystemTempDir() + "/TCompactTrieTest-TestMerge-res"));
  724. UNIT_ASSERT_VALUES_EQUAL(12u, trie.Get("ru.yandex"));
  725. UNIT_ASSERT_VALUES_EQUAL(13u, trie.Get("ru.google"));
  726. UNIT_ASSERT_VALUES_EQUAL(14u, trie.Get("ru.mail"));
  727. UNIT_ASSERT_VALUES_EQUAL(22u, trie.Get("org.kernel"));
  728. UNIT_ASSERT_VALUES_EQUAL(112u, trie.Get("com.yandex"));
  729. UNIT_ASSERT_VALUES_EQUAL(113u, trie.Get("com.google"));
  730. UNIT_ASSERT_VALUES_EQUAL(114u, trie.Get("com.yahoo"));
  731. unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMerge-res").data());
  732. unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMerge-com").data());
  733. unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMerge-ru").data());
  734. }
  735. void TCompactTrieTest::TestMergeFromBuffer() {
  736. TArrayWithSizeHolder<char> buffer1;
  737. {
  738. TCompactTrieBuilder<> b;
  739. b.Add("aaaaa", 1);
  740. b.Add("bbbbb", 2);
  741. b.Add("ccccc", 3);
  742. buffer1.Resize(b.MeasureByteSize());
  743. TMemoryOutput out(buffer1.Get(), buffer1.Size());
  744. b.Save(out);
  745. }
  746. TArrayWithSizeHolder<char> buffer2;
  747. {
  748. TCompactTrieBuilder<> b;
  749. b.Add("aaaaa", 10);
  750. b.Add("bbbbb", 20);
  751. b.Add("ccccc", 30);
  752. b.Add("xxxxx", 40);
  753. b.Add("yyyyy", 50);
  754. buffer2.Resize(b.MeasureByteSize());
  755. TMemoryOutput out(buffer2.Get(), buffer2.Size());
  756. b.Save(out);
  757. }
  758. {
  759. TCompactTrieBuilder<> b;
  760. UNIT_ASSERT(b.AddSubtreeInBuffer("com.", std::move(buffer1)));
  761. UNIT_ASSERT(b.Add("org.upyachka", 42));
  762. UNIT_ASSERT(b.AddSubtreeInBuffer("ru.", std::move(buffer2)));
  763. TUnbufferedFileOutput out(GetSystemTempDir() + "/TCompactTrieTest-TestMergeFromBuffer-res");
  764. b.Save(out);
  765. }
  766. TCompactTrie<> trie(TBlob::FromFileSingleThreaded(GetSystemTempDir() + "/TCompactTrieTest-TestMergeFromBuffer-res"));
  767. UNIT_ASSERT_VALUES_EQUAL(10u, trie.Get("ru.aaaaa"));
  768. UNIT_ASSERT_VALUES_EQUAL(20u, trie.Get("ru.bbbbb"));
  769. UNIT_ASSERT_VALUES_EQUAL(40u, trie.Get("ru.xxxxx"));
  770. UNIT_ASSERT_VALUES_EQUAL(42u, trie.Get("org.upyachka"));
  771. UNIT_ASSERT_VALUES_EQUAL(1u, trie.Get("com.aaaaa"));
  772. UNIT_ASSERT_VALUES_EQUAL(2u, trie.Get("com.bbbbb"));
  773. UNIT_ASSERT_VALUES_EQUAL(3u, trie.Get("com.ccccc"));
  774. unlink((GetSystemTempDir() + "/TCompactTrieTest-TestMergeFromBuffer-res").data());
  775. }
  776. void TCompactTrieTest::TestUnique() {
  777. TestUniqueImpl(false);
  778. TestUniqueImpl(true);
  779. }
  780. void TCompactTrieTest::TestUniqueImpl(bool isPrefixGrouped) {
  781. TCompactTrieBuilder<char, ui32> builder(CTBF_UNIQUE | (isPrefixGrouped ? CTBF_PREFIX_GROUPED : CTBF_NONE));
  782. const char* data[] = {
  783. "Kazan",
  784. "Moscow",
  785. "Monino",
  786. "Murmansk",
  787. "Fryanovo",
  788. "Fryazino",
  789. "Fryazevo",
  790. "Fry",
  791. "Tumen",
  792. };
  793. for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
  794. UNIT_ASSERT_C(builder.Add(data[i], strlen(data[i]), i + 1), i);
  795. }
  796. bool wasException = false;
  797. try {
  798. builder.Add(data[4], strlen(data[4]), 20);
  799. } catch (const yexception& e) {
  800. wasException = true;
  801. UNIT_ASSERT(strstr(e.what(), "Duplicate key"));
  802. }
  803. UNIT_ASSERT_C(wasException, "TestUnique");
  804. }
  805. void TCompactTrieTest::TestAddRetValue() {
  806. TCompactTrieBuilder<char, ui32> builder;
  807. const char* data[] = {
  808. "Kazan",
  809. "Moscow",
  810. "Monino",
  811. "Murmansk",
  812. "Fryanovo",
  813. "Fryazino",
  814. "Fryazevo",
  815. "Fry",
  816. "Tumen",
  817. };
  818. for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
  819. UNIT_ASSERT(builder.Add(data[i], strlen(data[i]), i + 1));
  820. UNIT_ASSERT(!builder.Add(data[i], strlen(data[i]), i + 2));
  821. ui32 value;
  822. UNIT_ASSERT(builder.Find(data[i], strlen(data[i]), &value));
  823. UNIT_ASSERT(value == i + 2);
  824. }
  825. }
  826. void TCompactTrieTest::TestClear() {
  827. TCompactTrieBuilder<char, ui32> builder;
  828. const char* data[] = {
  829. "Kazan",
  830. "Moscow",
  831. "Monino",
  832. "Murmansk",
  833. "Fryanovo",
  834. "Fryazino",
  835. "Fryazevo",
  836. "Fry",
  837. "Tumen",
  838. };
  839. for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
  840. builder.Add(data[i], strlen(data[i]), i + 1);
  841. }
  842. UNIT_ASSERT(builder.GetEntryCount() == Y_ARRAY_SIZE(data));
  843. builder.Clear();
  844. UNIT_ASSERT(builder.GetEntryCount() == 0);
  845. UNIT_ASSERT(builder.GetNodeCount() == 1);
  846. }
  847. void TCompactTrieTest::TestFindTails() {
  848. TestFindTailsImpl("aa");
  849. TestFindTailsImpl("bb");
  850. TestFindTailsImpl("fb");
  851. TestFindTailsImpl("fbc");
  852. TestFindTailsImpl("fbbaa");
  853. }
  854. template <class T>
  855. class TCompactTrieTest::TDummyPacker: public TNullPacker<T> {
  856. public:
  857. static T Data(const TString&) {
  858. T data;
  859. TNullPacker<T>().UnpackLeaf(nullptr, data);
  860. return data;
  861. }
  862. typedef T TData;
  863. };
  864. class TCompactTrieTest::TStrokaPacker: public TCompactTriePacker<TString> {
  865. public:
  866. typedef TString TData;
  867. static TString Data(const TString& str) {
  868. return str;
  869. }
  870. };
  871. template <class T>
  872. class TCompactTrieTest::TIntPacker: public TCompactTriePacker<T> {
  873. public:
  874. typedef T TData;
  875. static TData Data(const TString&) {
  876. return RandomNumber<std::make_unsigned_t<T>>();
  877. }
  878. };
  879. void TCompactTrieTest::TestIterateEmptyKey() {
  880. TBuffer trieBuffer;
  881. {
  882. TCompactTrieBuilder<char, ui32> builder;
  883. UNIT_ASSERT(builder.Add("", 1));
  884. TBufferStream trieBufferO(trieBuffer);
  885. builder.Save(trieBufferO);
  886. }
  887. TCompactTrie<char, ui32> trie(TBlob::FromBuffer(trieBuffer));
  888. ui32 val;
  889. UNIT_ASSERT(trie.Find("", &val));
  890. UNIT_ASSERT(val == 1);
  891. TCompactTrie<char, ui32>::TConstIterator it = trie.Begin();
  892. UNIT_ASSERT(it.GetKey().empty());
  893. UNIT_ASSERT(it.GetValue() == 1);
  894. }
  895. void TCompactTrieTest::TestTrieSet() {
  896. TBuffer buffer;
  897. {
  898. TCompactTrieSet<char>::TBuilder builder;
  899. UNIT_ASSERT(builder.Add("a", 0));
  900. UNIT_ASSERT(builder.Add("ab", 1));
  901. UNIT_ASSERT(builder.Add("abc", 1));
  902. UNIT_ASSERT(builder.Add("abcd", 0));
  903. UNIT_ASSERT(!builder.Add("abcd", 1));
  904. TBufferStream stream(buffer);
  905. builder.Save(stream);
  906. }
  907. TCompactTrieSet<char> set(TBlob::FromBuffer(buffer));
  908. UNIT_ASSERT(set.Has("a"));
  909. UNIT_ASSERT(set.Has("ab"));
  910. UNIT_ASSERT(set.Has("abc"));
  911. UNIT_ASSERT(set.Has("abcd"));
  912. UNIT_ASSERT(!set.Has("abcde"));
  913. UNIT_ASSERT(!set.Has("aa"));
  914. UNIT_ASSERT(!set.Has("b"));
  915. UNIT_ASSERT(!set.Has(""));
  916. TCompactTrieSet<char> tails;
  917. UNIT_ASSERT(set.FindTails("a", tails));
  918. UNIT_ASSERT(tails.Has("b"));
  919. UNIT_ASSERT(tails.Has("bcd"));
  920. UNIT_ASSERT(!tails.Has("ab"));
  921. UNIT_ASSERT(!set.Has(""));
  922. TCompactTrieSet<char> empty;
  923. UNIT_ASSERT(set.FindTails("abcd", empty));
  924. UNIT_ASSERT(!empty.Has("a"));
  925. UNIT_ASSERT(!empty.Has("b"));
  926. UNIT_ASSERT(!empty.Has("c"));
  927. UNIT_ASSERT(!empty.Has("d"));
  928. UNIT_ASSERT(!empty.Has("d"));
  929. UNIT_ASSERT(empty.Has("")); // contains only empty string
  930. }
  931. // Tests for trie with vector (list, set) values
  932. TVector<TUtf16String> TCompactTrieTest::GetSampleKeys(size_t nKeys) const {
  933. Y_ASSERT(nKeys <= 10);
  934. TString sampleKeys[] = {"a", "b", "ac", "bd", "abe", "bcf", "deg", "ah", "xy", "abc"};
  935. TVector<TUtf16String> result;
  936. for (size_t i = 0; i < nKeys; i++)
  937. result.push_back(ASCIIToWide(sampleKeys[i]));
  938. return result;
  939. }
  940. template <class TContainer>
  941. TVector<TContainer> TCompactTrieTest::GetSampleVectorData(size_t nValues) {
  942. TVector<TContainer> data;
  943. for (size_t i = 0; i < nValues; i++) {
  944. data.push_back(TContainer());
  945. for (size_t j = 0; j < i; j++)
  946. data[i].insert(data[i].end(), (typename TContainer::value_type)((j == 3) ? 0 : (1 << (j * 5))));
  947. }
  948. return data;
  949. }
  950. template <class TContainer>
  951. TVector<TContainer> TCompactTrieTest::GetSampleTextVectorData(size_t nValues) {
  952. TVector<TContainer> data;
  953. for (size_t i = 0; i < nValues; i++) {
  954. data.push_back(TContainer());
  955. for (size_t j = 0; j < i; j++)
  956. data[i].insert(data[i].end(), TString("abc") + ToString<size_t>(j));
  957. }
  958. return data;
  959. }
  960. template <class T>
  961. void TCompactTrieTest::CheckEquality(const T& value1, const T& value2) const {
  962. UNIT_ASSERT_VALUES_EQUAL(value1, value2);
  963. }
  964. template <>
  965. void TCompactTrieTest::CheckEquality<TVector<i64>>(const TVector<i64>& value1, const TVector<i64>& value2) const {
  966. UNIT_ASSERT_VALUES_EQUAL(value1.size(), value2.size());
  967. for (size_t i = 0; i < value1.size(); i++)
  968. UNIT_ASSERT_VALUES_EQUAL(value1[i], value2[i]);
  969. }
  970. template <class TContainer>
  971. void TCompactTrieTest::TestTrieWithContainers(const TVector<TUtf16String>& keys, const TVector<TContainer>& sampleData, TString methodName) {
  972. TString fileName = GetSystemTempDir() + "/TCompactTrieTest-TestTrieWithContainers-" + methodName;
  973. TCompactTrieBuilder<wchar16, TContainer> b;
  974. for (size_t i = 0; i < keys.size(); i++) {
  975. b.Add(keys[i], sampleData[i]);
  976. }
  977. TUnbufferedFileOutput out(fileName);
  978. b.Save(out);
  979. TCompactTrie<wchar16, TContainer> trie(TBlob::FromFileSingleThreaded(fileName));
  980. for (size_t i = 0; i < keys.size(); i++) {
  981. TContainer value = trie.Get(keys[i]);
  982. UNIT_ASSERT_VALUES_EQUAL(value.size(), sampleData[i].size());
  983. typename TContainer::const_iterator p = value.begin();
  984. typename TContainer::const_iterator p1 = sampleData[i].begin();
  985. for (; p != value.end(); p++, p1++)
  986. CheckEquality<typename TContainer::value_type>(*p, *p1);
  987. }
  988. unlink(fileName.data());
  989. }
  990. template <>
  991. void TCompactTrieTest::TestTrieWithContainers<std::pair<TUtf16String, TVector<i64>>>(const TVector<TUtf16String>& keys, const TVector<std::pair<TUtf16String, TVector<i64>>>& sampleData, TString methodName) {
  992. typedef std::pair<TUtf16String, TVector<i64>> TContainer;
  993. TString fileName = GetSystemTempDir() + "/TCompactTrieTest-TestTrieWithContainers-" + methodName;
  994. TCompactTrieBuilder<wchar16, TContainer> b;
  995. for (size_t i = 0; i < keys.size(); i++) {
  996. b.Add(keys[i], sampleData[i]);
  997. }
  998. TUnbufferedFileOutput out(fileName);
  999. b.Save(out);
  1000. TCompactTrie<wchar16, TContainer> trie(TBlob::FromFileSingleThreaded(fileName));
  1001. for (size_t i = 0; i < keys.size(); i++) {
  1002. TContainer value = trie.Get(keys[i]);
  1003. CheckEquality<TContainer::first_type>(value.first, sampleData[i].first);
  1004. CheckEquality<TContainer::second_type>(value.second, sampleData[i].second);
  1005. }
  1006. unlink(fileName.data());
  1007. }
  1008. void TCompactTrieTest::TestTrieForVectorInt64() {
  1009. TestTrieWithContainers<TVector<i64>>(GetSampleKeys(10), GetSampleVectorData<TVector<i64>>(10), "v-i64");
  1010. }
  1011. void TCompactTrieTest::TestTrieForListInt64() {
  1012. TestTrieWithContainers<TList<i64>>(GetSampleKeys(10), GetSampleVectorData<TList<i64>>(10), "l-i64");
  1013. }
  1014. void TCompactTrieTest::TestTrieForSetInt64() {
  1015. TestTrieWithContainers<TSet<i64>>(GetSampleKeys(10), GetSampleVectorData<TSet<i64>>(10), "s-i64");
  1016. }
  1017. void TCompactTrieTest::TestTrieForVectorStroka() {
  1018. TestTrieWithContainers<TVector<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TVector<TString>>(10), "v-str");
  1019. }
  1020. void TCompactTrieTest::TestTrieForListStroka() {
  1021. TestTrieWithContainers<TList<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TList<TString>>(10), "l-str");
  1022. }
  1023. void TCompactTrieTest::TestTrieForSetStroka() {
  1024. TestTrieWithContainers<TSet<TString>>(GetSampleKeys(10), GetSampleTextVectorData<TSet<TString>>(10), "s-str");
  1025. }
  1026. void TCompactTrieTest::TestTrieForVectorWtroka() {
  1027. TVector<TVector<TString>> data = GetSampleTextVectorData<TVector<TString>>(10);
  1028. TVector<TVector<TUtf16String>> wData;
  1029. for (size_t i = 0; i < data.size(); i++) {
  1030. wData.push_back(TVector<TUtf16String>());
  1031. for (size_t j = 0; j < data[i].size(); j++)
  1032. wData[i].push_back(UTF8ToWide(data[i][j]));
  1033. }
  1034. TestTrieWithContainers<TVector<TUtf16String>>(GetSampleKeys(10), wData, "v-wtr");
  1035. }
  1036. void TCompactTrieTest::TestTrieForVectorFloat() {
  1037. TestTrieWithContainers<TVector<float>>(GetSampleKeys(10), GetSampleVectorData<TVector<float>>(10), "v-float");
  1038. }
  1039. void TCompactTrieTest::TestTrieForVectorDouble() {
  1040. TestTrieWithContainers<TVector<double>>(GetSampleKeys(10), GetSampleVectorData<TVector<double>>(10), "v-double");
  1041. }
  1042. void TCompactTrieTest::TestTrieForListVectorInt64() {
  1043. TVector<i64> tmp;
  1044. tmp.push_back(0);
  1045. TList<TVector<i64>> dataElement(5, tmp);
  1046. TVector<TList<TVector<i64>>> data(10, dataElement);
  1047. TestTrieWithContainers<TList<TVector<i64>>>(GetSampleKeys(10), data, "l-v-i64");
  1048. }
  1049. void TCompactTrieTest::TestTrieForPairWtrokaVectorInt64() {
  1050. TVector<TUtf16String> keys = GetSampleKeys(10);
  1051. TVector<TVector<i64>> values = GetSampleVectorData<TVector<i64>>(10);
  1052. TVector<std::pair<TUtf16String, TVector<i64>>> data;
  1053. for (size_t i = 0; i < 10; i++)
  1054. data.push_back(std::pair<TUtf16String, TVector<i64>>(keys[i] + u"_v", values[i]));
  1055. TestTrieWithContainers<std::pair<TUtf16String, TVector<i64>>>(keys, data, "pair-str-v-i64");
  1056. }
  1057. void TCompactTrieTest::TestEmptyValueOutOfOrder() {
  1058. TBufferOutput buffer;
  1059. using TSymbol = ui32;
  1060. {
  1061. TCompactTrieBuilder<TSymbol, ui32> builder;
  1062. TSymbol key = 1;
  1063. builder.Add(&key, 1, 10);
  1064. builder.Add(nullptr, 0, 14);
  1065. builder.Save(buffer);
  1066. }
  1067. {
  1068. TCompactTrie<TSymbol, ui32> trie(buffer.Buffer().Data(), buffer.Buffer().Size());
  1069. UNIT_ASSERT(trie.Find(nullptr, 0));
  1070. }
  1071. }
  1072. void TCompactTrieTest::TestFindLongestPrefixWithEmptyValue() {
  1073. TBufferOutput buffer;
  1074. {
  1075. TCompactTrieBuilder<wchar16, ui32> builder;
  1076. builder.Add(u"", 42);
  1077. builder.Add(u"yandex", 271828);
  1078. builder.Add(u"ya", 31415);
  1079. builder.Save(buffer);
  1080. }
  1081. {
  1082. TCompactTrie<wchar16, ui32> trie(buffer.Buffer().Data(), buffer.Buffer().Size());
  1083. size_t prefixLen = 123;
  1084. ui32 value = 0;
  1085. UNIT_ASSERT(trie.FindLongestPrefix(u"google", &prefixLen, &value));
  1086. UNIT_ASSERT(prefixLen == 0);
  1087. UNIT_ASSERT(value == 42);
  1088. UNIT_ASSERT(trie.FindLongestPrefix(u"yahoo", &prefixLen, &value));
  1089. UNIT_ASSERT(prefixLen == 2);
  1090. UNIT_ASSERT(value == 31415);
  1091. }
  1092. }
  1093. template <typename TChar>
  1094. struct TConvertKey {
  1095. static inline TString Convert(const TStringBuf& key) {
  1096. return ToString(key);
  1097. }
  1098. };
  1099. template <>
  1100. struct TConvertKey<wchar16> {
  1101. static inline TUtf16String Convert(const TStringBuf& key) {
  1102. return UTF8ToWide(key);
  1103. }
  1104. };
  1105. template <>
  1106. struct TConvertKey<wchar32> {
  1107. static inline TUtf32String Convert(const TStringBuf& key) {
  1108. return TUtf32String::FromUtf8(key);
  1109. }
  1110. };
  1111. template <class TSearchIter, class TKeyBuf>
  1112. static void MoveIter(TSearchIter& iter, const TKeyBuf& key) {
  1113. for (size_t i = 0; i < key.length(); ++i) {
  1114. UNIT_ASSERT(iter.Advance(key[i]));
  1115. }
  1116. }
  1117. template <typename TChar>
  1118. void TCompactTrieTest::TestSearchIterImpl() {
  1119. TBufferOutput buffer;
  1120. {
  1121. TCompactTrieBuilder<TChar, ui32> builder;
  1122. TStringBuf data[] = {
  1123. TStringBuf("abaab"),
  1124. TStringBuf("abcdef"),
  1125. TStringBuf("abbbc"),
  1126. TStringBuf("bdfaa"),
  1127. };
  1128. for (size_t i = 0; i < Y_ARRAY_SIZE(data); ++i) {
  1129. builder.Add(TConvertKey<TChar>::Convert(data[i]), i + 1);
  1130. }
  1131. builder.Save(buffer);
  1132. }
  1133. TCompactTrie<TChar, ui32> trie(buffer.Buffer().Data(), buffer.Buffer().Size());
  1134. ui32 value = 0;
  1135. auto iter(MakeSearchIterator(trie));
  1136. MoveIter(iter, TConvertKey<TChar>::Convert(TStringBuf("abc")));
  1137. UNIT_ASSERT(!iter.GetValue(&value));
  1138. iter = MakeSearchIterator(trie);
  1139. MoveIter(iter, TConvertKey<TChar>::Convert(TStringBuf("abbbc")));
  1140. UNIT_ASSERT(iter.GetValue(&value));
  1141. UNIT_ASSERT_EQUAL(value, 3);
  1142. iter = MakeSearchIterator(trie);
  1143. UNIT_ASSERT(iter.Advance(TConvertKey<TChar>::Convert(TStringBuf("bdfa"))));
  1144. UNIT_ASSERT(!iter.GetValue(&value));
  1145. iter = MakeSearchIterator(trie);
  1146. UNIT_ASSERT(iter.Advance(TConvertKey<TChar>::Convert(TStringBuf("bdfaa"))));
  1147. UNIT_ASSERT(iter.GetValue(&value));
  1148. UNIT_ASSERT_EQUAL(value, 4);
  1149. UNIT_ASSERT(!MakeSearchIterator(trie).Advance(TChar('z')));
  1150. UNIT_ASSERT(!MakeSearchIterator(trie).Advance(TConvertKey<TChar>::Convert(TStringBuf("cdf"))));
  1151. UNIT_ASSERT(!MakeSearchIterator(trie).Advance(TConvertKey<TChar>::Convert(TStringBuf("abca"))));
  1152. }
  1153. void TCompactTrieTest::TestSearchIterChar() {
  1154. TestSearchIterImpl<char>();
  1155. }
  1156. void TCompactTrieTest::TestSearchIterWchar() {
  1157. TestSearchIterImpl<wchar16>();
  1158. }
  1159. void TCompactTrieTest::TestSearchIterWchar32() {
  1160. TestSearchIterImpl<wchar32>();
  1161. }
  1162. void TCompactTrieTest::TestCopyAndAssignment() {
  1163. TBufferOutput bufout;
  1164. typedef TCompactTrie<> TTrie;
  1165. CreateTrie<char>(bufout, false, false);
  1166. TTrie trie(bufout.Buffer().Data(), bufout.Buffer().Size());
  1167. TTrie copy(trie);
  1168. UNIT_ASSERT(copy.HasCorrectSkipper());
  1169. TTrie assign;
  1170. assign = trie;
  1171. UNIT_ASSERT(assign.HasCorrectSkipper());
  1172. TTrie move(std::move(trie));
  1173. UNIT_ASSERT(move.HasCorrectSkipper());
  1174. TTrie moveAssign;
  1175. moveAssign = TTrie(bufout.Buffer().Data(), bufout.Buffer().Size());
  1176. UNIT_ASSERT(moveAssign.HasCorrectSkipper());
  1177. }
  1178. template <class TTrie>
  1179. void TCompactTrieTest::TestFirstSymbolIteratorForTrie(const TTrie& trie, const TStringBuf& narrowAnswers) {
  1180. NCompactTrie::TFirstSymbolIterator<TTrie> it;
  1181. it.SetTrie(trie, trie.GetSkipper());
  1182. typename TTrie::TKey answers = MakeWideKey<typename TTrie::TSymbol>(narrowAnswers);
  1183. auto answer = answers.begin();
  1184. for (; !it.AtEnd(); it.MakeStep(), ++answer) {
  1185. UNIT_ASSERT(answer != answers.end());
  1186. UNIT_ASSERT(it.GetKey() == *answer);
  1187. }
  1188. UNIT_ASSERT(answer == answers.end());
  1189. }
  1190. template <class TSymbol>
  1191. void TCompactTrieTest::TestFirstSymbolIterator() {
  1192. TBufferOutput bufout;
  1193. typedef TCompactTrie<TSymbol> TTrie;
  1194. CreateTrie<TSymbol>(bufout, false, false);
  1195. TTrie trie(bufout.Buffer().Data(), bufout.Buffer().Size());
  1196. TStringBuf rootAnswers = "abcdf";
  1197. TestFirstSymbolIteratorForTrie(trie, rootAnswers);
  1198. TStringBuf aAnswers = "abcd";
  1199. TestFirstSymbolIteratorForTrie(trie.FindTails(MakeWideKey<TSymbol>("a", 1)), aAnswers);
  1200. }
  1201. void TCompactTrieTest::TestFirstSymbolIterator8() {
  1202. TestFirstSymbolIterator<char>();
  1203. }
  1204. void TCompactTrieTest::TestFirstSymbolIterator16() {
  1205. TestFirstSymbolIterator<wchar16>();
  1206. }
  1207. void TCompactTrieTest::TestFirstSymbolIterator32() {
  1208. TestFirstSymbolIterator<ui32>();
  1209. }
  1210. void TCompactTrieTest::TestFirstSymbolIteratorChar32() {
  1211. TestFirstSymbolIterator<wchar32>();
  1212. }
  1213. void TCompactTrieTest::TestArrayPacker() {
  1214. using TDataInt = std::array<int, 2>;
  1215. const std::pair<TString, TDataInt> dataXxx{"xxx", {{15, 16}}};
  1216. const std::pair<TString, TDataInt> dataYyy{"yyy", {{20, 30}}};
  1217. TCompactTrieBuilder<char, TDataInt> trieBuilderOne;
  1218. trieBuilderOne.Add(dataXxx.first, dataXxx.second);
  1219. trieBuilderOne.Add(dataYyy.first, dataYyy.second);
  1220. TBufferOutput bufferOne;
  1221. trieBuilderOne.Save(bufferOne);
  1222. const TCompactTrie<char, TDataInt> trieOne(bufferOne.Buffer().Data(), bufferOne.Buffer().Size());
  1223. UNIT_ASSERT_VALUES_EQUAL(dataXxx.second, trieOne.Get(dataXxx.first));
  1224. UNIT_ASSERT_VALUES_EQUAL(dataYyy.second, trieOne.Get(dataYyy.first));
  1225. using TDataStroka = std::array<TString, 2>;
  1226. const std::pair<TString, TDataStroka> dataZzz{"zzz", {{"hello", "there"}}};
  1227. const std::pair<TString, TDataStroka> dataWww{"www", {{"half", "life"}}};
  1228. TCompactTrieBuilder<char, TDataStroka> trieBuilderTwo;
  1229. trieBuilderTwo.Add(dataZzz.first, dataZzz.second);
  1230. trieBuilderTwo.Add(dataWww.first, dataWww.second);
  1231. TBufferOutput bufferTwo;
  1232. trieBuilderTwo.Save(bufferTwo);
  1233. const TCompactTrie<char, TDataStroka> trieTwo(bufferTwo.Buffer().Data(), bufferTwo.Buffer().Size());
  1234. UNIT_ASSERT_VALUES_EQUAL(dataZzz.second, trieTwo.Get(dataZzz.first));
  1235. UNIT_ASSERT_VALUES_EQUAL(dataWww.second, trieTwo.Get(dataWww.first));
  1236. }
  1237. void TCompactTrieTest::TestBuilderFindLongestPrefix() {
  1238. const size_t sizes[] = {10, 100};
  1239. const double branchProbabilities[] = {0.01, 0.1, 0.5, 0.9, 0.99};
  1240. for (size_t size : sizes) {
  1241. for (double branchProbability : branchProbabilities) {
  1242. TestBuilderFindLongestPrefix(size, branchProbability, false, false);
  1243. TestBuilderFindLongestPrefix(size, branchProbability, false, true);
  1244. TestBuilderFindLongestPrefix(size, branchProbability, true, false);
  1245. TestBuilderFindLongestPrefix(size, branchProbability, true, true);
  1246. }
  1247. }
  1248. }
  1249. void TCompactTrieTest::TestBuilderFindLongestPrefix(size_t keysCount, double branchProbability, bool isPrefixGrouped, bool hasEmptyKey) {
  1250. TVector<TString> keys;
  1251. TString keyToAdd;
  1252. for (size_t i = 0; i < keysCount; ++i) {
  1253. const size_t prevKeyLen = keyToAdd.Size();
  1254. // add two random chars to prev key
  1255. keyToAdd += RandChar();
  1256. keyToAdd += RandChar();
  1257. const bool changeBranch = prevKeyLen && RandomNumber<double>() < branchProbability;
  1258. if (changeBranch) {
  1259. const size_t branchPlace = RandomNumber<size_t>(prevKeyLen + 1); // random place in [0, prevKeyLen]
  1260. *(keyToAdd.begin() + branchPlace) = RandChar();
  1261. }
  1262. keys.push_back(keyToAdd);
  1263. }
  1264. if (isPrefixGrouped)
  1265. Sort(keys.begin(), keys.end());
  1266. else
  1267. Shuffle(keys.begin(), keys.end());
  1268. TCompactTrieBuilder<char, TString> builder(isPrefixGrouped ? CTBF_PREFIX_GROUPED : CTBF_NONE);
  1269. const TString EMPTY_VALUE = "empty";
  1270. if (hasEmptyKey)
  1271. builder.Add(nullptr, 0, EMPTY_VALUE);
  1272. for (size_t i = 0; i < keysCount; ++i) {
  1273. const TString& key = keys[i];
  1274. for (size_t j = 0; j < keysCount; ++j) {
  1275. const TString& otherKey = keys[j];
  1276. const bool exists = j < i;
  1277. size_t expectedSize = 0;
  1278. if (exists) {
  1279. expectedSize = otherKey.size();
  1280. } else {
  1281. size_t max = 0;
  1282. for (size_t k = 0; k < i; ++k)
  1283. if (keys[k].Size() < otherKey.Size() && keys[k].Size() > max && otherKey.StartsWith(keys[k]))
  1284. max = keys[k].Size();
  1285. expectedSize = max;
  1286. }
  1287. size_t prefixSize = 0xfcfcfc;
  1288. TString value = "abcd";
  1289. const bool expectedResult = hasEmptyKey || expectedSize != 0;
  1290. UNIT_ASSERT_VALUES_EQUAL_C(expectedResult, builder.FindLongestPrefix(otherKey.data(), otherKey.size(), &prefixSize, &value), "otherKey = " << HexEncode(otherKey));
  1291. if (expectedResult) {
  1292. UNIT_ASSERT_VALUES_EQUAL(expectedSize, prefixSize);
  1293. if (expectedSize) {
  1294. UNIT_ASSERT_VALUES_EQUAL(TStringBuf(otherKey).SubStr(0, prefixSize), value);
  1295. } else {
  1296. UNIT_ASSERT_VALUES_EQUAL(EMPTY_VALUE, value);
  1297. }
  1298. } else {
  1299. UNIT_ASSERT_VALUES_EQUAL("abcd", value);
  1300. UNIT_ASSERT_VALUES_EQUAL(0xfcfcfc, prefixSize);
  1301. }
  1302. for (int c = 0; c < 10; ++c) {
  1303. TString extendedKey = otherKey;
  1304. extendedKey += RandChar();
  1305. size_t extendedPrefixSize = 0xdddddd;
  1306. TString extendedValue = "dcba";
  1307. UNIT_ASSERT_VALUES_EQUAL(expectedResult, builder.FindLongestPrefix(extendedKey.data(), extendedKey.size(), &extendedPrefixSize, &extendedValue));
  1308. if (expectedResult) {
  1309. UNIT_ASSERT_VALUES_EQUAL(value, extendedValue);
  1310. UNIT_ASSERT_VALUES_EQUAL(prefixSize, extendedPrefixSize);
  1311. } else {
  1312. UNIT_ASSERT_VALUES_EQUAL("dcba", extendedValue);
  1313. UNIT_ASSERT_VALUES_EQUAL(0xdddddd, extendedPrefixSize);
  1314. }
  1315. }
  1316. }
  1317. builder.Add(key.data(), key.size(), key);
  1318. }
  1319. TBufferOutput buffer;
  1320. builder.Save(buffer);
  1321. }
  1322. void TCompactTrieTest::TestBuilderFindLongestPrefixWithEmptyValue() {
  1323. TCompactTrieBuilder<wchar16, ui32> builder;
  1324. builder.Add(u"", 42);
  1325. builder.Add(u"yandex", 271828);
  1326. builder.Add(u"ya", 31415);
  1327. size_t prefixLen = 123;
  1328. ui32 value = 0;
  1329. UNIT_ASSERT(builder.FindLongestPrefix(u"google", &prefixLen, &value));
  1330. UNIT_ASSERT_VALUES_EQUAL(prefixLen, 0);
  1331. UNIT_ASSERT_VALUES_EQUAL(value, 42);
  1332. UNIT_ASSERT(builder.FindLongestPrefix(u"yahoo", &prefixLen, &value));
  1333. UNIT_ASSERT_VALUES_EQUAL(prefixLen, 2);
  1334. UNIT_ASSERT_VALUES_EQUAL(value, 31415);
  1335. TBufferOutput buffer;
  1336. builder.Save(buffer);
  1337. }
  1338. void TCompactTrieTest::TestPatternSearcherEmpty() {
  1339. TCompactPatternSearcherBuilder<char, ui32> builder;
  1340. TBufferOutput searcherData;
  1341. builder.Save(searcherData);
  1342. TCompactPatternSearcher<char, ui32> searcher(
  1343. searcherData.Buffer().Data(),
  1344. searcherData.Buffer().Size()
  1345. );
  1346. UNIT_ASSERT(searcher.SearchMatches("a").empty());
  1347. UNIT_ASSERT(searcher.SearchMatches("").empty());
  1348. UNIT_ASSERT(searcher.SearchMatches("abc").empty());
  1349. }
  1350. void TCompactTrieTest::TestPatternSearcherOnDataset(
  1351. const TVector<TString>& patterns,
  1352. const TVector<TString>& samples
  1353. ) {
  1354. TCompactPatternSearcherBuilder<char, ui32> builder;
  1355. for (size_t patternIdx = 0; patternIdx < patterns.size(); ++patternIdx) {
  1356. builder.Add(patterns[patternIdx], patternIdx);
  1357. }
  1358. TBufferOutput searcherData;
  1359. builder.Save(searcherData);
  1360. TCompactPatternSearcher<char, ui32> searcher(
  1361. searcherData.Buffer().Data(),
  1362. searcherData.Buffer().Size()
  1363. );
  1364. for (const auto& sample : samples) {
  1365. const auto matches = searcher.SearchMatches(sample);
  1366. size_t matchesNum = 0;
  1367. THashSet<TString> processedPatterns;
  1368. for (const auto& pattern : patterns) {
  1369. if (pattern.Empty() || processedPatterns.contains(pattern)) {
  1370. continue;
  1371. }
  1372. for (size_t start = 0; start + pattern.Size() <= sample.Size(); ++start) {
  1373. matchesNum += (pattern == sample.substr(start, pattern.Size()));
  1374. }
  1375. processedPatterns.insert(pattern);
  1376. }
  1377. UNIT_ASSERT_VALUES_EQUAL(matchesNum, matches.size());
  1378. TSet<std::pair<size_t, ui32>> foundMatches;
  1379. for (const auto& match : matches) {
  1380. std::pair<size_t, ui32> matchParams(match.End, match.Data);
  1381. UNIT_ASSERT(!foundMatches.contains(matchParams));
  1382. foundMatches.insert(matchParams);
  1383. const auto& pattern = patterns[match.Data];
  1384. UNIT_ASSERT_VALUES_EQUAL(
  1385. sample.substr(match.End - pattern.size() + 1, pattern.size()),
  1386. pattern
  1387. );
  1388. }
  1389. }
  1390. }
  1391. void TCompactTrieTest::TestPatternSearcherSimple() {
  1392. TestPatternSearcherOnDataset(
  1393. { // patterns
  1394. "abcd",
  1395. "abc",
  1396. "ab",
  1397. "a",
  1398. ""
  1399. },
  1400. { // samples
  1401. "abcde",
  1402. "abcd",
  1403. "abc",
  1404. "ab",
  1405. "a",
  1406. ""
  1407. }
  1408. );
  1409. TestPatternSearcherOnDataset(
  1410. { // patterns
  1411. "a"
  1412. "ab",
  1413. "abcd",
  1414. },
  1415. { // samples
  1416. "abcde",
  1417. "abcd",
  1418. "abc",
  1419. "ab",
  1420. "a",
  1421. ""
  1422. }
  1423. );
  1424. TestPatternSearcherOnDataset(
  1425. { // patterns
  1426. "aaaa",
  1427. "aaa",
  1428. "aa",
  1429. "a",
  1430. },
  1431. { // samples
  1432. "aaaaaaaaaaaa"
  1433. }
  1434. );
  1435. TestPatternSearcherOnDataset(
  1436. { // patterns
  1437. "aa", "ab", "ac", "ad", "ae", "af",
  1438. "ba", "bb", "bc", "bd", "be", "bf",
  1439. "ca", "cb", "cc", "cd", "ce", "cf",
  1440. "da", "db", "dc", "dd", "de", "df",
  1441. "ea", "eb", "ec", "ed", "ee", "ef",
  1442. "fa", "fb", "fc", "fd", "fe", "ff"
  1443. },
  1444. { // samples
  1445. "dcabafeebfdcbacddacadbaabecdbaeffecdbfabcdcabcfaefecdfebacfedacefbdcacfeb",
  1446. "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefancdefancdef",
  1447. "fedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcbafedcba",
  1448. "",
  1449. "a", "b", "c", "d", "e", "f",
  1450. "aa", "ab", "ac", "ad", "ae", "af",
  1451. "ba", "bb", "bc", "bd", "be", "bf",
  1452. "ca", "cb", "cc", "cd", "ce", "cf",
  1453. "da", "db", "dc", "dd", "de", "df",
  1454. "ea", "eb", "ec", "ed", "ee", "ef",
  1455. "fa", "fb", "fc", "fd", "fe", "ff"
  1456. }
  1457. );
  1458. }
  1459. static char RandChar(
  1460. TFastRng<ui64>& rng,
  1461. int maxChar
  1462. ) {
  1463. return static_cast<char>(rng.GenRand() % (maxChar + 1));
  1464. }
  1465. static TString RandStr(
  1466. TFastRng<ui64>& rng,
  1467. size_t maxLength,
  1468. int maxChar,
  1469. bool nonEmpty = false
  1470. ) {
  1471. Y_ASSERT(maxLength > 0);
  1472. size_t length;
  1473. if (nonEmpty) {
  1474. length = rng.GenRand() % maxLength + 1;
  1475. } else {
  1476. length = rng.GenRand() % (maxLength + 1);
  1477. }
  1478. TString result;
  1479. while (result.size() < length) {
  1480. result += RandChar(rng, maxChar);
  1481. }
  1482. return result;
  1483. }
  1484. void TCompactTrieTest::TestPatternSearcherRandom(
  1485. size_t patternsNum,
  1486. size_t patternMaxLength,
  1487. size_t strMaxLength,
  1488. int maxChar,
  1489. TFastRng<ui64>& rng
  1490. ) {
  1491. auto patternToSearch = RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true);
  1492. TVector<TString> patterns = {patternToSearch};
  1493. while (patterns.size() < patternsNum) {
  1494. patterns.push_back(RandStr(rng, patternMaxLength, maxChar, /*nonEmpty*/true));
  1495. }
  1496. auto filler = RandStr(rng, strMaxLength - patternToSearch.Size() + 1, maxChar);
  1497. size_t leftFillerSize = rng.GenRand() % (filler.size() + 1);
  1498. auto leftFiller = filler.substr(0, leftFillerSize);
  1499. auto rightFiller = filler.substr(leftFillerSize, filler.size() - leftFillerSize);
  1500. auto sample = leftFiller + patternToSearch + rightFiller;
  1501. TestPatternSearcherOnDataset(patterns, {sample});
  1502. }
  1503. void TCompactTrieTest::TestPatternSearcherRandom() {
  1504. TFastRng<ui64> rng(0);
  1505. for (size_t patternMaxLen : {1, 2, 10}) {
  1506. for (size_t strMaxLen : TVector<size_t>{patternMaxLen, 2 * patternMaxLen, 10}) {
  1507. for (int maxChar : {0, 1, 5, 255}) {
  1508. for (size_t patternsNum : {1, 10}) {
  1509. for (size_t testIdx = 0; testIdx < 3; ++testIdx) {
  1510. TestPatternSearcherRandom(
  1511. patternsNum,
  1512. patternMaxLen,
  1513. strMaxLen,
  1514. maxChar,
  1515. rng
  1516. );
  1517. }
  1518. }
  1519. }
  1520. }
  1521. }
  1522. }