mkql_rh_hash_ut.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. #include "../mkql_rh_hash.h"
  2. #include <library/cpp/testing/unittest/registar.h>
  3. #include <unordered_map>
  4. #include <unordered_set>
  5. namespace NKikimr {
  6. namespace NMiniKQL {
  7. Y_UNIT_TEST_SUITE(TMiniKQLRobinHoodHashTest) {
  8. Y_UNIT_TEST(Map) {
  9. TRobinHoodHashMap<i32> rh(sizeof(i64));
  10. std::unordered_map<i32, i64> h;
  11. for (ui64 i = 0; i < 10000; ++i) {
  12. auto k = i % 1000;
  13. auto [it, inserted] = h.emplace(k, 0);
  14. bool isNew;
  15. auto iter = rh.Insert(k, isNew);
  16. UNIT_ASSERT_VALUES_EQUAL(rh.GetKey(iter), k);
  17. UNIT_ASSERT_VALUES_EQUAL(isNew, inserted);
  18. it->second += i;
  19. if (isNew) {
  20. *(i64*)rh.GetMutablePayload(iter) = i;
  21. rh.CheckGrow();
  22. } else {
  23. *(i64*)rh.GetMutablePayload(iter) += i;
  24. }
  25. UNIT_ASSERT_VALUES_EQUAL(h.size(), rh.GetSize());
  26. }
  27. for (auto it = rh.Begin(); it != rh.End(); rh.Advance(it)) {
  28. if (!rh.IsValid(it)) {
  29. continue;
  30. }
  31. auto key = rh.GetKey(it);
  32. auto hit = h.find(key);
  33. UNIT_ASSERT(hit != h.end());
  34. UNIT_ASSERT_VALUES_EQUAL(*(i64*)rh.GetPayload(it), hit->second);
  35. h.erase(key);
  36. }
  37. UNIT_ASSERT(h.empty());
  38. }
  39. Y_UNIT_TEST(FixedMap) {
  40. TRobinHoodHashFixedMap<i32, i64> rh;
  41. std::unordered_map<i32, i64> h;
  42. for (ui64 i = 0; i < 10000; ++i) {
  43. auto k = i % 1000;
  44. auto [it, inserted] = h.emplace(k, 0);
  45. bool isNew;
  46. auto iter = rh.Insert(k, isNew);
  47. UNIT_ASSERT_VALUES_EQUAL(rh.GetKey(iter), k);
  48. UNIT_ASSERT_VALUES_EQUAL(isNew, inserted);
  49. it->second += i;
  50. if (isNew) {
  51. *(i64*)rh.GetMutablePayload(iter) = i;
  52. rh.CheckGrow();
  53. } else {
  54. *(i64*)rh.GetMutablePayload(iter) += i;
  55. }
  56. UNIT_ASSERT_VALUES_EQUAL(h.size(), rh.GetSize());
  57. }
  58. for (auto it = rh.Begin(); it != rh.End(); rh.Advance(it)) {
  59. if (!rh.IsValid(it)) {
  60. continue;
  61. }
  62. auto key = rh.GetKey(it);
  63. auto hit = h.find(key);
  64. UNIT_ASSERT(hit != h.end());
  65. UNIT_ASSERT_VALUES_EQUAL(*(i64*)rh.GetPayload(it), hit->second);
  66. h.erase(key);
  67. }
  68. UNIT_ASSERT(h.empty());
  69. }
  70. Y_UNIT_TEST(Set) {
  71. TRobinHoodHashSet<i32> rh;
  72. std::unordered_set<i32> h;
  73. for (ui64 i = 0; i < 10000; ++i) {
  74. auto k = i % 1000;
  75. auto[it, inserted] = h.emplace(k);
  76. bool isNew;
  77. auto iter = rh.Insert(k, isNew);
  78. UNIT_ASSERT_VALUES_EQUAL(rh.GetKey(iter), k);
  79. UNIT_ASSERT_VALUES_EQUAL(isNew, inserted);
  80. if (isNew) {
  81. rh.CheckGrow();
  82. }
  83. UNIT_ASSERT_VALUES_EQUAL(h.size(), rh.GetSize());
  84. }
  85. for (auto it = rh.Begin(); it != rh.End(); rh.Advance(it)) {
  86. if (!rh.IsValid(it)) {
  87. continue;
  88. }
  89. auto key = rh.GetKey(it);
  90. auto hit = h.find(key);
  91. UNIT_ASSERT(hit != h.end());
  92. h.erase(key);
  93. }
  94. UNIT_ASSERT(h.empty());
  95. }
  96. Y_UNIT_TEST(MapBatch) {
  97. using THashTable = TRobinHoodHashMap<i32>;
  98. THashTable rh(sizeof(i64));
  99. std::unordered_map<i32, i64> h;
  100. std::array<TRobinHoodBatchRequestItem<i32>, PrefetchBatchSize> batch;
  101. std::array<bool, PrefetchBatchSize> batchInserted;
  102. std::array<ui64, PrefetchBatchSize> batchI;
  103. ui32 batchLen = 0;
  104. auto processBatch = [&]() {
  105. rh.BatchInsert({batch.data(), batchLen}, [&](size_t i, THashTable::iterator iter, bool isNew) {
  106. UNIT_ASSERT_VALUES_EQUAL(isNew, batchInserted[i]);
  107. UNIT_ASSERT_VALUES_EQUAL(rh.GetKey(iter), batch[i].GetKey());
  108. if (isNew) {
  109. *(i64*)rh.GetMutablePayload(iter) = batchI[i];
  110. } else {
  111. *(i64*)rh.GetMutablePayload(iter) += batchI[i];
  112. }
  113. });
  114. UNIT_ASSERT_VALUES_EQUAL(h.size(), rh.GetSize());
  115. };
  116. for (ui64 i = 0; i < 10000; ++i) {
  117. if (batchLen == batch.size()) {
  118. processBatch();
  119. batchLen = 0;
  120. }
  121. auto k = i % 1000;
  122. auto [it, inserted] = h.emplace(k, 0);
  123. batchI[batchLen] = i;
  124. batchInserted[batchLen] = inserted;
  125. batch[batchLen].ConstructKey(k);
  126. ++batchLen;
  127. it->second += i;
  128. }
  129. processBatch();
  130. for (auto it = rh.Begin(); it != rh.End(); rh.Advance(it)) {
  131. if (!rh.IsValid(it)) {
  132. continue;
  133. }
  134. auto key = rh.GetKey(it);
  135. auto hit = h.find(key);
  136. UNIT_ASSERT(hit != h.end());
  137. UNIT_ASSERT_VALUES_EQUAL(*(i64*)rh.GetPayload(it), hit->second);
  138. h.erase(key);
  139. }
  140. UNIT_ASSERT(h.empty());
  141. }
  142. Y_UNIT_TEST(FixedMapBatch) {
  143. using THashTable = TRobinHoodHashFixedMap<i32, i64>;
  144. THashTable rh(sizeof(i64));
  145. std::unordered_map<i32, i64> h;
  146. std::array<TRobinHoodBatchRequestItem<i32>, PrefetchBatchSize> batch;
  147. std::array<bool, PrefetchBatchSize> batchInserted;
  148. std::array<ui64, PrefetchBatchSize> batchI;
  149. ui32 batchLen = 0;
  150. auto processBatch = [&]() {
  151. rh.BatchInsert({batch.data(), batchLen}, [&](size_t i, THashTable::iterator iter, bool isNew) {
  152. UNIT_ASSERT_VALUES_EQUAL(isNew, batchInserted[i]);
  153. UNIT_ASSERT_VALUES_EQUAL(rh.GetKey(iter), batch[i].GetKey());
  154. if (isNew) {
  155. *(i64*)rh.GetMutablePayload(iter) = batchI[i];
  156. } else {
  157. *(i64*)rh.GetMutablePayload(iter) += batchI[i];
  158. }
  159. });
  160. UNIT_ASSERT_VALUES_EQUAL(h.size(), rh.GetSize());
  161. };
  162. for (ui64 i = 0; i < 10000; ++i) {
  163. if (batchLen == batch.size()) {
  164. processBatch();
  165. batchLen = 0;
  166. }
  167. auto k = i % 1000;
  168. auto [it, inserted] = h.emplace(k, 0);
  169. batchI[batchLen] = i;
  170. batchInserted[batchLen] = inserted;
  171. batch[batchLen].ConstructKey(k);
  172. ++batchLen;
  173. it->second += i;
  174. }
  175. processBatch();
  176. for (auto it = rh.Begin(); it != rh.End(); rh.Advance(it)) {
  177. if (!rh.IsValid(it)) {
  178. continue;
  179. }
  180. auto key = rh.GetKey(it);
  181. auto hit = h.find(key);
  182. UNIT_ASSERT(hit != h.end());
  183. UNIT_ASSERT_VALUES_EQUAL(*(i64*)rh.GetPayload(it), hit->second);
  184. h.erase(key);
  185. }
  186. UNIT_ASSERT(h.empty());
  187. }
  188. Y_UNIT_TEST(SetBatch) {
  189. using THashTable = TRobinHoodHashSet<i32>;
  190. THashTable rh(sizeof(i64));
  191. std::unordered_set<i32> h;
  192. std::array<TRobinHoodBatchRequestItem<i32>, PrefetchBatchSize> batch;
  193. std::array<bool, PrefetchBatchSize> batchInserted;
  194. ui32 batchLen = 0;
  195. auto processBatch = [&]() {
  196. rh.BatchInsert({batch.data(), batchLen}, [&](size_t i, THashTable::iterator iter, bool isNew) {
  197. UNIT_ASSERT_VALUES_EQUAL(isNew, batchInserted[i]);
  198. UNIT_ASSERT_VALUES_EQUAL(rh.GetKey(iter), batch[i].GetKey());
  199. });
  200. UNIT_ASSERT_VALUES_EQUAL(h.size(), rh.GetSize());
  201. };
  202. for (ui64 i = 0; i < 10000; ++i) {
  203. if (batchLen == batch.size()) {
  204. processBatch();
  205. batchLen = 0;
  206. }
  207. auto k = i % 1000;
  208. auto [it, inserted] = h.emplace(k);
  209. batchInserted[batchLen] = inserted;
  210. batch[batchLen].ConstructKey(k);
  211. ++batchLen;
  212. }
  213. processBatch();
  214. for (auto it = rh.Begin(); it != rh.End(); rh.Advance(it)) {
  215. if (!rh.IsValid(it)) {
  216. continue;
  217. }
  218. auto key = rh.GetKey(it);
  219. auto hit = h.find(key);
  220. UNIT_ASSERT(hit != h.end());
  221. h.erase(key);
  222. }
  223. UNIT_ASSERT(h.empty());
  224. }
  225. Y_UNIT_TEST(Power2Collisions) {
  226. TRobinHoodHashSet<ui64> rh;
  227. for (ui64 i = 0; i < 10000; ++i) {
  228. auto k = i << 32;
  229. bool isNew;
  230. auto iter = rh.Insert(k, isNew);
  231. UNIT_ASSERT_VALUES_EQUAL(rh.GetKey(iter), k);
  232. if (isNew) {
  233. rh.CheckGrow();
  234. }
  235. UNIT_ASSERT_VALUES_EQUAL(i + 1, rh.GetSize());
  236. }
  237. i32 maxDistance = 0;
  238. for (auto it = rh.Begin(); it != rh.End(); rh.Advance(it)) {
  239. if (!rh.IsValid(it)) {
  240. continue;
  241. }
  242. auto distance = rh.GetPSL(it).Distance;
  243. maxDistance = Max(maxDistance, distance);
  244. }
  245. Cerr << "maxDistance: " << maxDistance << "\n";
  246. UNIT_ASSERT(maxDistance < 10);
  247. }
  248. Y_UNIT_TEST(RehashCollisions) {
  249. TRobinHoodHashSet<ui64> rh;
  250. TVector<ui64> values;
  251. const ui64 N = 1500000;
  252. values.reserve(N);
  253. for (ui64 i = 1; i <= N; ++i) {
  254. auto k = 64 * (i >> 4) + ((i & 8) ? 32 : 0) + (i & 7);
  255. values.push_back(k);
  256. }
  257. /*
  258. for (ui64 i = 0; i < 32; ++i) {
  259. Cerr << values[i] << "\n";
  260. }
  261. for (ui64 i = 0; i < 32; ++i) {
  262. Cerr << values[values.size() - 32 + i] << "\n";
  263. }*/
  264. for (ui64 i = 0; i < values.size(); ++i) {
  265. auto k = values[i];
  266. bool isNew;
  267. auto iter = rh.Insert(k, isNew);
  268. if (rh.GetKey(iter) != k) {
  269. UNIT_ASSERT_VALUES_EQUAL(rh.GetKey(iter), k);
  270. }
  271. if (isNew) {
  272. rh.CheckGrow();
  273. }
  274. if (i + 1 != rh.GetSize()) {
  275. UNIT_ASSERT_VALUES_EQUAL(i + 1, rh.GetSize());
  276. }
  277. }
  278. TRobinHoodHashSet<ui64> rh2;
  279. i32 maxDistance1 = 0;
  280. ui64 j = 0;
  281. for (auto it = rh.Begin(); it != rh.End(); rh.Advance(it)) {
  282. if (!rh.IsValid(it)) {
  283. continue;
  284. }
  285. auto distance = rh.GetPSL(it).Distance;
  286. maxDistance1 = Max(maxDistance1, distance);
  287. auto k = rh.GetKey(it);
  288. bool isNew;
  289. auto iter = rh2.Insert(k, isNew);
  290. if (rh2.GetKey(iter) != k) {
  291. UNIT_ASSERT_VALUES_EQUAL(rh2.GetKey(iter), k);
  292. }
  293. if (isNew) {
  294. rh2.CheckGrow();
  295. }
  296. if (j + 1 != rh2.GetSize()) {
  297. UNIT_ASSERT_VALUES_EQUAL(j + 1, rh2.GetSize());
  298. }
  299. ++j;
  300. }
  301. Cerr << "maxDistance1: " << maxDistance1 << "\n";
  302. UNIT_ASSERT(maxDistance1 < 20);
  303. i32 maxDistance2 = 0;
  304. for (auto it = rh2.Begin(); it != rh2.End(); rh2.Advance(it)) {
  305. if (!rh2.IsValid(it)) {
  306. continue;
  307. }
  308. auto distance = rh2.GetPSL(it).Distance;
  309. maxDistance2 = Max(maxDistance2, distance);
  310. }
  311. Cerr << "maxDistance2: " << maxDistance2 << "\n";
  312. UNIT_ASSERT(maxDistance2 < 20);
  313. }
  314. Y_UNIT_TEST(Lookup) {
  315. TRobinHoodHashMap<i32> rh(sizeof(i64));
  316. std::unordered_map<i32, i64> h;
  317. for (ui64 i = 0; i < 10000; ++i) {
  318. auto k = i % 1000;
  319. auto [it, inserted] = h.emplace(k, 0);
  320. bool isNew;
  321. auto iter = rh.Insert(k, isNew);
  322. UNIT_ASSERT_VALUES_EQUAL(rh.GetKey(iter), k);
  323. UNIT_ASSERT_VALUES_EQUAL(isNew, inserted);
  324. it->second += i;
  325. if (isNew) {
  326. *(i64*)rh.GetMutablePayload(iter) = i;
  327. rh.CheckGrow();
  328. } else {
  329. *(i64*)rh.GetMutablePayload(iter) += i;
  330. }
  331. UNIT_ASSERT_VALUES_EQUAL(h.size(), rh.GetSize());
  332. }
  333. for (ui64 i = 0; i < 1000; ++i) {
  334. auto iter = rh.Lookup(i);
  335. auto hit = h.find(i);
  336. UNIT_ASSERT_VALUES_EQUAL(*(i64*)rh.GetPayload(iter), hit->second);
  337. h.erase(hit);
  338. }
  339. UNIT_ASSERT(h.empty());
  340. }
  341. Y_UNIT_TEST(BatchLookup) {
  342. using THashTable = TRobinHoodHashMap<i32>;
  343. THashTable rh(sizeof(i64));
  344. std::unordered_map<i32, i64> h;
  345. for (ui64 i = 0; i < 10000; ++i) {
  346. auto k = i % 1000;
  347. auto [it, inserted] = h.emplace(k, 0);
  348. bool isNew;
  349. auto iter = rh.Insert(k, isNew);
  350. UNIT_ASSERT_VALUES_EQUAL(rh.GetKey(iter), k);
  351. UNIT_ASSERT_VALUES_EQUAL(isNew, inserted);
  352. it->second += i;
  353. if (isNew) {
  354. *(i64*)rh.GetMutablePayload(iter) = i;
  355. rh.CheckGrow();
  356. } else {
  357. *(i64*)rh.GetMutablePayload(iter) += i;
  358. }
  359. UNIT_ASSERT_VALUES_EQUAL(h.size(), rh.GetSize());
  360. }
  361. std::array<TRobinHoodBatchRequestItem<i32>, PrefetchBatchSize> batch;
  362. std::array<ui64, PrefetchBatchSize> batchI;
  363. ui32 batchLen = 0;
  364. auto processBatch = [&]() {
  365. rh.BatchLookup({batch.data(), batchLen}, [&](size_t i, THashTable::iterator iter) {
  366. auto key = batchI[i];
  367. auto hit = h.find(key);
  368. UNIT_ASSERT_VALUES_EQUAL(*(i64*)rh.GetPayload(iter), hit->second);
  369. h.erase(hit);
  370. });
  371. };
  372. for (ui64 i = 0; i < 1000; ++i) {
  373. if (batchLen == batch.size()) {
  374. processBatch();
  375. batchLen = 0;
  376. }
  377. batch[batchLen].ConstructKey(i);
  378. batchI[batchLen] = i;
  379. ++batchLen;
  380. }
  381. if (batchLen > 0) {
  382. processBatch();
  383. }
  384. UNIT_ASSERT(h.empty());
  385. }
  386. }
  387. }
  388. }