dense_hash.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653
  1. #pragma once
  2. #include "fwd.h"
  3. #include <util/generic/bitops.h>
  4. #include <util/generic/utility.h>
  5. #include <util/generic/vector.h>
  6. #include <util/generic/mapfindptr.h>
  7. #include <util/str_stl.h>
  8. #include <util/ysaveload.h>
  9. /*
  10. * There are 2 classes in this file:
  11. * - TDenseHash - analog of THashMap
  12. * - TDenseHashSet - analog of THashSet
  13. */
  14. /*
  15. * Implements dense-hash, in some circumstances it is a lot (2x) faster than THashMap.
  16. * We support only adding new elements.
  17. * TKey value equal to EmptyMarker (by default, it is TKey())
  18. * can not be inserted into hash - it is used as marker of empty element.
  19. * TValue type must be default constructible
  20. */
  21. template <class TKey,
  22. class TValue,
  23. class TKeyHash,
  24. size_t MaxLoadFactor,
  25. size_t LogInitSize>
  26. class TDenseHash : public TMapOps<TDenseHash<TKey, TValue, TKeyHash, MaxLoadFactor, LogInitSize>> {
  27. private:
  28. template <class THash, class TVal>
  29. class TIteratorBase {
  30. friend class TDenseHash;
  31. template <class THash2, class TVal2>
  32. friend class TIteratorBase;
  33. THash* Hash;
  34. size_t Idx;
  35. // used only to implement end()
  36. TIteratorBase(THash* hash, size_t initIdx)
  37. : Hash(hash)
  38. , Idx(initIdx)
  39. {
  40. }
  41. public:
  42. TIteratorBase(THash& hash)
  43. : Hash(&hash)
  44. , Idx(0)
  45. {
  46. if (Hash->EmptyMarker == Hash->Buckets[Idx].first) {
  47. Next();
  48. }
  49. }
  50. template <class THash2, class TVal2>
  51. TIteratorBase(const TIteratorBase<THash2, TVal2>& it)
  52. : Hash(it.Hash)
  53. , Idx(it.Idx)
  54. {
  55. }
  56. TIteratorBase(const TIteratorBase&) = default;
  57. static TIteratorBase CreateEmpty() {
  58. return TIteratorBase(nullptr, 0);
  59. }
  60. TIteratorBase& operator=(const TIteratorBase&) = default;
  61. void Next() {
  62. ++Idx;
  63. while (Idx < Hash->Buckets.size() && Hash->EmptyMarker == Hash->Buckets[Idx].first) {
  64. ++Idx;
  65. }
  66. }
  67. TIteratorBase& operator++() {
  68. Next();
  69. return *this;
  70. }
  71. TVal& operator*() {
  72. return Hash->Buckets[Idx];
  73. }
  74. TVal* operator->() {
  75. return &Hash->Buckets[Idx];
  76. }
  77. const TVal* operator->() const {
  78. return &Hash->Buckets[Idx];
  79. }
  80. THash* GetHash() {
  81. return Hash;
  82. }
  83. bool operator==(const TIteratorBase& rhs) const {
  84. Y_ASSERT(Hash == rhs.Hash);
  85. return Idx == rhs.Idx;
  86. }
  87. bool operator!=(const TIteratorBase& rhs) const {
  88. return !(*this == rhs);
  89. }
  90. };
  91. public:
  92. using key_type = TKey;
  93. using mapped_type = TValue;
  94. using value_type = std::pair<const key_type, mapped_type>;
  95. using size_type = std::size_t;
  96. using difference_type = std::ptrdiff_t;
  97. using hasher = TKeyHash;
  98. using key_equal = std::equal_to<key_type>; // TODO(tender-bum): template argument
  99. // using allocator_type = ...
  100. using reference = value_type&;
  101. using const_reference = const value_type&;
  102. using pointer = value_type*; // TODO(tender-bum): std::allocator_traits<Alloc>::pointer;
  103. using const_pointer = const value_type*; // TODO(tender-bum):
  104. // std::allocator_traits<Alloc>::const_pointer;
  105. using iterator = TIteratorBase<TDenseHash, value_type>;
  106. using const_iterator = TIteratorBase<const TDenseHash, const value_type>;
  107. public:
  108. TDenseHash(const key_type& emptyMarker = key_type{}, size_type initSize = 0)
  109. : EmptyMarker(emptyMarker)
  110. {
  111. MakeEmpty(initSize);
  112. }
  113. TDenseHash(const TDenseHash&) = default;
  114. TDenseHash(TDenseHash&&) = default;
  115. TDenseHash& operator=(const TDenseHash& rhs) {
  116. TDenseHash tmp{ rhs };
  117. return *this = std::move(tmp);
  118. }
  119. TDenseHash& operator=(TDenseHash&&) = default;
  120. friend bool operator==(const TDenseHash& lhs, const TDenseHash& rhs) {
  121. return lhs.Size() == rhs.Size() &&
  122. AllOf(lhs, [&rhs](const auto& v) {
  123. auto it = rhs.find(v.first);
  124. return it != rhs.end() && *it == v;
  125. });
  126. }
  127. void Clear() {
  128. for (auto& bucket : Buckets) {
  129. if (bucket.first != EmptyMarker) {
  130. SetValue(bucket, EmptyMarker, mapped_type{});
  131. }
  132. }
  133. NumFilled = 0;
  134. }
  135. void MakeEmpty(size_type initSize = 0) {
  136. if (!initSize) {
  137. initSize = 1 << LogInitSize;
  138. } else {
  139. initSize = FastClp2(initSize);
  140. }
  141. BucketMask = initSize - 1;
  142. NumFilled = 0;
  143. TVector<value_type> tmp;
  144. for (size_type i = 0; i < initSize; ++i) {
  145. tmp.emplace_back(EmptyMarker, mapped_type{});
  146. }
  147. tmp.swap(Buckets);
  148. GrowThreshold = Max<size_type>(1, initSize * MaxLoadFactor / 100) - 1;
  149. }
  150. template <class K>
  151. bool Has(const K& key) const {
  152. return ProcessKey<bool>(
  153. key,
  154. [](size_type) { return true; },
  155. [](size_type) { return false; });
  156. }
  157. size_type Capacity() const {
  158. return Buckets.capacity();
  159. }
  160. bool Empty() const {
  161. return Size() == 0;
  162. }
  163. size_type Size() const {
  164. return NumFilled;
  165. }
  166. template <size_type maxFillPercents, size_type logInitSize>
  167. void Swap(TDenseHash<key_type, mapped_type, hasher, maxFillPercents, logInitSize>& other) {
  168. Buckets.swap(other.Buckets);
  169. DoSwap(BucketMask, other.BucketMask);
  170. DoSwap(NumFilled, other.NumFilled);
  171. DoSwap(GrowThreshold, other.GrowThreshold);
  172. DoSwap(EmptyMarker, other.EmptyMarker);
  173. }
  174. void Save(IOutputStream* s) const {
  175. // TODO(tender-bum): make SaveLoad great again
  176. ::SaveMany(s, BucketMask, NumFilled, GrowThreshold);
  177. // We need to do so because Buckets may be serialized as a pod-array
  178. // that doesn't correspond to the previous behaviour
  179. ::SaveSize(s, Buckets.size());
  180. for (const auto& b : Buckets) {
  181. ::Save(s, b.first);
  182. ::Save(s, b.second);
  183. }
  184. mapped_type defaultValue{};
  185. ::SaveMany(s, EmptyMarker, defaultValue);
  186. }
  187. void Load(IInputStream* s) {
  188. // TODO(tender-bum): make SaveLoad great again
  189. ::LoadMany(s, BucketMask, NumFilled, GrowThreshold);
  190. // We need to do so because we can't load const fields
  191. struct TPairMimic {
  192. key_type First;
  193. mapped_type Second;
  194. Y_SAVELOAD_DEFINE(First, Second);
  195. };
  196. TVector<TPairMimic> tmp;
  197. ::Load(s, tmp);
  198. Buckets.clear();
  199. for (auto& v : tmp) {
  200. Buckets.emplace_back(std::move(v.First), std::move(v.Second));
  201. }
  202. ::Load(s, EmptyMarker);
  203. mapped_type defaultValue;
  204. ::Load(s, defaultValue);
  205. }
  206. public:
  207. iterator begin() {
  208. return iterator(*this);
  209. }
  210. iterator end() {
  211. return iterator(this, Buckets.size());
  212. }
  213. const_iterator begin() const {
  214. return const_iterator(*this);
  215. }
  216. const_iterator end() const {
  217. return const_iterator(this, Buckets.size());
  218. }
  219. template <class K>
  220. iterator find(const K& key) {
  221. return ProcessKey<iterator>(
  222. key,
  223. [&](size_type idx) { return iterator(this, idx); },
  224. [&](size_type) { return end(); });
  225. }
  226. template <class K>
  227. const_iterator find(const K& key) const {
  228. return ProcessKey<const_iterator>(
  229. key,
  230. [&](size_type idx) { return const_iterator(this, idx); },
  231. [&](size_type) { return end(); });
  232. }
  233. template <class K>
  234. const TValue& at(const K& key) const {
  235. return ProcessKey<const TValue&>(
  236. key,
  237. [&](size_type idx) -> const TValue& { return Buckets[idx].second; },
  238. [&](size_type) -> const TValue& { throw std::out_of_range("TDenseHash: missing key"); });
  239. }
  240. template <class K>
  241. TValue& at(const K& key) {
  242. return ProcessKey<TValue&>(
  243. key,
  244. [&](size_type idx) -> TValue& { return Buckets[idx].second; },
  245. [&](size_type) -> TValue& { throw std::out_of_range("TDenseHash: missing key"); });
  246. }
  247. bool Grow(size_type to = 0, bool force = false) {
  248. if (!to) {
  249. to = Buckets.size() * 2;
  250. } else {
  251. to = FastClp2(to);
  252. if (to <= Buckets.size() && !force) {
  253. return false;
  254. }
  255. }
  256. TVector<value_type> oldBuckets(Reserve(to));
  257. for (size_type i = 0; i < to; ++i) {
  258. oldBuckets.emplace_back(EmptyMarker, mapped_type{});
  259. }
  260. oldBuckets.swap(Buckets);
  261. BucketMask = Buckets.size() - 1;
  262. GrowThreshold = Max<size_type>(1, Buckets.size() * (MaxLoadFactor / 100.f)) - 1;
  263. for (auto& item : oldBuckets) {
  264. if (EmptyMarker != item.first) {
  265. SetValue(FindProperBucket(item.first), std::move(item));
  266. }
  267. }
  268. return true;
  269. }
  270. // Grow to size with which GrowThreshold will be higher then passed value
  271. //
  272. // (to) = (desired_num_filled + 2) * (100.f / MaxLoadFactor) + 2 after conversion to size_type
  273. // is not less than x := (desired_num_filled + 2) * (100.f / MaxLoadFactor) + 1 and FastClp2(to) is not less that (to)
  274. // (to) * (MaxLoadFactor / 100.f) >= x * (MaxLoadFactor / 100.f) = (desired_num_filled + 2) + (MaxLoadFactor / 100.f).
  275. // This require calculations with two or more significand decimal places
  276. // to have no less than (desired_num_filled + 2) after second conversion to size_type.
  277. // In that case after substracting 1 we got GrowThreshold >= desired_num_filled + 1
  278. //
  279. bool ReserveSpace(size_type desired_num_filled, bool force = false) {
  280. size_type to = Max<size_type>(1, (desired_num_filled + 2) * (100.f / MaxLoadFactor) + 2);
  281. return Grow(to, force);
  282. }
  283. // We need this overload because we want to optimize insertion when somebody inserts value_type.
  284. // So we don't need to extract the key.
  285. // This overload also allows brace enclosed initializer to be inserted.
  286. std::pair<iterator, bool> insert(const value_type& t) {
  287. size_type hs = hasher{}(t.first);
  288. auto p = GetBucketInfo(hs & BucketMask, t.first);
  289. if (p.second) {
  290. ++NumFilled;
  291. if (NumFilled >= GrowThreshold) {
  292. Grow();
  293. p.first = FindProperBucket(hs & BucketMask, t.first);
  294. }
  295. SetValue(p.first, t);
  296. return { iterator{ this, p.first }, true };
  297. }
  298. return { iterator{ this, p.first }, false };
  299. }
  300. // We need this overload because we want to optimize insertion when somebody inserts value_type.
  301. // So we don't need to extract the key.
  302. // This overload also allows brace enclosed initializer to be inserted.
  303. std::pair<iterator, bool> insert(value_type&& t) {
  304. size_type hs = hasher{}(t.first);
  305. auto p = GetBucketInfo(hs & BucketMask, t.first);
  306. if (p.second) {
  307. ++NumFilled;
  308. if (NumFilled >= GrowThreshold) {
  309. Grow();
  310. p.first = FindProperBucket(hs & BucketMask, t.first);
  311. }
  312. SetValue(p.first, std::move(t));
  313. return { iterator{ this, p.first }, true };
  314. }
  315. return { iterator{ this, p.first }, false };
  316. }
  317. // Standart integration. This overload is equivalent to emplace(std::forward<P>(p)).
  318. template <class P>
  319. std::enable_if_t<!std::is_same<std::decay_t<P>, value_type>::value,
  320. std::pair<iterator, bool>> insert(P&& p) {
  321. return emplace(std::forward<P>(p));
  322. }
  323. // Not really emplace because we need to know the key anyway. So we need to construct value_type.
  324. template <class... Args>
  325. std::pair<iterator, bool> emplace(Args&&... args) {
  326. return insert(value_type{ std::forward<Args>(args)... });
  327. }
  328. template <class K>
  329. mapped_type& operator[](K&& key) {
  330. size_type hs = hasher{}(key);
  331. auto p = GetBucketInfo(hs & BucketMask, key);
  332. if (p.second) {
  333. ++NumFilled;
  334. if (NumFilled >= GrowThreshold) {
  335. Grow();
  336. p.first = FindProperBucket(hs & BucketMask, key);
  337. }
  338. SetValue(p.first, std::forward<K>(key), mapped_type{});
  339. }
  340. return Buckets[p.first].second;
  341. }
  342. private:
  343. key_type EmptyMarker;
  344. size_type NumFilled;
  345. size_type BucketMask;
  346. size_type GrowThreshold;
  347. TVector<value_type> Buckets;
  348. private:
  349. // Tricky way to set value of type with const fields
  350. template <class... Args>
  351. void SetValue(value_type& bucket, Args&&... args) {
  352. bucket.~value_type();
  353. new (&bucket) value_type(std::forward<Args>(args)...);
  354. }
  355. template <class... Args>
  356. void SetValue(size_type idx, Args&&... args) {
  357. SetValue(Buckets[idx], std::forward<Args>(args)...);
  358. }
  359. template <class K>
  360. size_type FindProperBucket(size_type idx, const K& key) const {
  361. return ProcessIndex<size_type>(
  362. idx,
  363. key,
  364. [](size_type idx) { return idx; },
  365. [](size_type idx) { return idx; });
  366. }
  367. template <class K>
  368. size_type FindProperBucket(const K& key) const {
  369. return FindProperBucket(hasher{}(key) & BucketMask, key);
  370. }
  371. // { idx, is_empty }
  372. template <class K>
  373. std::pair<size_type, bool> GetBucketInfo(size_type idx, const K& key) const {
  374. return ProcessIndex<std::pair<size_type, bool>>(
  375. idx,
  376. key,
  377. [](size_type idx) { return std::make_pair(idx, false); },
  378. [](size_type idx) { return std::make_pair(idx, true); });
  379. }
  380. template <class R, class K, class OnFound, class OnEmpty>
  381. R ProcessIndex(size_type idx, const K& key, OnFound f0, OnEmpty f1) const {
  382. for (size_type numProbes = 1; EmptyMarker != Buckets[idx].first; ++numProbes) {
  383. if (Buckets[idx].first == key) {
  384. return f0(idx);
  385. }
  386. idx = (idx + numProbes) & BucketMask;
  387. }
  388. return f1(idx);
  389. }
  390. template <class R, class K, class OnFound, class OnEmpty>
  391. R ProcessKey(const K& key, OnFound&& f0, OnEmpty&& f1) const {
  392. return ProcessIndex<R>(
  393. hasher{}(key) & BucketMask, key, std::forward<OnFound>(f0), std::forward<OnEmpty>(f1));
  394. }
  395. };
  396. template <class TKey,
  397. class TKeyHash,
  398. size_t MaxLoadFactor,
  399. size_t LogInitSize>
  400. class TDenseHashSet {
  401. public:
  402. TDenseHashSet(const TKey& emptyMarker = TKey(), size_t initSize = 0)
  403. : EmptyMarker(emptyMarker)
  404. {
  405. MakeEmpty(initSize);
  406. }
  407. void Clear() {
  408. size_t currentSize = Buckets.size();
  409. Buckets.clear();
  410. Buckets.resize(currentSize, EmptyMarker);
  411. NumFilled = 0;
  412. }
  413. void MakeEmpty(size_t initSize = 0) {
  414. if (!initSize) {
  415. initSize = 1 << LogInitSize;
  416. } else {
  417. initSize = FastClp2(initSize);
  418. }
  419. BucketMask = initSize - 1;
  420. NumFilled = 0;
  421. TVector<TKey>(initSize, EmptyMarker).swap(Buckets);
  422. GrowThreshold = Max<size_t>(1, initSize * MaxLoadFactor / 100) - 1;
  423. }
  424. template <class K>
  425. bool Has(const K& key) const {
  426. return Buckets[FindBucket(key)] != EmptyMarker;
  427. }
  428. // gets existing item or inserts new
  429. template <class K>
  430. bool Insert(const K& key) {
  431. bool inserted = InsertNoGrow(key);
  432. if (inserted) {
  433. MaybeGrow();
  434. }
  435. return inserted;
  436. }
  437. size_t Capacity() const {
  438. return Buckets.capacity();
  439. }
  440. bool Empty() const {
  441. return Size() == 0;
  442. }
  443. size_t Size() const {
  444. return NumFilled;
  445. }
  446. template <size_t maxFillPercents, size_t logInitSize>
  447. void Swap(TDenseHashSet<TKey, TKeyHash, maxFillPercents, logInitSize>& other) {
  448. Buckets.swap(other.Buckets);
  449. DoSwap(BucketMask, other.BucketMask);
  450. DoSwap(NumFilled, other.NumFilled);
  451. DoSwap(GrowThreshold, other.GrowThreshold);
  452. DoSwap(EmptyMarker, other.EmptyMarker);
  453. }
  454. Y_SAVELOAD_DEFINE(BucketMask, NumFilled, GrowThreshold, Buckets, EmptyMarker);
  455. private:
  456. template <class THash>
  457. class TIteratorBase {
  458. friend class TDenseHashSet;
  459. THash* Hash;
  460. size_t Idx;
  461. // used only to implement end()
  462. TIteratorBase(THash* hash, size_t initIdx)
  463. : Hash(hash)
  464. , Idx(initIdx)
  465. {
  466. }
  467. public:
  468. TIteratorBase(THash& hash)
  469. : Hash(&hash)
  470. , Idx(0)
  471. {
  472. if (Hash->Buckets[Idx] == Hash->EmptyMarker) {
  473. Next();
  474. }
  475. }
  476. void Next() {
  477. ++Idx;
  478. while (Idx < Hash->Buckets.size() && Hash->Buckets[Idx] == Hash->EmptyMarker) {
  479. ++Idx;
  480. }
  481. }
  482. TIteratorBase& operator++() {
  483. Next();
  484. return *this;
  485. }
  486. bool Initialized() const {
  487. return Hash != nullptr;
  488. }
  489. bool Ok() const {
  490. return Idx < Hash->Buckets.size();
  491. }
  492. const TKey& operator*() const {
  493. return Key();
  494. }
  495. const TKey& Key() const {
  496. return Hash->Buckets[Idx];
  497. }
  498. bool operator==(const TIteratorBase& rhs) const {
  499. Y_ASSERT(Hash == rhs.Hash);
  500. return Idx == rhs.Idx;
  501. }
  502. bool operator!=(const TIteratorBase& rhs) const {
  503. return !(*this == rhs);
  504. }
  505. };
  506. public:
  507. typedef TIteratorBase<const TDenseHashSet> TConstIterator;
  508. TConstIterator begin() const {
  509. return TConstIterator(*this);
  510. }
  511. TConstIterator end() const {
  512. return TConstIterator(this, Buckets.size());
  513. }
  514. private:
  515. size_t BucketMask;
  516. size_t NumFilled;
  517. size_t GrowThreshold;
  518. TVector<TKey> Buckets;
  519. TKey EmptyMarker;
  520. template <class K>
  521. bool InsertNoGrow(const K& key) {
  522. size_t idx = FindBucket(key);
  523. if (Buckets[idx] == EmptyMarker) {
  524. ++NumFilled;
  525. Buckets[idx] = key;
  526. return true;
  527. }
  528. return false;
  529. }
  530. bool MaybeGrow() {
  531. if (NumFilled < GrowThreshold) {
  532. return false;
  533. }
  534. TVector<TKey> oldBuckets(Buckets.size() * 2, EmptyMarker);
  535. oldBuckets.swap(Buckets);
  536. BucketMask = Buckets.size() - 1;
  537. GrowThreshold = Max<size_t>(1, Buckets.size() * (MaxLoadFactor / 100.f)) - 1;
  538. NumFilled = 0;
  539. for (const TKey& key : oldBuckets) {
  540. if (key != EmptyMarker) {
  541. InsertNoGrow(key);
  542. }
  543. }
  544. return true;
  545. }
  546. template <class K>
  547. size_t FindBucket(const K& key) const {
  548. size_t idx = TKeyHash()(key) & BucketMask;
  549. for (size_t numProbes = 1; Buckets[idx] != EmptyMarker; ++numProbes) {
  550. if (Buckets[idx] == key) {
  551. return idx;
  552. }
  553. idx = (idx + numProbes) & BucketMask;
  554. }
  555. return idx;
  556. }
  557. };