dense_hash.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652
  1. #pragma once
  2. #include "fwd.h"
  3. #include <util/generic/utility.h>
  4. #include <util/generic/vector.h>
  5. #include <util/generic/mapfindptr.h>
  6. #include <util/str_stl.h>
  7. #include <util/ysaveload.h>
  8. /*
  9. * There are 2 classes in this file:
  10. * - TDenseHash - analog of THashMap
  11. * - TDenseHashSet - analog of THashSet
  12. */
  13. /*
  14. * Implements dense-hash, in some circumstances it is a lot (2x) faster than THashMap.
  15. * We support only adding new elements.
  16. * TKey value equal to EmptyMarker (by default, it is TKey())
  17. * can not be inserted into hash - it is used as marker of empty element.
  18. * TValue type must be default constructible
  19. */
  20. template <class TKey,
  21. class TValue,
  22. class TKeyHash,
  23. size_t MaxLoadFactor,
  24. size_t LogInitSize>
  25. class TDenseHash : public TMapOps<TDenseHash<TKey, TValue, TKeyHash, MaxLoadFactor, LogInitSize>> {
  26. private:
  27. template <class THash, class TVal>
  28. class TIteratorBase {
  29. friend class TDenseHash;
  30. template <class THash2, class TVal2>
  31. friend class TIteratorBase;
  32. THash* Hash;
  33. size_t Idx;
  34. // used only to implement end()
  35. TIteratorBase(THash* hash, size_t initIdx)
  36. : Hash(hash)
  37. , Idx(initIdx)
  38. {
  39. }
  40. public:
  41. TIteratorBase(THash& hash)
  42. : Hash(&hash)
  43. , Idx(0)
  44. {
  45. if (Hash->EmptyMarker == Hash->Buckets[Idx].first) {
  46. Next();
  47. }
  48. }
  49. template <class THash2, class TVal2>
  50. TIteratorBase(const TIteratorBase<THash2, TVal2>& it)
  51. : Hash(it.Hash)
  52. , Idx(it.Idx)
  53. {
  54. }
  55. TIteratorBase(const TIteratorBase&) = default;
  56. static TIteratorBase CreateEmpty() {
  57. return TIteratorBase(nullptr, 0);
  58. }
  59. TIteratorBase& operator=(const TIteratorBase&) = default;
  60. void Next() {
  61. ++Idx;
  62. while (Idx < Hash->Buckets.size() && Hash->EmptyMarker == Hash->Buckets[Idx].first) {
  63. ++Idx;
  64. }
  65. }
  66. TIteratorBase& operator++() {
  67. Next();
  68. return *this;
  69. }
  70. TVal& operator*() {
  71. return Hash->Buckets[Idx];
  72. }
  73. TVal* operator->() {
  74. return &Hash->Buckets[Idx];
  75. }
  76. const TVal* operator->() const {
  77. return &Hash->Buckets[Idx];
  78. }
  79. THash* GetHash() {
  80. return Hash;
  81. }
  82. bool operator==(const TIteratorBase& rhs) const {
  83. Y_ASSERT(Hash == rhs.Hash);
  84. return Idx == rhs.Idx;
  85. }
  86. bool operator!=(const TIteratorBase& rhs) const {
  87. return !(*this == rhs);
  88. }
  89. };
  90. public:
  91. using key_type = TKey;
  92. using mapped_type = TValue;
  93. using value_type = std::pair<const key_type, mapped_type>;
  94. using size_type = std::size_t;
  95. using difference_type = std::ptrdiff_t;
  96. using hasher = TKeyHash;
  97. using key_equal = std::equal_to<key_type>; // TODO(tender-bum): template argument
  98. // using allocator_type = ...
  99. using reference = value_type&;
  100. using const_reference = const value_type&;
  101. using pointer = value_type*; // TODO(tender-bum): std::allocator_traits<Alloc>::pointer;
  102. using const_pointer = const value_type*; // TODO(tender-bum):
  103. // std::allocator_traits<Alloc>::const_pointer;
  104. using iterator = TIteratorBase<TDenseHash, value_type>;
  105. using const_iterator = TIteratorBase<const TDenseHash, const value_type>;
  106. public:
  107. TDenseHash(const key_type& emptyMarker = key_type{}, size_type initSize = 0)
  108. : EmptyMarker(emptyMarker)
  109. {
  110. MakeEmpty(initSize);
  111. }
  112. TDenseHash(const TDenseHash&) = default;
  113. TDenseHash(TDenseHash&&) = default;
  114. TDenseHash& operator=(const TDenseHash& rhs) {
  115. TDenseHash tmp{ rhs };
  116. return *this = std::move(tmp);
  117. }
  118. TDenseHash& operator=(TDenseHash&&) = default;
  119. friend bool operator==(const TDenseHash& lhs, const TDenseHash& rhs) {
  120. return lhs.Size() == rhs.Size() &&
  121. AllOf(lhs, [&rhs](const auto& v) {
  122. auto it = rhs.find(v.first);
  123. return it != rhs.end() && *it == v;
  124. });
  125. }
  126. void Clear() {
  127. for (auto& bucket : Buckets) {
  128. if (bucket.first != EmptyMarker) {
  129. SetValue(bucket, EmptyMarker, mapped_type{});
  130. }
  131. }
  132. NumFilled = 0;
  133. }
  134. void MakeEmpty(size_type initSize = 0) {
  135. if (!initSize) {
  136. initSize = 1 << LogInitSize;
  137. } else {
  138. initSize = FastClp2(initSize);
  139. }
  140. BucketMask = initSize - 1;
  141. NumFilled = 0;
  142. TVector<value_type> tmp;
  143. for (size_type i = 0; i < initSize; ++i) {
  144. tmp.emplace_back(EmptyMarker, mapped_type{});
  145. }
  146. tmp.swap(Buckets);
  147. GrowThreshold = Max<size_type>(1, initSize * MaxLoadFactor / 100) - 1;
  148. }
  149. template <class K>
  150. bool Has(const K& key) const {
  151. return ProcessKey<bool>(
  152. key,
  153. [](size_type) { return true; },
  154. [](size_type) { return false; });
  155. }
  156. size_type Capacity() const {
  157. return Buckets.capacity();
  158. }
  159. bool Empty() const {
  160. return Size() == 0;
  161. }
  162. size_type Size() const {
  163. return NumFilled;
  164. }
  165. template <size_type maxFillPercents, size_type logInitSize>
  166. void Swap(TDenseHash<key_type, mapped_type, hasher, maxFillPercents, logInitSize>& other) {
  167. Buckets.swap(other.Buckets);
  168. DoSwap(BucketMask, other.BucketMask);
  169. DoSwap(NumFilled, other.NumFilled);
  170. DoSwap(GrowThreshold, other.GrowThreshold);
  171. DoSwap(EmptyMarker, other.EmptyMarker);
  172. }
  173. void Save(IOutputStream* s) const {
  174. // TODO(tender-bum): make SaveLoad great again
  175. ::SaveMany(s, BucketMask, NumFilled, GrowThreshold);
  176. // We need to do so because Buckets may be serialized as a pod-array
  177. // that doesn't correspond to the previous behaviour
  178. ::SaveSize(s, Buckets.size());
  179. for (const auto& b : Buckets) {
  180. ::Save(s, b.first);
  181. ::Save(s, b.second);
  182. }
  183. mapped_type defaultValue{};
  184. ::SaveMany(s, EmptyMarker, defaultValue);
  185. }
  186. void Load(IInputStream* s) {
  187. // TODO(tender-bum): make SaveLoad great again
  188. ::LoadMany(s, BucketMask, NumFilled, GrowThreshold);
  189. // We need to do so because we can't load const fields
  190. struct TPairMimic {
  191. key_type First;
  192. mapped_type Second;
  193. Y_SAVELOAD_DEFINE(First, Second);
  194. };
  195. TVector<TPairMimic> tmp;
  196. ::Load(s, tmp);
  197. Buckets.clear();
  198. for (auto& v : tmp) {
  199. Buckets.emplace_back(std::move(v.First), std::move(v.Second));
  200. }
  201. ::Load(s, EmptyMarker);
  202. mapped_type defaultValue;
  203. ::Load(s, defaultValue);
  204. }
  205. public:
  206. iterator begin() {
  207. return iterator(*this);
  208. }
  209. iterator end() {
  210. return iterator(this, Buckets.size());
  211. }
  212. const_iterator begin() const {
  213. return const_iterator(*this);
  214. }
  215. const_iterator end() const {
  216. return const_iterator(this, Buckets.size());
  217. }
  218. template <class K>
  219. iterator find(const K& key) {
  220. return ProcessKey<iterator>(
  221. key,
  222. [&](size_type idx) { return iterator(this, idx); },
  223. [&](size_type) { return end(); });
  224. }
  225. template <class K>
  226. const_iterator find(const K& key) const {
  227. return ProcessKey<const_iterator>(
  228. key,
  229. [&](size_type idx) { return const_iterator(this, idx); },
  230. [&](size_type) { return end(); });
  231. }
  232. template <class K>
  233. const TValue& at(const K& key) const {
  234. return ProcessKey<const TValue&>(
  235. key,
  236. [&](size_type idx) -> const TValue& { return Buckets[idx].second; },
  237. [&](size_type) -> const TValue& { throw std::out_of_range("TDenseHash: missing key"); });
  238. }
  239. template <class K>
  240. TValue& at(const K& key) {
  241. return ProcessKey<TValue&>(
  242. key,
  243. [&](size_type idx) -> TValue& { return Buckets[idx].second; },
  244. [&](size_type) -> TValue& { throw std::out_of_range("TDenseHash: missing key"); });
  245. }
  246. bool Grow(size_type to = 0, bool force = false) {
  247. if (!to) {
  248. to = Buckets.size() * 2;
  249. } else {
  250. to = FastClp2(to);
  251. if (to <= Buckets.size() && !force) {
  252. return false;
  253. }
  254. }
  255. TVector<value_type> oldBuckets(Reserve(to));
  256. for (size_type i = 0; i < to; ++i) {
  257. oldBuckets.emplace_back(EmptyMarker, mapped_type{});
  258. }
  259. oldBuckets.swap(Buckets);
  260. BucketMask = Buckets.size() - 1;
  261. GrowThreshold = Max<size_type>(1, Buckets.size() * (MaxLoadFactor / 100.f)) - 1;
  262. for (auto& item : oldBuckets) {
  263. if (EmptyMarker != item.first) {
  264. SetValue(FindProperBucket(item.first), std::move(item));
  265. }
  266. }
  267. return true;
  268. }
  269. // Grow to size with which GrowThreshold will be higher then passed value
  270. //
  271. // (to) = (desired_num_filled + 2) * (100.f / MaxLoadFactor) + 2 after conversion to size_type
  272. // is not less than x := (desired_num_filled + 2) * (100.f / MaxLoadFactor) + 1 and FastClp2(to) is not less that (to)
  273. // (to) * (MaxLoadFactor / 100.f) >= x * (MaxLoadFactor / 100.f) = (desired_num_filled + 2) + (MaxLoadFactor / 100.f).
  274. // This require calculations with two or more significand decimal places
  275. // to have no less than (desired_num_filled + 2) after second conversion to size_type.
  276. // In that case after substracting 1 we got GrowThreshold >= desired_num_filled + 1
  277. //
  278. bool ReserveSpace(size_type desired_num_filled, bool force = false) {
  279. size_type to = Max<size_type>(1, (desired_num_filled + 2) * (100.f / MaxLoadFactor) + 2);
  280. return Grow(to, force);
  281. }
  282. // We need this overload because we want to optimize insertion when somebody inserts value_type.
  283. // So we don't need to extract the key.
  284. // This overload also allows brace enclosed initializer to be inserted.
  285. std::pair<iterator, bool> insert(const value_type& t) {
  286. size_type hs = hasher{}(t.first);
  287. auto p = GetBucketInfo(hs & BucketMask, t.first);
  288. if (p.second) {
  289. ++NumFilled;
  290. if (NumFilled >= GrowThreshold) {
  291. Grow();
  292. p.first = FindProperBucket(hs & BucketMask, t.first);
  293. }
  294. SetValue(p.first, t);
  295. return { iterator{ this, p.first }, true };
  296. }
  297. return { iterator{ this, p.first }, false };
  298. }
  299. // We need this overload because we want to optimize insertion when somebody inserts value_type.
  300. // So we don't need to extract the key.
  301. // This overload also allows brace enclosed initializer to be inserted.
  302. std::pair<iterator, bool> insert(value_type&& t) {
  303. size_type hs = hasher{}(t.first);
  304. auto p = GetBucketInfo(hs & BucketMask, t.first);
  305. if (p.second) {
  306. ++NumFilled;
  307. if (NumFilled >= GrowThreshold) {
  308. Grow();
  309. p.first = FindProperBucket(hs & BucketMask, t.first);
  310. }
  311. SetValue(p.first, std::move(t));
  312. return { iterator{ this, p.first }, true };
  313. }
  314. return { iterator{ this, p.first }, false };
  315. }
  316. // Standart integration. This overload is equivalent to emplace(std::forward<P>(p)).
  317. template <class P>
  318. std::enable_if_t<!std::is_same<std::decay_t<P>, value_type>::value,
  319. std::pair<iterator, bool>> insert(P&& p) {
  320. return emplace(std::forward<P>(p));
  321. }
  322. // Not really emplace because we need to know the key anyway. So we need to construct value_type.
  323. template <class... Args>
  324. std::pair<iterator, bool> emplace(Args&&... args) {
  325. return insert(value_type{ std::forward<Args>(args)... });
  326. }
  327. template <class K>
  328. mapped_type& operator[](K&& key) {
  329. size_type hs = hasher{}(key);
  330. auto p = GetBucketInfo(hs & BucketMask, key);
  331. if (p.second) {
  332. ++NumFilled;
  333. if (NumFilled >= GrowThreshold) {
  334. Grow();
  335. p.first = FindProperBucket(hs & BucketMask, key);
  336. }
  337. SetValue(p.first, std::forward<K>(key), mapped_type{});
  338. }
  339. return Buckets[p.first].second;
  340. }
  341. private:
  342. key_type EmptyMarker;
  343. size_type NumFilled;
  344. size_type BucketMask;
  345. size_type GrowThreshold;
  346. TVector<value_type> Buckets;
  347. private:
  348. // Tricky way to set value of type with const fields
  349. template <class... Args>
  350. void SetValue(value_type& bucket, Args&&... args) {
  351. bucket.~value_type();
  352. new (&bucket) value_type(std::forward<Args>(args)...);
  353. }
  354. template <class... Args>
  355. void SetValue(size_type idx, Args&&... args) {
  356. SetValue(Buckets[idx], std::forward<Args>(args)...);
  357. }
  358. template <class K>
  359. size_type FindProperBucket(size_type idx, const K& key) const {
  360. return ProcessIndex<size_type>(
  361. idx,
  362. key,
  363. [](size_type idx) { return idx; },
  364. [](size_type idx) { return idx; });
  365. }
  366. template <class K>
  367. size_type FindProperBucket(const K& key) const {
  368. return FindProperBucket(hasher{}(key) & BucketMask, key);
  369. }
  370. // { idx, is_empty }
  371. template <class K>
  372. std::pair<size_type, bool> GetBucketInfo(size_type idx, const K& key) const {
  373. return ProcessIndex<std::pair<size_type, bool>>(
  374. idx,
  375. key,
  376. [](size_type idx) { return std::make_pair(idx, false); },
  377. [](size_type idx) { return std::make_pair(idx, true); });
  378. }
  379. template <class R, class K, class OnFound, class OnEmpty>
  380. R ProcessIndex(size_type idx, const K& key, OnFound f0, OnEmpty f1) const {
  381. for (size_type numProbes = 1; EmptyMarker != Buckets[idx].first; ++numProbes) {
  382. if (Buckets[idx].first == key) {
  383. return f0(idx);
  384. }
  385. idx = (idx + numProbes) & BucketMask;
  386. }
  387. return f1(idx);
  388. }
  389. template <class R, class K, class OnFound, class OnEmpty>
  390. R ProcessKey(const K& key, OnFound&& f0, OnEmpty&& f1) const {
  391. return ProcessIndex<R>(
  392. hasher{}(key) & BucketMask, key, std::forward<OnFound>(f0), std::forward<OnEmpty>(f1));
  393. }
  394. };
  395. template <class TKey,
  396. class TKeyHash,
  397. size_t MaxLoadFactor,
  398. size_t LogInitSize>
  399. class TDenseHashSet {
  400. public:
  401. TDenseHashSet(const TKey& emptyMarker = TKey(), size_t initSize = 0)
  402. : EmptyMarker(emptyMarker)
  403. {
  404. MakeEmpty(initSize);
  405. }
  406. void Clear() {
  407. size_t currentSize = Buckets.size();
  408. Buckets.clear();
  409. Buckets.resize(currentSize, EmptyMarker);
  410. NumFilled = 0;
  411. }
  412. void MakeEmpty(size_t initSize = 0) {
  413. if (!initSize) {
  414. initSize = 1 << LogInitSize;
  415. } else {
  416. initSize = FastClp2(initSize);
  417. }
  418. BucketMask = initSize - 1;
  419. NumFilled = 0;
  420. TVector<TKey>(initSize, EmptyMarker).swap(Buckets);
  421. GrowThreshold = Max<size_t>(1, initSize * MaxLoadFactor / 100) - 1;
  422. }
  423. template <class K>
  424. bool Has(const K& key) const {
  425. return Buckets[FindBucket(key)] != EmptyMarker;
  426. }
  427. // gets existing item or inserts new
  428. template <class K>
  429. bool Insert(const K& key) {
  430. bool inserted = InsertNoGrow(key);
  431. if (inserted) {
  432. MaybeGrow();
  433. }
  434. return inserted;
  435. }
  436. size_t Capacity() const {
  437. return Buckets.capacity();
  438. }
  439. bool Empty() const {
  440. return Size() == 0;
  441. }
  442. size_t Size() const {
  443. return NumFilled;
  444. }
  445. template <size_t maxFillPercents, size_t logInitSize>
  446. void Swap(TDenseHashSet<TKey, TKeyHash, maxFillPercents, logInitSize>& other) {
  447. Buckets.swap(other.Buckets);
  448. DoSwap(BucketMask, other.BucketMask);
  449. DoSwap(NumFilled, other.NumFilled);
  450. DoSwap(GrowThreshold, other.GrowThreshold);
  451. DoSwap(EmptyMarker, other.EmptyMarker);
  452. }
  453. Y_SAVELOAD_DEFINE(BucketMask, NumFilled, GrowThreshold, Buckets, EmptyMarker);
  454. private:
  455. template <class THash>
  456. class TIteratorBase {
  457. friend class TDenseHashSet;
  458. THash* Hash;
  459. size_t Idx;
  460. // used only to implement end()
  461. TIteratorBase(THash* hash, size_t initIdx)
  462. : Hash(hash)
  463. , Idx(initIdx)
  464. {
  465. }
  466. public:
  467. TIteratorBase(THash& hash)
  468. : Hash(&hash)
  469. , Idx(0)
  470. {
  471. if (Hash->Buckets[Idx] == Hash->EmptyMarker) {
  472. Next();
  473. }
  474. }
  475. void Next() {
  476. ++Idx;
  477. while (Idx < Hash->Buckets.size() && Hash->Buckets[Idx] == Hash->EmptyMarker) {
  478. ++Idx;
  479. }
  480. }
  481. TIteratorBase& operator++() {
  482. Next();
  483. return *this;
  484. }
  485. bool Initialized() const {
  486. return Hash != nullptr;
  487. }
  488. bool Ok() const {
  489. return Idx < Hash->Buckets.size();
  490. }
  491. const TKey& operator*() const {
  492. return Key();
  493. }
  494. const TKey& Key() const {
  495. return Hash->Buckets[Idx];
  496. }
  497. bool operator==(const TIteratorBase& rhs) const {
  498. Y_ASSERT(Hash == rhs.Hash);
  499. return Idx == rhs.Idx;
  500. }
  501. bool operator!=(const TIteratorBase& rhs) const {
  502. return !(*this == rhs);
  503. }
  504. };
  505. public:
  506. typedef TIteratorBase<const TDenseHashSet> TConstIterator;
  507. TConstIterator begin() const {
  508. return TConstIterator(*this);
  509. }
  510. TConstIterator end() const {
  511. return TConstIterator(this, Buckets.size());
  512. }
  513. private:
  514. size_t BucketMask;
  515. size_t NumFilled;
  516. size_t GrowThreshold;
  517. TVector<TKey> Buckets;
  518. TKey EmptyMarker;
  519. template <class K>
  520. bool InsertNoGrow(const K& key) {
  521. size_t idx = FindBucket(key);
  522. if (Buckets[idx] == EmptyMarker) {
  523. ++NumFilled;
  524. Buckets[idx] = key;
  525. return true;
  526. }
  527. return false;
  528. }
  529. bool MaybeGrow() {
  530. if (NumFilled < GrowThreshold) {
  531. return false;
  532. }
  533. TVector<TKey> oldBuckets(Buckets.size() * 2, EmptyMarker);
  534. oldBuckets.swap(Buckets);
  535. BucketMask = Buckets.size() - 1;
  536. GrowThreshold = Max<size_t>(1, Buckets.size() * (MaxLoadFactor / 100.f)) - 1;
  537. NumFilled = 0;
  538. for (const TKey& key : oldBuckets) {
  539. if (key != EmptyMarker) {
  540. InsertNoGrow(key);
  541. }
  542. }
  543. return true;
  544. }
  545. template <class K>
  546. size_t FindBucket(const K& key) const {
  547. size_t idx = TKeyHash()(key) & BucketMask;
  548. for (size_t numProbes = 1; Buckets[idx] != EmptyMarker; ++numProbes) {
  549. if (Buckets[idx] == key) {
  550. return idx;
  551. }
  552. idx = (idx + numProbes) & BucketMask;
  553. }
  554. return idx;
  555. }
  556. };