chunked_helpers.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674
  1. #pragma once
  2. #include <util/generic/vector.h>
  3. #include <util/generic/buffer.h>
  4. #include <util/generic/hash_set.h>
  5. #include <util/generic/cast.h>
  6. #include <util/generic/ymath.h>
  7. #include <util/memory/blob.h>
  8. #include <util/stream/buffer.h>
  9. #include <util/stream/mem.h>
  10. #include <util/system/unaligned_mem.h>
  11. #include <util/ysaveload.h>
  12. #include "reader.h"
  13. #include "writer.h"
  14. #include <cmath>
  15. #include <cstddef>
  16. template <typename T>
  17. class TYVector {
  18. private:
  19. ui32 Size;
  20. const T* Data;
  21. public:
  22. TYVector(const TBlob& blob)
  23. : Size(IntegerCast<ui32>(ReadUnaligned<ui64>(blob.Data())))
  24. , Data((const T*)((const char*)blob.Data() + sizeof(ui64)))
  25. {
  26. }
  27. void Get(size_t idx, T& t) const {
  28. assert(idx < (size_t)Size);
  29. t = ReadUnaligned<T>(Data + idx);
  30. }
  31. const T& At(size_t idx) const {
  32. assert(idx < (size_t)Size);
  33. return Data[idx];
  34. }
  35. size_t GetSize() const {
  36. return Size;
  37. }
  38. size_t RealSize() const {
  39. return sizeof(ui64) + Size * sizeof(T);
  40. }
  41. ~TYVector() = default;
  42. };
  43. template <typename T>
  44. class TYVectorWriter {
  45. private:
  46. TVector<T> Vector;
  47. public:
  48. TYVectorWriter() = default;
  49. void PushBack(const T& value) {
  50. Vector.push_back(value);
  51. }
  52. void Save(IOutputStream& out) const {
  53. ui64 uSize = (ui64)Vector.size();
  54. out.Write(&uSize, sizeof(uSize));
  55. out.Write(Vector.data(), Vector.size() * sizeof(T));
  56. }
  57. const T& At(size_t idx) const {
  58. assert(idx < Size());
  59. return Vector[idx];
  60. }
  61. T& At(size_t idx) {
  62. assert(idx < Size());
  63. return Vector[idx];
  64. }
  65. void Clear() {
  66. Vector.clear();
  67. }
  68. size_t Size() const {
  69. return Vector.size();
  70. }
  71. void Resize(size_t size) {
  72. Vector.resize(size);
  73. }
  74. void Resize(size_t size, const T& value) {
  75. Vector.resize(size, value);
  76. }
  77. };
  78. template <typename T, bool>
  79. struct TYVectorG;
  80. template <typename X>
  81. struct TYVectorG<X, false> {
  82. typedef TYVector<X> T;
  83. };
  84. template <typename X>
  85. struct TYVectorG<X, true> {
  86. typedef TYVectorWriter<X> T;
  87. };
  88. template <typename T>
  89. struct TIsMemsetThisWithZeroesSupported {
  90. enum {
  91. Result = TTypeTraits<T>::IsPod
  92. };
  93. };
  94. #define MEMSET_THIS_WITH_ZEROES_SUPPORTED(type) \
  95. template <> \
  96. struct TIsMemsetThisWithZeroesSupported<type> { \
  97. enum { \
  98. Result = true \
  99. }; \
  100. };
  101. class TPlainHashCommon {
  102. protected:
  103. #pragma pack(push, 8)
  104. template <typename TKey, typename TValue>
  105. class TPackedPair {
  106. private:
  107. typedef TPackedPair<TKey, TValue> TThis;
  108. TKey Key;
  109. TValue Value;
  110. private:
  111. static_assert(TIsMemsetThisWithZeroesSupported<TKey>::Result, "expect TIsMemsetThisWithZeroesSupported<TKey>::Result");
  112. static_assert(TIsMemsetThisWithZeroesSupported<TValue>::Result, "expect TIsMemsetThisWithZeroesSupported<TValue>::Result");
  113. /// to aviod uninitialized bytes
  114. void Init(const TKey& key, const TValue& value) {
  115. memset(static_cast<TThis*>(this), 0, sizeof(TThis));
  116. Key = key;
  117. Value = value;
  118. }
  119. public:
  120. TPackedPair(typename TTypeTraits<TKey>::TFuncParam key, typename TTypeTraits<TValue>::TFuncParam value) {
  121. Init(key, value);
  122. }
  123. TPackedPair(const TThis& rhs) {
  124. Init(rhs.Key, rhs.Value);
  125. }
  126. TPackedPair& operator=(const TThis& rhs) {
  127. if (this != &rhs) {
  128. Init(rhs.Key, rhs.Value);
  129. }
  130. return *this;
  131. }
  132. TPackedPair() {
  133. Init(TKey(), TValue());
  134. }
  135. typename TTypeTraits<TKey>::TFuncParam First() const {
  136. return Key;
  137. }
  138. typename TTypeTraits<TValue>::TFuncParam Second() const {
  139. return Value;
  140. }
  141. static TKey GetFirst(const void* self) {
  142. static constexpr size_t offset = offsetof(TThis, Key);
  143. return ReadUnaligned<TKey>(reinterpret_cast<const char*>(self) + offset);
  144. }
  145. static TValue GetSecond(const void* self) {
  146. static constexpr size_t offset = offsetof(TThis, Value);
  147. return ReadUnaligned<TValue>(reinterpret_cast<const char*>(self) + offset);
  148. }
  149. };
  150. #pragma pack(pop)
  151. protected:
  152. static const ui16 VERSION_ID = 2;
  153. #pragma pack(push, 8)
  154. struct TInterval {
  155. static const ui32 INVALID = (ui32)-1;
  156. ui32 Offset;
  157. ui32 Length;
  158. TInterval()
  159. : Offset(INVALID)
  160. , Length(INVALID)
  161. {
  162. }
  163. TInterval(ui32 offset, ui32 length)
  164. : Offset(offset)
  165. , Length(length)
  166. {
  167. }
  168. static inline ui32 GetOffset(const TInterval* self) {
  169. static constexpr size_t offset = offsetof(TInterval, Offset);
  170. return ReadUnaligned<ui32>(reinterpret_cast<const char*>(self) + offset);
  171. }
  172. static inline ui32 GetLength(const TInterval* self) {
  173. static constexpr size_t offset = offsetof(TInterval, Length);
  174. return ReadUnaligned<ui32>(reinterpret_cast<const char*>(self) + offset);
  175. }
  176. };
  177. #pragma pack(pop)
  178. static_assert(8 == sizeof(TInterval), "expect 8 == sizeof(TInterval)");
  179. template <typename TKey>
  180. static ui32 KeyHash(typename TTypeTraits<TKey>::TFuncParam key, ui16 bits) {
  181. Y_ASSERT(bits < 32);
  182. const ui32 res = ui32(key) & ((ui32(1) << bits) - 1);
  183. Y_ASSERT(res < (ui32(1) << bits));
  184. return res;
  185. }
  186. };
  187. template <typename TKey, typename TValue>
  188. class TPlainHashWriter : TPlainHashCommon {
  189. private:
  190. typedef TPackedPair<TKey, TValue> TKeyValuePair;
  191. typedef TVector<TKeyValuePair> TData;
  192. TData Data;
  193. typedef TVector<TData> TData2;
  194. bool IsPlainEnought(ui16 bits) const {
  195. TVector<size_t> counts(1LL << bits, 0);
  196. for (size_t i = 0; i < Data.size(); ++i) {
  197. size_t& count = counts[KeyHash<TKey>(TKeyValuePair::GetFirst(&Data[i]), bits)];
  198. ++count;
  199. if (count > 2)
  200. return false;
  201. }
  202. return true;
  203. }
  204. public:
  205. void Add(const TKey& key, const TValue& value) {
  206. Data.push_back(TKeyValuePair(key, value));
  207. }
  208. void Save(IOutputStream& out) const {
  209. Y_ASSERT(Data.size() < Max<ui32>());
  210. WriteBin<ui16>(&out, VERSION_ID);
  211. static const ui32 PAIR_SIZE = sizeof(TKeyValuePair);
  212. WriteBin<ui32>(&out, PAIR_SIZE);
  213. ui16 bits;
  214. if (!Data.empty()) {
  215. bits = (ui16)(log((float)Data.size()) / log(2.f));
  216. while ((bits < 22) && !IsPlainEnought(bits))
  217. ++bits;
  218. } else {
  219. bits = 0;
  220. }
  221. WriteBin<ui16>(&out, bits);
  222. WriteBin<ui32>(&out, (ui32)Data.size());
  223. const ui32 nBuckets = ui32(1) << bits;
  224. TData2 data2(nBuckets);
  225. for (size_t i = 0; i < Data.size(); ++i)
  226. data2[KeyHash<TKey>(TKeyValuePair::GetFirst(&Data[i]), bits)].push_back(Data[i]);
  227. typedef TVector<TInterval> TIntervals;
  228. TIntervals intervals(nBuckets);
  229. ui32 offset = 0;
  230. for (ui32 i = 0; i < nBuckets; ++i) {
  231. intervals[i].Offset = offset;
  232. intervals[i].Length = (ui32)data2[i].size();
  233. offset += (ui32)data2[i].size();
  234. }
  235. #ifndef NDEBUG
  236. for (ui32 i = 0; i < nBuckets; ++i) {
  237. for (size_t j = 0; j < data2[i].size(); ++j)
  238. for (size_t k = j + 1; k < data2[i].size(); ++k)
  239. if (TKeyValuePair::GetFirst(&data2[i][j]) == TKeyValuePair::GetFirst(&data2[i][k]))
  240. ythrow yexception() << "key clash";
  241. }
  242. #endif
  243. out.Write(intervals.data(), intervals.size() * sizeof(intervals[0]));
  244. for (ui32 i = 0; i < nBuckets; ++i)
  245. out.Write(data2[i].data(), data2[i].size() * sizeof(data2[i][0]));
  246. }
  247. };
  248. template <typename TKey, typename TValue>
  249. class TPlainHash : TPlainHashCommon {
  250. private:
  251. typedef TPackedPair<TKey, TValue> TKeyValuePair;
  252. const char* P;
  253. ui16 GetBits() const {
  254. return ReadUnaligned<ui16>(P + 6);
  255. }
  256. ui32 GetSize() const {
  257. return ReadUnaligned<ui32>(P + 8);
  258. }
  259. const TInterval* GetIntervals() const {
  260. return (const TInterval*)(P + 12);
  261. }
  262. const TKeyValuePair* GetData() const {
  263. return (const TKeyValuePair*)(GetIntervals() + (1ULL << GetBits()));
  264. }
  265. template <typename T>
  266. void Init(const T* p) {
  267. static_assert(sizeof(T) == 1, "expect sizeof(T) == 1");
  268. P = reinterpret_cast<const char*>(p);
  269. #ifndef NDEBUG
  270. ui16 version = ReadUnaligned<ui16>(p);
  271. if (version != VERSION_ID)
  272. ythrow yexception() << "bad version: " << version;
  273. static const ui32 PAIR_SIZE = sizeof(TKeyValuePair);
  274. const ui32 size = ReadUnaligned<ui32>(p + 2);
  275. if (size != PAIR_SIZE)
  276. ythrow yexception() << "bad size " << size << " instead of " << PAIR_SIZE;
  277. #endif
  278. }
  279. public:
  280. typedef const TKeyValuePair* TConstIterator;
  281. TPlainHash(const char* p) {
  282. Init(p);
  283. }
  284. TPlainHash(const TBlob& blob) {
  285. Init(blob.Begin());
  286. }
  287. bool Find(typename TTypeTraits<TKey>::TFuncParam key, TValue* res) const {
  288. // Cerr << GetBits() << "\t" << (1 << GetBits()) << "\t" << GetSize() << Endl;
  289. const ui32 hash = KeyHash<TKey>(key, GetBits());
  290. const TInterval* intervalPtr = GetIntervals();
  291. const TKeyValuePair* pair = GetData() + TInterval::GetOffset(intervalPtr + hash);
  292. const ui32 length = TInterval::GetLength(intervalPtr + hash);
  293. for (ui32 i = 0; i < length; ++i, ++pair) {
  294. if (TKeyValuePair::GetFirst(pair) == key) {
  295. *res = TKeyValuePair::GetSecond(pair);
  296. return true;
  297. }
  298. }
  299. return false;
  300. }
  301. TValue Get(typename TTypeTraits<TKey>::TFuncParam key) const {
  302. TValue res;
  303. if (Find(key, &res))
  304. return res;
  305. else
  306. ythrow yexception() << "key not found";
  307. }
  308. TConstIterator Begin() const {
  309. return GetData();
  310. }
  311. TConstIterator End() const {
  312. return GetData() + GetSize();
  313. }
  314. const char* ByteEnd() const {
  315. return (const char*)(GetData() + GetSize());
  316. }
  317. size_t ByteSize() const {
  318. return 12 + sizeof(TInterval) * (size_t(1) << GetBits()) + sizeof(TKeyValuePair) * GetSize();
  319. }
  320. };
  321. template <typename Key, typename Value, bool>
  322. struct TPlainHashG;
  323. template <typename Key, typename Value>
  324. struct TPlainHashG<Key, Value, false> {
  325. typedef TPlainHash<Key, Value> T;
  326. };
  327. template <typename Key, typename Value>
  328. struct TPlainHashG<Key, Value, true> {
  329. typedef TPlainHashWriter<Key, Value> T;
  330. };
  331. template <typename T>
  332. class TSingleValue {
  333. private:
  334. const T* Value;
  335. public:
  336. TSingleValue(const TBlob& blob) {
  337. Y_ASSERT(blob.Length() >= sizeof(T));
  338. Y_ASSERT(blob.Length() <= sizeof(T) + 16);
  339. Value = reinterpret_cast<const T*>(blob.Begin());
  340. }
  341. const T& Get() const {
  342. return *Value;
  343. }
  344. };
  345. template <typename T>
  346. class TSingleValueWriter {
  347. private:
  348. T Value;
  349. public:
  350. TSingleValueWriter() = default;
  351. TSingleValueWriter(const T& value)
  352. : Value(value)
  353. {
  354. }
  355. void Set(const T& value) {
  356. Value = value;
  357. }
  358. void Save(IOutputStream& out) const {
  359. out.Write(&Value, sizeof(Value));
  360. }
  361. };
  362. TBlob GetBlock(const TBlob& data, size_t index);
  363. template <class T>
  364. void WriteBlock(TChunkedDataWriter& writer, const T& t) {
  365. writer.NewBlock();
  366. t.Save(writer);
  367. }
  368. template <class T>
  369. void WriteBlock(TChunkedDataWriter& writer, T& t) {
  370. writer.NewBlock();
  371. t.Save(writer);
  372. }
  373. // Extends TChunkedDataWriter, allowing user to name blocks with arbitrary strings.
  374. class TNamedChunkedDataWriter: public TChunkedDataWriter {
  375. public:
  376. TNamedChunkedDataWriter(IOutputStream& slave);
  377. ~TNamedChunkedDataWriter() override;
  378. // Start a new unnamed block, overrides TChunkedDataReader::NewBlock().
  379. void NewBlock();
  380. // Start a new block with given name (possibly empty, in which case block is unnamed).
  381. // Throws an exception if name is a duplicate.
  382. void NewBlock(const TString& name);
  383. void WriteFooter();
  384. private:
  385. TVector<TString> Names;
  386. THashMap<TString, size_t> NameToIndex;
  387. };
  388. class TNamedChunkedDataReader: public TChunkedDataReader {
  389. public:
  390. TNamedChunkedDataReader(const TBlob& blob);
  391. inline bool HasBlock(const char* name) const {
  392. return NameToIndex.find(name) != NameToIndex.end();
  393. }
  394. inline size_t GetIndexByName(const char* name) const {
  395. THashMap<TString, size_t>::const_iterator it = NameToIndex.find(name);
  396. if (it == NameToIndex.end())
  397. throw yexception() << "Block \"" << name << "\" is not found";
  398. else
  399. return it->second;
  400. }
  401. // Returns number of blocks written to the file by user of TNamedChunkedDataReader.
  402. inline size_t GetBlocksCount() const {
  403. // Last block is for internal usage
  404. return TChunkedDataReader::GetBlocksCount() - 1;
  405. }
  406. inline const char* GetBlockName(size_t index) const {
  407. Y_ASSERT(index < GetBlocksCount());
  408. return Names[index].data();
  409. }
  410. inline const void* GetBlockByName(const char* name) const {
  411. return GetBlock(GetIndexByName(name));
  412. }
  413. inline size_t GetBlockLenByName(const char* name) const {
  414. return GetBlockLen(GetIndexByName(name));
  415. }
  416. inline TBlob GetBlobByName(const char* name) const {
  417. size_t id = GetIndexByName(name);
  418. return TBlob::NoCopy(GetBlock(id), GetBlockLen(id));
  419. }
  420. inline bool GetBlobByName(const char* name, TBlob& blob) const {
  421. THashMap<TString, size_t>::const_iterator it = NameToIndex.find(name);
  422. if (it == NameToIndex.end())
  423. return false;
  424. blob = TBlob::NoCopy(GetBlock(it->second), GetBlockLen(it->second));
  425. return true;
  426. }
  427. private:
  428. TVector<TString> Names;
  429. THashMap<TString, size_t> NameToIndex;
  430. };
  431. template <class T>
  432. struct TSaveLoadVectorNonPodElement {
  433. static inline void Save(IOutputStream* out, const T& t) {
  434. TSerializer<T>::Save(out, t);
  435. }
  436. static inline void Load(IInputStream* in, T& t, size_t elementSize) {
  437. Y_ASSERT(elementSize > 0);
  438. TSerializer<T>::Load(in, t);
  439. }
  440. };
  441. template <class T, bool isPod>
  442. class TVectorTakingIntoAccountThePodType {
  443. private:
  444. ui64 SizeofOffsets;
  445. const ui64* Offsets;
  446. const char* Data;
  447. public:
  448. TVectorTakingIntoAccountThePodType(const TBlob& blob) {
  449. SizeofOffsets = ReadUnaligned<ui64>(blob.Begin());
  450. Y_ASSERT(SizeofOffsets > 0);
  451. Offsets = reinterpret_cast<const ui64*>(blob.Begin() + sizeof(ui64));
  452. Data = reinterpret_cast<const char*>(blob.Begin() + sizeof(ui64) + SizeofOffsets * sizeof(ui64));
  453. }
  454. size_t GetSize() const {
  455. return (size_t)(SizeofOffsets - 1);
  456. }
  457. size_t GetLength(ui64 index) const {
  458. if (index + 1 >= SizeofOffsets)
  459. ythrow yexception() << "bad offset";
  460. return IntegerCast<size_t>(ReadUnaligned<ui64>(Offsets + index + 1) - ReadUnaligned<ui64>(Offsets + index));
  461. }
  462. void Get(ui64 index, T& t) const {
  463. const size_t len = GetLength(index);
  464. TMemoryInput input(Data + ReadUnaligned<ui64>(Offsets + index), len);
  465. TSaveLoadVectorNonPodElement<T>::Load(&input, t, len);
  466. }
  467. T Get(ui64 index) const {
  468. T ret;
  469. Get(index, ret);
  470. return ret;
  471. }
  472. size_t RealSize() const {
  473. return sizeof(ui64) * (SizeofOffsets + 1) + ReadUnaligned<ui64>(Offsets + SizeofOffsets - 1);
  474. }
  475. };
  476. template <class T, bool isPod>
  477. class TVectorTakingIntoAccountThePodTypeWriter : TNonCopyable {
  478. private:
  479. typedef TVector<ui64> TOffsets;
  480. TOffsets Offsets;
  481. TBuffer Data;
  482. TBufferOutput DataStream;
  483. public:
  484. TVectorTakingIntoAccountThePodTypeWriter()
  485. : DataStream(Data)
  486. {
  487. }
  488. void PushBack(const T& t) {
  489. Offsets.push_back((ui64) Data.size());
  490. TSaveLoadVectorNonPodElement<T>::Save(&DataStream, t);
  491. }
  492. size_t Size() const {
  493. return Offsets.size();
  494. }
  495. void Save(IOutputStream& out) const {
  496. ui64 sizeofOffsets = Offsets.size() + 1;
  497. out.Write(&sizeofOffsets, sizeof(sizeofOffsets));
  498. out.Write(Offsets.data(), Offsets.size() * sizeof(Offsets[0]));
  499. ui64 lastOffset = (ui64) Data.size();
  500. out.Write(&lastOffset, sizeof(lastOffset));
  501. out.Write(Data.data(), Data.size());
  502. }
  503. };
  504. template <class T>
  505. class TVectorTakingIntoAccountThePodType<T, true>: public TYVector<T> {
  506. public:
  507. TVectorTakingIntoAccountThePodType(const TBlob& blob)
  508. : TYVector<T>(blob)
  509. {
  510. }
  511. };
  512. template <class T>
  513. class TVectorTakingIntoAccountThePodTypeWriter<T, true>: public TYVectorWriter<T> {
  514. };
  515. template <typename T>
  516. class TGeneralVector: public TVectorTakingIntoAccountThePodType<T, TTypeTraits<T>::IsPod> {
  517. typedef TVectorTakingIntoAccountThePodType<T, TTypeTraits<T>::IsPod> TBase;
  518. public:
  519. TGeneralVector(const TBlob& blob)
  520. : TBase(blob)
  521. {
  522. }
  523. };
  524. template <typename T>
  525. class TGeneralVectorWriter: public TVectorTakingIntoAccountThePodTypeWriter<T, TTypeTraits<T>::IsPod> {
  526. };
  527. template <typename TItem, bool>
  528. struct TGeneralVectorG;
  529. template <typename TItem>
  530. struct TGeneralVectorG<TItem, false> {
  531. typedef TGeneralVector<TItem> T;
  532. };
  533. template <typename TItem>
  534. struct TGeneralVectorG<TItem, true> {
  535. typedef TGeneralVectorWriter<TItem> T;
  536. };
  537. template <>
  538. struct TSaveLoadVectorNonPodElement<TString> {
  539. static inline void Save(IOutputStream* out, const TString& s) {
  540. out->Write(s.data(), s.size() + 1);
  541. }
  542. static inline void Load(TMemoryInput* in, TString& s, size_t elementSize) {
  543. Y_ASSERT(elementSize > 0 && in->Avail() >= elementSize);
  544. s.assign(in->Buf(), elementSize - 1); /// excluding 0 at the end
  545. }
  546. };
  547. template <bool G>
  548. struct TStringsVectorG: public TGeneralVectorG<TString, G> {
  549. };
  550. using TStringsVector = TGeneralVector<TString>;
  551. using TStringsVectorWriter = TGeneralVectorWriter<TString>;