#pragma once #include #include #include #include #include #include #include #include #include #include #include "reader.h" #include "writer.h" #include #include template class TYVector { private: ui32 Size; const T* Data; public: TYVector(const TBlob& blob) : Size(IntegerCast(ReadUnaligned(blob.Data()))) , Data((const T*)((const char*)blob.Data() + sizeof(ui64))) { } void Get(size_t idx, T& t) const { assert(idx < (size_t)Size); t = ReadUnaligned(Data + idx); } const T& At(size_t idx) const { assert(idx < (size_t)Size); return Data[idx]; } size_t GetSize() const { return Size; } size_t RealSize() const { return sizeof(ui64) + Size * sizeof(T); } ~TYVector() = default; }; template class TYVectorWriter { private: TVector Vector; public: TYVectorWriter() = default; void PushBack(const T& value) { Vector.push_back(value); } void Save(IOutputStream& out) const { ui64 uSize = (ui64)Vector.size(); out.Write(&uSize, sizeof(uSize)); out.Write(Vector.data(), Vector.size() * sizeof(T)); } const T& At(size_t idx) const { assert(idx < Size()); return Vector[idx]; } T& At(size_t idx) { assert(idx < Size()); return Vector[idx]; } void Clear() { Vector.clear(); } size_t Size() const { return Vector.size(); } void Resize(size_t size) { Vector.resize(size); } void Resize(size_t size, const T& value) { Vector.resize(size, value); } }; template struct TYVectorG; template struct TYVectorG { typedef TYVector T; }; template struct TYVectorG { typedef TYVectorWriter T; }; template struct TIsMemsetThisWithZeroesSupported { enum { Result = TTypeTraits::IsPod }; }; #define MEMSET_THIS_WITH_ZEROES_SUPPORTED(type) \ template <> \ struct TIsMemsetThisWithZeroesSupported { \ enum { \ Result = true \ }; \ }; class TPlainHashCommon { protected: #pragma pack(push, 8) template class TPackedPair { private: typedef TPackedPair TThis; TKey Key; TValue Value; private: static_assert(TIsMemsetThisWithZeroesSupported::Result, "expect TIsMemsetThisWithZeroesSupported::Result"); static_assert(TIsMemsetThisWithZeroesSupported::Result, "expect TIsMemsetThisWithZeroesSupported::Result"); /// to aviod uninitialized bytes void Init(const TKey& key, const TValue& value) { memset(static_cast(this), 0, sizeof(TThis)); Key = key; Value = value; } public: TPackedPair(typename TTypeTraits::TFuncParam key, typename TTypeTraits::TFuncParam value) { Init(key, value); } TPackedPair(const TThis& rhs) { Init(rhs.Key, rhs.Value); } TPackedPair& operator=(const TThis& rhs) { if (this != &rhs) { Init(rhs.Key, rhs.Value); } return *this; } TPackedPair() { Init(TKey(), TValue()); } typename TTypeTraits::TFuncParam First() const { return Key; } typename TTypeTraits::TFuncParam Second() const { return Value; } static TKey GetFirst(const void* self) { static constexpr size_t offset = offsetof(TThis, Key); return ReadUnaligned(reinterpret_cast(self) + offset); } static TValue GetSecond(const void* self) { static constexpr size_t offset = offsetof(TThis, Value); return ReadUnaligned(reinterpret_cast(self) + offset); } }; #pragma pack(pop) protected: static const ui16 VERSION_ID = 2; #pragma pack(push, 8) struct TInterval { static const ui32 INVALID = (ui32)-1; ui32 Offset; ui32 Length; TInterval() : Offset(INVALID) , Length(INVALID) { } TInterval(ui32 offset, ui32 length) : Offset(offset) , Length(length) { } static inline ui32 GetOffset(const TInterval* self) { static constexpr size_t offset = offsetof(TInterval, Offset); return ReadUnaligned(reinterpret_cast(self) + offset); } static inline ui32 GetLength(const TInterval* self) { static constexpr size_t offset = offsetof(TInterval, Length); return ReadUnaligned(reinterpret_cast(self) + offset); } }; #pragma pack(pop) static_assert(8 == sizeof(TInterval), "expect 8 == sizeof(TInterval)"); template static ui32 KeyHash(typename TTypeTraits::TFuncParam key, ui16 bits) { Y_ASSERT(bits < 32); const ui32 res = ui32(key) & ((ui32(1) << bits) - 1); Y_ASSERT(res < (ui32(1) << bits)); return res; } }; template class TPlainHashWriter : TPlainHashCommon { private: typedef TPackedPair TKeyValuePair; typedef TVector TData; TData Data; typedef TVector TData2; bool IsPlainEnought(ui16 bits) const { TVector counts(1LL << bits, 0); for (size_t i = 0; i < Data.size(); ++i) { size_t& count = counts[KeyHash(TKeyValuePair::GetFirst(&Data[i]), bits)]; ++count; if (count > 2) return false; } return true; } public: void Add(const TKey& key, const TValue& value) { Data.push_back(TKeyValuePair(key, value)); } void Save(IOutputStream& out) const { Y_ASSERT(Data.size() < Max()); WriteBin(&out, VERSION_ID); static const ui32 PAIR_SIZE = sizeof(TKeyValuePair); WriteBin(&out, PAIR_SIZE); ui16 bits; if (!Data.empty()) { bits = (ui16)(log((float)Data.size()) / log(2.f)); while ((bits < 22) && !IsPlainEnought(bits)) ++bits; } else { bits = 0; } WriteBin(&out, bits); WriteBin(&out, (ui32)Data.size()); const ui32 nBuckets = ui32(1) << bits; TData2 data2(nBuckets); for (size_t i = 0; i < Data.size(); ++i) data2[KeyHash(TKeyValuePair::GetFirst(&Data[i]), bits)].push_back(Data[i]); typedef TVector TIntervals; TIntervals intervals(nBuckets); ui32 offset = 0; for (ui32 i = 0; i < nBuckets; ++i) { intervals[i].Offset = offset; intervals[i].Length = (ui32)data2[i].size(); offset += (ui32)data2[i].size(); } #ifndef NDEBUG for (ui32 i = 0; i < nBuckets; ++i) { for (size_t j = 0; j < data2[i].size(); ++j) for (size_t k = j + 1; k < data2[i].size(); ++k) if (TKeyValuePair::GetFirst(&data2[i][j]) == TKeyValuePair::GetFirst(&data2[i][k])) ythrow yexception() << "key clash"; } #endif out.Write(intervals.data(), intervals.size() * sizeof(intervals[0])); for (ui32 i = 0; i < nBuckets; ++i) out.Write(data2[i].data(), data2[i].size() * sizeof(data2[i][0])); } }; template class TPlainHash : TPlainHashCommon { private: typedef TPackedPair TKeyValuePair; const char* P; ui16 GetBits() const { return ReadUnaligned(P + 6); } ui32 GetSize() const { return ReadUnaligned(P + 8); } const TInterval* GetIntervals() const { return (const TInterval*)(P + 12); } const TKeyValuePair* GetData() const { return (const TKeyValuePair*)(GetIntervals() + (1ULL << GetBits())); } template void Init(const T* p) { static_assert(sizeof(T) == 1, "expect sizeof(T) == 1"); P = reinterpret_cast(p); #ifndef NDEBUG ui16 version = ReadUnaligned(p); if (version != VERSION_ID) ythrow yexception() << "bad version: " << version; static const ui32 PAIR_SIZE = sizeof(TKeyValuePair); const ui32 size = ReadUnaligned(p + 2); if (size != PAIR_SIZE) ythrow yexception() << "bad size " << size << " instead of " << PAIR_SIZE; #endif } public: typedef const TKeyValuePair* TConstIterator; TPlainHash(const char* p) { Init(p); } TPlainHash(const TBlob& blob) { Init(blob.Begin()); } bool Find(typename TTypeTraits::TFuncParam key, TValue* res) const { // Cerr << GetBits() << "\t" << (1 << GetBits()) << "\t" << GetSize() << Endl; const ui32 hash = KeyHash(key, GetBits()); const TInterval* intervalPtr = GetIntervals(); const TKeyValuePair* pair = GetData() + TInterval::GetOffset(intervalPtr + hash); const ui32 length = TInterval::GetLength(intervalPtr + hash); for (ui32 i = 0; i < length; ++i, ++pair) { if (TKeyValuePair::GetFirst(pair) == key) { *res = TKeyValuePair::GetSecond(pair); return true; } } return false; } TValue Get(typename TTypeTraits::TFuncParam key) const { TValue res; if (Find(key, &res)) return res; else ythrow yexception() << "key not found"; } TConstIterator Begin() const { return GetData(); } TConstIterator End() const { return GetData() + GetSize(); } const char* ByteEnd() const { return (const char*)(GetData() + GetSize()); } size_t ByteSize() const { return 12 + sizeof(TInterval) * (size_t(1) << GetBits()) + sizeof(TKeyValuePair) * GetSize(); } }; template struct TPlainHashG; template struct TPlainHashG { typedef TPlainHash T; }; template struct TPlainHashG { typedef TPlainHashWriter T; }; template class TSingleValue { private: const T* Value; public: TSingleValue(const TBlob& blob) { Y_ASSERT(blob.Length() >= sizeof(T)); Y_ASSERT(blob.Length() <= sizeof(T) + 16); Value = reinterpret_cast(blob.Begin()); } const T& Get() const { return *Value; } }; template class TSingleValueWriter { private: T Value; public: TSingleValueWriter() = default; TSingleValueWriter(const T& value) : Value(value) { } void Set(const T& value) { Value = value; } void Save(IOutputStream& out) const { out.Write(&Value, sizeof(Value)); } }; TBlob GetBlock(const TBlob& data, size_t index); template void WriteBlock(TChunkedDataWriter& writer, const T& t) { writer.NewBlock(); t.Save(writer); } template void WriteBlock(TChunkedDataWriter& writer, T& t) { writer.NewBlock(); t.Save(writer); } // Extends TChunkedDataWriter, allowing user to name blocks with arbitrary strings. class TNamedChunkedDataWriter: public TChunkedDataWriter { public: TNamedChunkedDataWriter(IOutputStream& slave); ~TNamedChunkedDataWriter() override; // Start a new unnamed block, overrides TChunkedDataReader::NewBlock(). void NewBlock(); // Start a new block with given name (possibly empty, in which case block is unnamed). // Throws an exception if name is a duplicate. void NewBlock(const TString& name); void WriteFooter(); private: TVector Names; THashMap NameToIndex; }; class TNamedChunkedDataReader: public TChunkedDataReader { public: TNamedChunkedDataReader(const TBlob& blob); inline bool HasBlock(const char* name) const { return NameToIndex.find(name) != NameToIndex.end(); } inline size_t GetIndexByName(const char* name) const { THashMap::const_iterator it = NameToIndex.find(name); if (it == NameToIndex.end()) throw yexception() << "Block \"" << name << "\" is not found"; else return it->second; } // Returns number of blocks written to the file by user of TNamedChunkedDataReader. inline size_t GetBlocksCount() const { // Last block is for internal usage return TChunkedDataReader::GetBlocksCount() - 1; } inline const char* GetBlockName(size_t index) const { Y_ASSERT(index < GetBlocksCount()); return Names[index].data(); } inline const void* GetBlockByName(const char* name) const { return GetBlock(GetIndexByName(name)); } inline size_t GetBlockLenByName(const char* name) const { return GetBlockLen(GetIndexByName(name)); } inline TBlob GetBlobByName(const char* name) const { size_t id = GetIndexByName(name); return TBlob::NoCopy(GetBlock(id), GetBlockLen(id)); } inline bool GetBlobByName(const char* name, TBlob& blob) const { THashMap::const_iterator it = NameToIndex.find(name); if (it == NameToIndex.end()) return false; blob = TBlob::NoCopy(GetBlock(it->second), GetBlockLen(it->second)); return true; } private: TVector Names; THashMap NameToIndex; }; template struct TSaveLoadVectorNonPodElement { static inline void Save(IOutputStream* out, const T& t) { TSerializer::Save(out, t); } static inline void Load(IInputStream* in, T& t, size_t elementSize) { Y_ASSERT(elementSize > 0); TSerializer::Load(in, t); } }; template class TVectorTakingIntoAccountThePodType { private: ui64 SizeofOffsets; const ui64* Offsets; const char* Data; public: TVectorTakingIntoAccountThePodType(const TBlob& blob) { SizeofOffsets = ReadUnaligned(blob.Begin()); Y_ASSERT(SizeofOffsets > 0); Offsets = reinterpret_cast(blob.Begin() + sizeof(ui64)); Data = reinterpret_cast(blob.Begin() + sizeof(ui64) + SizeofOffsets * sizeof(ui64)); } size_t GetSize() const { return (size_t)(SizeofOffsets - 1); } size_t GetLength(ui64 index) const { if (index + 1 >= SizeofOffsets) ythrow yexception() << "bad offset"; return IntegerCast(ReadUnaligned(Offsets + index + 1) - ReadUnaligned(Offsets + index)); } void Get(ui64 index, T& t) const { const size_t len = GetLength(index); TMemoryInput input(Data + ReadUnaligned(Offsets + index), len); TSaveLoadVectorNonPodElement::Load(&input, t, len); } T Get(ui64 index) const { T ret; Get(index, ret); return ret; } size_t RealSize() const { return sizeof(ui64) * (SizeofOffsets + 1) + ReadUnaligned(Offsets + SizeofOffsets - 1); } }; template class TVectorTakingIntoAccountThePodTypeWriter : TNonCopyable { private: typedef TVector TOffsets; TOffsets Offsets; TBuffer Data; TBufferOutput DataStream; public: TVectorTakingIntoAccountThePodTypeWriter() : DataStream(Data) { } void PushBack(const T& t) { Offsets.push_back((ui64) Data.size()); TSaveLoadVectorNonPodElement::Save(&DataStream, t); } size_t Size() const { return Offsets.size(); } void Save(IOutputStream& out) const { ui64 sizeofOffsets = Offsets.size() + 1; out.Write(&sizeofOffsets, sizeof(sizeofOffsets)); out.Write(Offsets.data(), Offsets.size() * sizeof(Offsets[0])); ui64 lastOffset = (ui64) Data.size(); out.Write(&lastOffset, sizeof(lastOffset)); out.Write(Data.data(), Data.size()); } }; template class TVectorTakingIntoAccountThePodType: public TYVector { public: TVectorTakingIntoAccountThePodType(const TBlob& blob) : TYVector(blob) { } }; template class TVectorTakingIntoAccountThePodTypeWriter: public TYVectorWriter { }; template class TGeneralVector: public TVectorTakingIntoAccountThePodType::IsPod> { typedef TVectorTakingIntoAccountThePodType::IsPod> TBase; public: TGeneralVector(const TBlob& blob) : TBase(blob) { } }; template class TGeneralVectorWriter: public TVectorTakingIntoAccountThePodTypeWriter::IsPod> { }; template struct TGeneralVectorG; template struct TGeneralVectorG { typedef TGeneralVector T; }; template struct TGeneralVectorG { typedef TGeneralVectorWriter T; }; template <> struct TSaveLoadVectorNonPodElement { static inline void Save(IOutputStream* out, const TString& s) { out->Write(s.data(), s.size() + 1); } static inline void Load(TMemoryInput* in, TString& s, size_t elementSize) { Y_ASSERT(elementSize > 0 && in->Avail() >= elementSize); s.assign(in->Buf(), elementSize - 1); /// excluding 0 at the end } }; template struct TStringsVectorG: public TGeneralVectorG { }; using TStringsVector = TGeneralVector; using TStringsVectorWriter = TGeneralVectorWriter;