#pragma once #include #include #include #include #include #include #include namespace NCodecs { class ISequenceReader { public: virtual bool NextRegion(TStringBuf& s) = 0; virtual ~ISequenceReader() = default; }; template TStringBuf ValueToStringBuf(TValue&& t) { return TStringBuf{NAccessors::Begin(t), NAccessors::End(t)}; } template TStringBuf IterToStringBuf(TIter iter) { return ValueToStringBuf(*iter); } template class TSimpleSequenceReader: public ISequenceReader { const TVector& Items; size_t Idx = 0; public: TSimpleSequenceReader(const TVector& items) : Items(items) { } bool NextRegion(TStringBuf& s) override { if (Idx >= Items.size()) { return false; } s = ValueToStringBuf(Items[Idx++]); return true; } }; template size_t GetInputSize(TIter begin, TIter end, TGetter getter) { size_t totalBytes = 0; for (TIter iter = begin; iter != end; ++iter) { totalBytes += getter(iter).size(); } return totalBytes; } template size_t GetInputSize(TIter begin, TIter end) { return GetInputSize(begin, end, IterToStringBuf); } template TVector GetSample(TIter begin, TIter end, size_t sampleSizeBytes, TGetter getter) { TFastRng64 rng{0x1ce1f2e507541a05, 0x07d45659, 0x7b8771030dd9917e, 0x2d6636ce}; size_t totalBytes = GetInputSize(begin, end, getter); double sampleProb = (double)sampleSizeBytes / Max(1, totalBytes); TVector result; for (TIter iter = begin; iter != end; ++iter) { if (sampleProb >= 1 || rng.GenRandReal1() < sampleProb) { TStringBuf reg = getter(iter); result.emplace_back(reg.data(), reg.size()); } } Shuffle(result.begin(), result.end(), rng); return result; } template TVector GetSample(TIter begin, TIter end, size_t sampleSizeBytes) { return GetSample(begin, end, sampleSizeBytes, IterToStringBuf); } }