1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889 |
- #pragma once
- #include <library/cpp/deprecated/accessors/accessors.h>
- #include <util/generic/buffer.h>
- #include <util/generic/vector.h>
- #include <util/random/fast.h>
- #include <util/random/shuffle.h>
- #include <functional>
- #include <type_traits>
- namespace NCodecs {
- class ISequenceReader {
- public:
- virtual bool NextRegion(TStringBuf& s) = 0;
- virtual ~ISequenceReader() = default;
- };
- template <class TValue>
- TStringBuf ValueToStringBuf(TValue&& t) {
- return TStringBuf{NAccessors::Begin(t), NAccessors::End(t)};
- }
- template <class TIter>
- TStringBuf IterToStringBuf(TIter iter) {
- return ValueToStringBuf(*iter);
- }
- template <class TItem>
- class TSimpleSequenceReader: public ISequenceReader {
- const TVector<TItem>& Items;
- size_t Idx = 0;
- public:
- TSimpleSequenceReader(const TVector<TItem>& items)
- : Items(items)
- {
- }
- bool NextRegion(TStringBuf& s) override {
- if (Idx >= Items.size()) {
- return false;
- }
- s = ValueToStringBuf(Items[Idx++]);
- return true;
- }
- };
- template <class TIter, class TGetter>
- size_t GetInputSize(TIter begin, TIter end, TGetter getter) {
- size_t totalBytes = 0;
- for (TIter iter = begin; iter != end; ++iter) {
- totalBytes += getter(iter).size();
- }
- return totalBytes;
- }
- template <class TIter>
- size_t GetInputSize(TIter begin, TIter end) {
- return GetInputSize(begin, end, IterToStringBuf<TIter>);
- }
- template <class TIter, class TGetter>
- TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes, TGetter getter) {
- TFastRng64 rng{0x1ce1f2e507541a05, 0x07d45659, 0x7b8771030dd9917e, 0x2d6636ce};
- size_t totalBytes = GetInputSize(begin, end, getter);
- double sampleProb = (double)sampleSizeBytes / Max<size_t>(1, totalBytes);
- TVector<TBuffer> result;
- for (TIter iter = begin; iter != end; ++iter) {
- if (sampleProb >= 1 || rng.GenRandReal1() < sampleProb) {
- TStringBuf reg = getter(iter);
- result.emplace_back(reg.data(), reg.size());
- }
- }
- Shuffle(result.begin(), result.end(), rng);
- return result;
- }
- template <class TIter>
- TVector<TBuffer> GetSample(TIter begin, TIter end, size_t sampleSizeBytes) {
- return GetSample(begin, end, sampleSizeBytes, IterToStringBuf<TIter>);
- }
- }
|