stream.cpp 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. #include "stream.h"
  2. #include "codecs.h"
  3. #include <util/digest/murmur.h>
  4. #include <util/generic/scope.h>
  5. #include <util/generic/cast.h>
  6. #include <util/generic/hash.h>
  7. #include <util/generic/singleton.h>
  8. #include <util/stream/mem.h>
  9. #include <util/ysaveload.h>
  10. using namespace NBlockCodecs;
  11. namespace {
  12. constexpr size_t MAX_BUF_LEN = 128 * 1024 * 1024;
  13. typedef ui16 TCodecID;
  14. typedef ui64 TBlockLen;
  15. struct TIds {
  16. inline TIds() {
  17. const TCodecList lst = ListAllCodecs();
  18. for (size_t i = 0; i < lst.size(); ++i) {
  19. const ICodec* c = Codec(lst[i]);
  20. ByID[CodecID(c)] = c;
  21. }
  22. }
  23. static inline TCodecID CodecID(const ICodec* c) {
  24. const TStringBuf name = c->Name();
  25. union {
  26. ui16 Parts[2];
  27. ui32 Data;
  28. } x;
  29. x.Data = MurmurHash<ui32>(name.data(), name.size());
  30. return x.Parts[1] ^ x.Parts[0];
  31. }
  32. inline const ICodec* Find(TCodecID id) const {
  33. TByID::const_iterator it = ByID.find(id);
  34. if (it != ByID.end()) {
  35. return it->second;
  36. }
  37. ythrow yexception() << "can not find codec by id " << id;
  38. }
  39. typedef THashMap<TCodecID, const ICodec*> TByID;
  40. TByID ByID;
  41. };
  42. TCodecID CodecID(const ICodec* c) {
  43. return TIds::CodecID(c);
  44. }
  45. const ICodec* CodecByID(TCodecID id) {
  46. return Singleton<TIds>()->Find(id);
  47. }
  48. }
  49. TCodedOutput::TCodedOutput(IOutputStream* out, const ICodec* c, size_t bufLen)
  50. : C_(c)
  51. , D_(bufLen)
  52. , S_(out)
  53. {
  54. if (bufLen > MAX_BUF_LEN) {
  55. ythrow yexception() << TStringBuf("too big buffer size: ") << bufLen;
  56. }
  57. }
  58. TCodedOutput::~TCodedOutput() {
  59. try {
  60. Finish();
  61. } catch (...) {
  62. }
  63. }
  64. void TCodedOutput::DoWrite(const void* buf, size_t len) {
  65. const char* in = (const char*)buf;
  66. while (len) {
  67. const size_t avail = D_.Avail();
  68. if (len < avail) {
  69. D_.Append(in, len);
  70. return;
  71. }
  72. D_.Append(in, avail);
  73. Y_ASSERT(!D_.Avail());
  74. in += avail;
  75. len -= avail;
  76. Y_ABORT_UNLESS(FlushImpl(), "flush on writing failed");
  77. }
  78. }
  79. bool TCodedOutput::FlushImpl() {
  80. const bool ret = !D_.Empty();
  81. const size_t payload = sizeof(TCodecID) + sizeof(TBlockLen);
  82. O_.Reserve(C_->MaxCompressedLength(D_) + payload);
  83. void* out = O_.Data() + payload;
  84. const size_t olen = C_->Compress(D_, out);
  85. {
  86. TMemoryOutput mo(O_.Data(), payload);
  87. ::Save(&mo, CodecID(C_));
  88. ::Save(&mo, SafeIntegerCast<TBlockLen>(olen));
  89. }
  90. S_->Write(O_.Data(), payload + olen);
  91. D_.Clear();
  92. O_.Clear();
  93. return ret;
  94. }
  95. void TCodedOutput::DoFlush() {
  96. if (S_ && !D_.Empty()) {
  97. FlushImpl();
  98. }
  99. }
  100. void TCodedOutput::DoFinish() {
  101. if (S_) {
  102. Y_DEFER {
  103. S_ = nullptr;
  104. };
  105. if (FlushImpl()) {
  106. //always write zero-length block as eos marker
  107. FlushImpl();
  108. }
  109. }
  110. }
  111. TDecodedInput::TDecodedInput(IInputStream* in)
  112. : S_(in)
  113. , C_(nullptr)
  114. {
  115. }
  116. TDecodedInput::TDecodedInput(IInputStream* in, const ICodec* codec)
  117. : S_(in)
  118. , C_(codec)
  119. {
  120. }
  121. TDecodedInput::~TDecodedInput() = default;
  122. size_t TDecodedInput::DoUnboundedNext(const void** ptr) {
  123. if (!S_) {
  124. return 0;
  125. }
  126. TCodecID codecId;
  127. TBlockLen blockLen;
  128. {
  129. const size_t payload = sizeof(TCodecID) + sizeof(TBlockLen);
  130. char buf[32];
  131. S_->LoadOrFail(buf, payload);
  132. TMemoryInput in(buf, payload);
  133. ::Load(&in, codecId);
  134. ::Load(&in, blockLen);
  135. }
  136. if (!blockLen) {
  137. S_ = nullptr;
  138. return 0;
  139. }
  140. if (Y_UNLIKELY(blockLen > 1024 * 1024 * 1024)) {
  141. ythrow yexception() << "block size exceeds 1 GiB";
  142. }
  143. TBuffer block;
  144. block.Resize(blockLen);
  145. S_->LoadOrFail(block.Data(), blockLen);
  146. auto codec = CodecByID(codecId);
  147. if (C_) {
  148. Y_ENSURE(C_->Name() == codec->Name(), TStringBuf("incorrect stream codec"));
  149. }
  150. if (codec->DecompressedLength(block) > MAX_BUF_LEN) {
  151. ythrow yexception() << "broken stream";
  152. }
  153. codec->Decode(block, D_);
  154. *ptr = D_.Data();
  155. return D_.Size();
  156. }