tokenizer.h 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. #pragma once
  2. #include "input.h"
  3. #include <util/generic/buffer.h>
  4. #include <util/generic/mem_copy.h>
  5. #include <util/generic/strbuf.h>
  6. #include <util/system/compiler.h>
  7. #include <util/system/yassert.h>
  8. /**
  9. * @addtogroup Streams
  10. * @{
  11. */
  12. /**
  13. * Simple stream tokenizer. Splits the stream into tokens that are available
  14. * via iterator interface.
  15. *
  16. * @tparam TEndOfToken Predicate for token delimiter characters.
  17. * @see TEol
  18. */
  19. template <typename TEndOfToken>
  20. class TStreamTokenizer {
  21. public:
  22. class TIterator {
  23. public:
  24. inline TIterator(TStreamTokenizer* const parent)
  25. : Parent_(parent)
  26. , AtEnd_(!Parent_->Next(Data_, Len_))
  27. {
  28. }
  29. inline TIterator() noexcept
  30. : Parent_(nullptr)
  31. , Data_(nullptr)
  32. , Len_(0)
  33. , AtEnd_(true)
  34. {
  35. }
  36. inline ~TIterator() = default;
  37. inline void operator++() {
  38. Next();
  39. }
  40. inline bool operator==(const TIterator& l) const noexcept {
  41. return AtEnd_ == l.AtEnd_;
  42. }
  43. inline bool operator!=(const TIterator& l) const noexcept {
  44. return !(*this == l);
  45. }
  46. /**
  47. * @return Return null-terminated character array with current token.
  48. * The pointer may be invalid after iterator increment.
  49. */
  50. inline const char* Data() const noexcept {
  51. Y_ASSERT(!AtEnd_);
  52. return Data_;
  53. }
  54. /**
  55. * @return Length of current token.
  56. */
  57. inline size_t Length() const noexcept {
  58. Y_ASSERT(!AtEnd_);
  59. return Len_;
  60. }
  61. inline TIterator* operator->() noexcept {
  62. return this;
  63. }
  64. inline TStringBuf operator*() noexcept {
  65. return TStringBuf{Data_, Len_};
  66. }
  67. private:
  68. inline void Next() {
  69. Y_ASSERT(Parent_);
  70. AtEnd_ = !Parent_->Next(Data_, Len_);
  71. }
  72. private:
  73. TStreamTokenizer* const Parent_;
  74. char* Data_;
  75. size_t Len_;
  76. bool AtEnd_;
  77. };
  78. inline TStreamTokenizer(IInputStream* const input, const TEndOfToken& eot = TEndOfToken(),
  79. const size_t initial = 1024)
  80. : Input_(input)
  81. , Buf_(initial)
  82. , Cur_(BufBegin())
  83. , End_(BufBegin())
  84. , Eot_(eot)
  85. {
  86. CheckBuf();
  87. }
  88. inline bool Next(char*& buf, size_t& len) {
  89. char* it = Cur_;
  90. while (true) {
  91. do {
  92. while (it != End_) {
  93. if (Eot_(*it)) {
  94. *it = '\0';
  95. buf = Cur_;
  96. len = it - Cur_;
  97. Cur_ = it + 1;
  98. return true;
  99. } else {
  100. ++it;
  101. }
  102. }
  103. if (Fill() == 0 && End_ != BufEnd()) {
  104. *it = '\0';
  105. buf = Cur_;
  106. len = it - Cur_;
  107. Cur_ = End_;
  108. return len;
  109. }
  110. } while (it != BufEnd());
  111. Y_ASSERT(it == BufEnd());
  112. Y_ASSERT(End_ == BufEnd());
  113. const size_t blen = End_ - Cur_;
  114. if (Cur_ == BufBegin()) {
  115. Y_ASSERT(blen == Buf_.Capacity());
  116. /*
  117. * do reallocate
  118. */
  119. Buf_.Reserve(Buf_.Capacity() * 4);
  120. CheckBuf();
  121. } else {
  122. /*
  123. * do move
  124. */
  125. MemMove(BufBegin(), Cur_, blen);
  126. }
  127. Cur_ = BufBegin();
  128. End_ = Cur_ + blen;
  129. it = End_;
  130. }
  131. }
  132. inline TIterator begin() {
  133. return TIterator{this};
  134. }
  135. inline TIterator end() noexcept {
  136. return {};
  137. }
  138. private:
  139. inline size_t Fill() {
  140. const size_t avail = BufEnd() - End_;
  141. const size_t bytesRead = Input_->Read(End_, avail);
  142. End_ += bytesRead;
  143. return bytesRead;
  144. }
  145. inline char* BufBegin() noexcept {
  146. return Buf_.Data();
  147. }
  148. inline char* BufEnd() noexcept {
  149. return Buf_.Data() + Buf_.Capacity();
  150. }
  151. inline void CheckBuf() const {
  152. if (!Buf_.Data()) {
  153. throw std::bad_alloc();
  154. }
  155. }
  156. private:
  157. IInputStream* const Input_;
  158. TBuffer Buf_;
  159. char* Cur_;
  160. char* End_;
  161. TEndOfToken Eot_;
  162. };
  163. /**
  164. * Predicate for `TStreamTokenizer` that uses '\\n' as a delimiter.
  165. */
  166. struct TEol {
  167. inline bool operator()(char ch) const noexcept {
  168. return ch == '\n';
  169. }
  170. };
  171. /** @} */