split_iterator.h 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. #pragma once
  2. #include <library/cpp/deprecated/kmp/kmp.h>
  3. #include <util/string/cast.h>
  4. #include <util/string/util.h>
  5. #include <util/string/builder.h>
  6. #include <util/system/yassert.h>
  7. #include <util/system/defaults.h>
  8. #include <util/generic/strbuf.h>
  9. #include <util/generic/string.h>
  10. #include <util/generic/vector.h>
  11. #include <util/generic/yexception.h>
  12. #include <cstdio>
  13. template <typename T>
  14. struct TNumPair {
  15. T Begin;
  16. T End;
  17. TNumPair() = default;
  18. TNumPair(T begin, T end)
  19. : Begin(begin)
  20. , End(end)
  21. {
  22. Y_ASSERT(begin <= end);
  23. }
  24. T Length() const {
  25. return End - Begin + 1;
  26. }
  27. bool operator==(const TNumPair& r) const {
  28. return (Begin == r.Begin) && (End == r.End);
  29. }
  30. bool operator!=(const TNumPair& r) const {
  31. return (Begin != r.Begin) || (End != r.End);
  32. }
  33. };
  34. using TSizeTRegion = TNumPair<size_t>;
  35. using TUi32Region = TNumPair<ui32>;
  36. template <>
  37. inline TString ToString(const TUi32Region& r) {
  38. return TStringBuilder() << "(" << r.Begin << ", " << r.End << ")";
  39. }
  40. template <>
  41. inline TUi32Region FromString(const TString& s) {
  42. TUi32Region result;
  43. sscanf(s.data(), "(%" PRIu32 ", %" PRIu32 ")", &result.Begin, &result.End);
  44. return result;
  45. }
  46. class TSplitDelimiters {
  47. private:
  48. bool Delims[256];
  49. public:
  50. explicit TSplitDelimiters(const char* s);
  51. Y_FORCE_INLINE bool IsDelimiter(ui8 ch) const {
  52. return Delims[ch];
  53. }
  54. };
  55. template <class Split>
  56. class TSplitIterator;
  57. class TSplitBase {
  58. protected:
  59. const char* Str;
  60. size_t Len;
  61. public:
  62. TSplitBase(const char* str, size_t length);
  63. TSplitBase(const TString& s);
  64. Y_FORCE_INLINE const char* GetString() const {
  65. return Str;
  66. }
  67. Y_FORCE_INLINE size_t GetLength() const {
  68. return Len;
  69. }
  70. private:
  71. // we don't own Str, make sure that no one calls us with temporary object
  72. TSplitBase(TString&&) = delete;
  73. };
  74. #ifdef _MSC_VER
  75. #pragma warning(push)
  76. #pragma warning(disable : 4512)
  77. #endif
  78. class TDelimitersSplit: public TSplitBase {
  79. private:
  80. const TSplitDelimiters& Delimiters;
  81. public:
  82. using TIterator = TSplitIterator<TDelimitersSplit>;
  83. friend class TSplitIterator<TDelimitersSplit>;
  84. TDelimitersSplit(const char* str, size_t length, const TSplitDelimiters& delimiters);
  85. TDelimitersSplit(const TString& s, const TSplitDelimiters& delimiters);
  86. TIterator Iterator() const;
  87. TSizeTRegion Next(size_t& pos) const;
  88. size_t Begin() const;
  89. private:
  90. // we don't own Delimiters, make sure that no one calls us with temporary object
  91. TDelimitersSplit(const char*, size_t, TSplitDelimiters&&) = delete;
  92. TDelimitersSplit(const TString&, TSplitDelimiters&&) = delete;
  93. TDelimitersSplit(TString&&, const TSplitDelimiters&) = delete;
  94. };
  95. class TDelimitersStrictSplit: public TSplitBase {
  96. private:
  97. const TSplitDelimiters& Delimiters;
  98. public:
  99. using TIterator = TSplitIterator<TDelimitersStrictSplit>;
  100. friend class TSplitIterator<TDelimitersStrictSplit>;
  101. TDelimitersStrictSplit(const char* str, size_t length, const TSplitDelimiters& delimiters);
  102. TDelimitersStrictSplit(const TString& s, const TSplitDelimiters& delimiters);
  103. TIterator Iterator() const;
  104. TSizeTRegion Next(size_t& pos) const;
  105. size_t Begin() const;
  106. private:
  107. // we don't own Delimiters, make sure that no one calls us with temporary object
  108. TDelimitersStrictSplit(const char*, size_t, TSplitDelimiters&&) = delete;
  109. TDelimitersStrictSplit(const TString&, TSplitDelimiters&&) = delete;
  110. TDelimitersStrictSplit(TString&&, const TSplitDelimiters&) = delete;
  111. };
  112. class TScreenedDelimitersSplit: public TSplitBase {
  113. private:
  114. const TSplitDelimiters& Delimiters;
  115. const TSplitDelimiters& Screens;
  116. public:
  117. using TIterator = TSplitIterator<TScreenedDelimitersSplit>;
  118. friend class TSplitIterator<TScreenedDelimitersSplit>;
  119. TScreenedDelimitersSplit(const char*, size_t, const TSplitDelimiters& delimiters, const TSplitDelimiters& screens);
  120. TScreenedDelimitersSplit(const TString& s, const TSplitDelimiters& delimiters, const TSplitDelimiters& screens);
  121. TIterator Iterator() const;
  122. TSizeTRegion Next(size_t& pos) const;
  123. size_t Begin() const;
  124. private:
  125. // we don't own Delimiters and Screens, make sure that no one calls us with temporary object
  126. TScreenedDelimitersSplit(TString&&, const TSplitDelimiters&, const TSplitDelimiters&) = delete;
  127. TScreenedDelimitersSplit(const TString&, TSplitDelimiters&&, const TSplitDelimiters&) = delete;
  128. TScreenedDelimitersSplit(const TString&, const TSplitDelimiters&, TSplitDelimiters&&) = delete;
  129. };
  130. class TDelimitersSplitWithoutTags: public TSplitBase {
  131. private:
  132. const TSplitDelimiters& Delimiters;
  133. size_t SkipTag(size_t pos) const;
  134. size_t SkipDelimiters(size_t pos) const;
  135. public:
  136. using TIterator = TSplitIterator<TDelimitersSplitWithoutTags>;
  137. friend class TSplitIterator<TDelimitersSplitWithoutTags>;
  138. TDelimitersSplitWithoutTags(const char* str, size_t length, const TSplitDelimiters& delimiters);
  139. TDelimitersSplitWithoutTags(const TString& s, const TSplitDelimiters& delimiters);
  140. TIterator Iterator() const;
  141. TSizeTRegion Next(size_t& pos) const;
  142. size_t Begin() const;
  143. private:
  144. // we don't own Delimiters, make sure that no one calls us with temporary object
  145. TDelimitersSplitWithoutTags(const char*, size_t, TSplitDelimiters&&) = delete;
  146. TDelimitersSplitWithoutTags(const TString&, TSplitDelimiters&&) = delete;
  147. TDelimitersSplitWithoutTags(TString&&, const TSplitDelimiters&) = delete;
  148. };
  149. class TCharSplit: public TSplitBase {
  150. public:
  151. using TIterator = TSplitIterator<TCharSplit>;
  152. friend class TSplitIterator<TCharSplit>;
  153. TCharSplit(const char* str, size_t length);
  154. TCharSplit(const TString& s);
  155. TIterator Iterator() const;
  156. TSizeTRegion Next(size_t& pos) const;
  157. size_t Begin() const;
  158. private:
  159. // we don't own Str, make sure that no one calls us with temporary object
  160. TCharSplit(TString&&) = delete;
  161. };
  162. #ifdef _MSC_VER
  163. #pragma warning(pop)
  164. #endif
  165. class TCharSplitWithoutTags: public TSplitBase {
  166. private:
  167. size_t SkipTag(size_t pos) const;
  168. size_t SkipDelimiters(size_t pos) const;
  169. public:
  170. using TIterator = TSplitIterator<TCharSplitWithoutTags>;
  171. friend class TSplitIterator<TCharSplitWithoutTags>;
  172. TCharSplitWithoutTags(const char* str, size_t length);
  173. TCharSplitWithoutTags(const TString& s);
  174. TIterator Iterator() const;
  175. TSizeTRegion Next(size_t& pos) const;
  176. size_t Begin() const;
  177. private:
  178. // we don't own Str, make sure that no one calls us with temporary object
  179. TCharSplitWithoutTags(TString&&) = delete;
  180. };
  181. class TSubstringSplitDelimiter {
  182. public:
  183. TKMPMatcher Matcher;
  184. size_t Len;
  185. TSubstringSplitDelimiter(const TString& s);
  186. };
  187. class TSubstringSplit: public TSplitBase {
  188. private:
  189. const TSubstringSplitDelimiter& Delimiter;
  190. public:
  191. using TIterator = TSplitIterator<TSubstringSplit>;
  192. friend class TSplitIterator<TSubstringSplit>;
  193. TSubstringSplit(const char* str, size_t length, const TSubstringSplitDelimiter& delimiter);
  194. TSubstringSplit(const TString& str, const TSubstringSplitDelimiter& delimiter);
  195. TIterator Iterator() const;
  196. TSizeTRegion Next(size_t& pos) const;
  197. size_t Begin() const;
  198. private:
  199. // we don't own Delimiters, make sure that no one calls us with temporary object
  200. TSubstringSplit(TString&&, const TSubstringSplitDelimiter&) = delete;
  201. TSubstringSplit(const TString&, TSubstringSplitDelimiter&&) = delete;
  202. };
  203. template <class TSplit>
  204. class TSplitIterator {
  205. protected:
  206. const TSplit& Split;
  207. size_t Pos;
  208. TString* CurrentStroka;
  209. public:
  210. TSplitIterator(const TSplit& split)
  211. : Split(split)
  212. , Pos(Split.Begin())
  213. , CurrentStroka(nullptr)
  214. {
  215. }
  216. virtual ~TSplitIterator() {
  217. delete CurrentStroka;
  218. }
  219. inline TSizeTRegion Next() {
  220. Y_ENSURE(!Eof(), TStringBuf("eof reached"));
  221. return Split.Next(Pos);
  222. }
  223. TStringBuf NextTok() {
  224. if (Eof())
  225. return TStringBuf();
  226. TSizeTRegion region = Next();
  227. return TStringBuf(Split.Str + region.Begin, region.End - region.Begin);
  228. }
  229. const TString& NextString() {
  230. if (!CurrentStroka)
  231. CurrentStroka = new TString();
  232. TSizeTRegion region = Next();
  233. CurrentStroka->assign(Split.Str, region.Begin, region.Length() - 1);
  234. return *CurrentStroka;
  235. }
  236. inline bool Eof() const {
  237. return Pos >= Split.Len;
  238. }
  239. TString GetTail() const {
  240. return TString(Split.Str + Pos);
  241. }
  242. void Skip(size_t count) {
  243. for (size_t i = 0; i < count; ++i)
  244. Next();
  245. }
  246. };
  247. using TSplitTokens = TVector<TString>;
  248. template <typename TSplit>
  249. void Split(const TSplit& split, TSplitTokens* words) {
  250. words->clear();
  251. TSplitIterator<TSplit> it(split);
  252. while (!it.Eof())
  253. words->push_back(it.NextString());
  254. }