tokenizer_ut.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. #include <library/cpp/testing/unittest/registar.h>
  2. #include <util/generic/array_size.h>
  3. #include <util/generic/strbuf.h>
  4. #include "mem.h"
  5. #include "null.h"
  6. #include "tokenizer.h"
  7. static inline void CheckIfNullTerminated(const TStringBuf str) {
  8. UNIT_ASSERT_VALUES_EQUAL('\0', *(str.data() + str.size()));
  9. }
  10. Y_UNIT_TEST_SUITE(TStreamTokenizerTests) {
  11. Y_UNIT_TEST(EmptyStreamTest) {
  12. auto&& input = TNullInput{};
  13. auto&& tokenizer = TStreamTokenizer<TEol>{&input};
  14. auto tokensCount = size_t{};
  15. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  16. CheckIfNullTerminated(TStringBuf{it->Data(), it->Length()});
  17. ++tokensCount;
  18. }
  19. UNIT_ASSERT_VALUES_EQUAL(0, tokensCount);
  20. }
  21. Y_UNIT_TEST(EmptyTokensTest) {
  22. const char data[] = "\n\n";
  23. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  24. auto&& input = TMemoryInput{data, dataSize};
  25. auto&& tokenizer = TStreamTokenizer<TEol>{&input};
  26. auto tokensCount = size_t{};
  27. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  28. CheckIfNullTerminated(TStringBuf{it->Data(), it->Length()});
  29. UNIT_ASSERT_VALUES_EQUAL(0, it->Length());
  30. ++tokensCount;
  31. }
  32. UNIT_ASSERT_VALUES_EQUAL(2, tokensCount);
  33. }
  34. Y_UNIT_TEST(LastTokenendDoesntSatisfyPredicateTest) {
  35. const char data[] = "abc\ndef\nxxxxxx";
  36. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  37. const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
  38. const auto tokensSize = Y_ARRAY_SIZE(tokens);
  39. auto&& input = TMemoryInput{data, dataSize};
  40. auto&& tokenizer = TStreamTokenizer<TEol>{&input};
  41. auto tokensCount = size_t{};
  42. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  43. UNIT_ASSERT(tokensCount < tokensSize);
  44. const auto token = TStringBuf{it->Data(), it->Length()};
  45. CheckIfNullTerminated(token);
  46. UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
  47. ++tokensCount;
  48. }
  49. UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
  50. }
  51. Y_UNIT_TEST(FirstTokenIsEmptyTest) {
  52. const char data[] = "\ndef\nxxxxxx";
  53. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  54. const TStringBuf tokens[] = {TStringBuf(), TStringBuf("def"), TStringBuf("xxxxxx")};
  55. const auto tokensSize = Y_ARRAY_SIZE(tokens);
  56. auto&& input = TMemoryInput{data, dataSize};
  57. auto&& tokenizer = TStreamTokenizer<TEol>{&input};
  58. auto tokensCount = size_t{};
  59. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  60. UNIT_ASSERT(tokensCount < tokensSize);
  61. const auto token = TStringBuf{it->Data(), it->Length()};
  62. CheckIfNullTerminated(token);
  63. UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
  64. ++tokensCount;
  65. }
  66. UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
  67. }
  68. Y_UNIT_TEST(PredicateDoesntMatch) {
  69. const char data[] = "1234567890-=!@#$%^&*()_+QWERTYUIOP{}qwertyuiop[]ASDFGHJKL:";
  70. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  71. auto&& input = TMemoryInput{data, dataSize};
  72. auto&& tokenizer = TStreamTokenizer<TEol>{&input};
  73. auto tokensCount = size_t{};
  74. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  75. const auto token = TStringBuf{it->Data(), it->Length()};
  76. CheckIfNullTerminated(token);
  77. UNIT_ASSERT_VALUES_EQUAL(data, token);
  78. ++tokensCount;
  79. }
  80. UNIT_ASSERT_VALUES_EQUAL(1, tokensCount);
  81. }
  82. Y_UNIT_TEST(SimpleTest) {
  83. const char data[] = "qwerty\n1234567890\n";
  84. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  85. const TStringBuf tokens[] = {TStringBuf("qwerty"), TStringBuf("1234567890")};
  86. const auto tokensSize = Y_ARRAY_SIZE(tokens);
  87. auto&& input = TMemoryInput{data, dataSize};
  88. auto&& tokenizer = TStreamTokenizer<TEol>{&input};
  89. auto tokensCount = size_t{};
  90. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  91. UNIT_ASSERT(tokensCount < tokensSize);
  92. const auto token = TStringBuf{it->Data(), it->Length()};
  93. CheckIfNullTerminated(token);
  94. UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
  95. ++tokensCount;
  96. }
  97. UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
  98. }
  99. Y_UNIT_TEST(CustomPredicateTest) {
  100. struct TIsVerticalBar {
  101. inline bool operator()(const char ch) const noexcept {
  102. return '|' == ch;
  103. }
  104. };
  105. const char data[] = "abc|def|xxxxxx";
  106. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  107. const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
  108. const auto tokensSize = Y_ARRAY_SIZE(tokens);
  109. auto&& input = TMemoryInput{data, dataSize};
  110. auto&& tokenizer = TStreamTokenizer<TIsVerticalBar>{&input};
  111. auto tokensCount = size_t{};
  112. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  113. UNIT_ASSERT(tokensCount < tokensSize);
  114. const auto token = TStringBuf{it->Data(), it->Length()};
  115. CheckIfNullTerminated(token);
  116. UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
  117. ++tokensCount;
  118. }
  119. UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
  120. }
  121. Y_UNIT_TEST(CustomPredicateSecondTest) {
  122. struct TIsVerticalBar {
  123. inline bool operator()(const char ch) const noexcept {
  124. return '|' == ch || ',' == ch;
  125. }
  126. };
  127. const char data[] = "abc|def|xxxxxx,abc|def|xxxxxx";
  128. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  129. const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx"),
  130. TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
  131. const auto tokensSize = Y_ARRAY_SIZE(tokens);
  132. auto&& input = TMemoryInput{data, dataSize};
  133. auto&& tokenizer = TStreamTokenizer<TIsVerticalBar>{&input};
  134. auto tokensCount = size_t{};
  135. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  136. UNIT_ASSERT(tokensCount < tokensSize);
  137. const auto token = TStringBuf{it->Data(), it->Length()};
  138. CheckIfNullTerminated(token);
  139. UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
  140. ++tokensCount;
  141. }
  142. UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
  143. }
  144. Y_UNIT_TEST(FalsePredicateTest) {
  145. struct TAlwaysFalse {
  146. inline bool operator()(const char) const noexcept {
  147. return false;
  148. }
  149. };
  150. const char data[] = "1234567890-=!@#$%^&*()_+QWERTYUIOP{}qwertyuiop[]ASDFGHJKL:";
  151. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  152. auto&& input = TMemoryInput{data, dataSize};
  153. auto&& tokenizer = TStreamTokenizer<TAlwaysFalse>{&input};
  154. auto tokensCount = size_t{};
  155. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  156. const auto token = TStringBuf{it->Data(), it->Length()};
  157. CheckIfNullTerminated(token);
  158. UNIT_ASSERT_VALUES_EQUAL(data, token);
  159. ++tokensCount;
  160. }
  161. UNIT_ASSERT_VALUES_EQUAL(1, tokensCount);
  162. }
  163. Y_UNIT_TEST(TruePredicateTest) {
  164. struct TAlwaysTrue {
  165. inline bool operator()(const char) const noexcept {
  166. return true;
  167. }
  168. };
  169. const char data[] = "1234567890-=!@#$%^&*()_+QWERTYUIOP{}qwertyuiop[]ASDFGHJKL:";
  170. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  171. auto&& input = TMemoryInput{data, dataSize};
  172. auto&& tokenizer = TStreamTokenizer<TAlwaysTrue>{&input};
  173. auto tokensCount = size_t{};
  174. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  175. CheckIfNullTerminated(TStringBuf{it->Data(), it->Length()});
  176. UNIT_ASSERT_VALUES_EQUAL(0, it->Length());
  177. ++tokensCount;
  178. }
  179. UNIT_ASSERT_VALUES_EQUAL(dataSize, tokensCount);
  180. }
  181. Y_UNIT_TEST(FirstTokenHasSizeOfTheBufferTest) {
  182. const char data[] = "xxxxx\nxx";
  183. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  184. const TStringBuf tokens[] = {TStringBuf("xxxxx"), TStringBuf("xx")};
  185. const auto tokensSize = Y_ARRAY_SIZE(tokens);
  186. auto&& input = TMemoryInput{data, dataSize};
  187. auto&& tokenizer = TStreamTokenizer<TEol>{&input, TEol{}, tokens[0].size()};
  188. auto tokensCount = size_t{};
  189. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  190. const auto token = TStringBuf{it->Data(), it->Length()};
  191. CheckIfNullTerminated(token);
  192. UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
  193. ++tokensCount;
  194. }
  195. UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
  196. }
  197. Y_UNIT_TEST(OnlyTokenHasSizeOfTheBufferTest) {
  198. const char data[] = "xxxxx";
  199. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  200. auto&& input = TMemoryInput{data, dataSize};
  201. auto&& tokenizer = TStreamTokenizer<TEol>{&input, TEol{}, dataSize};
  202. auto tokensCount = size_t{};
  203. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  204. const auto token = TStringBuf{it->Data(), it->Length()};
  205. CheckIfNullTerminated(token);
  206. UNIT_ASSERT_VALUES_EQUAL(data, token);
  207. ++tokensCount;
  208. }
  209. UNIT_ASSERT_VALUES_EQUAL(1, tokensCount);
  210. }
  211. Y_UNIT_TEST(BufferSizeInitialSizeSmallerThanTokenTest) {
  212. const char data[] = "xxxxx\nxx";
  213. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  214. const TStringBuf tokens[] = {TStringBuf("xxxxx"), TStringBuf("xx")};
  215. const auto tokensSize = Y_ARRAY_SIZE(tokens);
  216. auto&& input = TMemoryInput{data, dataSize};
  217. auto&& tokenizer = TStreamTokenizer<TEol>{&input, TEol{}, 1};
  218. auto tokensCount = size_t{};
  219. for (auto it = tokenizer.begin(); tokenizer.end() != it; ++it) {
  220. const auto token = TStringBuf{it->Data(), it->Length()};
  221. CheckIfNullTerminated(token);
  222. UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
  223. ++tokensCount;
  224. }
  225. UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
  226. }
  227. Y_UNIT_TEST(RangeBasedForTest) {
  228. const char data[] = "abc\ndef\nxxxxxx";
  229. const auto dataSize = Y_ARRAY_SIZE(data) - 1;
  230. const TStringBuf tokens[] = {TStringBuf("abc"), TStringBuf("def"), TStringBuf("xxxxxx")};
  231. const auto tokensSize = Y_ARRAY_SIZE(tokens);
  232. auto&& input = TMemoryInput{data, dataSize};
  233. auto&& tokenizer = TStreamTokenizer<TEol>{&input};
  234. auto tokensCount = size_t{};
  235. for (const auto& token : tokenizer) {
  236. UNIT_ASSERT(tokensCount < tokensSize);
  237. CheckIfNullTerminated(token);
  238. UNIT_ASSERT_VALUES_EQUAL(tokens[tokensCount], token);
  239. ++tokensCount;
  240. }
  241. UNIT_ASSERT_VALUES_EQUAL(tokensSize, tokensCount);
  242. }
  243. } // Y_UNIT_TEST_SUITE(TStreamTokenizerTests)