count_ut.cpp 26 KB


  1. /*
  2. * count_ut.cpp --
  3. *
  4. * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
  5. * Alexander Gololobov <agololobov@gmail.com>
  6. *
  7. * This file is part of Pire, the Perl Incompatible
  8. * Regular Expressions library.
  9. *
  10. * Pire is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU Lesser Public License as published by
  12. * the Free Software Foundation, either version 3 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * Pire is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU Lesser Public License for more details.
  19. * You should have received a copy of the GNU Lesser Public License
  20. * along with Pire. If not, see <http://www.gnu.org/licenses>.
  21. */
  22. #include <stub/hacks.h>
  23. #include <stub/saveload.h>
  24. #include <stub/utf8.h>
  25. #include <stub/memstreams.h>
  26. #include "stub/cppunit.h"
  27. #include <pire.h>
  28. #include <extra.h>
  29. #include <string.h>
  30. Y_UNIT_TEST_SUITE(TestCount) {
  31. Pire::Fsm MkFsm(const char* regexp, const Pire::Encoding& encoding)
  32. {
  33. Pire::Lexer lex;
  34. lex.SetEncoding(encoding);
  35. TVector<wchar32> ucs4;
  36. encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4));
  37. lex.Assign(ucs4.begin(), ucs4.end());
  38. return lex.Parse();
  39. }
  40. template<class Scanner>
  41. typename Scanner::State InitializedState(const Scanner& scanner)
  42. {
  43. typename Scanner::State state;
  44. scanner.Initialize(state);
  45. return state;
  46. }
  47. template<class Scanner>
  48. typename Scanner::State Run(const Scanner& scanner, const char* text, size_t len =-1)
  49. {
  50. if (len == (size_t)-1) len = strlen(text);
  51. auto state = InitializedState(scanner);
  52. Pire::Step(scanner, state, Pire::BeginMark);
  53. Pire::Run(scanner, state, text, text + len);
  54. Pire::Step(scanner, state, Pire::EndMark);
  55. return state;
  56. }
  57. template<class Scanner>
  58. size_t CountOne(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
  59. {
  60. const auto regexpFsm = MkFsm(regexp, encoding);
  61. const auto separatorFsm = MkFsm(separator, encoding);
  62. return Run(Scanner{regexpFsm, separatorFsm}, text, len).Result(0);
  63. }
  64. size_t Count(const char* regexp, const char* separator, const char* text, size_t len = -1, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
  65. {
  66. const auto regexpFsm = MkFsm(regexp, encoding);
  67. const auto separatorFsm = MkFsm(separator, encoding);
  68. auto countingResult = Run(Pire::CountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
  69. auto newResult = Run(Pire::AdvancedCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
  70. if (strcmp(separator, ".*") == 0) {
  71. HalfFinalFsm fsm(regexpFsm);
  72. fsm.MakeGreedyCounter(true);
  73. auto halfFinalSimpleResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0);
  74. fsm = HalfFinalFsm(regexpFsm);
  75. fsm.MakeGreedyCounter(false);
  76. auto halfFinalCorrectResult = Run(Pire::HalfFinalScanner{fsm}, text, len).Result(0);
  77. UNIT_ASSERT_EQUAL(halfFinalSimpleResult, halfFinalCorrectResult);
  78. UNIT_ASSERT_EQUAL(halfFinalSimpleResult, countingResult);
  79. }
  80. UNIT_ASSERT_EQUAL(countingResult, newResult);
  81. auto noGlueLimitResult = Run(Pire::NoGlueLimitCountingScanner{regexpFsm, separatorFsm}, text, len).Result(0);
  82. UNIT_ASSERT_EQUAL(countingResult, noGlueLimitResult);
  83. return newResult;
  84. }
  85. Y_UNIT_TEST(Count)
  86. {
  87. UNIT_ASSERT_EQUAL(Count("[a-z]+", "\\s", "abc def, abc def ghi, abc"), size_t(3));
  88. char aaa[] = "abc def\0 abc\0 def ghi, abc";
  89. UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa), Pire::Encodings::Latin1()), size_t(6));
  90. UNIT_ASSERT_EQUAL(Count("[a-z]+", ".*", aaa, sizeof(aaa)), size_t(6));
  91. UNIT_ASSERT_EQUAL(Count("\\w", "", "abc abcdef abcd abcdefgh ac"), size_t(8));
  92. UNIT_ASSERT_EQUAL(Count("http", ".*", "http://aaa, http://bbb, something in the middle, http://ccc, end"), size_t(3));
  93. UNIT_ASSERT_EQUAL(Count("abc", ".*", "abcabcabcabc"), size_t(4));
  94. UNIT_ASSERT_EQUAL(Count("[\320\220-\320\257\320\260-\321\217]+", "\\s+", " \320\257\320\275\320\264\320\265\320\272\321\201 "
  95. "\320\237\320\276\320\262\320\265\321\200\320\275\321\203\321\202\321\214 \320\222\320\276\320\271\321\202\320\270\302\240"
  96. "\320\262\302\240\320\277\320\276\321\207\321\202\321\203 \302\251\302\240" "1997\342\200\224" "2008 "
  97. "\302\253\320\257\320\275\320\264\320\265\320\272\321\201\302\273 \320\224\320\270\320\267\320\260\320\271\320\275\302"
  98. "\240\342\200\224\302\240\320\241\321\202\321\203\320\264\320\270\321\217 \320\220\321\200\321\202\320\265\320\274\320\270"
  99. "\321\217\302\240\320\233\320\265\320\261\320\265\320\264\320\265\320\262\320\260\012\012"), size_t(5));
  100. UNIT_ASSERT_EQUAL(Count("\321\201\320\265\320\272\321\201", ".*",
  101. "\320\277\320\276\321\200\320\275\320\276, \320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 "
  102. "\320\264\320\265\321\202\320\270, \320\264\320\265\321\202\320\270 \320\277\320\276\321\200\320\275\320\276 "
  103. "\320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265. "
  104. "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265 \320\262\320\270\320\264\320\265\320\276 "
  105. "\320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270. \320\264\320\265\321\202\320\270 "
  106. "\320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265\320\276 "
  107. "\320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265!<br> "
  108. "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\264\320\273\321\217 \320\277\320\276\320\264 "
  109. "\321\201\320\265\320\272\321\201\320\260 \320\277\320\260\321\200\320\276\320\271 "
  110. "\321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201 \320\270\321\211\320\265\320\274 "
  111. "\320\272\320\260\320\271\321\204\320\276\320\274. \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 "
  112. "\320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 "
  113. "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277\320\260\321\200\320\276\320\271 "
  114. "\320\270\321\211\320\265\320\274 \321\201 \320\264\320\273\321\217 \321\201\320\265\320\272\321\201\320\260!<br> "
  115. "\321\202\320\270\321\202\321\214\320\272\320\270 \320\261\320\276\320\273\321\214\321\210\320\270\320\265. "
  116. "\320\273\320\265\321\202 10 \320\263\320\276\320\273\321\213\320\265 12 \320\264\320\265\321\202\320\270!<br> "
  117. "\320\270\321\211\320\265\320\274 \321\201 \320\277\320\276\320\264 \320\272\320\260\320\271\321\204\320\276\320\274 "
  118. "\321\201\320\265\320\272\321\201\320\260\320\277\320\260\321\200\320\276\320\271 \320\264\320\273\321\217 "
  119. "\320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271! "
  120. "\320\261\320\276\320\273\321\214\321\210\320\270\320\265 \321\202\320\270\321\202\321\214\320\272\320\270, "
  121. "\320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 \321\201\320\270\321\201\321\202\320\265\320\274\320\260 "
  122. "\320\264\320\273\321\217 \320\276\320\277\320\276\321\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202"
  123. "\320\265\320\273\321\214\320\275\320\260\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\264"
  124. "\320\273\321\217 \320\270\321\211\320\265\320\274 \321\201\320\265\320\272\321\201\320\260 \320\272\320\260\320\271\321\204"
  125. "\320\276\320\274 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \321\201\320\265\320\274\320\265\320\271\320\275"
  126. "\320\276\320\271 \320\277\320\276\320\264 \320\277\320\260\321\200\320\276\320\271 \321\201. \320\276\320\277\320\276\321"
  127. "\200\320\275\320\276-\320\264\320\262\320\270\320\263\320\260\321\202\320\265\320\273\321\214\320\275\320\260\321\217 \321"
  128. "\201\320\270\321\201\321\202\320\265\320\274\320\260 \320\273\320\265\320\272\320\260\321\200\321\201\321\202\320\262\320\260 "
  129. "\320\264\320\273\321\217 \320\266\320\270\320\262\320\276\321\202\320\275\321\213\321\205, \320\261\320\265\321\201\320\277"
  130. "\320\273\320\260\321\202\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\262\320\270\320\264\320\265"
  131. "\320\276 \320\264\320\265\321\202\320\270. \320\276\321\204\320\270\321\206\320\265\321\200\321\213 \320\277\320\276\321"
  132. "\200\320\275\320\276 \321\204\320\276\321\202\320\276 \320\263\320\265\320\270, \320\270\321\211\320\265\320\274 \321\201"
  133. "\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\277"
  134. "\320\276 \320\277\320\260\321\200\320\276\320\271 \321\201\320\265\320\272\321\201\320\260 \320\264\320\273\321\217 \321\201 "
  135. "\320\272\320\260\320\271\321\204\320\276\320\274. \320\277\320\276\320\264 \320\264\320\273\321\217 \320\272\320\260\320\271"
  136. "\321\204\320\276\320\274 \321\201\320\265\320\274\320\265\320\271\320\275\320\276\320\271 \321\201\320\265\320\272\321\201"
  137. "\320\260 \320\277\320\260\321\200\320\276\320\271 \321\201 \320\264\320\265\320\262\321\203\321\210\320\272\321\203 \320\270"
  138. "\321\211\320\265\320\274? \320\262\320\270\320\264\320\265\320\276 \320\261\320\265\321\201\320\277\320\273\320\260\321\202"
  139. "\320\275\320\276\320\265 \320\277\320\276\321\200\320\275\320\276 \320\264\320\265\321\202\320\270, \320\264\320\265\321\202"
  140. "\320\270 \320\261\320\265\321\201\320\277\320\273\320\260\321\202\320\275\320\276\320\265"),
  141. size_t(6));
  142. UNIT_ASSERT_EQUAL(Count("<a[^>]*>[^<]*</a>", "([^<]|<br\\s?/?>)*", "\321\200\320\275\320\276</a><br />"
  143. "<a href=\"http://wapspzk.1sweethost.com//22.html\">\320\264\320\265\321\210\320\265\320\262\321\213\320\265 \320\277\320\276"
  144. "\321\200\320\275\320\276 \321\204\320\270\320\273\321\214\320\274\321\213</a><br /><a href=\"http://wapspzk.1sweethost.com//23.html\">"
  145. "\321\201\320\265\320\272\321\201 \321\210\320\276\320\277 \321\200\320\276\321\201\320\270\321\202\320\260</a><br />"
  146. "<a href=\"http://wapspzk.1sweethost.com//24.html\">\320\263\320\276\320\273\321\213\320\265 \320\264\320\265\320\262\321\203"
  147. "\321\210\320\272\320\270 \321\203\320\273\320\270\321\206\320\260</a><br /><a href=\"http://wapspzk.1sweethost.com//25.html\">"
  148. "\321\202\321\200\320\260\321\205\320\275\321\203\321\202\321\214 \320\274\320\260\320\274\320\260\321\210\320\270</a><br />"
  149. "<a href=\"http://wapspzk.1sweethost.com//26.html\">\320\277\320\270\320\267\320\264\320\260 \321\204\321\200\320\270\321\201"
  150. "\320\272\320\265</a><br /><a href=\"http://wapspzk.1sweethost.com//27.html\">\320\261\320\265\321\201\320\277\320\273\320\260"
  151. "\321\202\320\275\320\276</a><br /><a href=\"http://wapspzk.1sweethost.com//33.html\">\321\201\320\276\321\206\320\270\320\276"
  152. "\320\273\320\276\320\263\320\270\321\207\320\265\321\201\320\272\320\270\320\271 \320\260\320\275\320\260\320\273\320\270\320"
  153. "\267 \320\274\320\276\320\264\320\265\320\273\320\265\320\271 \321\201\320\265\320\272\321\201\321\203\320\260\320\273\321\214"
  154. "\320\275\320\276\320\263\320\276 \320\277\320\276\320\262\320\265\320\264\320\265\320\275\320\270\321\217</a>\321\217"), size_t(7));
  155. UNIT_ASSERT(CountOne<Pire::CountingScanner>("a", "b", "aaa") != size_t(3));
  156. UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("a", "b", "aaa"), size_t(1));
  157. UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("[a-z\320\260-\321\217]+", " +",
  158. " \320\260\320\260\320\220 abc def \320\260 cd"),
  159. size_t(4)); // Pire::CountingScanner returns 1 here, since it enters a dead state
  160. }
  161. Y_UNIT_TEST(CountWithoutSeparator)
  162. {
  163. UNIT_ASSERT_EQUAL(Count("a", "", "aa aaa"), size_t(3));
  164. }
  165. Y_UNIT_TEST(CountGreedy)
  166. {
  167. const auto& enc = Pire::Encodings::Latin1();
  168. char text[] = "wwwsswwwsssswwws";
  169. UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3));
  170. UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www", ".{1,6}", text, sizeof(text), enc), size_t(3));
  171. UNIT_ASSERT_EQUAL(CountOne<Pire::AdvancedCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3));
  172. UNIT_ASSERT_EQUAL(CountOne<Pire::NoGlueLimitCountingScanner>("www.{1,6}", "", text, sizeof(text), enc), size_t(3));
  173. }
  174. Y_UNIT_TEST(CountRepeating)
  175. {
  176. char text[] = "abbabbabbabbat";
  177. UNIT_ASSERT_EQUAL(Count("abba", ".*", text, sizeof(text), Pire::Encodings::Latin1()), size_t(2));
  178. }
  179. template<class Scanner>
  180. void CountGlueOne()
  181. {
  182. const auto& enc = Pire::Encodings::Utf8();
  183. auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
  184. auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
  185. auto sc = Scanner::Glue(sc1, sc2);
  186. auto st = Run(sc, "abc defg 123 jklmn 4567 opqrst");
  187. UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
  188. UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
  189. }
  190. Y_UNIT_TEST(CountGlue)
  191. {
  192. CountGlueOne<Pire::CountingScanner>();
  193. CountGlueOne<Pire::AdvancedCountingScanner>();
  194. CountGlueOne<Pire::NoGlueLimitCountingScanner>();
  195. }
  196. template <class Scanner>
  197. void CountManyGluesOne(size_t maxRegexps) {
  198. const auto& encoding = Pire::Encodings::Utf8();
  199. auto text = "abcdbaa aa";
  200. TVector<ypair<std::string, std::string>> tasks = {
  201. {"a", ".*"},
  202. {"b", ".*"},
  203. {"c", ".*"},
  204. {"ba", ".*"},
  205. {"ab",".*"},
  206. };
  207. TVector<size_t> answers = {5, 2, 1, 1, 1};
  208. Scanner scanner;
  209. size_t regexpsCount = 0;
  210. for (; regexpsCount < maxRegexps; ++regexpsCount) {
  211. const auto& task = tasks[regexpsCount % tasks.size()];
  212. const auto regexpFsm = MkFsm(task.first.c_str(), encoding);
  213. const auto separatorFsm = MkFsm(task.second.c_str(), encoding);
  214. Scanner nextScanner(regexpFsm, separatorFsm);
  215. auto glue = Scanner::Glue(scanner, nextScanner);
  216. if (glue.Empty()) {
  217. break;
  218. }
  219. scanner = std::move(glue);
  220. }
  221. auto state = Run(scanner, text);
  222. for (size_t i = 0; i < regexpsCount; ++i) {
  223. UNIT_ASSERT_EQUAL(state.Result(i), answers[i % answers.size()]);
  224. }
  225. }
  226. Y_UNIT_TEST(CountManyGlues)
  227. {
  228. CountManyGluesOne<Pire::CountingScanner>(20);
  229. CountManyGluesOne<Pire::AdvancedCountingScanner>(20);
  230. CountManyGluesOne<Pire::NoGlueLimitCountingScanner>(50);
  231. }
  232. template<class Scanner>
  233. void CountBoundariesOne()
  234. {
  235. const char* strings[] = { "abcdef", "abc def", "defcba", "wxyz abc", "a", "123" };
  236. const auto& enc = Pire::Encodings::Utf8();
  237. Scanner sc(MkFsm("^[a-z]+$", enc), MkFsm("(.|^|$)*", enc));
  238. auto st = InitializedState(sc);
  239. for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) {
  240. Pire::Step(sc, st, Pire::BeginMark);
  241. Pire::Run(sc, st, strings[i], strings[i] + strlen(strings[i]));
  242. Pire::Step(sc, st, Pire::EndMark);
  243. }
  244. UNIT_ASSERT_EQUAL(st.Result(0), size_t(3));
  245. const auto& enc2 = Pire::Encodings::Latin1();
  246. Scanner sc2(MkFsm("[a-z]", enc2), MkFsm(".*", enc2));
  247. auto st2 = InitializedState(sc2);
  248. for (size_t i = 0; i < sizeof(strings) / sizeof(*strings); ++i) {
  249. Pire::Step(sc2, st2, Pire::BeginMark);
  250. Pire::Run(sc2, st2, strings[i], strings[i] + strlen(strings[i]));
  251. Pire::Step(sc2, st2, Pire::EndMark);
  252. }
  253. UNIT_ASSERT_EQUAL(st2.Result(0), size_t(7));
  254. }
  255. Y_UNIT_TEST(CountBoundaries)
  256. {
  257. CountBoundariesOne<Pire::CountingScanner>();
  258. CountBoundariesOne<Pire::AdvancedCountingScanner>();
  259. CountBoundariesOne<Pire::NoGlueLimitCountingScanner>();
  260. }
  261. template<class Scanner>
  262. void SerializationOne()
  263. {
  264. const auto& enc = Pire::Encodings::Latin1();
  265. auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
  266. auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
  267. auto sc = Scanner::Glue(sc1, sc2);
  268. BufferOutput wbuf;
  269. ::Save(&wbuf, sc);
  270. MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
  271. Scanner sc3;
  272. ::Load(&rbuf, sc3);
  273. auto st = Run(sc3, "abc defg 123 jklmn 4567 opqrst");
  274. UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
  275. UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
  276. const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
  277. TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
  278. // Test mmap-ing at various alignments
  279. for (size_t offset = 0; offset < MaxTestOffset; ++offset) {
  280. const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
  281. memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
  282. try {
  283. Scanner sc4;
  284. const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
  285. if (offset % sizeof(size_t) != 0) {
  286. UNIT_ASSERT(!"CountingScanner failed to check for misaligned mmaping");
  287. } else {
  288. UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
  289. st = Run(sc4, "abc defg 123 jklmn 4567 opqrst");
  290. UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
  291. UNIT_ASSERT_EQUAL(st.Result(1), size_t(2));
  292. }
  293. }
  294. catch (Pire::Error&) {}
  295. }
  296. }
  297. Y_UNIT_TEST(Serialization)
  298. {
  299. SerializationOne<Pire::CountingScanner>();
  300. SerializationOne<Pire::AdvancedCountingScanner>();
  301. SerializationOne<Pire::NoGlueLimitCountingScanner>();
  302. }
  303. template<class Scanner>
  304. void Serialization_v6_compatibilityOne()
  305. {
  306. const auto& enc = Pire::Encodings::Latin1();
  307. auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
  308. auto sc2 = Scanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
  309. auto sc = Scanner::Glue(sc1, sc2);
  310. BufferOutput wbuf;
  311. ::Save(&wbuf, sc);
  312. // Patched scanner is a scanner of RE_VERSION 6.
  313. // The patched scanner is concatenated with original scanner to
  314. // make sure all content of patched scanner is consumed.
  315. const size_t ALIGNMENT = sizeof(size_t);
  316. size_t actions_size =
  317. sc.Size() *
  318. sc.LettersCount() *
  319. sizeof(typename Scanner::Action);
  320. UNIT_ASSERT_EQUAL(actions_size % ALIGNMENT, 0);
  321. size_t tags_size = sc.Size() * sizeof(typename Scanner::Tag);
  322. const char* src = wbuf.Buffer().Data();
  323. size_t src_size = wbuf.Buffer().Size();
  324. size_t patched_size = src_size + actions_size;
  325. size_t bytes_before_actions = src_size - tags_size;
  326. const int fill_char = 0x42;
  327. TVector<char> buf2(patched_size + src_size + 2 * ALIGNMENT);
  328. char* dst = reinterpret_cast<char*>(Pire::Impl::AlignUp(&buf2[0], ALIGNMENT));
  329. char* patched = dst;
  330. // Insert dummy m_actions between m_jumps and m_tags.
  331. memcpy(patched, src, bytes_before_actions); // copy members before m_actions
  332. memset(patched + bytes_before_actions, fill_char, actions_size); // m_actions
  333. memcpy(patched + bytes_before_actions + actions_size,
  334. src + bytes_before_actions,
  335. tags_size); // m_tags
  336. // Set version to 6
  337. // order of fields in header: magic, version, ...
  338. ui32* version_in_patched = reinterpret_cast<ui32*>(patched) + 1;
  339. UNIT_ASSERT_EQUAL(*version_in_patched, Pire::Header::RE_VERSION);
  340. *version_in_patched = Pire::Header::RE_VERSION_WITH_MACTIONS;
  341. // write normal scanner after patched one
  342. char* normal = Pire::Impl::AlignUp(patched + patched_size, ALIGNMENT);
  343. memcpy(normal, src, src_size);
  344. char* dst_end = Pire::Impl::AlignUp(normal + src_size, ALIGNMENT);
  345. size_t dst_size = dst_end - dst;
  346. // test loading from stream
  347. {
  348. MemoryInput rbuf(dst, dst_size);
  349. Scanner sc_patched, sc_normal;
  350. ::Load(&rbuf, sc_patched);
  351. ::Load(&rbuf, sc_normal);
  352. auto st_patched = Run(sc_patched,
  353. "abc defg 123 jklmn 4567 opqrst");
  354. UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4));
  355. UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2));
  356. auto st_normal = Run(sc_normal,
  357. "abc defg 123 jklmn 4567 opqrst");
  358. UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4));
  359. UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2));
  360. }
  361. // test loading using Mmap
  362. {
  363. Scanner sc_patched, sc_normal;
  364. const void* tail = sc_patched.Mmap(patched, patched_size);
  365. UNIT_ASSERT_EQUAL(tail, normal);
  366. const void* tail2 = sc_normal.Mmap(tail, src_size);
  367. UNIT_ASSERT_EQUAL(tail2, dst_end);
  368. auto st_patched = Run(sc_patched,
  369. "abc defg 123 jklmn 4567 opqrst");
  370. UNIT_ASSERT_EQUAL(st_patched.Result(0), size_t(4));
  371. UNIT_ASSERT_EQUAL(st_patched.Result(1), size_t(2));
  372. auto st_normal = Run(sc_normal,
  373. "abc defg 123 jklmn 4567 opqrst");
  374. UNIT_ASSERT_EQUAL(st_normal.Result(0), size_t(4));
  375. UNIT_ASSERT_EQUAL(st_normal.Result(1), size_t(2));
  376. }
  377. }
  378. Y_UNIT_TEST(Serialization_v6_compatibility)
  379. {
  380. Serialization_v6_compatibilityOne<Pire::CountingScanner>();
  381. Serialization_v6_compatibilityOne<Pire::AdvancedCountingScanner>();
  382. // NoGlueLimitCountingScanner is not v6_compatible
  383. }
  384. Y_UNIT_TEST(NoGlueLimitScannerCompatibilityWithAdvancedScanner) {
  385. const auto& enc = Pire::Encodings::Latin1();
  386. auto sc1 = AdvancedCountingScanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
  387. auto sc2 = AdvancedCountingScanner(MkFsm("[0-9]+", enc), MkFsm(".*", enc));
  388. auto sc = AdvancedCountingScanner::Glue(sc1, sc2);
  389. BufferOutput wbuf;
  390. ::Save(&wbuf, sc);
  391. TVector<char> buf2(wbuf.Buffer().Size());
  392. memcpy(buf2.data(), wbuf.Buffer().Data(), wbuf.Buffer().Size());
  393. // test loading from stream
  394. {
  395. MemoryInput rbuf(buf2.data(), buf2.size());
  396. NoGlueLimitCountingScanner scanner;
  397. ::Load(&rbuf, scanner);
  398. auto state = Run(scanner,
  399. "abc defg 123 jklmn 4567 opqrst");
  400. UNIT_ASSERT_EQUAL(state.Result(0), size_t(4));
  401. UNIT_ASSERT_EQUAL(state.Result(1), size_t(2));
  402. }
  403. // test loading using Mmap
  404. {
  405. NoGlueLimitCountingScanner scanner;
  406. const void* tail = scanner.Mmap(buf2.data(), buf2.size());
  407. UNIT_ASSERT_EQUAL(tail, buf2.data() + buf2.size());
  408. auto state = Run(scanner,
  409. "abc defg 123 jklmn 4567 opqrst");
  410. UNIT_ASSERT_EQUAL(state.Result(0), size_t(4));
  411. UNIT_ASSERT_EQUAL(state.Result(1), size_t(2));
  412. }
  413. }
  414. template<class Scanner>
  415. void EmptyOne()
  416. {
  417. Scanner sc;
  418. UNIT_ASSERT(sc.Empty());
  419. UNIT_CHECKPOINT(); Run(sc, "a string"); // Just should not crash
  420. // Test glueing empty
  421. const auto& enc = Pire::Encodings::Latin1();
  422. auto sc1 = Scanner(MkFsm("[a-z]+", enc), MkFsm(".*", enc));
  423. auto sc2 = Scanner::Glue(sc, Scanner::Glue(sc, sc1));
  424. auto st = Run(sc2, "abc defg 123 jklmn 4567 opqrst");
  425. UNIT_ASSERT_EQUAL(st.Result(0), size_t(4));
  426. // Test Save/Load/Mmap
  427. BufferOutput wbuf;
  428. ::Save(&wbuf, sc);
  429. MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
  430. Pire::CountingScanner sc3;
  431. ::Load(&rbuf, sc3);
  432. UNIT_CHECKPOINT(); Run(sc3, "a string");
  433. const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
  434. TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
  435. const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
  436. memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
  437. Scanner sc4;
  438. const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
  439. UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
  440. UNIT_CHECKPOINT(); Run(sc4, "a string");
  441. }
  442. Y_UNIT_TEST(Empty)
  443. {
  444. EmptyOne<Pire::CountingScanner>();
  445. EmptyOne<Pire::AdvancedCountingScanner>();
  446. EmptyOne<Pire::NoGlueLimitCountingScanner>();
  447. }
  448. template<typename Scanner>
  449. TVector<Scanner> MakeHalfFinalCount(const char* regexp, const Pire::Encoding& encoding = Pire::Encodings::Utf8()) {
  450. TVector<Scanner> scanners(6);
  451. const auto regexpFsm = MkFsm(regexp, encoding);
  452. HalfFinalFsm fsm(regexpFsm);
  453. fsm.MakeGreedyCounter(true);
  454. scanners[0] = Scanner(fsm);
  455. fsm = HalfFinalFsm(regexpFsm);
  456. fsm.MakeGreedyCounter(false);
  457. scanners[1] = Scanner(fsm);
  458. fsm = HalfFinalFsm(regexpFsm);
  459. fsm.MakeNonGreedyCounter(true, true);
  460. scanners[2] = Scanner(fsm);
  461. fsm = HalfFinalFsm(regexpFsm);
  462. fsm.MakeNonGreedyCounter(true, false);
  463. scanners[3] = Scanner(fsm);
  464. fsm = HalfFinalFsm(regexpFsm);
  465. fsm.MakeNonGreedyCounter(false);
  466. scanners[4] = Scanner(fsm);
  467. scanners[5] = scanners[0];
  468. for (size_t i = 1; i < 5; i++) {
  469. scanners[5] = Scanner::Glue(scanners[5], scanners[i]);
  470. }
  471. return scanners;
  472. }
  473. template<typename Scanner>
  474. void HalfFinalCount(TVector<Scanner> scanners, const char* text, TVector<size_t> result) {
  475. for (size_t i = 0; i < 5; i++) {
  476. UNIT_ASSERT_EQUAL(Run(scanners[i], text, -1).Result(0), result[i]);
  477. }
  478. auto state = Run(scanners[5], text, -1);
  479. for (size_t i = 0; i < 5; i++) {
  480. UNIT_ASSERT_EQUAL(state.Result(i), result[i]);
  481. }
  482. }
  483. template<typename Scanner>
  484. void TestHalfFinalCount() {
  485. HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+"), "abbabbbabbbbbb", {3, 3, 3, 11, 3});
  486. HalfFinalCount(MakeHalfFinalCount<Scanner>("(ab)+"), "ababbababbab", {3, 3, 5, 5, 5});
  487. HalfFinalCount(MakeHalfFinalCount<Scanner>("(abab)+"), "ababababab", {1, 1, 4, 4, 2});
  488. HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbb", {1, 10, 10, 10, 10});
  489. HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbb", {1, 10, 11, 11, 11});
  490. HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbc", {1, 1, 10, 11, 10});
  491. HalfFinalCount(MakeHalfFinalCount<Scanner>("ab+c|b"), "abbbbbbbbbbbc", {1, 1, 11, 12, 11});
  492. HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbbc", {1, 1, 8, 9, 8});
  493. HalfFinalCount(MakeHalfFinalCount<Scanner>("a\\w+c|b"), "abbbdbbbdbb", {1, 8, 8, 8, 8});
  494. HalfFinalCount(MakeHalfFinalCount<Scanner>("a[a-z]+c|b"), "abeeeebeeeeeeeeeceeaeebeeeaeecceebeeaeebeeb", {2, 4, 7, 9, 7});
  495. }
  496. Y_UNIT_TEST(HalfFinal)
  497. {
  498. TestHalfFinalCount<Pire::HalfFinalScanner>();
  499. TestHalfFinalCount<Pire::NonrelocHalfFinalScanner>();
  500. TestHalfFinalCount<Pire::HalfFinalScannerNoMask>();
  501. TestHalfFinalCount<Pire::NonrelocHalfFinalScannerNoMask>();
  502. }
  503. template<typename Scanner>
  504. void TestHalfFinalSerialization() {
  505. auto oldScanners = MakeHalfFinalCount<Scanner>("(\\w\\w)+");
  506. BufferOutput wbuf;
  507. for (size_t i = 0; i < 6; i++) {
  508. ::Save(&wbuf, oldScanners[i]);
  509. }
  510. MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
  511. TVector<Scanner> scanners(6);
  512. for (size_t i = 0; i < 6; i++) {
  513. ::Load(&rbuf, scanners[i]);
  514. }
  515. HalfFinalCount(scanners, "ab abbb ababa a", {3, 3, 8, 8, 5});
  516. }
  517. Y_UNIT_TEST(HalfFinalSerialization)
  518. {
  519. TestHalfFinalSerialization<Pire::HalfFinalScanner>();
  520. TestHalfFinalSerialization<Pire::HalfFinalScannerNoMask>();
  521. }
  522. }