capture_ut.cpp 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. /*
  2. * capture_ut.cpp --
  3. *
  4. * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
  5. * Alexander Gololobov <agololobov@gmail.com>
  6. *
  7. * This file is part of Pire, the Perl Incompatible
  8. * Regular Expressions library.
  9. *
  10. * Pire is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU Lesser Public License as published by
  12. * the Free Software Foundation, either version 3 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * Pire is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU Lesser Public License for more details.
  19. * You should have received a copy of the GNU Lesser Public License
  20. * along with Pire. If not, see <http://www.gnu.org/licenses>.
  21. */
  22. #include <stub/hacks.h>
  23. #include <stub/saveload.h>
  24. #include <stub/utf8.h>
  25. #include <stub/memstreams.h>
  26. #include "stub/cppunit.h"
  27. #include <pire.h>
  28. #include <extra.h>
  29. #include <string.h>
  30. Y_UNIT_TEST_SUITE(TestPireCapture) {
  31. using Pire::CapturingScanner;
  32. using Pire::SlowCapturingScanner;
  33. typedef Pire::CapturingScanner::State State;
  34. CapturingScanner Compile(const char* regexp, int index)
  35. {
  36. Pire::Lexer lexer;
  37. lexer.Assign(regexp, regexp + strlen(regexp));
  38. lexer.AddFeature(Pire::Features::CaseInsensitive());
  39. lexer.AddFeature(Pire::Features::Capture((size_t) index));
  40. Pire::Fsm fsm = lexer.Parse();
  41. fsm.Surround();
  42. fsm.Determine();
  43. return fsm.Compile<Pire::CapturingScanner>();
  44. }
  45. SlowCapturingScanner SlowCompile(const char* regexp, int index, const Pire::Encoding& encoding = Pire::Encodings::Utf8())
  46. {
  47. Pire::Lexer lexer;
  48. lexer.AddFeature(Pire::Features::Capture(static_cast<size_t>(index)));
  49. lexer.SetEncoding(encoding);
  50. TVector<wchar32> ucs4;
  51. encoding.FromLocal(regexp, regexp + strlen(regexp), std::back_inserter(ucs4));
  52. lexer.Assign(ucs4.begin(), ucs4.end());
  53. Pire::Fsm fsm = lexer.Parse();
  54. fsm.Surround();
  55. return fsm.Compile<Pire::SlowCapturingScanner>();
  56. }
  57. State RunRegexp(const CapturingScanner& scanner, const char* str)
  58. {
  59. State state;
  60. scanner.Initialize(state);
  61. Step(scanner, state, Pire::BeginMark);
  62. Run(scanner, state, str, str + strlen(str));
  63. Step(scanner, state, Pire::EndMark);
  64. return state;
  65. }
  66. SlowCapturingScanner::State RunRegexp(const SlowCapturingScanner& scanner, const char* str)
  67. {
  68. SlowCapturingScanner::State state;
  69. scanner.Initialize(state);
  70. Run(scanner, state, str, str + strlen(str));
  71. return state;
  72. }
  73. ystring Captured(const State& state, const char* str)
  74. {
  75. if (state.Captured())
  76. return ystring(str + state.Begin() - 1, str + state.End() - 1);
  77. else
  78. return ystring();
  79. }
  80. Y_UNIT_TEST(Trivial)
  81. {
  82. CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1);
  83. State state;
  84. const char* str;
  85. str = "google_id = 'abcde';";
  86. state = RunRegexp(scanner, str);
  87. UNIT_ASSERT(state.Captured());
  88. UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
  89. str = "var google_id = 'abcde'; eval(google_id);";
  90. state = RunRegexp(scanner, str);
  91. UNIT_ASSERT(state.Captured());
  92. UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
  93. str = "google_id != 'abcde';";
  94. state = RunRegexp(scanner, str);
  95. UNIT_ASSERT(!state.Captured());
  96. }
  97. Y_UNIT_TEST(Sequential)
  98. {
  99. CapturingScanner scanner = Compile("google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;", 1);
  100. State state;
  101. const char* str;
  102. str = "google_id = 'abcde'; google_id = 'xyz';";
  103. state = RunRegexp(scanner, str);
  104. UNIT_ASSERT(state.Captured());
  105. UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("abcde"));
  106. str = "var google_id = 'abc de'; google_id = 'xyz';";
  107. state = RunRegexp(scanner, str);
  108. UNIT_ASSERT(state.Captured());
  109. UNIT_ASSERT_VALUES_EQUAL(Captured(state, str), ystring("xyz"));
  110. }
  111. Y_UNIT_TEST(NegatedTerminator)
  112. {
  113. CapturingScanner scanner = Compile("=(\\d+)[^\\d]", 1);
  114. State state;
  115. const char* str;
  116. str = "=12345;";
  117. state = RunRegexp(scanner, str);
  118. UNIT_ASSERT(state.Captured());
  119. UNIT_ASSERT_EQUAL(Captured(state, str), ystring("12345"));
  120. }
  121. Y_UNIT_TEST(Serialization)
  122. {
  123. const char* regex = "google_id\\s*=\\s*[\'\"]([a-z0-9]+)[\'\"]\\s*;";
  124. CapturingScanner scanner2 = Compile(regex, 1);
  125. SlowCapturingScanner slowScanner2 = SlowCompile(regex, 1);
  126. BufferOutput wbuf, wbuf2;
  127. ::Save(&wbuf, scanner2);
  128. ::Save(&wbuf2, slowScanner2);
  129. MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
  130. MemoryInput rbuf2(wbuf2.Buffer().Data(), wbuf2.Buffer().Size());
  131. CapturingScanner scanner;
  132. SlowCapturingScanner slowScanner;
  133. ::Load(&rbuf, scanner);
  134. ::Load(&rbuf2, slowScanner);
  135. State state;
  136. SlowCapturingScanner::State slowState;
  137. const char* str;
  138. str = "google_id = 'abcde';";
  139. state = RunRegexp(scanner, str);
  140. slowState = RunRegexp(slowScanner, str);
  141. UNIT_ASSERT(state.Captured());
  142. UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
  143. SlowCapturingScanner::SingleState final;
  144. UNIT_ASSERT(slowScanner.GetCapture(slowState, final));
  145. ystring ans(str, final.GetBegin(), final.GetEnd() - final.GetBegin());
  146. UNIT_ASSERT_EQUAL(ans, ystring("abcde"));
  147. str = "google_id != 'abcde';";
  148. state = RunRegexp(scanner, str);
  149. slowState = RunRegexp(slowScanner, str);
  150. UNIT_ASSERT(!state.Captured());
  151. UNIT_ASSERT(!slowScanner.GetCapture(slowState, final));
  152. CapturingScanner scanner3;
  153. const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
  154. TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
  155. const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
  156. memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
  157. const void* tail = scanner3.Mmap(ptr, wbuf.Buffer().Size());
  158. UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
  159. str = "google_id = 'abcde';";
  160. state = RunRegexp(scanner3, str);
  161. UNIT_ASSERT(state.Captured());
  162. UNIT_ASSERT_EQUAL(Captured(state, str), ystring("abcde"));
  163. str = "google_id != 'abcde';";
  164. state = RunRegexp(scanner3, str);
  165. UNIT_ASSERT(!state.Captured());
  166. ptr = (const void*) ((const char*) wbuf.Buffer().Data() + 1);
  167. try {
  168. scanner3.Mmap(ptr, wbuf.Buffer().Size());
  169. UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping");
  170. }
  171. catch (Pire::Error&) {}
  172. for (size_t offset = 1; offset < MaxTestOffset; ++offset) {
  173. ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
  174. memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
  175. try {
  176. scanner3.Mmap(ptr, wbuf.Buffer().Size());
  177. if (offset % sizeof(size_t) != 0) {
  178. UNIT_ASSERT(!"CapturingScanner failed to check for misaligned mmaping");
  179. } else {
  180. str = "google_id = 'abcde';";
  181. state = RunRegexp(scanner3, str);
  182. UNIT_ASSERT(state.Captured());
  183. }
  184. }
  185. catch (Pire::Error&) {}
  186. }
  187. }
  188. Y_UNIT_TEST(Empty)
  189. {
  190. Pire::CapturingScanner sc;
  191. UNIT_ASSERT(sc.Empty());
  192. UNIT_CHECKPOINT(); RunRegexp(sc, "a string"); // Just should not crash
  193. // Test Save/Load/Mmap
  194. BufferOutput wbuf;
  195. ::Save(&wbuf, sc);
  196. MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
  197. Pire::CapturingScanner sc3;
  198. ::Load(&rbuf, sc3);
  199. UNIT_CHECKPOINT(); RunRegexp(sc3, "a string");
  200. const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
  201. TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
  202. const void* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
  203. memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
  204. Pire::CapturingScanner sc4;
  205. const void* tail = sc4.Mmap(ptr, wbuf.Buffer().Size());
  206. UNIT_ASSERT_EQUAL(tail, (const void*) ((size_t)ptr + wbuf.Buffer().Size()));
  207. UNIT_CHECKPOINT(); RunRegexp(sc4, "a string");
  208. }
  209. void MakeSlowCapturingTest(const char* regexp, const char* text, size_t position, bool ans, const ystring& captured = ystring(""), const Pire::Encoding& encoding = Pire::Encodings::Utf8())
  210. {
  211. Pire::SlowCapturingScanner sc = SlowCompile(regexp, position, encoding);
  212. SlowCapturingScanner::State st = RunRegexp(sc, text);
  213. SlowCapturingScanner::SingleState fin;
  214. bool ifCaptured = sc.GetCapture(st, fin);
  215. if (ans) {
  216. UNIT_ASSERT(ifCaptured);
  217. ystring answer(text, fin.GetBegin(), fin.GetEnd() - fin.GetBegin());
  218. UNIT_ASSERT_EQUAL(answer, captured);
  219. } else {
  220. UNIT_ASSERT(!ifCaptured);
  221. }
  222. }
  223. Y_UNIT_TEST(SlowCapturingNonGreedy)
  224. {
  225. const char* regexp = ".*?(pref.*suff)";
  226. const char* text = "pref ala bla pref cla suff dla";
  227. MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref ala bla pref cla suff"));
  228. }
  229. Y_UNIT_TEST(SlowCaptureGreedy)
  230. {
  231. const char* regexp = ".*(pref.*suff)";
  232. const char* text = "pref ala bla pref cla suff dla";
  233. MakeSlowCapturingTest(regexp, text, 1, true, ystring("pref cla suff"));
  234. }
  235. Y_UNIT_TEST(SlowCaptureInOr)
  236. {
  237. const char* regexp = "(A)|A";
  238. const char* text = "A";
  239. MakeSlowCapturingTest(regexp, text, 1, true, ystring("A"));
  240. const char* regexp2 = "A|(A)";
  241. MakeSlowCapturingTest(regexp2, text, 1, false);
  242. }
  243. Y_UNIT_TEST(SlowCapturing)
  244. {
  245. const char* regexp = "^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)";
  246. const char* text = "http://vkontakte.ru/id100500";
  247. MakeSlowCapturingTest(regexp, text, 2, true, ystring("100500"));
  248. }
  249. Y_UNIT_TEST(Utf_8)
  250. {
  251. const char* regexp = "\xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, ((\\s|\\w|[()]|-)+)!";
  252. const char* text =" \xd0\x97\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb2\xd1\x81\xd1\x82\xd0\xb2\xd1\x83\xd0\xb9\xd1\x82\xd0\xb5, \xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)! ";
  253. const char* ans = "\xd0\xa3\xd0\xb2\xd0\xb0\xd0\xb6\xd0\xb0\xd0\xb5\xd0\xbc\xd1\x8b\xd0\xb9 (-\xd0\xb0\xd1\x8f)";
  254. MakeSlowCapturingTest(regexp, text, 1, true, ystring(ans));
  255. }
  256. }