pire_ut.cpp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888
  1. /*
  2. * pire_ut.cpp --
  3. *
  4. * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
  5. * Alexander Gololobov <agololobov@gmail.com>
  6. *
  7. * This file is part of Pire, the Perl Incompatible
  8. * Regular Expressions library.
  9. *
  10. * Pire is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU Lesser Public License as published by
  12. * the Free Software Foundation, either version 3 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * Pire is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU Lesser Public License for more details.
  19. * You should have received a copy of the GNU Lesser Public License
  20. * along with Pire. If not, see <http://www.gnu.org/licenses>.
  21. */
  22. #include <stub/hacks.h>
  23. #include <stub/defaults.h>
  24. #include <stub/saveload.h>
  25. #include <stub/memstreams.h>
  26. #include "stub/cppunit.h"
  27. #include <stdexcept>
  28. #include "common.h"
  29. Y_UNIT_TEST_SUITE(TestPire) {
  30. /*****************************************************************************
  31. * Tests themselves
  32. *****************************************************************************/
  33. Y_UNIT_TEST(String)
  34. {
  35. REGEXP("abc") {
  36. ACCEPTS("def abc ghi");
  37. ACCEPTS("abc");
  38. DENIES ("def abd ghi");
  39. }
  40. }
  41. Y_UNIT_TEST(Boundaries)
  42. {
  43. REGEXP("^abc") {
  44. ACCEPTS("abc ghi");
  45. DENIES ("def abc");
  46. }
  47. REGEXP("abc$") {
  48. DENIES ("abc ghi");
  49. ACCEPTS("def abc");
  50. }
  51. }
  52. Y_UNIT_TEST(Primitives)
  53. {
  54. REGEXP("abc|def") {
  55. ACCEPTS("def");
  56. ACCEPTS("abc");
  57. DENIES ("deb");
  58. }
  59. REGEXP("ad*e") {
  60. ACCEPTS("xaez");
  61. ACCEPTS("xadez");
  62. ACCEPTS("xaddez");
  63. ACCEPTS("xadddddddddddddddddddddddez");
  64. DENIES ("xafez");
  65. }
  66. REGEXP("ad+e") {
  67. DENIES ("xaez");
  68. ACCEPTS("xadez");
  69. ACCEPTS("xaddez");
  70. ACCEPTS("xadddddddddddddddddddddddez");
  71. DENIES ("xafez");
  72. }
  73. REGEXP("ad?e") {
  74. ACCEPTS("xaez");
  75. ACCEPTS("xadez");
  76. DENIES ("xaddez");
  77. DENIES ("xafez");
  78. }
  79. REGEXP("a.{1}e") {
  80. ACCEPTS("axe");
  81. DENIES ("ae");
  82. DENIES ("axye");
  83. }
  84. }
  85. void TestMassAlternatives(const char* pattern) {
  86. REGEXP(pattern) {
  87. ACCEPTS("abc");
  88. ACCEPTS("def");
  89. ACCEPTS("ghi");
  90. ACCEPTS("klm");
  91. DENIES ("aei");
  92. DENIES ("klc");
  93. }
  94. }
  95. Y_UNIT_TEST(MassAlternatives)
  96. {
  97. TestMassAlternatives("((abc|def)|ghi)|klm");
  98. TestMassAlternatives("(abc|def)|(ghi|klm)");
  99. TestMassAlternatives("abc|(def|(ghi|klm))");
  100. TestMassAlternatives("abc|(def|ghi)|klm");
  101. }
  102. Y_UNIT_TEST(Composition)
  103. {
  104. REGEXP("^/([^\\\\/]|\\\\.)*/[a-z]*$") {
  105. ACCEPTS("/regexp/i");
  106. ACCEPTS("/regexp2/");
  107. DENIES ("regexp");
  108. ACCEPTS("/dir\\/file/");
  109. DENIES ("/dir/file/");
  110. ACCEPTS("/dir\\\\/");
  111. DENIES ("/dir\\\\/file/");
  112. }
  113. REGEXP("Head(Inner)*Tail") {
  114. ACCEPTS("HeadInnerTail");
  115. ACCEPTS("HeadInnerInnerTail");
  116. DENIES ("HeadInneInnerTail");
  117. ACCEPTS("HeadTail");
  118. }
  119. }
  120. Y_UNIT_TEST(Repetition)
  121. {
  122. REGEXP("^x{3,6}$") {
  123. DENIES ("xx");
  124. ACCEPTS("xxx");
  125. ACCEPTS("xxxx");
  126. ACCEPTS("xxxxx");
  127. ACCEPTS("xxxxxx");
  128. DENIES ("xxxxxxx");
  129. }
  130. REGEXP("^x{3,}$") {
  131. DENIES ("xx");
  132. ACCEPTS("xxx");
  133. ACCEPTS("xxxx");
  134. ACCEPTS("xxxxxxxxxxx");
  135. ACCEPTS("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
  136. }
  137. REGEXP("^x{3}$") {
  138. DENIES ("x");
  139. DENIES ("xx");
  140. ACCEPTS("xxx");
  141. DENIES ("xxxx");
  142. DENIES ("xxxxx");
  143. DENIES ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
  144. }
  145. REGEXP("x.{3,10}$") {
  146. for (size_t size = 0; size < 20; ++size) {
  147. ystring str = ystring(size*2, 'b') + "x" + ystring(size, 'e');
  148. if (size >= 3 && size <= 10)
  149. ACCEPTS(str.c_str());
  150. else
  151. DENIES(str.c_str());
  152. }
  153. }
  154. }
  155. Y_UNIT_TEST(UTF8)
  156. {
  157. REGEXP2("^.$", "u") {
  158. // A single-byte sequence 0xxx xxxx
  159. ACCEPTS("\x41");
  160. DENIES ("\x81");
  161. // A two-byte sequence: 110x xxxx | 10xx xxxx
  162. ACCEPTS("\xC1\x81");
  163. DENIES ("\xC1");
  164. DENIES ("\xC1\x41");
  165. DENIES ("\xC1\xC2");
  166. DENIES ("\xC1\x81\x82");
  167. // A three-byte sequence: 1110 xxxx | 10xx xxxx | 10xx xxxx
  168. ACCEPTS("\xE1\x81\x82");
  169. DENIES ("\xE1");
  170. DENIES ("\xE1\x42");
  171. DENIES ("\xE1\x42\x43");
  172. DENIES ("\xE1\xC2\xC3");
  173. DENIES ("\xE1\x82");
  174. DENIES ("\xE1\x82\x83\x84");
  175. // A four-byte sequence: 1111 0xxx | 10xx xxxx | 10xx xxxx | 10xx xxxx
  176. ACCEPTS("\xF1\x81\x82\x83");
  177. }
  178. REGEXP2("x\xD0\xA4y", "u") ACCEPTS("x\xD0\xA4y");
  179. }
  180. Y_UNIT_TEST(AndNot)
  181. {
  182. REGEXP2("<([0-9]+&~123&~456)>", "a") {
  183. ACCEPTS("<111>");
  184. ACCEPTS("<124>");
  185. DENIES ("<123>");
  186. DENIES ("<456>");
  187. DENIES ("<abc>");
  188. }
  189. REGEXP2("[0-9]+\\&1+", "a") {
  190. DENIES("111");
  191. ACCEPTS("123&111");
  192. }
  193. }
  194. Y_UNIT_TEST(Empty)
  195. {
  196. Scanners s("\\s*", "n");
  197. Pire::Scanner::State state;
  198. s.fast.Initialize(state);
  199. UNIT_ASSERT(s.fast.Final(state));
  200. Pire::SimpleScanner::State stateSF;
  201. s.simple.Initialize(stateSF);
  202. UNIT_ASSERT(s.simple.Final(stateSF));
  203. }
  204. Y_UNIT_TEST(Misc)
  205. {
  206. REGEXP2("^[^\\s=/>]*$", "n") ACCEPTS("a");
  207. REGEXP("\\t") ACCEPTS("\t");
  208. SCANNER(ParseRegexp(".*") & ~ParseRegexp(".*http.*")) {
  209. ACCEPTS("str");
  210. DENIES("str_http");
  211. }
  212. SCANNER(~Pire::Fsm()) ACCEPTS("str");
  213. }
  214. Y_UNIT_TEST(Ranges)
  215. {
  216. REGEXP("a\\W") {
  217. ACCEPTS("a,");
  218. DENIES("ab");
  219. }
  220. try {
  221. REGEXP("abc[def") {}
  222. UNIT_ASSERT(!"Should report syntax error");
  223. }
  224. catch (Pire::Error&) {}
  225. }
  226. Y_UNIT_TEST(Reverse)
  227. {
  228. SCANNER(ParseRegexp("abcdef").Reverse()) {
  229. ACCEPTS("fedcba");
  230. DENIES ("abcdef");
  231. }
  232. }
  233. #if defined(__GNUC__)
  234. #pragma GCC diagnostic push
  235. #pragma GCC diagnostic ignored "-Warray-bounds"
  236. #endif
  237. Y_UNIT_TEST(PrefixSuffix)
  238. {
  239. static const char* pattern = "-->";
  240. Pire::Fsm fsm = ParseRegexp(pattern, "n");
  241. Pire::Scanner ngsc = (~Pire::Fsm::MakeFalse() + fsm).Compile<Pire::Scanner>();
  242. Pire::Scanner gsc = (~fsm.Surrounded() + fsm).Compile<Pire::Scanner>();
  243. Pire::Scanner rsc = fsm.Reverse().Compile<Pire::Scanner>();
  244. static const char* text = "1234567890 --> middle --> end";
  245. const char* end = Pire::LongestPrefix(gsc, text, text + strlen(text));
  246. UNIT_ASSERT_EQUAL(end, text + 14);
  247. const char* begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1;
  248. UNIT_ASSERT_EQUAL(begin, text + 11);
  249. auto view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(gsc, text));
  250. UNIT_ASSERT_EQUAL(view.data(), text + 11);
  251. UNIT_ASSERT_EQUAL(view.size(), 3);
  252. end = Pire::LongestPrefix(ngsc, text, text + strlen(text));
  253. UNIT_ASSERT_EQUAL(end, text + 25);
  254. begin = Pire::LongestSuffix(rsc, end - 1, text - 1) + 1;
  255. UNIT_ASSERT_EQUAL(begin, text + 22);
  256. view = Pire::LongestSuffix(rsc, Pire::LongestPrefix(ngsc, text));
  257. UNIT_ASSERT_EQUAL(view.data(), text + 22);
  258. UNIT_ASSERT_EQUAL(view.size(), 3);
  259. end = Pire::ShortestPrefix(gsc, text, text + strlen(text));
  260. UNIT_ASSERT_EQUAL(end, text + 14);
  261. begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1;
  262. UNIT_ASSERT_EQUAL(begin, text + 11);
  263. view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(gsc, text));
  264. UNIT_ASSERT_EQUAL(view.data(), text + 11);
  265. UNIT_ASSERT_EQUAL(view.size(), 3);
  266. end = Pire::ShortestPrefix(ngsc, text, text + strlen(text));
  267. UNIT_ASSERT_EQUAL(end, text + 14);
  268. begin = Pire::ShortestSuffix(rsc, end - 1, text - 1) + 1;
  269. UNIT_ASSERT_EQUAL(begin, text + 11);
  270. view = Pire::ShortestSuffix(rsc, Pire::ShortestPrefix(ngsc, text));
  271. UNIT_ASSERT_EQUAL(view.data(), text + 11);
  272. UNIT_ASSERT_EQUAL(view.size(), 3);
  273. }
  274. #if defined(__GNUC__)
  275. #pragma GCC diagnostic pop
  276. #endif
  277. Y_UNIT_TEST(PrefixSuffixEmptyView) {
  278. const std::string_view empty{};
  279. auto checkAnswer = [](std::string_view answer) {
  280. return !answer.data() && answer.size() == 0;
  281. };
  282. TVector<ystring> patterns = {
  283. "",
  284. "a",
  285. ".*",
  286. "a.*",
  287. ".*a"
  288. };
  289. for (const auto& pattern: patterns) {
  290. Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
  291. UNIT_ASSERT_C(checkAnswer(Pire::ShortestPrefix(sc, empty)), pattern);
  292. UNIT_ASSERT_C(checkAnswer(Pire::LongestPrefix(sc, empty)), pattern);
  293. UNIT_ASSERT_C(checkAnswer(Pire::ShortestSuffix(sc, empty)), pattern);
  294. UNIT_ASSERT_C(checkAnswer(Pire::LongestSuffix(sc, empty)), pattern);
  295. }
  296. }
  297. namespace {
  298. ssize_t LongestPrefixLen(const char* pattern, const char* str)
  299. {
  300. Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
  301. const char* end = Pire::LongestPrefix(sc, str, str + strlen(str));
  302. return end ? end - str : -1;
  303. }
  304. ssize_t ShortestPrefixLen(const char* pattern, const char* str)
  305. {
  306. Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
  307. const char* end = Pire::ShortestPrefix(sc, str, str + strlen(str));
  308. return end ? end - str : -1;
  309. }
  310. ssize_t LongestSuffixLen(const char* pattern, const char* str)
  311. {
  312. Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
  313. const char* rbegin = str + strlen(str) - 1;
  314. const char* rend = Pire::LongestSuffix(sc, rbegin, str - 1);
  315. return rend ? rbegin - rend : -1;
  316. }
  317. ssize_t ShortestSuffixLen(const char* pattern, const char* str) {
  318. Pire::Scanner sc = Pire::Lexer(pattern).Parse().Compile<Pire::Scanner>();
  319. const char* rbegin = str + strlen(str) - 1;
  320. const char* rend = Pire::ShortestSuffix(sc, rbegin, str - 1);
  321. return rend ? rbegin - rend : -1;
  322. }
  323. }
  324. Y_UNIT_TEST(ScanBoundaries)
  325. {
  326. struct Case {
  327. ystring pattern;
  328. ystring text;
  329. ssize_t shortestPrefixLen;
  330. ssize_t longestPrefixLen;
  331. ystring ToString() const {
  332. return ystring("Pattern: ") + pattern + ", text: " + text;
  333. }
  334. };
  335. TVector <Case> cases = {
  336. {
  337. "a*",
  338. "",
  339. 0,
  340. 0,
  341. },
  342. {
  343. "a",
  344. "",
  345. -1,
  346. -1,
  347. },
  348. {
  349. "fixed",
  350. "fixed prefix",
  351. 5,
  352. 5,
  353. },
  354. {
  355. "fixed",
  356. "a fixed nonexistent prefix",
  357. -1,
  358. -1,
  359. },
  360. {
  361. "a*",
  362. "aaabbb",
  363. 0,
  364. 3,
  365. },
  366. {
  367. "a*",
  368. "bbbbbb",
  369. 0,
  370. 0,
  371. },
  372. {
  373. "a*",
  374. "aaaaaa",
  375. 0,
  376. 6,
  377. },
  378. {
  379. "aa*",
  380. "aaabbb",
  381. 1,
  382. 3,
  383. },
  384. {
  385. "a*a",
  386. "aaaaaa",
  387. 1,
  388. 6,
  389. },
  390. {
  391. ".*a",
  392. "bbbba",
  393. 5,
  394. 5,
  395. },
  396. {
  397. ".*",
  398. "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-",
  399. 0,
  400. 80,
  401. },
  402. {
  403. ".*a",
  404. "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a",
  405. 81,
  406. 81,
  407. },
  408. {
  409. ".*a",
  410. "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
  411. "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a",
  412. 81,
  413. 162,
  414. },
  415. {
  416. ".*b",
  417. "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-",
  418. -1,
  419. -1,
  420. },
  421. {
  422. ".*a.*",
  423. "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
  424. "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
  425. 81,
  426. 162,
  427. },
  428. {
  429. ".*a.*b",
  430. "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
  431. "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
  432. 162,
  433. 162,
  434. },
  435. {
  436. "1.*a.*",
  437. "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-a"
  438. "123456789-123456789-123456789-123456789-123456789-123456789-123456789-123456789-b",
  439. 81,
  440. 162,
  441. },
  442. {
  443. "a+",
  444. "bbbbbb",
  445. -1,
  446. -1,
  447. },
  448. };
  449. for (const auto& test: cases) {
  450. UNIT_ASSERT_EQUAL_C(ShortestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.shortestPrefixLen, test.ToString());
  451. UNIT_ASSERT_EQUAL_C(LongestPrefixLen(test.pattern.c_str(), test.text.c_str()), test.longestPrefixLen, test.ToString());
  452. auto reversed = test.text;
  453. ReverseInPlace(reversed);
  454. UNIT_ASSERT_EQUAL_C(ShortestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.shortestPrefixLen, test.ToString());
  455. UNIT_ASSERT_EQUAL_C(LongestSuffixLen(test.pattern.c_str(), reversed.c_str()), test.longestPrefixLen, test.ToString());
  456. }
  457. }
  458. Y_UNIT_TEST(ScanTermination)
  459. {
  460. Pire::Scanner sc = Pire::Lexer("aaa").Parse().Compile<Pire::Scanner>();
  461. // Scanning must terminate at first dead state. If it does not,
  462. // we will pass through the end of our string and end up with segfault.
  463. const char str[] = "aaab";
  464. const char* p = Pire::LongestPrefix(sc, &str[0], &str[0] + sizeof(str));
  465. UNIT_ASSERT(p == &str[0] + 3);
  466. }
  467. struct BasicMmapTest {
  468. template <class Scanner>
  469. static void Match(Scanner& sc, const void* ptr, size_t sz, const char* str)
  470. {
  471. try {
  472. sc.Mmap(ptr, sz);
  473. if (!Pire::Impl::IsAligned(ptr, sizeof(size_t))) {
  474. UNIT_ASSERT(!"Failed to check for misaligned mmaping");
  475. } else {
  476. UNIT_ASSERT(Matches(sc, str));
  477. }
  478. }
  479. catch (Pire::Error&) {}
  480. }
  481. };
  482. template <class Sc1, class Sc2>
  483. void TestCopyingHelper()
  484. {
  485. Pire::Fsm fsm = ParseRegexp("^r$", "");
  486. Sc1 sc1(Pire::Fsm(fsm).Compile<Sc1>());
  487. // Test copy ctor
  488. UNIT_ASSERT(Matches(Sc2(sc1), "r"));
  489. UNIT_ASSERT(!Matches(Sc2(sc1), "p"));
  490. // Test '=' operator
  491. Sc2 sc2;
  492. sc2 = sc1;
  493. UNIT_ASSERT(Matches(sc2, "r"));
  494. UNIT_ASSERT(!Matches(sc2, "p"));
  495. }
  496. template <class Sc1, class Sc2>
  497. void TestCopying()
  498. {
  499. TestCopyingHelper<Sc1, Sc2>();
  500. TestCopyingHelper<Sc2, Sc1>();
  501. }
  502. Y_UNIT_TEST(Copying)
  503. {
  504. TestCopying<Pire::Scanner, Pire::NonrelocScanner>();
  505. TestCopying<Pire::ScannerNoMask, Pire::NonrelocScannerNoMask>();
  506. TestCopying<Pire::HalfFinalScanner, Pire::NonrelocHalfFinalScanner>();
  507. TestCopying<Pire::HalfFinalScannerNoMask, Pire::NonrelocHalfFinalScannerNoMask>();
  508. }
  509. template<class Scanner>
  510. void MatchScanner(Scanner& scanner) {
  511. UNIT_ASSERT(Matches(scanner, "regexp"));
  512. UNIT_ASSERT(!Matches(scanner, "regxp"));
  513. UNIT_ASSERT(!Matches(scanner, "regexp t"));
  514. }
  515. template<class Scanner>
  516. void LoadAndMatchScanner(MemoryInput& rbuf, Scanner& scanner) {
  517. Load(&rbuf, scanner);
  518. MatchScanner(scanner);
  519. }
  520. template<class Scanner>
  521. const char* MmapAndMatchScanner(Scanner& scanner, const char* ptr, size_t size) {
  522. const char* ptr2 = (const char*)scanner.Mmap(ptr, size);
  523. MatchScanner(scanner);
  524. return ptr2;
  525. }
  526. Y_UNIT_TEST(Serialization)
  527. {
  528. Scanners s("^regexp$");
  529. BufferOutput wbuf;
  530. Save(&wbuf, s.fast);
  531. Save(&wbuf, s.simple);
  532. Save(&wbuf, s.slow);
  533. Save(&wbuf, s.fastNoMask);
  534. Save(&wbuf, s.nonreloc);
  535. Save(&wbuf, s.nonrelocNoMask);
  536. Save(&wbuf, s.halfFinal);
  537. Save(&wbuf, s.halfFinalNoMask);
  538. Save(&wbuf, s.nonrelocHalfFinal);
  539. Save(&wbuf, s.nonrelocHalfFinalNoMask);
  540. MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
  541. LoadAndMatchScanner(rbuf, s.fast);
  542. LoadAndMatchScanner(rbuf, s.simple);
  543. LoadAndMatchScanner(rbuf, s.slow);
  544. LoadAndMatchScanner(rbuf, s.fastNoMask);
  545. LoadAndMatchScanner(rbuf, s.nonreloc);
  546. LoadAndMatchScanner(rbuf, s.nonrelocNoMask);
  547. LoadAndMatchScanner(rbuf, s.halfFinal);
  548. LoadAndMatchScanner(rbuf, s.halfFinalNoMask);
  549. LoadAndMatchScanner(rbuf, s.nonrelocHalfFinal);
  550. LoadAndMatchScanner(rbuf, s.nonrelocHalfFinalNoMask);
  551. Pire::Scanner fast;
  552. Pire::SimpleScanner simple;
  553. Pire::SlowScanner slow;
  554. Pire::ScannerNoMask fastNoMask;
  555. Pire::HalfFinalScanner halfFinal;
  556. Pire::HalfFinalScannerNoMask halfFinalNoMask;
  557. Pire::Scanner fast1;
  558. Pire::ScannerNoMask fastNoMask1;
  559. Pire::HalfFinalScanner halfFinal1;
  560. Pire::HalfFinalScannerNoMask halfFinalNoMask1;
  561. const size_t MaxTestOffset = 2 * sizeof(Pire::Impl::MaxSizeWord);
  562. TVector<char> buf2(wbuf.Buffer().Size() + sizeof(size_t) + MaxTestOffset);
  563. const char* ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t));
  564. const char* end = ptr + wbuf.Buffer().Size();
  565. memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
  566. const char* ptr2 = 0;
  567. ptr2 = MmapAndMatchScanner(fast, ptr, end - ptr);
  568. size_t fastSize = ptr2 - ptr;
  569. ptr = ptr2;
  570. ptr2 = MmapAndMatchScanner(simple, ptr, end - ptr);
  571. size_t simpleSize = ptr2 - ptr;
  572. ptr = ptr2;
  573. ptr = MmapAndMatchScanner(slow, ptr, end - ptr);
  574. ptr = MmapAndMatchScanner(fastNoMask, ptr, end - ptr);
  575. // Nonreloc-s are saved as Scaner-s, so read them again
  576. ptr = MmapAndMatchScanner(fast1, ptr, end - ptr);
  577. ptr = MmapAndMatchScanner(fastNoMask1, ptr, end - ptr);
  578. ptr = MmapAndMatchScanner(halfFinal, ptr, end - ptr);
  579. ptr = MmapAndMatchScanner(halfFinalNoMask, ptr, end - ptr);
  580. ptr = MmapAndMatchScanner(halfFinal1, ptr, end - ptr);
  581. ptr = MmapAndMatchScanner(halfFinalNoMask1, ptr, end - ptr);
  582. UNIT_ASSERT_EQUAL(ptr, end);
  583. for (size_t offset = 1; offset < MaxTestOffset; ++offset) {
  584. ptr = Pire::Impl::AlignUp(&buf2[0], sizeof(size_t)) + offset;
  585. end = ptr + wbuf.Buffer().Size();
  586. memcpy((void*) ptr, wbuf.Buffer().Data(), wbuf.Buffer().Size());
  587. BasicMmapTest::Match(fast, ptr, end - ptr, "regexp");
  588. ptr = ptr + fastSize;
  589. BasicMmapTest::Match(simple, ptr, end - ptr, "regexp");
  590. ptr = ptr + simpleSize;
  591. BasicMmapTest::Match(slow, ptr, end - ptr, "regexp");
  592. }
  593. }
  594. Y_UNIT_TEST(TestShortcuts)
  595. {
  596. REGEXP("aaa") {
  597. ACCEPTS("......................................aaa.............");
  598. DENIES ("......................................aab.............");
  599. DENIES ("......................................................");
  600. }
  601. REGEXP("[ab]{3}") {
  602. ACCEPTS("......................................aaa.............");
  603. ACCEPTS("......................................aab.............");
  604. ACCEPTS("......................................bbb.............");
  605. DENIES ("......................................................");
  606. }
  607. REGEXP2("\xD0\xB0", "u") {
  608. ACCEPTS("......................................\xD0\xB0...............");
  609. ACCEPTS("...................................\xD0\xB0..................");
  610. ACCEPTS("................................\xD0\xB0.....................");
  611. }
  612. }
  613. template<class Scanner>
  614. void TestGlue()
  615. {
  616. Scanner sc1 = ParseRegexp("aaa").Compile<Scanner>();
  617. Scanner sc2 = ParseRegexp("bbb").Compile<Scanner>();
  618. Scanner glued = Scanner::Glue(sc1, sc2);
  619. UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(2));
  620. auto state = RunRegexp(glued, "aaa");
  621. auto res = glued.AcceptedRegexps(state);
  622. UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
  623. UNIT_ASSERT_EQUAL(*res.first, size_t(0));
  624. state = RunRegexp(glued, "bbb");
  625. res = glued.AcceptedRegexps(state);
  626. UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
  627. UNIT_ASSERT_EQUAL(*res.first, size_t(1));
  628. state = RunRegexp(glued, "aaabbb");
  629. res = glued.AcceptedRegexps(state);
  630. UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(2));
  631. UNIT_ASSERT_EQUAL(res.first[0], size_t(0));
  632. UNIT_ASSERT_EQUAL(res.first[1], size_t(1));
  633. state = RunRegexp(glued, "ccc");
  634. res = glued.AcceptedRegexps(state);
  635. UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(0));
  636. Scanner sc3 = ParseRegexp("ccc").Compile<Scanner>();
  637. glued = Scanner::Glue(sc3, glued);
  638. UNIT_ASSERT_EQUAL(glued.RegexpsCount(), size_t(3));
  639. state = RunRegexp(glued, "ccc");
  640. res = glued.AcceptedRegexps(state);
  641. UNIT_ASSERT_EQUAL(res.second - res.first, ssize_t(1));
  642. UNIT_ASSERT_EQUAL(res.first[0], size_t(0));
  643. Scanner sc4 = Scanner::Glue(
  644. ParseRegexp("a", "n").Compile<Scanner>(),
  645. ParseRegexp("c", "n").Compile<Scanner>()
  646. );
  647. state = RunRegexp(sc4, "ac");
  648. res = sc4.AcceptedRegexps(state);
  649. UNIT_ASSERT(res.second == res.first);
  650. state = RunRegexp(sc4, "ac");
  651. UNIT_ASSERT(!sc4.Final(state));
  652. }
  653. Y_UNIT_TEST(Glue)
  654. {
  655. TestGlue<Pire::Scanner>();
  656. TestGlue<Pire::NonrelocScanner>();
  657. TestGlue<Pire::ScannerNoMask>();
  658. TestGlue<Pire::NonrelocScannerNoMask>();
  659. TestGlue<Pire::HalfFinalScanner>();
  660. TestGlue<Pire::NonrelocHalfFinalScanner>();
  661. TestGlue<Pire::HalfFinalScannerNoMask>();
  662. TestGlue<Pire::NonrelocHalfFinalScannerNoMask>();
  663. }
  664. Y_UNIT_TEST(Slow)
  665. {
  666. Pire::SlowScanner sc = ParseRegexp("a.{30}$", "").Compile<Pire::SlowScanner>();
  667. // 123456789012345678901234567890
  668. UNIT_ASSERT( Matches(sc, "....a.............................."));
  669. UNIT_ASSERT(!Matches(sc, "....a..............................."));
  670. UNIT_ASSERT(!Matches(sc, "....a............................."));
  671. }
  672. struct astring: private std::vector<char> {
  673. template <typename... A>
  674. inline astring(A&&... a) {
  675. std::string s(std::forward<A>(a)...);
  676. insert(end(), s.begin(), s.end());
  677. push_back(0);
  678. }
  679. inline char* c_str() noexcept {
  680. return data();
  681. }
  682. friend astring operator+(astring l, const astring& r) {
  683. l.insert(l.end() - 1, r.begin(), r.end());
  684. return l;
  685. }
  686. };
  687. Y_UNIT_TEST(Aligned)
  688. {
  689. using ystring = astring;
  690. UNIT_ASSERT(Pire::Impl::IsAligned(ystring("x").c_str(), sizeof(void*)));
  691. REGEXP("xy") {
  692. // Short string with aligned head
  693. ACCEPTS(ystring("xy").c_str());
  694. DENIES (ystring("yz").c_str());
  695. // Short string, unaligned
  696. ACCEPTS(ystring(".xy").c_str() + 1);
  697. DENIES (ystring(".yz").c_str() + 1);
  698. // Short string with aligned tail
  699. ACCEPTS((ystring(sizeof(void*) - 2, '.') + "xy").c_str() + sizeof(void*) - 2);
  700. DENIES ((ystring(sizeof(void*) - 2, '.') + "yz").c_str() + sizeof(void*) - 2);
  701. }
  702. REGEXP("abcde") {
  703. // Everything aligned, match occurs in the middle
  704. ACCEPTS(ystring("ZZZZZabcdeZZZZZZ").c_str());
  705. DENIES (ystring("ZZZZZabcdfZZZZZZ").c_str());
  706. // Unaligned head
  707. ACCEPTS(ystring(".ZabcdeZZZ").c_str() + 1);
  708. DENIES (ystring(".ZxbcdeZZZ").c_str() + 1);
  709. // Unaligned tail
  710. ACCEPTS(ystring("ZZZZZZZZZZZZZabcde").c_str());
  711. DENIES (ystring("ZZZZZZZZZZZZZabcdf").c_str());
  712. }
  713. }
  714. #undef Run
  715. template <class Scanner>
  716. void BasicTestEmptySaveLoadMmap()
  717. {
  718. Scanner sc;
  719. UNIT_ASSERT(sc.Empty());
  720. UNIT_ASSERT_EQUAL(sc.RegexpsCount(), size_t(0));
  721. UNIT_CHECKPOINT(); Pire::Runner(sc).Begin().Run("a string", 7).End(); // should not crash
  722. BufferOutput wbuf;
  723. UNIT_CHECKPOINT(); Save(&wbuf, sc);
  724. MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
  725. Scanner sc3;
  726. /*UNIT_CHECKPOINT();*/ Load(&rbuf, sc3);
  727. UNIT_ASSERT(sc3.Empty());
  728. UNIT_CHECKPOINT(); Pire::Runner(sc3).Begin().Run("a string", 7).End();
  729. Scanner sc4;
  730. /*UNIT_CHECKPOINT();*/ const char* ptr = (const char*) sc4.Mmap(wbuf.Buffer().Data(), wbuf.Buffer().Size());
  731. UNIT_ASSERT(ptr == wbuf.Buffer().Data() + wbuf.Buffer().Size());
  732. UNIT_ASSERT(sc4.Empty());
  733. UNIT_CHECKPOINT(); Pire::Runner(sc4).Begin().Run("a string", 7).End();
  734. }
  735. Y_UNIT_TEST(EmptyScanner)
  736. {
  737. // Tests for Scanner
  738. BasicTestEmptySaveLoadMmap<Pire::Scanner>();
  739. BasicTestEmptySaveLoadMmap<Pire::ScannerNoMask>();
  740. BasicTestEmptySaveLoadMmap<Pire::HalfFinalScanner>();
  741. BasicTestEmptySaveLoadMmap<Pire::HalfFinalScannerNoMask>();
  742. Pire::Scanner sc;
  743. Pire::Scanner scsc = Pire::Scanner::Glue(sc, sc);
  744. UNIT_ASSERT(scsc.Empty());
  745. UNIT_ASSERT_EQUAL(scsc.RegexpsCount(), size_t(0));
  746. UNIT_CHECKPOINT(); Pire::Runner(scsc).Begin().Run("a string", 7).End();
  747. Pire::Scanner sc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>();
  748. UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1));
  749. UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End();
  750. UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(scsc, sc2).RegexpsCount(), size_t(1));
  751. UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(scsc, sc2)).Begin().Run("a string", 7).End();
  752. UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc).RegexpsCount(), size_t(1));
  753. UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(Pire::Scanner::Glue(scsc, sc2), sc)).Begin().Run("a string", 7).End();
  754. // Tests for NonrelocScanner
  755. Pire::NonrelocScanner nsc;
  756. UNIT_ASSERT(nsc.Empty());
  757. UNIT_ASSERT_EQUAL(nsc.RegexpsCount(), size_t(0));
  758. UNIT_CHECKPOINT(); Pire::Runner(nsc).Begin().Run("a string", 7).End();
  759. Pire::NonrelocScanner nsc2 = Pire::Lexer("regex").Parse().Compile<Pire::Scanner>();
  760. UNIT_ASSERT_EQUAL(Pire::Scanner::Glue(sc, sc2).RegexpsCount(), size_t(1));
  761. UNIT_CHECKPOINT(); Pire::Runner(Pire::Scanner::Glue(sc, sc2)).Begin().Run("a string", 7).End();
  762. {
  763. BufferOutput wbuf;
  764. UNIT_CHECKPOINT(); Save(&wbuf, nsc);
  765. MemoryInput rbuf(wbuf.Buffer().Data(), wbuf.Buffer().Size());
  766. Pire::NonrelocScanner nsc3;
  767. /*UNIT_CHECKPOINT();*/ Load(&rbuf, nsc3);
  768. UNIT_ASSERT(nsc3.Empty());
  769. UNIT_CHECKPOINT(); Pire::Runner(nsc3).Begin().Run("a string", 7).End();
  770. }
  771. BasicTestEmptySaveLoadMmap<Pire::SimpleScanner>();
  772. BasicTestEmptySaveLoadMmap<Pire::SlowScanner>();
  773. }
  774. Y_UNIT_TEST(NullPointer)
  775. {
  776. const char* null = 0;
  777. Pire::Scanner sc = Pire::Fsm().Compile<Pire::Scanner>();
  778. Pire::Runner(sc).Begin().Run(null, null).End();
  779. }
  780. }