123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318 |
- #include <library/cpp/testing/unittest/registar.h>
- #include <library/cpp/regex/pire/regexp.h>
- #include <library/cpp/regex/pire/pcre2pire.h>
- Y_UNIT_TEST_SUITE(TRegExp) {
- using namespace NRegExp;
- Y_UNIT_TEST(False) {
- UNIT_ASSERT(!TMatcher(TFsm::False()).Match("").Final());
- UNIT_ASSERT(!TMatcher(TFsm::False()).Match(TStringBuf{}).Final());
- }
- Y_UNIT_TEST(Surround) {
- UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
- UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(false))).Match("aqwb").Final());
- }
- Y_UNIT_TEST(Boundaries) {
- UNIT_ASSERT(!TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
- UNIT_ASSERT(!TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
- UNIT_ASSERT(TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
- UNIT_ASSERT(TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
- UNIT_ASSERT(!TMatcher(TFsm("qw$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
- UNIT_ASSERT(!TMatcher(TFsm("^qw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
- UNIT_ASSERT(TMatcher(TFsm("^aqwb$", TFsm::TOptions().SetSurround(true)))
- .Match(TStringBuf("a"), true, false)
- .Match(TStringBuf("q"), false, false)
- .Match(TStringBuf("w"), false, false)
- .Match(TStringBuf("b"), false, true)
- .Final());
- }
- Y_UNIT_TEST(Case) {
- UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true))).Match("Qw").Final());
- UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false))).Match("Qw").Final());
- }
- Y_UNIT_TEST(UnicodeCase) {
- UNIT_ASSERT(TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(true))).Match("Ab").Final());
- UNIT_ASSERT(!TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(false))).Match("Ab").Final());
- }
- Y_UNIT_TEST(Utf) {
- NRegExp::TFsmBase::TOptions opts;
- opts.Charset = CODES_UTF8;
- opts.Surround = true;
- UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("wtf").Final());
- UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("чзн").Final());
- UNIT_ASSERT(TMatcher(TFsm("ч.*", opts)).Match("чзн").Final());
- UNIT_ASSERT(!TMatcher(TFsm("чзн", opts)).Match("чзх").Final());
- }
- Y_UNIT_TEST(AndNot) {
- NRegExp::TFsmBase::TOptions opts;
- opts.AndNotSupport = true;
- {
- NRegExp::TFsm fsm(".*&~([0-9]*)", opts);
- UNIT_ASSERT(TMatcher(fsm).Match("a2").Final());
- UNIT_ASSERT(TMatcher(fsm).Match("ab").Final());
- UNIT_ASSERT(TMatcher(fsm).Match("1a").Final());
- UNIT_ASSERT(!TMatcher(fsm).Match("12").Final());
- }
- {
- NRegExp::TFsm fsm(".*&~(.*[0-9].*)", opts);
- UNIT_ASSERT(TMatcher(fsm).Match("ab").Final());
- UNIT_ASSERT(!TMatcher(fsm).Match("a2").Final());
- UNIT_ASSERT(!TMatcher(fsm).Match("1a").Final());
- UNIT_ASSERT(!TMatcher(fsm).Match("12").Final());
- }
- {
- NRegExp::TFsm fsm(
- "((([a-z0-9_\\-]+[.])*[a-z0-9_\\-]+)"
- "&~(\\d+[.]\\d+[.]\\d+[.]\\d+))(:\\d+)?",
- TFsm::TOptions().SetCaseInsensitive(true).SetAndNotSupport(true)
- );
- UNIT_ASSERT(TMatcher(fsm).Match("yandex.ru").Final());
- UNIT_ASSERT(TMatcher(fsm).Match("yandex").Final());
- UNIT_ASSERT(TMatcher(fsm).Match("yandex:80").Final());
- UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1").Final());
- UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1:8080").Final());
- }
- }
- Y_UNIT_TEST(Glue) {
- TFsm glued =
- TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true)) |
- TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false)) |
- TFsm("abc", TFsm::TOptions().SetCaseInsensitive(false));
- UNIT_ASSERT(TMatcher(glued).Match("Qw").Final());
- UNIT_ASSERT(TMatcher(glued).Match("Qw").Final());
- UNIT_ASSERT(TMatcher(glued).Match("abc").Final());
- UNIT_ASSERT(!TMatcher(glued).Match("Abc").Final());
- }
- Y_UNIT_TEST(Capture1) {
- TCapturingFsm fsm("here we have user_id=([a-z0-9]+);");
- TSearcher searcher(fsm);
- searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a"));
- }
- Y_UNIT_TEST(Capture2) {
- TCapturingFsm fsm("w([abcdez]+)f");
- TSearcher searcher(fsm);
- searcher.Search("wabcdef");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("abcde"));
- }
- Y_UNIT_TEST(Capture3) {
- TCapturingFsm fsm("http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)",
- TFsm::TOptions().SetCapture(2));
- TSearcher searcher(fsm);
- searcher.Search("http://vkontakte.ru/id100500");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500"));
- }
- Y_UNIT_TEST(Capture4) {
- TCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!",
- TFsm::TOptions().SetCharset(CODES_UTF8));
- TSearcher searcher(fsm);
- searcher.Search(" Здравствуйте, Уважаемый (-ая)! ");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)"));
- }
- Y_UNIT_TEST(Capture5) {
- TCapturingFsm fsm("away\\.php\\?to=http:([^\"])+\"");
- TSearcher searcher(fsm);
- searcher.Search("\"/away.php?to=http:some.addr\"&id=1");
- UNIT_ASSERT(searcher.Captured());
- //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr"));
- }
- Y_UNIT_TEST(Capture6) {
- TCapturingFsm fsm("(/to-match-with)");
- TSearcher searcher(fsm);
- searcher.Search("/some/table/path/to-match-with");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("/to-match-with"));
- }
- Y_UNIT_TEST(Capture7) {
- TCapturingFsm fsm("(pref.*suff)");
- TSearcher searcher(fsm);
- searcher.Search("ala pref bla suff cla");
- UNIT_ASSERT(searcher.Captured());
- //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref bla suff"));
- }
- Y_UNIT_TEST(CaptureXA) {
- TCapturingFsm fsm(".*(xa).*");
- TSearcher searcher(fsm);
- searcher.Search("xa");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xa"));
- }
- Y_UNIT_TEST(CaptureWrongXX) {
- TCapturingFsm fsm(".*(xx).*");
- TSearcher searcher(fsm);
- searcher.Search("xx");
- UNIT_ASSERT(searcher.Captured());
- // Surprise!
- // TCapturingFsm uses a fast - O(|text|) - but incorrect algorithm.
- // It works more or less for a particular class of regexps to which ".*(xx).*" does not belong.
- // So it returns not the expected "xx" but just the second "x" instead.
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("x"));
- }
- Y_UNIT_TEST(CaptureRight1XX) {
- TCapturingFsm fsm("[^x]+(xx).*");
- TSearcher searcher(fsm);
- searcher.Search("xxx");
- UNIT_ASSERT(!searcher.Captured());
- }
- Y_UNIT_TEST(CaptureRight2XX) {
- TCapturingFsm fsm("[^x]+(xx).*");
- TSearcher searcher(fsm);
- searcher.Search("axx");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
- }
- Y_UNIT_TEST(CaptureRight3XX) {
- TCapturingFsm fsm("[^x]+(xx).*");
- TSearcher searcher(fsm);
- searcher.Search("axxb");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
- }
- Y_UNIT_TEST(SlowCaptureXX) {
- TSlowCapturingFsm fsm(".*(xx).*");
- TSlowSearcher searcher(fsm);
- searcher.Search("xx");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
- }
- Y_UNIT_TEST(SlowCapture) {
- TSlowCapturingFsm fsm("^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)",
- TFsm::TOptions().SetCapture(2));
- TSlowSearcher searcher(fsm);
- searcher.Search("http://vkontakte.ru/id100500");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500"));
- }
- Y_UNIT_TEST(SlowCaptureGreedy) {
- TSlowCapturingFsm fsm(".*(pref.*suff)");
- TSlowSearcher searcher(fsm);
- searcher.Search("pref ala bla pref cla suff dla");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref cla suff"));
- }
- Y_UNIT_TEST(SlowCaptureNonGreedy) {
- TSlowCapturingFsm fsm(".*?(pref.*suff)");
- TSlowSearcher searcher(fsm);
- searcher.Search("pref ala bla pref cla suff dla");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref ala bla pref cla suff"));
- }
- Y_UNIT_TEST(SlowCapture2) {
- TSlowCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!",
- TFsm::TOptions().SetCharset(CODES_UTF8));
- TSlowSearcher searcher(fsm);
- searcher.Search(" Здравствуйте, Уважаемый (-ая)! ");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)"));
- }
- Y_UNIT_TEST(SlowCapture3) {
- TSlowCapturingFsm fsm("here we have user_id=([a-z0-9]+);");
- TSlowSearcher searcher(fsm);
- searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a"));
- }
- Y_UNIT_TEST(SlowCapture4) {
- TSlowCapturingFsm fsm("away\\.php\\?to=http:([^\"]+)\"");
- TSlowSearcher searcher(fsm);
- searcher.Search("\"/away.php?to=http:some.addr\"&id=1");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr"));
- }
- Y_UNIT_TEST(CapturedEmptySlow) {
- TSlowCapturingFsm fsm("Comments=(.*)$");
- TSlowSearcher searcher(fsm);
- searcher.Search("And Comments=");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf(""));
- }
- Y_UNIT_TEST(CaptureInOrFirst) {
- TSlowCapturingFsm fsm("(A)|A");
- TSlowSearcher searcher(fsm);
- searcher.Search("A");
- UNIT_ASSERT(searcher.Captured());
- }
- Y_UNIT_TEST(CaptureInOrSecond) {
- TSlowCapturingFsm fsm("A|(A)");
- TSlowSearcher searcher(fsm);
- searcher.Search("A");
- UNIT_ASSERT(!searcher.Captured());
- }
- Y_UNIT_TEST(CaptureOutside) {
- TSlowCapturingFsm fsm("((ID=([0-9]+))?)");
- TSlowSearcher searcher(fsm);
- searcher.Search("ID=");
- UNIT_ASSERT(searcher.Captured());
- UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf(""));
- }
- Y_UNIT_TEST(CaptureInside) {
- TSlowCapturingFsm fsm("((ID=([0-9]+))?)",
- TFsm::TOptions().SetCapture(2));
- TSlowSearcher searcher(fsm);
- searcher.Search("ID=");
- UNIT_ASSERT(!searcher.Captured());
- }
- Y_UNIT_TEST(Pcre2PireTest) {
- UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)"), "(fake)");
- UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)??"), "(fake)?");
- UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)*?fake"), "(fake)*fake");
- UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>fake)"), "(fake)");
- UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("fake\\#"), "fake#");
- UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>)fake"), "fake");
- UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?P<field1>)(?P<field2>))"), "");
- UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?:fake))"), "((fake))");
- }
- }
|