regexp_ut.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. #include <library/cpp/testing/unittest/registar.h>
  2. #include <library/cpp/regex/pire/regexp.h>
  3. #include <library/cpp/regex/pire/pcre2pire.h>
  4. Y_UNIT_TEST_SUITE(TRegExp) {
  5. using namespace NRegExp;
  6. Y_UNIT_TEST(False) {
  7. UNIT_ASSERT(!TMatcher(TFsm::False()).Match("").Final());
  8. UNIT_ASSERT(!TMatcher(TFsm::False()).Match(TStringBuf{}).Final());
  9. }
  10. Y_UNIT_TEST(Surround) {
  11. UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
  12. UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetSurround(false))).Match("aqwb").Final());
  13. }
  14. Y_UNIT_TEST(Boundaries) {
  15. UNIT_ASSERT(!TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
  16. UNIT_ASSERT(!TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match("aqwb").Final());
  17. UNIT_ASSERT(TMatcher(TFsm("qwb$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
  18. UNIT_ASSERT(TMatcher(TFsm("^aqw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
  19. UNIT_ASSERT(!TMatcher(TFsm("qw$", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
  20. UNIT_ASSERT(!TMatcher(TFsm("^qw", TFsm::TOptions().SetSurround(true))).Match(TStringBuf("aqwb"), true, true).Final());
  21. UNIT_ASSERT(TMatcher(TFsm("^aqwb$", TFsm::TOptions().SetSurround(true)))
  22. .Match(TStringBuf("a"), true, false)
  23. .Match(TStringBuf("q"), false, false)
  24. .Match(TStringBuf("w"), false, false)
  25. .Match(TStringBuf("b"), false, true)
  26. .Final());
  27. }
  28. Y_UNIT_TEST(Case) {
  29. UNIT_ASSERT(TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true))).Match("Qw").Final());
  30. UNIT_ASSERT(!TMatcher(TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false))).Match("Qw").Final());
  31. }
  32. Y_UNIT_TEST(UnicodeCase) {
  33. UNIT_ASSERT(TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(true))).Match("Ab").Final());
  34. UNIT_ASSERT(!TMatcher(TFsm("\\x{61}\\x{62}", TFsm::TOptions().SetCaseInsensitive(false))).Match("Ab").Final());
  35. }
  36. Y_UNIT_TEST(Utf) {
  37. NRegExp::TFsmBase::TOptions opts;
  38. opts.Charset = CODES_UTF8;
  39. opts.Surround = true;
  40. UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("wtf").Final());
  41. UNIT_ASSERT(TMatcher(TFsm(".*", opts)).Match("чзн").Final());
  42. UNIT_ASSERT(TMatcher(TFsm("ч.*", opts)).Match("чзн").Final());
  43. UNIT_ASSERT(!TMatcher(TFsm("чзн", opts)).Match("чзх").Final());
  44. }
  45. Y_UNIT_TEST(AndNot) {
  46. NRegExp::TFsmBase::TOptions opts;
  47. opts.AndNotSupport = true;
  48. {
  49. NRegExp::TFsm fsm(".*&~([0-9]*)", opts);
  50. UNIT_ASSERT(TMatcher(fsm).Match("a2").Final());
  51. UNIT_ASSERT(TMatcher(fsm).Match("ab").Final());
  52. UNIT_ASSERT(TMatcher(fsm).Match("1a").Final());
  53. UNIT_ASSERT(!TMatcher(fsm).Match("12").Final());
  54. }
  55. {
  56. NRegExp::TFsm fsm(".*&~(.*[0-9].*)", opts);
  57. UNIT_ASSERT(TMatcher(fsm).Match("ab").Final());
  58. UNIT_ASSERT(!TMatcher(fsm).Match("a2").Final());
  59. UNIT_ASSERT(!TMatcher(fsm).Match("1a").Final());
  60. UNIT_ASSERT(!TMatcher(fsm).Match("12").Final());
  61. }
  62. {
  63. NRegExp::TFsm fsm(
  64. "((([a-z0-9_\\-]+[.])*[a-z0-9_\\-]+)"
  65. "&~(\\d+[.]\\d+[.]\\d+[.]\\d+))(:\\d+)?",
  66. TFsm::TOptions().SetCaseInsensitive(true).SetAndNotSupport(true)
  67. );
  68. UNIT_ASSERT(TMatcher(fsm).Match("yandex.ru").Final());
  69. UNIT_ASSERT(TMatcher(fsm).Match("yandex").Final());
  70. UNIT_ASSERT(TMatcher(fsm).Match("yandex:80").Final());
  71. UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1").Final());
  72. UNIT_ASSERT(!TMatcher(fsm).Match("127.0.0.1:8080").Final());
  73. }
  74. }
  75. Y_UNIT_TEST(Glue) {
  76. TFsm glued =
  77. TFsm("qw", TFsm::TOptions().SetCaseInsensitive(true)) |
  78. TFsm("qw", TFsm::TOptions().SetCaseInsensitive(false)) |
  79. TFsm("abc", TFsm::TOptions().SetCaseInsensitive(false));
  80. UNIT_ASSERT(TMatcher(glued).Match("Qw").Final());
  81. UNIT_ASSERT(TMatcher(glued).Match("Qw").Final());
  82. UNIT_ASSERT(TMatcher(glued).Match("abc").Final());
  83. UNIT_ASSERT(!TMatcher(glued).Match("Abc").Final());
  84. }
  85. Y_UNIT_TEST(Capture1) {
  86. TCapturingFsm fsm("here we have user_id=([a-z0-9]+);");
  87. TSearcher searcher(fsm);
  88. searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF");
  89. UNIT_ASSERT(searcher.Captured());
  90. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a"));
  91. }
  92. Y_UNIT_TEST(Capture2) {
  93. TCapturingFsm fsm("w([abcdez]+)f");
  94. TSearcher searcher(fsm);
  95. searcher.Search("wabcdef");
  96. UNIT_ASSERT(searcher.Captured());
  97. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("abcde"));
  98. }
  99. Y_UNIT_TEST(Capture3) {
  100. TCapturingFsm fsm("http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)",
  101. TFsm::TOptions().SetCapture(2));
  102. TSearcher searcher(fsm);
  103. searcher.Search("http://vkontakte.ru/id100500");
  104. UNIT_ASSERT(searcher.Captured());
  105. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500"));
  106. }
  107. Y_UNIT_TEST(Capture4) {
  108. TCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!",
  109. TFsm::TOptions().SetCharset(CODES_UTF8));
  110. TSearcher searcher(fsm);
  111. searcher.Search(" Здравствуйте, Уважаемый (-ая)! ");
  112. UNIT_ASSERT(searcher.Captured());
  113. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)"));
  114. }
  115. Y_UNIT_TEST(Capture5) {
  116. TCapturingFsm fsm("away\\.php\\?to=http:([^\"])+\"");
  117. TSearcher searcher(fsm);
  118. searcher.Search("\"/away.php?to=http:some.addr\"&id=1");
  119. UNIT_ASSERT(searcher.Captured());
  120. //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr"));
  121. }
  122. Y_UNIT_TEST(Capture6) {
  123. TCapturingFsm fsm("(/to-match-with)");
  124. TSearcher searcher(fsm);
  125. searcher.Search("/some/table/path/to-match-with");
  126. UNIT_ASSERT(searcher.Captured());
  127. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("/to-match-with"));
  128. }
  129. Y_UNIT_TEST(Capture7) {
  130. TCapturingFsm fsm("(pref.*suff)");
  131. TSearcher searcher(fsm);
  132. searcher.Search("ala pref bla suff cla");
  133. UNIT_ASSERT(searcher.Captured());
  134. //UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref bla suff"));
  135. }
  136. Y_UNIT_TEST(CaptureXA) {
  137. TCapturingFsm fsm(".*(xa).*");
  138. TSearcher searcher(fsm);
  139. searcher.Search("xa");
  140. UNIT_ASSERT(searcher.Captured());
  141. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xa"));
  142. }
  143. Y_UNIT_TEST(CaptureWrongXX) {
  144. TCapturingFsm fsm(".*(xx).*");
  145. TSearcher searcher(fsm);
  146. searcher.Search("xx");
  147. UNIT_ASSERT(searcher.Captured());
  148. // Surprise!
  149. // TCapturingFsm uses a fast - O(|text|) - but incorrect algorithm.
  150. // It works more or less for a particular class of regexps to which ".*(xx).*" does not belong.
  151. // So it returns not the expected "xx" but just the second "x" instead.
  152. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("x"));
  153. }
  154. Y_UNIT_TEST(CaptureRight1XX) {
  155. TCapturingFsm fsm("[^x]+(xx).*");
  156. TSearcher searcher(fsm);
  157. searcher.Search("xxx");
  158. UNIT_ASSERT(!searcher.Captured());
  159. }
  160. Y_UNIT_TEST(CaptureRight2XX) {
  161. TCapturingFsm fsm("[^x]+(xx).*");
  162. TSearcher searcher(fsm);
  163. searcher.Search("axx");
  164. UNIT_ASSERT(searcher.Captured());
  165. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
  166. }
  167. Y_UNIT_TEST(CaptureRight3XX) {
  168. TCapturingFsm fsm("[^x]+(xx).*");
  169. TSearcher searcher(fsm);
  170. searcher.Search("axxb");
  171. UNIT_ASSERT(searcher.Captured());
  172. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
  173. }
  174. Y_UNIT_TEST(SlowCaptureXX) {
  175. TSlowCapturingFsm fsm(".*(xx).*");
  176. TSlowSearcher searcher(fsm);
  177. searcher.Search("xx");
  178. UNIT_ASSERT(searcher.Captured());
  179. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("xx"));
  180. }
  181. Y_UNIT_TEST(SlowCapture) {
  182. TSlowCapturingFsm fsm("^http://vk(ontakte[.]ru|[.]com)/id(\\d+)([^0-9]|$)",
  183. TFsm::TOptions().SetCapture(2));
  184. TSlowSearcher searcher(fsm);
  185. searcher.Search("http://vkontakte.ru/id100500");
  186. UNIT_ASSERT(searcher.Captured());
  187. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("100500"));
  188. }
  189. Y_UNIT_TEST(SlowCaptureGreedy) {
  190. TSlowCapturingFsm fsm(".*(pref.*suff)");
  191. TSlowSearcher searcher(fsm);
  192. searcher.Search("pref ala bla pref cla suff dla");
  193. UNIT_ASSERT(searcher.Captured());
  194. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref cla suff"));
  195. }
  196. Y_UNIT_TEST(SlowCaptureNonGreedy) {
  197. TSlowCapturingFsm fsm(".*?(pref.*suff)");
  198. TSlowSearcher searcher(fsm);
  199. searcher.Search("pref ala bla pref cla suff dla");
  200. UNIT_ASSERT(searcher.Captured());
  201. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("pref ala bla pref cla suff"));
  202. }
  203. Y_UNIT_TEST(SlowCapture2) {
  204. TSlowCapturingFsm fsm("Здравствуйте, ((\\s|\\w|[()]|-)+)!",
  205. TFsm::TOptions().SetCharset(CODES_UTF8));
  206. TSlowSearcher searcher(fsm);
  207. searcher.Search(" Здравствуйте, Уважаемый (-ая)! ");
  208. UNIT_ASSERT(searcher.Captured());
  209. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("Уважаемый (-ая)"));
  210. }
  211. Y_UNIT_TEST(SlowCapture3) {
  212. TSlowCapturingFsm fsm("here we have user_id=([a-z0-9]+);");
  213. TSlowSearcher searcher(fsm);
  214. searcher.Search("in db and here we have user_id=0x0d0a; same as CRLF");
  215. UNIT_ASSERT(searcher.Captured());
  216. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("0x0d0a"));
  217. }
  218. Y_UNIT_TEST(SlowCapture4) {
  219. TSlowCapturingFsm fsm("away\\.php\\?to=http:([^\"]+)\"");
  220. TSlowSearcher searcher(fsm);
  221. searcher.Search("\"/away.php?to=http:some.addr\"&id=1");
  222. UNIT_ASSERT(searcher.Captured());
  223. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf("some.addr"));
  224. }
  225. Y_UNIT_TEST(CapturedEmptySlow) {
  226. TSlowCapturingFsm fsm("Comments=(.*)$");
  227. TSlowSearcher searcher(fsm);
  228. searcher.Search("And Comments=");
  229. UNIT_ASSERT(searcher.Captured());
  230. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf(""));
  231. }
  232. Y_UNIT_TEST(CaptureInOrFirst) {
  233. TSlowCapturingFsm fsm("(A)|A");
  234. TSlowSearcher searcher(fsm);
  235. searcher.Search("A");
  236. UNIT_ASSERT(searcher.Captured());
  237. }
  238. Y_UNIT_TEST(CaptureInOrSecond) {
  239. TSlowCapturingFsm fsm("A|(A)");
  240. TSlowSearcher searcher(fsm);
  241. searcher.Search("A");
  242. UNIT_ASSERT(!searcher.Captured());
  243. }
  244. Y_UNIT_TEST(CaptureOutside) {
  245. TSlowCapturingFsm fsm("((ID=([0-9]+))?)");
  246. TSlowSearcher searcher(fsm);
  247. searcher.Search("ID=");
  248. UNIT_ASSERT(searcher.Captured());
  249. UNIT_ASSERT_VALUES_EQUAL(searcher.GetCaptured(), TStringBuf(""));
  250. }
  251. Y_UNIT_TEST(CaptureInside) {
  252. TSlowCapturingFsm fsm("((ID=([0-9]+))?)",
  253. TFsm::TOptions().SetCapture(2));
  254. TSlowSearcher searcher(fsm);
  255. searcher.Search("ID=");
  256. UNIT_ASSERT(!searcher.Captured());
  257. }
  258. Y_UNIT_TEST(Pcre2PireTest) {
  259. UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)"), "(fake)");
  260. UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)??"), "(fake)?");
  261. UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:fake)*?fake"), "(fake)*fake");
  262. UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>fake)"), "(fake)");
  263. UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("fake\\#"), "fake#");
  264. UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?P<field>)fake"), "fake");
  265. UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?P<field1>)(?P<field2>))"), "");
  266. UNIT_ASSERT_VALUES_EQUAL(Pcre2Pire("(?:(?:fake))"), "((fake))");
  267. }
  268. }