approx_matching_ut.cpp 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. /*
  2. * approx_matching_ut.cpp --
  3. *
  4. * Copyright (c) 2019 YANDEX LLC, Karina Usmanova <usmanova.karin@yandex.ru>
  5. *
  6. * This file is part of Pire, the Perl Incompatible
  7. * Regular Expressions library.
  8. *
  9. * Pire is free software: you can redistribute it and/or modify
  10. * it under the terms of the GNU Lesser Public License as published by
  11. * the Free Software Foundation, either version 3 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * Pire is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU Lesser Public License for more details.
  18. * You should have received a copy of the GNU Lesser Public License
  19. * along with Pire. If not, see <http://www.gnu.org/licenses>.
  20. */
  21. #include <contrib/libs/pire/pire/pire.h>
  22. #include "common.h"
  23. Y_UNIT_TEST_SUITE(ApproxMatchingTest) {
  24. Pire::Fsm BuildFsm(const char *str)
  25. {
  26. Pire::Lexer lexer;
  27. TVector<wchar32> ucs4;
  28. lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4));
  29. lexer.Assign(ucs4.begin(), ucs4.end());
  30. return lexer.Parse();
  31. }
  32. Y_UNIT_TEST(Simple) {
  33. auto fsm = BuildFsm("^ab$");
  34. APPROXIMATE_SCANNER(fsm, 1) {
  35. ACCEPTS("ab");
  36. ACCEPTS("ax");
  37. ACCEPTS("xb");
  38. ACCEPTS("a");
  39. ACCEPTS("b");
  40. ACCEPTS("xab");
  41. ACCEPTS("axb");
  42. ACCEPTS("abx");
  43. ACCEPTS("aab");
  44. DENIES("xy");
  45. DENIES("abcd");
  46. DENIES("xabx");
  47. DENIES("");
  48. }
  49. fsm = BuildFsm("^ab$");
  50. APPROXIMATE_SCANNER(fsm, 2) {
  51. ACCEPTS("ab");
  52. ACCEPTS("xy");
  53. ACCEPTS("");
  54. ACCEPTS("axbx");
  55. DENIES("xxabx");
  56. DENIES("xbxxx");
  57. }
  58. }
  59. Y_UNIT_TEST(SpecialSymbols) {
  60. auto fsm = BuildFsm("^.*ab$");
  61. APPROXIMATE_SCANNER(fsm, 1) {
  62. ACCEPTS("a");
  63. ACCEPTS("b");
  64. ACCEPTS("ab");
  65. ACCEPTS("xxxxab");
  66. ACCEPTS("xxxxabab");
  67. DENIES("xxxx");
  68. DENIES("abxxxx");
  69. }
  70. fsm = BuildFsm("^[a-c]$");
  71. APPROXIMATE_SCANNER(fsm, 1) {
  72. ACCEPTS("a");
  73. ACCEPTS("b");
  74. ACCEPTS("c");
  75. ACCEPTS("/");
  76. ACCEPTS("");
  77. ACCEPTS("ax");
  78. DENIES("xx");
  79. DENIES("abc");
  80. }
  81. fsm = BuildFsm("^x{4}$");
  82. APPROXIMATE_SCANNER(fsm, 2) {
  83. DENIES ("x");
  84. ACCEPTS("xx");
  85. ACCEPTS("xxx");
  86. ACCEPTS("xxxx");
  87. ACCEPTS("xxxxx");
  88. ACCEPTS("xxxxxx");
  89. DENIES ("xxxxxxx");
  90. ACCEPTS("xxyy");
  91. ACCEPTS("xxyyx");
  92. ACCEPTS("xxxxyz");
  93. DENIES("xyyy");
  94. }
  95. fsm = BuildFsm("^(a|b)$");
  96. APPROXIMATE_SCANNER(fsm, 1) {
  97. ACCEPTS("a");
  98. ACCEPTS("b");
  99. ACCEPTS("x");
  100. ACCEPTS("");
  101. ACCEPTS("ax");
  102. DENIES("abc");
  103. DENIES("xx");
  104. }
  105. fsm = BuildFsm("^(ab|cd)$");
  106. APPROXIMATE_SCANNER(fsm, 1) {
  107. ACCEPTS("ab");
  108. ACCEPTS("cd");
  109. ACCEPTS("ax");
  110. ACCEPTS("xd");
  111. ACCEPTS("abx");
  112. ACCEPTS("a");
  113. DENIES("abcd");
  114. DENIES("xx");
  115. DENIES("");
  116. }
  117. fsm = BuildFsm("^[a-c]{3}$");
  118. APPROXIMATE_SCANNER(fsm, 2) {
  119. ACCEPTS("abc");
  120. ACCEPTS("aaa");
  121. ACCEPTS("a");
  122. ACCEPTS("ax");
  123. ACCEPTS("abxcx");
  124. DENIES("x");
  125. DENIES("");
  126. DENIES("xaxx");
  127. }
  128. fsm = BuildFsm("^\\x{61}$");
  129. APPROXIMATE_SCANNER(fsm, 1) {
  130. ACCEPTS("a");
  131. ACCEPTS("x");
  132. ACCEPTS("");
  133. ACCEPTS("ax");
  134. DENIES("axx");
  135. DENIES("xx");
  136. }
  137. fsm = BuildFsm("^a.bc$");
  138. APPROXIMATE_SCANNER(fsm, 1) {
  139. ACCEPTS("axxbc");
  140. ACCEPTS("abc");
  141. ACCEPTS("xabc");
  142. ACCEPTS("xaxbc");
  143. DENIES("bc");
  144. DENIES("abcx");
  145. }
  146. }
  147. Y_UNIT_TEST(TestSurrounded) {
  148. auto fsm = BuildFsm("abc").Surround();
  149. APPROXIMATE_SCANNER(fsm, 1) {
  150. ACCEPTS("abc");
  151. ACCEPTS("xabcx");
  152. ACCEPTS("xabx");
  153. ACCEPTS("axc");
  154. ACCEPTS("bac");
  155. DENIES("a");
  156. DENIES("xaxxxx");
  157. }
  158. fsm = BuildFsm("^abc$").Surround();
  159. APPROXIMATE_SCANNER(fsm, 1) {
  160. ACCEPTS("abc");
  161. ACCEPTS("abcx");
  162. ACCEPTS("xabc");
  163. ACCEPTS("axc");
  164. ACCEPTS("bac");
  165. DENIES("xabx");
  166. DENIES("axx");
  167. }
  168. }
  169. Y_UNIT_TEST(GlueFsm) {
  170. auto fsm = BuildFsm("^a$") | BuildFsm("^b$");
  171. APPROXIMATE_SCANNER(fsm, 1) {
  172. ACCEPTS("");
  173. ACCEPTS("a");
  174. ACCEPTS("b");
  175. ACCEPTS("x");
  176. ACCEPTS("ab");
  177. DENIES("abb");
  178. }
  179. fsm = BuildFsm("^[a-b]$") | BuildFsm("^c{2}$");
  180. APPROXIMATE_SCANNER(fsm, 1) {
  181. ACCEPTS("a");
  182. ACCEPTS("b");
  183. ACCEPTS("cc");
  184. ACCEPTS("x");
  185. ACCEPTS("xa");
  186. ACCEPTS("c");
  187. ACCEPTS("xc");
  188. ACCEPTS("cxc");
  189. ACCEPTS("");
  190. }
  191. }
  192. enum MutateOperation {
  193. Begin,
  194. Substitute = Begin,
  195. Delete,
  196. Insert,
  197. End
  198. };
  199. ystring ChangeText(const ystring& text, int operation, int pos)
  200. {
  201. auto changedText = text;
  202. switch (operation) {
  203. case MutateOperation::Substitute:
  204. changedText[pos] = 'x';
  205. break;
  206. case MutateOperation::Delete:
  207. changedText.erase(pos, 1);
  208. break;
  209. case MutateOperation::Insert:
  210. changedText.insert(pos, 1, 'x');
  211. break;
  212. }
  213. return changedText;
  214. }
  215. Y_UNIT_TEST(StressTest) {
  216. ystring text;
  217. for (size_t letter = 0; letter < 10; ++letter) {
  218. text += ystring(3, letter + 'a');
  219. }
  220. const ystring regexp = "^" + text + "$";
  221. auto fsm = BuildFsm(regexp.data());
  222. APPROXIMATE_SCANNER(fsm, 1) {
  223. ACCEPTS(text);
  224. for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
  225. for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
  226. auto changedText = ChangeText(text, operation, pos);
  227. ACCEPTS(changedText);
  228. }
  229. }
  230. }
  231. APPROXIMATE_SCANNER(fsm, 0) {
  232. ACCEPTS(text);
  233. for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
  234. for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
  235. auto changedText = ChangeText(text, operation, pos);
  236. DENIES(changedText);
  237. }
  238. }
  239. }
  240. APPROXIMATE_SCANNER(fsm, 2) {
  241. ACCEPTS(text);
  242. for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
  243. size_t posRight = text.size() - posLeft - 1;
  244. for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
  245. for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
  246. auto changedText = ChangeText(text, operationRight, posRight);
  247. changedText = ChangeText(changedText, operationLeft, posLeft);
  248. ACCEPTS(changedText);
  249. }
  250. }
  251. }
  252. }
  253. APPROXIMATE_SCANNER(fsm, 1) {
  254. ACCEPTS(text);
  255. for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
  256. size_t posRight = text.size() - posLeft - 1;
  257. for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
  258. for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
  259. auto changedText = ChangeText(text, operationRight, posRight);
  260. changedText = ChangeText(changedText, operationLeft, posLeft);
  261. DENIES(changedText);
  262. }
  263. }
  264. }
  265. }
  266. }
  267. Y_UNIT_TEST(SwapLetters) {
  268. auto fsm = BuildFsm("^abc$");
  269. APPROXIMATE_SCANNER(fsm, 1) {
  270. ACCEPTS("bac");
  271. ACCEPTS("acb");
  272. DENIES("cba");
  273. DENIES("bax");
  274. }
  275. fsm = BuildFsm("^abcd$");
  276. APPROXIMATE_SCANNER(fsm, 2) {
  277. ACCEPTS("bacd");
  278. ACCEPTS("acbd");
  279. ACCEPTS("baxd");
  280. ACCEPTS("badc");
  281. ACCEPTS("bcad");
  282. ACCEPTS("bcda");
  283. DENIES("xcbx");
  284. DENIES("baxx");
  285. DENIES("ba");
  286. DENIES("cdab");
  287. }
  288. fsm = BuildFsm("^abc$");
  289. APPROXIMATE_SCANNER(fsm, 0) {
  290. ACCEPTS("abc");
  291. DENIES("bac");
  292. }
  293. fsm = BuildFsm("^[a-c][1-3]$");
  294. APPROXIMATE_SCANNER(fsm, 1) {
  295. ACCEPTS("a3");
  296. ACCEPTS("c");
  297. ACCEPTS("1");
  298. ACCEPTS("1a");
  299. ACCEPTS("3b");
  300. DENIES("4a");
  301. }
  302. fsm = BuildFsm("^.*abc$");
  303. APPROXIMATE_SCANNER(fsm, 1) {
  304. ACCEPTS("ab");
  305. ACCEPTS("xxxxbac");
  306. DENIES("xxxxa");
  307. DENIES("xxxxcb");
  308. }
  309. }
  310. Y_UNIT_TEST(SwapStressTest){
  311. ystring text;
  312. for (size_t letter = 0; letter < 30; ++letter) {
  313. text += ystring(1, (letter % 26) + 'a');
  314. }
  315. const ystring regexp = "^" + text + "$";
  316. auto fsm = BuildFsm(regexp.data());
  317. auto changedText = text;
  318. APPROXIMATE_SCANNER(fsm, 1) {
  319. ACCEPTS(text);
  320. for (size_t pos = 0; pos < text.size() - 1; ++pos) {
  321. changedText[pos] = text[pos + 1];
  322. changedText[pos + 1] = text[pos];
  323. ACCEPTS(changedText);
  324. changedText[pos] = text[pos];
  325. changedText[pos + 1] = text[pos + 1];
  326. }
  327. }
  328. APPROXIMATE_SCANNER(fsm, 0) {
  329. ACCEPTS(text);
  330. for (size_t pos = 0; pos < text.size() - 1; ++pos) {
  331. changedText[pos] = text[pos + 1];
  332. changedText[pos + 1] = text[pos];
  333. DENIES(changedText);
  334. changedText[pos] = text[pos];
  335. changedText[pos + 1] = text[pos + 1];
  336. }
  337. }
  338. }
  339. }