123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379 |
- /*
- * approx_matching_ut.cpp --
- *
- * Copyright (c) 2019 YANDEX LLC, Karina Usmanova <usmanova.karin@yandex.ru>
- *
- * This file is part of Pire, the Perl Incompatible
- * Regular Expressions library.
- *
- * Pire is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Pire is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser Public License for more details.
- * You should have received a copy of the GNU Lesser Public License
- * along with Pire. If not, see <http://www.gnu.org/licenses>.
- */
- #include <contrib/libs/pire/pire/pire.h>
- #include "common.h"
- Y_UNIT_TEST_SUITE(ApproxMatchingTest) {
- Pire::Fsm BuildFsm(const char *str)
- {
- Pire::Lexer lexer;
- TVector<wchar32> ucs4;
- lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4));
- lexer.Assign(ucs4.begin(), ucs4.end());
- return lexer.Parse();
- }
- Y_UNIT_TEST(Simple) {
- auto fsm = BuildFsm("^ab$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("ab");
- ACCEPTS("ax");
- ACCEPTS("xb");
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("xab");
- ACCEPTS("axb");
- ACCEPTS("abx");
- ACCEPTS("aab");
- DENIES("xy");
- DENIES("abcd");
- DENIES("xabx");
- DENIES("");
- }
- fsm = BuildFsm("^ab$");
- APPROXIMATE_SCANNER(fsm, 2) {
- ACCEPTS("ab");
- ACCEPTS("xy");
- ACCEPTS("");
- ACCEPTS("axbx");
- DENIES("xxabx");
- DENIES("xbxxx");
- }
- }
- Y_UNIT_TEST(SpecialSymbols) {
- auto fsm = BuildFsm("^.*ab$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("ab");
- ACCEPTS("xxxxab");
- ACCEPTS("xxxxabab");
- DENIES("xxxx");
- DENIES("abxxxx");
- }
- fsm = BuildFsm("^[a-c]$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("c");
- ACCEPTS("/");
- ACCEPTS("");
- ACCEPTS("ax");
- DENIES("xx");
- DENIES("abc");
- }
- fsm = BuildFsm("^x{4}$");
- APPROXIMATE_SCANNER(fsm, 2) {
- DENIES ("x");
- ACCEPTS("xx");
- ACCEPTS("xxx");
- ACCEPTS("xxxx");
- ACCEPTS("xxxxx");
- ACCEPTS("xxxxxx");
- DENIES ("xxxxxxx");
- ACCEPTS("xxyy");
- ACCEPTS("xxyyx");
- ACCEPTS("xxxxyz");
- DENIES("xyyy");
- }
- fsm = BuildFsm("^(a|b)$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("x");
- ACCEPTS("");
- ACCEPTS("ax");
- DENIES("abc");
- DENIES("xx");
- }
- fsm = BuildFsm("^(ab|cd)$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("ab");
- ACCEPTS("cd");
- ACCEPTS("ax");
- ACCEPTS("xd");
- ACCEPTS("abx");
- ACCEPTS("a");
- DENIES("abcd");
- DENIES("xx");
- DENIES("");
- }
- fsm = BuildFsm("^[a-c]{3}$");
- APPROXIMATE_SCANNER(fsm, 2) {
- ACCEPTS("abc");
- ACCEPTS("aaa");
- ACCEPTS("a");
- ACCEPTS("ax");
- ACCEPTS("abxcx");
- DENIES("x");
- DENIES("");
- DENIES("xaxx");
- }
- fsm = BuildFsm("^\\x{61}$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a");
- ACCEPTS("x");
- ACCEPTS("");
- ACCEPTS("ax");
- DENIES("axx");
- DENIES("xx");
- }
- fsm = BuildFsm("^a.bc$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("axxbc");
- ACCEPTS("abc");
- ACCEPTS("xabc");
- ACCEPTS("xaxbc");
- DENIES("bc");
- DENIES("abcx");
- }
- }
- Y_UNIT_TEST(TestSurrounded) {
- auto fsm = BuildFsm("abc").Surround();
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("abc");
- ACCEPTS("xabcx");
- ACCEPTS("xabx");
- ACCEPTS("axc");
- ACCEPTS("bac");
- DENIES("a");
- DENIES("xaxxxx");
- }
- fsm = BuildFsm("^abc$").Surround();
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("abc");
- ACCEPTS("abcx");
- ACCEPTS("xabc");
- ACCEPTS("axc");
- ACCEPTS("bac");
- DENIES("xabx");
- DENIES("axx");
- }
- }
- Y_UNIT_TEST(GlueFsm) {
- auto fsm = BuildFsm("^a$") | BuildFsm("^b$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("");
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("x");
- ACCEPTS("ab");
- DENIES("abb");
- }
- fsm = BuildFsm("^[a-b]$") | BuildFsm("^c{2}$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a");
- ACCEPTS("b");
- ACCEPTS("cc");
- ACCEPTS("x");
- ACCEPTS("xa");
- ACCEPTS("c");
- ACCEPTS("xc");
- ACCEPTS("cxc");
- ACCEPTS("");
- }
- }
- enum MutateOperation {
- Begin,
- Substitute = Begin,
- Delete,
- Insert,
- End
- };
- ystring ChangeText(const ystring& text, int operation, int pos)
- {
- auto changedText = text;
- switch (operation) {
- case MutateOperation::Substitute:
- changedText[pos] = 'x';
- break;
- case MutateOperation::Delete:
- changedText.erase(pos, 1);
- break;
- case MutateOperation::Insert:
- changedText.insert(pos, 1, 'x');
- break;
- }
- return changedText;
- }
- Y_UNIT_TEST(StressTest) {
- ystring text;
- for (size_t letter = 0; letter < 10; ++letter) {
- text += ystring(3, letter + 'a');
- }
- const ystring regexp = "^" + text + "$";
- auto fsm = BuildFsm(regexp.data());
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS(text);
- for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
- for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
- auto changedText = ChangeText(text, operation, pos);
- ACCEPTS(changedText);
- }
- }
- }
- APPROXIMATE_SCANNER(fsm, 0) {
- ACCEPTS(text);
- for (size_t pos = 0; pos < regexp.size() - 2; ++pos) {
- for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) {
- auto changedText = ChangeText(text, operation, pos);
- DENIES(changedText);
- }
- }
- }
- APPROXIMATE_SCANNER(fsm, 2) {
- ACCEPTS(text);
- for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
- size_t posRight = text.size() - posLeft - 1;
- for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
- for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
- auto changedText = ChangeText(text, operationRight, posRight);
- changedText = ChangeText(changedText, operationLeft, posLeft);
- ACCEPTS(changedText);
- }
- }
- }
- }
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS(text);
- for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight
- size_t posRight = text.size() - posLeft - 1;
- for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) {
- for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) {
- auto changedText = ChangeText(text, operationRight, posRight);
- changedText = ChangeText(changedText, operationLeft, posLeft);
- DENIES(changedText);
- }
- }
- }
- }
- }
- Y_UNIT_TEST(SwapLetters) {
- auto fsm = BuildFsm("^abc$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("bac");
- ACCEPTS("acb");
- DENIES("cba");
- DENIES("bax");
- }
- fsm = BuildFsm("^abcd$");
- APPROXIMATE_SCANNER(fsm, 2) {
- ACCEPTS("bacd");
- ACCEPTS("acbd");
- ACCEPTS("baxd");
- ACCEPTS("badc");
- ACCEPTS("bcad");
- ACCEPTS("bcda");
- DENIES("xcbx");
- DENIES("baxx");
- DENIES("ba");
- DENIES("cdab");
- }
- fsm = BuildFsm("^abc$");
- APPROXIMATE_SCANNER(fsm, 0) {
- ACCEPTS("abc");
- DENIES("bac");
- }
- fsm = BuildFsm("^[a-c][1-3]$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("a3");
- ACCEPTS("c");
- ACCEPTS("1");
- ACCEPTS("1a");
- ACCEPTS("3b");
- DENIES("4a");
- }
- fsm = BuildFsm("^.*abc$");
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS("ab");
- ACCEPTS("xxxxbac");
- DENIES("xxxxa");
- DENIES("xxxxcb");
- }
- }
- Y_UNIT_TEST(SwapStressTest){
- ystring text;
- for (size_t letter = 0; letter < 30; ++letter) {
- text += ystring(1, (letter % 26) + 'a');
- }
- const ystring regexp = "^" + text + "$";
- auto fsm = BuildFsm(regexp.data());
- auto changedText = text;
- APPROXIMATE_SCANNER(fsm, 1) {
- ACCEPTS(text);
- for (size_t pos = 0; pos < text.size() - 1; ++pos) {
- changedText[pos] = text[pos + 1];
- changedText[pos + 1] = text[pos];
- ACCEPTS(changedText);
- changedText[pos] = text[pos];
- changedText[pos + 1] = text[pos + 1];
- }
- }
- APPROXIMATE_SCANNER(fsm, 0) {
- ACCEPTS(text);
- for (size_t pos = 0; pos < text.size() - 1; ++pos) {
- changedText[pos] = text[pos + 1];
- changedText[pos + 1] = text[pos];
- DENIES(changedText);
- changedText[pos] = text[pos];
- changedText[pos + 1] = text[pos + 1];
- }
- }
- }
- }
|