/* * approx_matching_ut.cpp -- * * Copyright (c) 2019 YANDEX LLC, Karina Usmanova * * This file is part of Pire, the Perl Incompatible * Regular Expressions library. * * Pire is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Pire is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * You should have received a copy of the GNU Lesser Public License * along with Pire. If not, see . */ #include #include "common.h" Y_UNIT_TEST_SUITE(ApproxMatchingTest) { Pire::Fsm BuildFsm(const char *str) { Pire::Lexer lexer; TVector ucs4; lexer.Encoding().FromLocal(str, str + strlen(str), std::back_inserter(ucs4)); lexer.Assign(ucs4.begin(), ucs4.end()); return lexer.Parse(); } Y_UNIT_TEST(Simple) { auto fsm = BuildFsm("^ab$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("ab"); ACCEPTS("ax"); ACCEPTS("xb"); ACCEPTS("a"); ACCEPTS("b"); ACCEPTS("xab"); ACCEPTS("axb"); ACCEPTS("abx"); ACCEPTS("aab"); DENIES("xy"); DENIES("abcd"); DENIES("xabx"); DENIES(""); } fsm = BuildFsm("^ab$"); APPROXIMATE_SCANNER(fsm, 2) { ACCEPTS("ab"); ACCEPTS("xy"); ACCEPTS(""); ACCEPTS("axbx"); DENIES("xxabx"); DENIES("xbxxx"); } } Y_UNIT_TEST(SpecialSymbols) { auto fsm = BuildFsm("^.*ab$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("a"); ACCEPTS("b"); ACCEPTS("ab"); ACCEPTS("xxxxab"); ACCEPTS("xxxxabab"); DENIES("xxxx"); DENIES("abxxxx"); } fsm = BuildFsm("^[a-c]$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("a"); ACCEPTS("b"); ACCEPTS("c"); ACCEPTS("/"); ACCEPTS(""); ACCEPTS("ax"); DENIES("xx"); DENIES("abc"); } fsm = BuildFsm("^x{4}$"); APPROXIMATE_SCANNER(fsm, 2) { DENIES ("x"); ACCEPTS("xx"); ACCEPTS("xxx"); ACCEPTS("xxxx"); ACCEPTS("xxxxx"); ACCEPTS("xxxxxx"); DENIES ("xxxxxxx"); ACCEPTS("xxyy"); ACCEPTS("xxyyx"); ACCEPTS("xxxxyz"); DENIES("xyyy"); } fsm = BuildFsm("^(a|b)$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("a"); ACCEPTS("b"); ACCEPTS("x"); ACCEPTS(""); ACCEPTS("ax"); DENIES("abc"); DENIES("xx"); } fsm = BuildFsm("^(ab|cd)$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("ab"); ACCEPTS("cd"); ACCEPTS("ax"); ACCEPTS("xd"); ACCEPTS("abx"); ACCEPTS("a"); DENIES("abcd"); DENIES("xx"); DENIES(""); } fsm = BuildFsm("^[a-c]{3}$"); APPROXIMATE_SCANNER(fsm, 2) { ACCEPTS("abc"); ACCEPTS("aaa"); ACCEPTS("a"); ACCEPTS("ax"); ACCEPTS("abxcx"); DENIES("x"); DENIES(""); DENIES("xaxx"); } fsm = BuildFsm("^\\x{61}$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("a"); ACCEPTS("x"); ACCEPTS(""); ACCEPTS("ax"); DENIES("axx"); DENIES("xx"); } fsm = BuildFsm("^a.bc$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("axxbc"); ACCEPTS("abc"); ACCEPTS("xabc"); ACCEPTS("xaxbc"); DENIES("bc"); DENIES("abcx"); } } Y_UNIT_TEST(TestSurrounded) { auto fsm = BuildFsm("abc").Surround(); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("abc"); ACCEPTS("xabcx"); ACCEPTS("xabx"); ACCEPTS("axc"); ACCEPTS("bac"); DENIES("a"); DENIES("xaxxxx"); } fsm = BuildFsm("^abc$").Surround(); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("abc"); ACCEPTS("abcx"); ACCEPTS("xabc"); ACCEPTS("axc"); ACCEPTS("bac"); DENIES("xabx"); DENIES("axx"); } } Y_UNIT_TEST(GlueFsm) { auto fsm = BuildFsm("^a$") | BuildFsm("^b$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS(""); ACCEPTS("a"); ACCEPTS("b"); ACCEPTS("x"); ACCEPTS("ab"); DENIES("abb"); } fsm = BuildFsm("^[a-b]$") | BuildFsm("^c{2}$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("a"); ACCEPTS("b"); ACCEPTS("cc"); ACCEPTS("x"); ACCEPTS("xa"); ACCEPTS("c"); ACCEPTS("xc"); ACCEPTS("cxc"); ACCEPTS(""); } } enum MutateOperation { Begin, Substitute = Begin, Delete, Insert, End }; ystring ChangeText(const ystring& text, int operation, int pos) { auto changedText = text; switch (operation) { case MutateOperation::Substitute: changedText[pos] = 'x'; break; case MutateOperation::Delete: changedText.erase(pos, 1); break; case MutateOperation::Insert: changedText.insert(pos, 1, 'x'); break; } return changedText; } Y_UNIT_TEST(StressTest) { ystring text; for (size_t letter = 0; letter < 10; ++letter) { text += ystring(3, letter + 'a'); } const ystring regexp = "^" + text + "$"; auto fsm = BuildFsm(regexp.data()); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS(text); for (size_t pos = 0; pos < regexp.size() - 2; ++pos) { for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) { auto changedText = ChangeText(text, operation, pos); ACCEPTS(changedText); } } } APPROXIMATE_SCANNER(fsm, 0) { ACCEPTS(text); for (size_t pos = 0; pos < regexp.size() - 2; ++pos) { for (int operation = MutateOperation::Begin; operation < MutateOperation::End; ++operation) { auto changedText = ChangeText(text, operation, pos); DENIES(changedText); } } } APPROXIMATE_SCANNER(fsm, 2) { ACCEPTS(text); for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight size_t posRight = text.size() - posLeft - 1; for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) { for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) { auto changedText = ChangeText(text, operationRight, posRight); changedText = ChangeText(changedText, operationLeft, posLeft); ACCEPTS(changedText); } } } } APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS(text); for (size_t posLeft = 0; posLeft < text.size() / 2 - 1; ++posLeft) { // Subtract 1 to avoid interaction of operationLeft and operationRight size_t posRight = text.size() - posLeft - 1; for (int operationLeft = MutateOperation::Begin; operationLeft < MutateOperation::End; ++operationLeft) { for (int operationRight = MutateOperation::Begin; operationRight < MutateOperation::End; ++operationRight) { auto changedText = ChangeText(text, operationRight, posRight); changedText = ChangeText(changedText, operationLeft, posLeft); DENIES(changedText); } } } } } Y_UNIT_TEST(SwapLetters) { auto fsm = BuildFsm("^abc$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("bac"); ACCEPTS("acb"); DENIES("cba"); DENIES("bax"); } fsm = BuildFsm("^abcd$"); APPROXIMATE_SCANNER(fsm, 2) { ACCEPTS("bacd"); ACCEPTS("acbd"); ACCEPTS("baxd"); ACCEPTS("badc"); ACCEPTS("bcad"); ACCEPTS("bcda"); DENIES("xcbx"); DENIES("baxx"); DENIES("ba"); DENIES("cdab"); } fsm = BuildFsm("^abc$"); APPROXIMATE_SCANNER(fsm, 0) { ACCEPTS("abc"); DENIES("bac"); } fsm = BuildFsm("^[a-c][1-3]$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("a3"); ACCEPTS("c"); ACCEPTS("1"); ACCEPTS("1a"); ACCEPTS("3b"); DENIES("4a"); } fsm = BuildFsm("^.*abc$"); APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS("ab"); ACCEPTS("xxxxbac"); DENIES("xxxxa"); DENIES("xxxxcb"); } } Y_UNIT_TEST(SwapStressTest){ ystring text; for (size_t letter = 0; letter < 30; ++letter) { text += ystring(1, (letter % 26) + 'a'); } const ystring regexp = "^" + text + "$"; auto fsm = BuildFsm(regexp.data()); auto changedText = text; APPROXIMATE_SCANNER(fsm, 1) { ACCEPTS(text); for (size_t pos = 0; pos < text.size() - 1; ++pos) { changedText[pos] = text[pos + 1]; changedText[pos + 1] = text[pos]; ACCEPTS(changedText); changedText[pos] = text[pos]; changedText[pos + 1] = text[pos + 1]; } } APPROXIMATE_SCANNER(fsm, 0) { ACCEPTS(text); for (size_t pos = 0; pos < text.size() - 1; ++pos) { changedText[pos] = text[pos + 1]; changedText[pos + 1] = text[pos]; DENIES(changedText); changedText[pos] = text[pos]; changedText[pos + 1] = text[pos + 1]; } } } }