123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221 |
- #include "levenshtein_diff.h"
- #include <library/cpp/testing/unittest/registar.h>
- #include <util/generic/string.h>
- namespace {
- float unaryZeroWeigher(const char&) {
- return 0.0f;
- }
- float unaryMaxWeigher(const char&) {
- return 1.0f;
- }
- float binaryZeroWeigher(const char&, const char&) {
- return 0.0f;
- }
- float binaryMaxWeigher(const char&, const char&) {
- return 1.0f;
- }
- }
- Y_UNIT_TEST_SUITE(Levenstein) {
- Y_UNIT_TEST(Distance) {
- UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("knight"), TStringBuf("king")), 4);
- UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("life flies"), TStringBuf("fly lives")), 6);
- UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("spot trail"), TStringBuf("stop trial")), 4);
- UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("hello"), TStringBuf("hulloah")), 3);
- UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("yeoman"), TStringBuf("yo man")), 2);
- }
- Y_UNIT_TEST(DamerauDistance) {
- UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("knight"), TStringBuf("king")), 3);
- UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("life flies"), TStringBuf("fly lives")), 6);
- UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("spot trail"), TStringBuf("stop trial")), 3);
- UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("status"), TStringBuf("tsatus")), 1);
- UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("hello"), TStringBuf("heh lol")), 3);
- }
- }
- Y_UNIT_TEST_SUITE(WeightedLevenstein) {
- Y_UNIT_TEST(EqualStrings) {
- NLevenshtein::TEditChain chain;
- float distance = 0.0f;
- NLevenshtein::GetEditChain(TString("12345"), TString("12345"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
- UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
- }
- Y_UNIT_TEST(EmptyStrings) {
- NLevenshtein::TEditChain chain;
- float distance = 0.0f;
- NLevenshtein::GetEditChain(TString(""), TString(""), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
- UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 0);
- }
- Y_UNIT_TEST(InsertsOnly) {
- auto unaryWeigher = [](const char&) {
- return 2.0f;
- };
- NLevenshtein::TEditChain chain;
- float distance = 0.0f;
- NLevenshtein::GetEditChain(TString(""), TString("12345"), chain, &distance, binaryZeroWeigher, unaryZeroWeigher, unaryWeigher);
- UNIT_ASSERT_VALUES_EQUAL(distance, 10.0f);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
- }
- Y_UNIT_TEST(DeletionsOnly) {
- auto unaryWeigher = [](const char&) {
- return 3.0f;
- };
- NLevenshtein::TEditChain chain;
- float distance = 0.0f;
- NLevenshtein::GetEditChain(TString("54321"), TString(""), chain, &distance, binaryZeroWeigher, unaryWeigher, unaryZeroWeigher);
- UNIT_ASSERT_VALUES_EQUAL(distance, 15.0f);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
- }
- Y_UNIT_TEST(SymmetryCheck) {
- const TString str1 = "123x5";
- const TString str2 = "x2345";
- const float trgDistance = 2.0f;
- const size_t trgChainLen = 5;
- NLevenshtein::TEditChain chainLeftRight;
- float distanceLeftRight = 0.0f;
- NLevenshtein::GetEditChain(str1, str2, chainLeftRight, &distanceLeftRight, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
- UNIT_ASSERT_VALUES_EQUAL(distanceLeftRight, trgDistance);
- UNIT_ASSERT_VALUES_EQUAL(chainLeftRight.size(), trgChainLen);
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[0]), static_cast<int>(NLevenshtein::EMT_REPLACE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[2]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[4]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
- NLevenshtein::TEditChain chainRightLeft;
- float distanceRightLeft = 0.0f;
- NLevenshtein::GetEditChain(str2, str1, chainRightLeft, &distanceRightLeft, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
- UNIT_ASSERT_VALUES_EQUAL(distanceRightLeft, trgDistance);
- UNIT_ASSERT_VALUES_EQUAL(chainRightLeft.size(), trgChainLen);
- UNIT_ASSERT(chainRightLeft == chainLeftRight);
- }
- Y_UNIT_TEST(PreferReplacements) {
- auto binaryWeigher = [](const char&, const char&) {
- return 0.0625f;
- };
- NLevenshtein::TEditChain chain;
- float distance = 0.0f;
- NLevenshtein::GetEditChain(TString("54321"), TString("43210"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
- UNIT_ASSERT_VALUES_EQUAL(distance, 0.3125f);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
- }
- Y_UNIT_TEST(PreferInsertDeletions) {
- auto unaryWeigher = [](const char&) {
- return 0.0625f;
- };
- NLevenshtein::TEditChain chain;
- float distance = 0.0f;
- NLevenshtein::GetEditChain(TString("54321"), TString("98765"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryWeigher);
- UNIT_ASSERT_VALUES_EQUAL(distance, 0.5f);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 9);
- }
- Y_UNIT_TEST(NoXDeletions) {
- auto unaryWeigher = [](const char& c) {
- return c == 'x' ? 100.0f : 1.0f;
- };
- NLevenshtein::TEditChain chain;
- float distance = 0.0f;
- NLevenshtein::GetEditChain(TString("543x1"), TString("5431"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryMaxWeigher);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE));
- UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
- }
- Y_UNIT_TEST(NoXInsertions) {
- auto unaryWeigher = [](const char& c) {
- return c == 'x' ? 100.0f : 1.0f;
- };
- NLevenshtein::TEditChain chain;
- float distance = 0.0f;
- NLevenshtein::GetEditChain(TString("5431"), TString("543x1"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryWeigher);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_INSERT));
- UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
- }
- Y_UNIT_TEST(NoReplacementsOfX) {
- auto binaryWeigher = [](const char& l, const char&) {
- return l == 'x' ? 100.0f : 1.0f;
- };
- NLevenshtein::TEditChain chain;
- float distance = 0.0f;
- NLevenshtein::GetEditChain(TString("5432x"), TString("5432y"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6);
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[5]), static_cast<int>(NLevenshtein::EMT_INSERT));
- UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
- }
- Y_UNIT_TEST(NoReplacementsForX) {
- auto binaryWeigher = [](const char&, const char& r) {
- return r == 'x' ? 100.0f : 1.0f;
- };
- NLevenshtein::TEditChain chain;
- float distance = 0.0f;
- NLevenshtein::GetEditChain(TString("y4321"), TString("x4321"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6);
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_DELETE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_INSERT));
- UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
- }
- Y_UNIT_TEST(SimilarOperationPriorities) {
- auto replaceWeigher = [](const char&, const char&) {
- return 0.5f;
- };
- auto deleteWeigher = [](const char&) {
- return 0.2f;
- };
- auto insertWeigher = [](const char&) {
- return 0.9f;
- };
- NLevenshtein::TEditChain chain;
- float distance = 0.0f;
- NLevenshtein::GetEditChain(TString("y0"), TString("0x"), chain, &distance, replaceWeigher, deleteWeigher, insertWeigher);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 2);
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_REPLACE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_REPLACE));
- UNIT_ASSERT_VALUES_EQUAL(distance, 1.0f);
- }
- Y_UNIT_TEST(DamerauLevenshtein) {
- int distance;
- NLevenshtein::TEditChain chain;
- NLevenshtein::GetEditChainGeneric<TString, true>(TString("status"), TString("tsatus"), chain, &distance);
- UNIT_ASSERT_VALUES_EQUAL(distance, 1);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_TRANSPOSE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
- NLevenshtein::GetEditChainGeneric<TString, true>(TString("hello"), TString("heh lol"), chain, &distance);
- UNIT_ASSERT_VALUES_EQUAL(distance, 3);
- UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6);
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[2]), static_cast<int>(NLevenshtein::EMT_INSERT));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_INSERT));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
- UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[5]), static_cast<int>(NLevenshtein::EMT_TRANSPOSE));
- }
- }
|