levenshtein_diff_ut.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. #include "levenshtein_diff.h"
  2. #include <library/cpp/testing/unittest/registar.h>
  3. #include <util/generic/string.h>
  4. namespace {
  5. float unaryZeroWeigher(const char&) {
  6. return 0.0f;
  7. }
  8. float unaryMaxWeigher(const char&) {
  9. return 1.0f;
  10. }
  11. float binaryZeroWeigher(const char&, const char&) {
  12. return 0.0f;
  13. }
  14. float binaryMaxWeigher(const char&, const char&) {
  15. return 1.0f;
  16. }
  17. }
  18. Y_UNIT_TEST_SUITE(Levenstein) {
  19. Y_UNIT_TEST(Distance) {
  20. UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("knight"), TStringBuf("king")), 4);
  21. UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("life flies"), TStringBuf("fly lives")), 6);
  22. UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("spot trail"), TStringBuf("stop trial")), 4);
  23. UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("hello"), TStringBuf("hulloah")), 3);
  24. UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("yeoman"), TStringBuf("yo man")), 2);
  25. }
  26. Y_UNIT_TEST(DamerauDistance) {
  27. UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("knight"), TStringBuf("king")), 3);
  28. UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("life flies"), TStringBuf("fly lives")), 6);
  29. UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("spot trail"), TStringBuf("stop trial")), 3);
  30. UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("status"), TStringBuf("tsatus")), 1);
  31. UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("hello"), TStringBuf("heh lol")), 3);
  32. }
  33. }
  34. Y_UNIT_TEST_SUITE(WeightedLevenstein) {
  35. Y_UNIT_TEST(EqualStrings) {
  36. NLevenshtein::TEditChain chain;
  37. float distance = 0.0f;
  38. NLevenshtein::GetEditChain(TString("12345"), TString("12345"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
  39. UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f);
  40. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
  41. }
  42. Y_UNIT_TEST(EmptyStrings) {
  43. NLevenshtein::TEditChain chain;
  44. float distance = 0.0f;
  45. NLevenshtein::GetEditChain(TString(""), TString(""), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
  46. UNIT_ASSERT_VALUES_EQUAL(distance, 0.0f);
  47. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 0);
  48. }
  49. Y_UNIT_TEST(InsertsOnly) {
  50. auto unaryWeigher = [](const char&) {
  51. return 2.0f;
  52. };
  53. NLevenshtein::TEditChain chain;
  54. float distance = 0.0f;
  55. NLevenshtein::GetEditChain(TString(""), TString("12345"), chain, &distance, binaryZeroWeigher, unaryZeroWeigher, unaryWeigher);
  56. UNIT_ASSERT_VALUES_EQUAL(distance, 10.0f);
  57. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
  58. }
  59. Y_UNIT_TEST(DeletionsOnly) {
  60. auto unaryWeigher = [](const char&) {
  61. return 3.0f;
  62. };
  63. NLevenshtein::TEditChain chain;
  64. float distance = 0.0f;
  65. NLevenshtein::GetEditChain(TString("54321"), TString(""), chain, &distance, binaryZeroWeigher, unaryWeigher, unaryZeroWeigher);
  66. UNIT_ASSERT_VALUES_EQUAL(distance, 15.0f);
  67. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
  68. }
  69. Y_UNIT_TEST(SymmetryCheck) {
  70. const TString str1 = "123x5";
  71. const TString str2 = "x2345";
  72. const float trgDistance = 2.0f;
  73. const size_t trgChainLen = 5;
  74. NLevenshtein::TEditChain chainLeftRight;
  75. float distanceLeftRight = 0.0f;
  76. NLevenshtein::GetEditChain(str1, str2, chainLeftRight, &distanceLeftRight, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
  77. UNIT_ASSERT_VALUES_EQUAL(distanceLeftRight, trgDistance);
  78. UNIT_ASSERT_VALUES_EQUAL(chainLeftRight.size(), trgChainLen);
  79. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[0]), static_cast<int>(NLevenshtein::EMT_REPLACE));
  80. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
  81. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[2]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
  82. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
  83. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chainLeftRight[4]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
  84. NLevenshtein::TEditChain chainRightLeft;
  85. float distanceRightLeft = 0.0f;
  86. NLevenshtein::GetEditChain(str2, str1, chainRightLeft, &distanceRightLeft, binaryMaxWeigher, unaryMaxWeigher, unaryMaxWeigher);
  87. UNIT_ASSERT_VALUES_EQUAL(distanceRightLeft, trgDistance);
  88. UNIT_ASSERT_VALUES_EQUAL(chainRightLeft.size(), trgChainLen);
  89. UNIT_ASSERT(chainRightLeft == chainLeftRight);
  90. }
  91. Y_UNIT_TEST(PreferReplacements) {
  92. auto binaryWeigher = [](const char&, const char&) {
  93. return 0.0625f;
  94. };
  95. NLevenshtein::TEditChain chain;
  96. float distance = 0.0f;
  97. NLevenshtein::GetEditChain(TString("54321"), TString("43210"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
  98. UNIT_ASSERT_VALUES_EQUAL(distance, 0.3125f);
  99. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
  100. }
  101. Y_UNIT_TEST(PreferInsertDeletions) {
  102. auto unaryWeigher = [](const char&) {
  103. return 0.0625f;
  104. };
  105. NLevenshtein::TEditChain chain;
  106. float distance = 0.0f;
  107. NLevenshtein::GetEditChain(TString("54321"), TString("98765"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryWeigher);
  108. UNIT_ASSERT_VALUES_EQUAL(distance, 0.5f);
  109. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 9);
  110. }
  111. Y_UNIT_TEST(NoXDeletions) {
  112. auto unaryWeigher = [](const char& c) {
  113. return c == 'x' ? 100.0f : 1.0f;
  114. };
  115. NLevenshtein::TEditChain chain;
  116. float distance = 0.0f;
  117. NLevenshtein::GetEditChain(TString("543x1"), TString("5431"), chain, &distance, binaryMaxWeigher, unaryWeigher, unaryMaxWeigher);
  118. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
  119. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
  120. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE));
  121. UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
  122. }
  123. Y_UNIT_TEST(NoXInsertions) {
  124. auto unaryWeigher = [](const char& c) {
  125. return c == 'x' ? 100.0f : 1.0f;
  126. };
  127. NLevenshtein::TEditChain chain;
  128. float distance = 0.0f;
  129. NLevenshtein::GetEditChain(TString("5431"), TString("543x1"), chain, &distance, binaryMaxWeigher, unaryMaxWeigher, unaryWeigher);
  130. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
  131. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_REPLACE));
  132. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_INSERT));
  133. UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
  134. }
  135. Y_UNIT_TEST(NoReplacementsOfX) {
  136. auto binaryWeigher = [](const char& l, const char&) {
  137. return l == 'x' ? 100.0f : 1.0f;
  138. };
  139. NLevenshtein::TEditChain chain;
  140. float distance = 0.0f;
  141. NLevenshtein::GetEditChain(TString("5432x"), TString("5432y"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
  142. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6);
  143. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_DELETE));
  144. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[5]), static_cast<int>(NLevenshtein::EMT_INSERT));
  145. UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
  146. }
  147. Y_UNIT_TEST(NoReplacementsForX) {
  148. auto binaryWeigher = [](const char&, const char& r) {
  149. return r == 'x' ? 100.0f : 1.0f;
  150. };
  151. NLevenshtein::TEditChain chain;
  152. float distance = 0.0f;
  153. NLevenshtein::GetEditChain(TString("y4321"), TString("x4321"), chain, &distance, binaryWeigher, unaryMaxWeigher, unaryMaxWeigher);
  154. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6);
  155. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_DELETE));
  156. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_INSERT));
  157. UNIT_ASSERT_VALUES_EQUAL(distance, 2.0f);
  158. }
  159. Y_UNIT_TEST(SimilarOperationPriorities) {
  160. auto replaceWeigher = [](const char&, const char&) {
  161. return 0.5f;
  162. };
  163. auto deleteWeigher = [](const char&) {
  164. return 0.2f;
  165. };
  166. auto insertWeigher = [](const char&) {
  167. return 0.9f;
  168. };
  169. NLevenshtein::TEditChain chain;
  170. float distance = 0.0f;
  171. NLevenshtein::GetEditChain(TString("y0"), TString("0x"), chain, &distance, replaceWeigher, deleteWeigher, insertWeigher);
  172. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 2);
  173. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_REPLACE));
  174. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_REPLACE));
  175. UNIT_ASSERT_VALUES_EQUAL(distance, 1.0f);
  176. }
  177. Y_UNIT_TEST(DamerauLevenshtein) {
  178. int distance;
  179. NLevenshtein::TEditChain chain;
  180. NLevenshtein::GetEditChainGeneric<TString, true>(TString("status"), TString("tsatus"), chain, &distance);
  181. UNIT_ASSERT_VALUES_EQUAL(distance, 1);
  182. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 5);
  183. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_TRANSPOSE));
  184. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
  185. NLevenshtein::GetEditChainGeneric<TString, true>(TString("hello"), TString("heh lol"), chain, &distance);
  186. UNIT_ASSERT_VALUES_EQUAL(distance, 3);
  187. UNIT_ASSERT_VALUES_EQUAL(chain.size(), 6);
  188. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[0]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
  189. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[1]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
  190. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[2]), static_cast<int>(NLevenshtein::EMT_INSERT));
  191. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[3]), static_cast<int>(NLevenshtein::EMT_INSERT));
  192. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[4]), static_cast<int>(NLevenshtein::EMT_PRESERVE));
  193. UNIT_ASSERT_VALUES_EQUAL(static_cast<int>(chain[5]), static_cast<int>(NLevenshtein::EMT_TRANSPOSE));
  194. }
  195. }