Browse Source

Replace the functions for the `Levenshtein` and the `Damerau–Levenshtein` distances with the memory effective version.

YQL UDFs `String::LevensteinDistance` and `Unicode::LevensteinDistance` use `NLevenshtein::Distance` inside.
The previous version of `NLevenshtein::Distance` uses O(nm) space instead of O(min(n, m)).
akinshchikov 1 year ago
parent
commit
72c25e282d

+ 36 - 8
library/cpp/string_utils/levenshtein_diff/levenshtein_diff.h

@@ -95,7 +95,7 @@ namespace NLevenshtein {
             ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT);
         }
         const TWeightType maxWeight = std::numeric_limits<TWeightType>::max();
-        // Here goes basic Damerau-Levestein's algorithm
+        // Here goes basic Damerau-Levenshtein's algorithm
         for (int i = 1; i <= l1; i++) {
             for (int j = 1; j <= l2; j++) {
                 if (str1[i - 1] == str2[j - 1]) {
@@ -195,14 +195,42 @@ namespace NLevenshtein {
 
     template <class TStringType, bool Damerau = false>
     size_t DistanceImpl(const TStringType& str1, const TStringType& str2) {
-        TEditChain editChain;
-        GetEditChainGeneric<TStringType, Damerau>(str1, str2, editChain);
-        size_t result = 0;
-        for (auto edit : editChain) {
-            if (IsImportantEditMove(edit))
-                result++;
+        if (str1.size() > str2.size()) {
+            return DistanceImpl<TStringType, Damerau>(str2, str1);
+        }
+
+        size_t size1 = str1.size();
+        size_t size2 = str2.size();
+
+        TVector<size_t> currentRow(size1 + 1);
+        TVector<size_t> previousRow(size1 + 1);
+        TVector<size_t> transpositionRow(size1 + 1);
+
+        for (size_t i = 0; i <= size1; ++i) {
+            previousRow[i] = i;
+        }
+
+        for (size_t i = 1; i <= size2; ++i) {
+            currentRow[0] = i;
+
+            for (size_t j = 1; j <= size1; ++j) {
+                size_t cost = str1[j - 1] == str2[i - 1] ? 0 : 1;
+
+                currentRow[j] = std::min(previousRow[j - 1] + cost, std::min(currentRow[j - 1], previousRow[j]) + 1);
+
+                if (Damerau && i > 1 && j > 1 && str1[j - 2] == str2[i - 1] && str1[j - 1] == str2[i - 2]) {
+                    currentRow[j] = std::min(currentRow[j], transpositionRow[j - 2] + cost);
+                }
+            }
+
+            if (Damerau) {
+                std::swap(transpositionRow, previousRow);
+            }
+
+            std::swap(previousRow, currentRow);
         }
-        return result;
+
+        return previousRow[size1];
     }
 
     template <class TStringType>

+ 11 - 0
library/cpp/string_utils/levenshtein_diff/levenshtein_diff_ut.cpp

@@ -26,9 +26,20 @@ namespace {
 
 Y_UNIT_TEST_SUITE(Levenstein) {
     Y_UNIT_TEST(Distance) {
+        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("knight"), TStringBuf("king")), 4);
+        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("life flies"), TStringBuf("fly lives")), 6);
+        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("spot trail"), TStringBuf("stop trial")), 4);
         UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("hello"), TStringBuf("hulloah")), 3);
         UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::Distance(TStringBuf("yeoman"), TStringBuf("yo man")), 2);
     }
+
+    Y_UNIT_TEST(DamerauDistance) {
+        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("knight"), TStringBuf("king")), 3);
+        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("life flies"), TStringBuf("fly lives")), 6);
+        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("spot trail"), TStringBuf("stop trial")), 3);
+        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("status"), TStringBuf("tsatus")), 1);
+        UNIT_ASSERT_VALUES_EQUAL(NLevenshtein::DamerauDistance(TStringBuf("hello"), TStringBuf("heh lol")), 3);
+    }
 }
 
 Y_UNIT_TEST_SUITE(WeightedLevenstein) {