damerau_levenshtein_distance.cc 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. // Copyright 2022 The Abseil Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // https://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include "absl/strings/internal/damerau_levenshtein_distance.h"
  15. #include <algorithm>
  16. #include <array>
  17. #include <numeric>
  18. #include "absl/strings/string_view.h"
  19. namespace absl {
  20. ABSL_NAMESPACE_BEGIN
  21. namespace strings_internal {
  22. // Calculate DamerauLevenshtein (adjacent transpositions) distance
  23. // between two strings,
  24. // https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance. The
  25. // algorithm follows the condition that no substring is edited more than once.
  26. // While this can reduce is larger distance, it's a) a much simpler algorithm
  27. // and b) more realistic for the case that typographic mistakes should be
  28. // detected.
  29. // When the distance is larger than cutoff, or one of the strings has more
  30. // than MAX_SIZE=100 characters, the code returns min(MAX_SIZE, cutoff) + 1.
  31. uint8_t CappedDamerauLevenshteinDistance(absl::string_view s1,
  32. absl::string_view s2, uint8_t cutoff) {
  33. const uint8_t MAX_SIZE = 100;
  34. const uint8_t _cutoff = std::min(MAX_SIZE, cutoff);
  35. const uint8_t cutoff_plus_1 = static_cast<uint8_t>(_cutoff + 1);
  36. if (s1.size() > s2.size()) std::swap(s1, s2);
  37. if (s1.size() + _cutoff < s2.size() || s2.size() > MAX_SIZE)
  38. return cutoff_plus_1;
  39. if (s1.empty())
  40. return static_cast<uint8_t>(s2.size());
  41. // Lower diagonal bound: y = x - lower_diag
  42. const uint8_t lower_diag =
  43. _cutoff - static_cast<uint8_t>(s2.size() - s1.size());
  44. // Upper diagonal bound: y = x + upper_diag
  45. const uint8_t upper_diag = _cutoff;
  46. // d[i][j] is the number of edits required to convert s1[0, i] to s2[0, j]
  47. std::array<std::array<uint8_t, MAX_SIZE + 2>, MAX_SIZE + 2> d;
  48. std::iota(d[0].begin(), d[0].begin() + upper_diag + 1, 0);
  49. d[0][cutoff_plus_1] = cutoff_plus_1;
  50. for (size_t i = 1; i <= s1.size(); ++i) {
  51. // Deduce begin of relevant window.
  52. size_t j_begin = 1;
  53. if (i > lower_diag) {
  54. j_begin = i - lower_diag;
  55. d[i][j_begin - 1] = cutoff_plus_1;
  56. } else {
  57. d[i][0] = static_cast<uint8_t>(i);
  58. }
  59. // Deduce end of relevant window.
  60. size_t j_end = i + upper_diag;
  61. if (j_end > s2.size()) {
  62. j_end = s2.size();
  63. } else {
  64. d[i][j_end + 1] = cutoff_plus_1;
  65. }
  66. for (size_t j = j_begin; j <= j_end; ++j) {
  67. const uint8_t deletion_distance = d[i - 1][j] + 1;
  68. const uint8_t insertion_distance = d[i][j - 1] + 1;
  69. const uint8_t mismatched_tail_cost = s1[i - 1] == s2[j - 1] ? 0 : 1;
  70. const uint8_t mismatch_distance = d[i - 1][j - 1] + mismatched_tail_cost;
  71. uint8_t transposition_distance = _cutoff + 1;
  72. if (i > 1 && j > 1 && s1[i - 1] == s2[j - 2] && s1[i - 2] == s2[j - 1])
  73. transposition_distance = d[i - 2][j - 2] + 1;
  74. d[i][j] = std::min({cutoff_plus_1, deletion_distance, insertion_distance,
  75. mismatch_distance, transposition_distance});
  76. }
  77. }
  78. return d[s1.size()][s2.size()];
  79. }
  80. } // namespace strings_internal
  81. ABSL_NAMESPACE_END
  82. } // namespace absl