123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- #pragma once
- #include <util/draft/matrix.h>
- #include <util/generic/algorithm.h>
- #include <util/generic/vector.h>
- #include <util/system/yassert.h>
- #include <type_traits>
- #include <utility>
- namespace NLevenshtein {
- enum EEditMoveType {
- EMT_SPECIAL,
- EMT_PRESERVE,
- EMT_REPLACE,
- EMT_DELETE,
- EMT_INSERT
- };
- inline bool IsImportantEditMove(EEditMoveType p) {
- return (p != EMT_SPECIAL && p != EMT_PRESERVE);
- }
- inline void MakeMove(EEditMoveType t, int& p1, int& p2) {
- switch (t) {
- case EMT_PRESERVE:
- case EMT_REPLACE:
- p1++;
- p2++;
- break;
- case EMT_DELETE:
- p1++;
- break;
- case EMT_INSERT:
- p2++;
- break;
- default:
- break;
- }
- }
- using TEditChain = TVector<EEditMoveType>;
- template <typename TArgType>
- struct TWeightOneUnaryGetter {
- int operator()(const TArgType&) const {
- return 1;
- }
- };
- template <typename TArgType>
- struct TWeightOneBinaryGetter {
- int operator()(const TArgType&, const TArgType&) const {
- return 1;
- }
- };
- template <typename TStringType>
- using TCharType = typename std::decay_t<decltype(std::add_const_t<TStringType>()[0])>;
- /// Finds sequence of "edit moves" for two strings
- template <class TStringType, class TWeightType = int,
- class TReplaceWeigher = TWeightOneBinaryGetter<TCharType<TStringType>>,
- class TDeleteWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>,
- class TInsertWeigher = TWeightOneUnaryGetter<TCharType<TStringType>>
- >
- void GetEditChain(const TStringType& str1, const TStringType& str2, TEditChain& res, TWeightType* weight = nullptr,
- const TReplaceWeigher& replaceWeigher = TReplaceWeigher(),
- const TDeleteWeigher& deleteWeigher = TDeleteWeigher(),
- const TInsertWeigher& insertWeigher = TInsertWeigher())
- {
- int l1 = (int)str1.size();
- int l2 = (int)str2.size();
- TMatrix<std::pair<TWeightType, EEditMoveType>> ma(l1 + 1, l2 + 1); /// ma[i][j].first = diff(str1[0..i-1], str2[0..j-1])
- ma[0][0] = std::make_pair(0, EMT_SPECIAL); // starting point
- for (int i = 1; i <= l1; i++) {
- ma[i][0] = std::make_pair(ma[i - 1][0].first + deleteWeigher(str1[i - 1]), EMT_DELETE);
- }
- for (int i = 1; i <= l2; i++) {
- ma[0][i] = std::make_pair(ma[0][i - 1].first + insertWeigher(str2[i - 1]), EMT_INSERT);
- }
- // Here goes basic Levestein's algorithm
- for (int i = 1; i <= l1; i++) {
- for (int j = 1; j <= l2; j++) {
- if (str1[i - 1] == str2[j - 1]) {
- ma[i][j] = std::make_pair(ma[i - 1][j - 1].first, EMT_PRESERVE);
- } else {
- const TWeightType replaceWeight = replaceWeigher(str1[i - 1], str2[j - 1]);
- Y_ASSERT(replaceWeight >= 0);
- ma[i][j] = std::make_pair(ma[i - 1][j - 1].first + replaceWeight, EMT_REPLACE);
- }
- if (ma[i][j].first > ma[i - 1][j].first) {
- const TWeightType deleteWeight = deleteWeigher(str1[i - 1]);
- Y_ASSERT(deleteWeight >= 0);
- const TWeightType deletePathWeight = ma[i - 1][j].first + deleteWeight;
- if (deletePathWeight <= ma[i][j].first) {
- ma[i][j] = std::make_pair(deletePathWeight, EMT_DELETE);
- }
- }
- if (ma[i][j].first > ma[i][j - 1].first) {
- const TWeightType insertWeight = insertWeigher(str2[j - 1]);
- Y_ASSERT(insertWeight >= 0);
- const TWeightType insertPathWeight = ma[i][j - 1].first + insertWeight;
- if (insertPathWeight <= ma[i][j].first) {
- ma[i][j] = std::make_pair(insertPathWeight, EMT_INSERT);
- }
- }
- }
- }
- // Tracing the path from final point
- res.clear();
- res.reserve(Max<size_t>(l1, l2));
- for (int i = l1, j = l2; ma[i][j].second != EMT_SPECIAL;) {
- res.push_back(ma[i][j].second);
- switch (ma[i][j].second) {
- case EMT_PRESERVE:
- case EMT_REPLACE:
- --i;
- --j;
- break;
- case EMT_DELETE:
- --i;
- break;
- case EMT_INSERT:
- --j;
- break;
- default:
- // TODO: throw exception
- break;
- }
- }
- std::reverse(res.begin(), res.end());
- if (weight != nullptr) {
- *weight = ma[l1][l2].first;
- }
- }
- template <class TStringType>
- size_t Distance(const TStringType& str1, const TStringType& str2) {
- TEditChain editChain;
- GetEditChain(str1, str2, editChain);
- size_t result = 0;
- for (auto edit : editChain) {
- if (IsImportantEditMove(edit))
- result++;
- }
- return result;
- }
- /// Calculates substrings to be replaced for str1->str2 transformation
- struct TReplacement {
- int CorrectOffset, CorrectLength, MisspelledOffset, MisspelledLength;
- TReplacement()
- : CorrectOffset(0)
- , CorrectLength(0)
- , MisspelledOffset(0)
- , MisspelledLength(0)
- {
- }
- TReplacement(int correctOffset, int correctLength, int misspelledOffset, int misspelledLength)
- : CorrectOffset(correctOffset)
- , CorrectLength(correctLength)
- , MisspelledOffset(misspelledOffset)
- , MisspelledLength(misspelledLength)
- {
- }
- };
- template <class TStringType>
- void GetStringReplacements(const TStringType& str1, const TStringType& str2, TVector<TReplacement>& res) {
- TEditChain editChain;
- GetEditChain(str1, str2, editChain);
- editChain.push_back(EMT_SPECIAL);
- int c1 = 0, c2 = 0;
- res.clear();
- for (TEditChain::const_iterator it = editChain.begin(); it != editChain.end(); it++) {
- if (IsImportantEditMove(*it)) {
- int sc1 = c1, sc2 = c2;
- do {
- MakeMove(*it, c1, c2);
- ++it;
- } while (IsImportantEditMove(*it));
- res.push_back(TReplacement(sc1, c1 - sc1, sc2, c2 - sc2));
- }
- MakeMove(*it, c1, c2);
- }
- }
- }
|