Просмотр исходного кода

Refactoring: Clean up SignatureDetection class

Ryan Lue 6 лет назад
Родитель
Сommit
00934ef4f4
1 измененных файлов с 34 добавлено и 96 удалено
  1. 34 96
      lib/signature_detection.rb

+ 34 - 96
lib/signature_detection.rb

@@ -20,91 +20,34 @@ returns
 =end
 
   def self.find_signature(messages)
-
-    string_list = messages.map { |m| m[:content] }
-                          .map do |c|
-                            c.match?(%r{text/html}i) ? c.html2text(true) : c
-                          end
-
-    # hash with possible signature and count of matches in string list
-    possible_signatures = {}
-
-    # loop all strings in array
-    string_list.each_with_index do |_main_string, main_string_index|
-      break if main_string_index + 1 > string_list.length - 1
-
-      # loop all all strings in array except of the previous index
-      ( main_string_index + 1..string_list.length - 1 ).each do |second_string_index|
-
-        # get content of string 1
-        string1_content = string_list[main_string_index]
-
-        # get content of string 2
-        string2_content = string_list[second_string_index]
-
-        # diff strings
-        diff_result = Diffy::Diff.new(string1_content, string2_content)
-
-        # split diff result by new line
-        diff_result_array = diff_result.to_s.split("\n")
-
-        # define start index for blocks with no difference
-        match_block = nil
-
-        # loop of lines of the diff result
-        ( 0..diff_result_array.length - 1 ).each do |diff_string_index|
-
-          # if no block with difference is defined then we try to find a string block without a difference
-          if !match_block
-            match_block = diff_string_index
-          end
-
-          # get line of diff result with current loop inde
-          line = diff_result_array[diff_string_index]
-
-          # check if the line starts with
-          # + = new content incoming
-          # - = removed content
-          # \ = end of file
-          # or if the current line is the last line of the diff result
-          next if line !~ /^(\\|\+|\-)/i && diff_string_index != diff_result_array.length - 1
-
-          # if the count of the lines without any difference is higher than 4 lines
-          if diff_string_index - match_block > 4
-
-            # define the block size without any difference
-            # except "-" because in this case 1 line is removed to much
-            match_block_total = diff_string_index + (line.match?(/^(\\|\+)/i) ? -1 : 0)
-
-            # get string of possible signature, use only the first 10 lines
-            match_max_content = 0
-            match_content = ''
-            ( match_block..match_block_total ).each do |match_block_index|
-              break if match_max_content == 10
-
-              match_max_content += 1
-              match_content += "#{diff_result_array[match_block_index][1..-1]}\n"
-            end
-
-            # count the match of the signature in string list to rank
-            # the signature
-            possible_signatures[match_content] ||= 0
-            possible_signatures[match_content] += 1
-
-            break
-          end
-
-          match_block = nil
-        end
-      end
+    signature_candidates = Hash.new(0)  # <potential_signature>: <score>
+    messages             = messages.map { |m| m[:content_type].match?(%r{text/html}i) ? m[:content].html2text(true) : m[:content] }
+    message_pairs        = messages.each_cons(2).to_a
+    diffs                = message_pairs.map { |msg_pair| Diffy::Diff.new(*msg_pair).to_s }
+
+    # Find the first 5- to 10-line common substring in each diff
+    diffs.map { |d| d.split("\n") }.each do |diff_lines|
+      # Get line numbers in diff representing changes (those starting with +, -, \)
+      delta_indices = diff_lines.map.with_index { |l, i| l.start_with?(' ') ? nil : i }.compact
+
+      # Add boundaries at start and end
+      delta_indices.unshift(-1).push(diff_lines.length)
+
+      # Find first gap of 5+ lines between deltas (i.e., the common substring's location)
+      sig_range = delta_indices.each_cons(2)
+                               .map { |head, tail| [head + 1, tail - 1] }
+                               .find { |head, tail| tail > head + 4 }
+
+      # Take up to 10 lines from this "gap" (i.e., the common substring)
+      match_content = diff_lines[sig_range.first..sig_range.last]
+                        .map { |l| l.sub(/^./, '') }
+                        .first(10).join("\n")
+
+      # Add this substring to the signature_candidates hash and increment its match score
+      signature_candidates[match_content] += 1
     end
 
-    # loop all possible signature by rating and return highest rating
-    possible_signatures.sort { |a1, a2| a2[1].to_i <=> a1[1].to_i }.map do |content, _score|
-      return content.chomp
-    end
-
-    nil
+    signature_candidates.max_by { |_, score| score }&.first
   end
 
 =begin
@@ -124,18 +67,13 @@ returns
 =end
 
   def self.find_signature_line(signature, string, content_type)
-
-    if content_type.match?(%r{text/html}i)
-      string = string.html2text(true)
-    end
+    string = string.html2text(true) if content_type.match?(%r{text/html}i)
 
     # try to find the char position of the signature
     search_position = string.index(signature)
 
-    return if search_position.nil?
-
     # count new lines up to signature
-    string[0..search_position].split("\n").length + 1
+    string[0..search_position].split("\n").length + 1 if search_position.present?
   end
 
 =begin
@@ -256,11 +194,12 @@ returns
 =end
 
   def self.rebuild_all_articles
-
     article_type = Ticket::Article::Type.lookup(name: 'email')
-    Ticket::Article.select('id').where(type_id: article_type.id).order(id: :desc).each do |local_article|
-      article = Ticket::Article.find(local_article.id)
-      user = User.find(article.created_by_id)
+
+    Ticket::Article.where(type_id: article_type.id)
+                   .order(id: :desc)
+                   .find_each(batch_size: 10) do |article|
+      user = User.lookup(id: article.created_by_id)
       next if !user.preferences[:signature_detection]
 
       signature_line = find_signature_line(
@@ -269,10 +208,9 @@ returns
         article.content_type,
       )
       next if !signature_line
-      next if article.preferences[:signature_detection] == signature_line
 
       article.preferences[:signature_detection] = signature_line
-      article.save
+      article.save if article.changed?
     end
     true
   end