Browse Source

Added proving backend to detect signatures by diff module 'diffy' to reduce redundancy in article views.

Rolf Schmidt 9 years ago
parent
commit
7a7d9471c9

+ 2 - 0
Gemfile

@@ -62,6 +62,8 @@ gem 'browser'
 gem 'eventmachine'
 gem 'em-websocket'
 
+gem 'diffy'
+
 # Gems used only for develop/test and not required
 # in production environments by default.
 group :development, :test do

+ 2 - 0
Gemfile.lock

@@ -76,6 +76,7 @@ GEM
     delayed_job_active_record (4.1.0)
       activerecord (>= 3.0, < 5)
       delayed_job (>= 3.0, < 5)
+    diffy (3.0.7)
     docile (1.1.5)
     eco (1.0.0)
       coffee-script
@@ -302,6 +303,7 @@ DEPENDENCIES
   coffee-script-source
   daemons
   delayed_job_active_record
+  diffy
   eco
   em-websocket
   eventmachine

+ 122 - 0
lib/signature_detection.rb

@@ -0,0 +1,122 @@
+module SignatureDetection
+
+=begin
+
+try to detect the signature in list of articles for example
+
+  signature = SignatureDetection.find_signature(string_list)
+
+returns
+
+  signature = '...signature possible match...'
+
+=end
+
+  def self.find_signature(string_list)
+
+    # hash with possible signature and count of matches in string list
+    possible_signatures = {}
+
+    # loop all strings in array
+    #for main_string_index in 0 .. string_list.length - 1
+    ( 0..string_list.length - 1 ).each {|main_string_index|
+      break if main_string_index + 1 > string_list.length - 1
+
+      # loop all all strings in array except of the previous index
+      ( main_string_index + 1..string_list.length - 1 ).each {|second_string_index|
+
+        # get content of string 1
+        string1_content = string_list[main_string_index]
+
+        # get content of string 2
+        string2_content = string_list[second_string_index]
+
+        # diff strings
+        diff_result = Diffy::Diff.new(string1_content, string2_content)
+
+        # split diff result by new line
+        diff_result_array = diff_result.to_s.split("\n")
+
+        # define start index for blocks with no difference
+        match_block = nil
+
+        # loop of lines of the diff result
+        ( 0..diff_result_array.length - 1 ).each {|diff_string_index|
+
+          # if no block with difference is defined then we try to find a string block without a difference
+          if !match_block
+            match_block = diff_string_index
+          end
+
+          # get line of diff result with current loop inde
+          line = diff_result_array[diff_string_index]
+
+          # check if the line starts with
+          # + = new content incoming
+          # - = removed content
+          # \ = end of file
+          # or if the current line is the last line of the diff result
+          next if line !~ /^(\\|\+|\-)/i && diff_string_index != diff_result_array.length - 1
+
+          # if the count of the lines without any difference is higher than 5 lines
+          if diff_string_index - match_block > 5
+
+            # define the block size without any difference
+            # except "-" because in this case 1 line is removed to much
+            match_block_total = diff_string_index + (line =~ /^(\\|\+)/i ? -1 : 0)
+
+            # get string of possible signature
+            match_content = ''
+            ( match_block..match_block_total ).each {|match_block_index|
+              match_content += "#{diff_result_array[match_block_index][1..-1]}\n"
+            }
+
+            # count the match of the signature in string list to rank
+            # the signature
+            possible_signatures[match_content] ||= 0
+            possible_signatures[match_content] += 1
+
+          end
+
+          match_block = nil
+        }
+      }
+    }
+
+    # loop all possible signature by rating and return highest rating
+    possible_signatures.sort { |a1, a2| a2[1].to_i <=> a1[1].to_i }.map do |content, _score|
+      return content.chomp
+    end
+
+    nil
+  end
+
+=begin
+
+this function will search for a signature string in a string (e.g. article) and return the line number of the signature start
+
+  signature_line = SignatureDetection.find_signature_line(signature, string)
+
+returns
+
+  signature_line = 123
+
+  or
+
+  signature_line = nil
+
+=end
+
+  def self.find_signature_line(signature, string)
+
+    # try to find the char position of the signature
+    search_position = string.index(signature)
+
+    return if search_position.nil?
+
+    # count new lines up to signature
+    search_newlines  = string[0..search_position].split("\n").length + 1
+
+    search_newlines
+  end
+end

+ 2 - 0
test/fixtures/email_signature_detection/client_a_1.txt

@@ -1,5 +1,7 @@
 Hi,
 
+123
+
 uns liegt die fachliche Anforderung vor, dass eine Agent-AddNote-Benachrichtigung für die beiden o. g. TicketHistory-Typen versendet werden soll.
 
 Das Modul Custom/Kernel/System/Ticket/Article.pm sieht diese Benachrichtigungen nach meinem Verständnis bisher nicht vor. Dafür wäre doch eine Codeerweiterung erforderlich, oder?

+ 14 - 0
test/fixtures/email_signature_detection/client_a_2.txt

@@ -1,7 +1,21 @@
 Hi Martin,
 
+123
+
 ich benötige von Dir eine Aufwandschätzung für ein Upgrade von x.1 auf x.5 (wir hatten schon mal diesbezüglich informiert, jetzt wollen die Entscheider Zahlen sehen).
 
+asd
+fa
+sdf
+a
+sdf
+asd
+f
+as
+df
+asd
+f
+
 Vielen Dank!
 
 Mit freundlichen Grüßen

+ 56 - 45
test/unit/email_signatur_detection_test.rb

@@ -6,61 +6,72 @@ class EmailSignaturDetectionTest < ActiveSupport::TestCase
   test 'test case I - sender a' do
 
     # fixtures of sender a
-    fixture_files = [
-      'email_signature_detection/client_a_1.txt',
-      'email_signature_detection/client_a_2.txt',
-      'email_signature_detection/client_a_3.txt',
-    ]
-
-    # detect signature
-    match_structure = ''
-
-    # tests
-    # 'email_signature_detection/client_a_1.txt'
-    result_should = {
-      line: 9
+    fixture_files = {
+      'email_signature_detection/client_a_1.txt' => { line: 10 },
+      'email_signature_detection/client_a_2.txt' => { line: 20 },
+      'email_signature_detection/client_a_3.txt' => { line: 6 },
     }
 
-    # 'email_signature_detection/client_a_2.txt'
-    result_should = {
-      line: 7
-    }
+    fixture_files_string_list = []
 
-    # 'email_signature_detection/client_a_3.txt'
-    result_should = {
-      line: 7
-    }
-    assert(true)
+    fixture_files.keys.each do |filepath|
+
+      file_content = ''
+
+      file = File.new("#{Rails.root}/test/fixtures/#{filepath}", 'r')
+      while (line = file.gets)
+        file_content += line
+      end
+      file.close
+
+      fixture_files[filepath][:content] = file_content
+      fixture_files_string_list.push(file_content)
+    end
+
+    signature = SignatureDetection.find_signature(fixture_files_string_list)
+    expected_signature = "\nMit freundlichen Grüßen\n\nBob Smith\nBerechtigungen und dez. Department\n________________________________\n\nMusik AG\nBerechtigungen und dez. Department (ITPBM)\nKastanien 2\n12345 Hornhausen\nTel.: +49 911 6760\nFax: +49 911 85 6760\nMobil: +49 173 911\nE-Mail: Bob.Smith@music.com\nhttp://www.music.com\n\nMusik AG | Kastanien 2 | 12345 Hornhausen\nSitz der AG: Hornhausen, HRB xxxxx | USt.-ID: DE 111222333444\nVorstand: Marc Smith, Weber Huber\nAufsichtsrat: Max Mix (Vors.)"
+    assert_equal(expected_signature, signature)
+
+    fixture_files.keys.each do |filepath|
+      expected_signature_position = fixture_files[filepath][:line]
+
+      assert_equal(expected_signature_position, SignatureDetection.find_signature_line(signature, fixture_files[filepath][:content]))
+    end
   end
 
   test 'test case II - sender b' do
 
-    # fixtures of sender a
-    fixture_files = [
-      'email_signature_detection/client_b_1.txt',
-      'email_signature_detection/client_b_2.txt',
-      'email_signature_detection/client_b_3.txt',
-    ]
-
-    # detect signature
-    match_structure = ''
-
-    # tests
-    # 'email_signature_detection/client_b_1.txt'
-    result_should = {
-      line: 27
+    fixture_files = {
+      'email_signature_detection/client_b_1.txt' => { line: 26 },
+      'email_signature_detection/client_b_2.txt' => { line: 4 },
+      'email_signature_detection/client_b_3.txt' => { line: 6 },
     }
 
-    # 'email_signature_detection/client_b_2.txt'
-    result_should = {
-      line: 5
-    }
+    fixture_files_string_list = []
 
-    # 'email_signature_detection/client_b_3.txt'
-    result_should = {
-      line: 7
-    }
-    assert(true)
+    fixture_files.keys.each do |filepath|
+
+      file_content = ''
+
+      file = File.new("#{Rails.root}/test/fixtures/#{filepath}", 'r')
+      while (line = file.gets)
+        file_content += line
+      end
+      file.close
+
+      fixture_files[filepath][:content] = file_content
+      fixture_files_string_list.push(file_content)
+    end
+
+    signature = SignatureDetection.find_signature(fixture_files_string_list)
+    expected_signature = "\nFreundliche Grüße\n\nGünter Lässig\nLokale Daten\n\nMusic GmbH\nBaustraße 123, 12345 Max City\nTelefon 0123 5432114\nTelefax 0123 5432139\nE-Mail Günter.Lässig@example.com<mailto:Günter.Lässig@example.com>\n\nExample. Zusammen für eine bessere Welt.\n[cid:image001.png@01CE92A6.EC495B60]<http://www.example.com/>\n\n[cid:image002.png@01CE92A6.EC495B60]<http://www.facebook.com/example.com>\n\n[cid:image003.png@01CE92A6.EC495B60]<http://twitter.com/example>\n\n[cid:image004.png@01CE92A6.EC495B60]<https://www.xing.com/companies/example/neu-example>\n\n[cid:image005.jpg@01CE92A6.EC495B60]<http://www.youtube.com/example>\n\n[cid:image006.png@01CE92A6.EC495B60]<http://www.example.com/no_cache/privatkunden/aktuelles/news-presse/newsletter.html>\n\nSitz der Gesellschaft: Max City, Amtsgericht Max City HRB Nr. 1234\nGeschäftsführer: Bob Smith\nVorsitzender des Aufsichtsrats: Alex Marx"
+    assert_equal(expected_signature, signature)
+
+    fixture_files.keys.each do |filepath|
+      expected_signature_position = fixture_files[filepath][:line]
+
+      assert_equal(expected_signature_position, SignatureDetection.find_signature_line(signature, fixture_files[filepath][:content]))
+    end
   end
 
 end