signature_detection.rb 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. module SignatureDetection
  2. =begin
  3. try to detect the signature in list of articles for example
  4. messages = [
  5. {
  6. content: 'some content',
  7. content_type: 'text/plain',
  8. },
  9. ]
  10. signature = SignatureDetection.find_signature(messages)
  11. returns
  12. signature = '...signature possible match...'
  13. =end
  14. def self.find_signature(messages)
  15. signature_candidates = Hash.new(0) # <potential_signature>: <score>
  16. messages = messages.map { |m| m[:content_type].match?(%r{text/html}i) ? m[:content].html2text(true) : m[:content] }
  17. message_pairs = messages.each_cons(2).to_a
  18. diffs = message_pairs.map { |msg_pair| Diffy::Diff.new(*msg_pair).to_s }
  19. # Find the first 5- to 10-line common substring in each diff
  20. diffs.map { |d| d.split("\n") }.each do |diff_lines|
  21. # Get line numbers in diff representing changes (those starting with +, -, \)
  22. delta_indices = diff_lines.map.with_index { |l, i| l.start_with?(' ') ? nil : i }.compact
  23. # Add boundaries at start and end
  24. delta_indices.unshift(-1).push(diff_lines.length)
  25. # Find first gap of 5+ lines between deltas (i.e., the common substring's location)
  26. sig_range = delta_indices.each_cons(2)
  27. .map { |head, tail| [head + 1, tail - 1] }
  28. .find { |head, tail| tail > head + 4 }
  29. next if sig_range.nil?
  30. # Take up to 10 lines from this "gap" (i.e., the common substring)
  31. match_content = diff_lines[sig_range.first..sig_range.last]
  32. .map { |l| l.sub(/^./, '') }
  33. .first(10).join("\n")
  34. # Add this substring to the signature_candidates hash and increment its match score
  35. signature_candidates[match_content] += 1
  36. end
  37. signature_candidates.max_by { |_, score| score }&.first
  38. end
  39. =begin
  40. this function will search for a signature string in a string (e.g. article) and return the line number of the signature start
  41. signature_line = SignatureDetection.find_signature_line(signature, message, content_type)
  42. returns
  43. signature_line = 123
  44. or
  45. signature_line = nil
  46. =end
  47. def self.find_signature_line(signature, string, content_type)
  48. string = string.html2text(true) if content_type.match?(%r{text/html}i)
  49. # try to find the char position of the signature
  50. search_position = string.index(signature)
  51. # count new lines up to signature
  52. string[0..search_position].split("\n").length + 1 if search_position.present?
  53. end
  54. =begin
  55. find signature line of message by user and article
  56. signature_line = SignatureDetection.find_signature_line_by_article(user, article)
  57. returns
  58. signature_line = 123
  59. or
  60. signature_line = nil
  61. =end
  62. def self.find_signature_line_by_article(user, article)
  63. return if !user.preferences[:signature_detection]
  64. SignatureDetection.find_signature_line(
  65. user.preferences[:signature_detection],
  66. article.body,
  67. article.content_type,
  68. )
  69. end
  70. =begin
  71. this function will search for a signature string in all articles of a given user_id
  72. signature = SignatureDetection.by_user_id(user_id)
  73. returns
  74. signature = '...signature possible match...'
  75. =end
  76. def self.by_user_id(user_id)
  77. type = Ticket::Article::Type.lookup(name: 'email')
  78. sender = Ticket::Article::Sender.lookup(name: 'Customer')
  79. tickets = Ticket.where(
  80. created_by_id: user_id,
  81. create_article_type_id: type.id,
  82. create_article_sender_id: sender.id
  83. ).limit(5).order(id: :desc)
  84. article_bodies = []
  85. tickets.each do |ticket|
  86. article = ticket.articles.first
  87. next if !article
  88. data = {
  89. content: article.body,
  90. content_type: article.content_type,
  91. }
  92. article_bodies.push data
  93. end
  94. find_signature(article_bodies)
  95. end
  96. =begin
  97. rebuild signature for each user
  98. SignatureDetection.rebuild_all_user
  99. returns
  100. true/false
  101. =end
  102. def self.rebuild_all_user
  103. User.select('id').where(active: true).order(id: :desc).each do |local_user|
  104. rebuild_user(local_user.id)
  105. end
  106. true
  107. end
  108. =begin
  109. rebuild signature detection for user
  110. SignatureDetection.rebuild_user(user_id)
  111. returns
  112. true/false
  113. =end
  114. def self.rebuild_user(user_id)
  115. signature_detection = by_user_id(user_id)
  116. return if !signature_detection
  117. user = User.find(user_id)
  118. return if user.preferences[:signature_detection] == signature_detection
  119. user.preferences[:signature_detection] = signature_detection
  120. user.save
  121. true
  122. end
  123. =begin
  124. rebuild signature for all articles
  125. SignatureDetection.rebuild_all_articles
  126. returns
  127. true/false
  128. =end
  129. def self.rebuild_all_articles
  130. article_type = Ticket::Article::Type.lookup(name: 'email')
  131. Ticket::Article.where(type_id: article_type.id)
  132. .order(id: :desc)
  133. .find_each(batch_size: 10) do |article|
  134. user = User.lookup(id: article.created_by_id)
  135. next if !user.preferences[:signature_detection]
  136. signature_line = find_signature_line(
  137. user.preferences[:signature_detection],
  138. article.body,
  139. article.content_type,
  140. )
  141. next if !signature_line
  142. article.preferences[:signature_detection] = signature_line
  143. article.save if article.changed?
  144. end
  145. true
  146. end
  147. end