Просмотр исходного кода

Improved html2html_strict (allow hyperlinks but clean it before).

Martin Edenhofer 8 лет назад
Родитель
Сommit
249fba71c0

+ 48 - 15
lib/core_ext/string.rb

@@ -95,41 +95,55 @@ class String
     link_list = ''
     counter   = 0
     if !string_only
-      string.gsub!(/<a\s.*?href=("|')(.+?)("|').*?>/ix) {
+      string.gsub!(/<a[[:space:]].*?href=("|')(.+?)("|').*?>/ix) {
         link = $2
         counter = counter + 1
         link_list += "[#{counter}] #{link}\n"
         "[#{counter}] "
       }
     else
-      string.gsub!(%r{<a\s+href=("|')(.+?)("|')(\s*|\s+[^>]*)>(.+?)<\s*/a\s*>}mxi) {|_placeholder|
-        link = $2
-        if !link.empty?
+      string.gsub!(%r{<a[[:space:]]+(|\S+[[:space:]]+)href=("|')(.+?)("|')([[:space:]]*|[[:space:]]+[^>]*)>(.+?)<[[:space:]]*/a[[:space:]]*>}mxi) {|_placeholder|
+        link = $3
+        text = $6
+        text.gsub!(/\<.+?\>/, '')
+
+        link_compare = link.dup
+        if !link_compare.empty?
           link.strip!
+          link_compare.strip!
+          link_compare.downcase!
+          link_compare.sub!(%r{/$}, '')
         end
-        text = $5
-        if !text.empty?
+        text_compare = text.dup
+        if !text_compare.empty?
           text.strip!
+          text_compare.strip!
+          text_compare.downcase!
+          text_compare.sub!(%r{/$}, '')
         end
-        placeholder = if !link.empty? && text.empty?
+        placeholder = if !link_compare.empty? && text_compare.empty?
                         link
-                      elsif link.empty? && !text.empty?
+                      elsif link_compare.empty? && !text_compare.empty?
                         text
-                      elsif !link.empty? && !text.empty? && (link.downcase == text.downcase || link.downcase == "mailto:#{text}".downcase || link.downcase == "http://#{text}".downcase)
+                      elsif link_compare && link_compare =~ /^mailto/i
                         text
+                      elsif !link_compare.empty? && !text_compare.empty? && (link_compare == text_compare || link_compare == "mailto:#{text}".downcase || link_compare == "http://#{text}".downcase)
+                        "######LINKEXT:#{link}/TEXT:#{text}######"
+                      elsif text !~ /^http/
+                        "#{text} (######LINKRAW:#{link}######)"
                       else
-                        "#{text} (#{link})"
+                        "#{link} (######LINKRAW:#{text}######)"
                       end
       }
     end
 
     # remove style tags with content
-    string.gsub!(%r{<style(|\s.+?)>(.+?)</style>}im, '')
+    string.gsub!(%r{<style(|[[:space:]].+?)>(.+?)</style>}im, '')
 
     # remove empty lines
     string.gsub!(/^[[:space:]]*/m, '')
     if strict
-      string.gsub!(%r{< \s* (/*) \s* (b|i|ul|ol|li|u|h1|h2|h3|hr) (\s*|\s+[^>]*) >}mxi, '######\1\2######')
+      string.gsub!(%r{< [[:space:]]* (/*) [[:space:]]* (b|i|ul|ol|li|u|h1|h2|h3|hr) ([[:space:]]*|[[:space:]]+[^>]*) >}mxi, '######\1\2######')
     end
 
     # pre/code handling 1/2
@@ -164,10 +178,10 @@ class String
     string.gsub!(%r{</h\d>}i, "\n")
 
     # add new lines
-    string.gsub!(%r{</div><div(|\s.+?)>}im, "\n")
-    string.gsub!(%r{</p><p(|\s.+?)>}im, "\n")
+    string.gsub!(%r{</div><div(|[[:space:]].+?)>}im, "\n")
+    string.gsub!(%r{</p><p(|[[:space:]].+?)>}im, "\n")
     string.gsub!(%r{<(div|p|pre|br|table|tr|h)(|/| [^>]*)>}i, "\n")
-    string.gsub!(%r{</(p|br|div)(|\s.+?)>}i, "\n")
+    string.gsub!(%r{</(p|br|div)(|[[:space:]].+?)>}i, "\n")
     string.gsub!(%r{</td>}i, ' ')
 
     # strip all other tags
@@ -176,6 +190,23 @@ class String
     # replace multiple spaces with one
     string.gsub!(/  /, ' ')
 
+    # add hyperlinks
+    if strict
+      string.gsub!(%r{([[:space:]])((http|https|ftp|tel)://.+?|(www..+?))([[:space:]]|\.[[:space:]]|,[[:space:]])}mxi) {|_placeholder|
+        pre = $1
+        content = $2
+        post = $5
+        if content =~ /^www/i
+          content = "http://#{content}"
+        end
+        placeholder = if content =~ /^(http|https|ftp|tel)/i
+                        "#{pre}######LINKRAW:#{content}#######{post}"
+                      else
+                        "#{pre}#{content}#{post}"
+                      end
+      }
+    end
+
     # try HTMLEntities, if it fails on invalid signes, use manual way
     begin
       coder = HTMLEntities.new
@@ -259,6 +290,8 @@ class String
   def html2html_strict
     string = html2text(true, true)
     string = string.text2html
+    string.gsub!(%r{######LINKEXT:(.+?)/TEXT:(.+?)######}, '<a href="\1" target="_blank">\2</a>')
+    string.gsub!(/######LINKRAW:(.+?)######/, '<a href="\1" target="_blank">\1</a>')
     string.gsub!(/######(.+?)######/, '<\1>')
     string.chomp
   end

+ 77 - 3
test/unit/aaa_string_test.rb

@@ -450,8 +450,81 @@ Men-----------------------'
     result = '<h3>test</h3>'
     assert_equal(result, html.html2html_strict)
 
-    html   = "<b\n>test</b>"
-    result = '<b>test</b>'
+    html   = '<a href="http://example.com">http://example.com</a>'
+    result = '<a href="http://example.com" target="_blank">http://example.com</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = '<A href="http://example.com?a=1;">http://example.com?a=1;</A>'
+    result = '<a href="http://example.com?a=1;" target="_blank">http://example.com?a=1;</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = '<a href="http://web.de">web.de</a>'
+    result = '<a href="http://web.de" target="_blank">web.de</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = '<a id="123" href="http://web.de">web.de</a>'
+    result = '<a href="http://web.de" target="_blank">web.de</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = '<br>https://www.facebook.com/test<br>'
+    result = '<a href="https://www.facebook.com/test" target="_blank">https://www.facebook.com/test</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = 'some text http://example.com some other text'
+    result = 'some text <a href="http://example.com" target="_blank">http://example.com</a> some other text'
+    assert_equal(result, html.html2html_strict)
+
+    html   = 'some text www.example.com some other text'
+    result = 'some text <a href="http://www.example.com" target="_blank">http://www.example.com</a> some other text'
+    assert_equal(result, html.html2html_strict)
+
+    html   = '<a href="http://example.com">http://what-different.example.com</a>'
+    result = 'http://example.com (<a href="http://what-different.example.com" target="_blank">http://what-different.example.com</a>)'
+    result = 'http://example.com (<a href="http://what-different.example.com" target="_blank">http://what-different.example.com</a>)'
+    assert_equal(result, html.html2html_strict)
+
+    html   = '<a href="http://example.com">http://EXAMPLE.com</a>'
+    result = '<a href="http://example.com" target="_blank">http://EXAMPLE.com</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = '<a href="http://example.com" class="abc">http://example.com</a>'
+    result = '<a href="http://example.com" target="_blank">http://example.com</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = '<a href="http://example.com/" class="abc">http://example.com</a>'
+    result = '<a href="http://example.com/" target="_blank">http://example.com</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = "<a href=\"http://example.com/\n\" class=\"abc\">http://example.com</a>"
+    result = '<a href="http://example.com/" target="_blank">http://example.com</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = "<a href=\"http://example.com/\n \" class=\"abc\n\"\n>http://example.com</a>"
+    result = '<a href="http://example.com/" target="_blank">http://example.com</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = "<div>http://example.com</div>"
+    result = '<a href="http://example.com" target="_blank">http://example.com</a>'
+    assert_equal(result, html.html2html_strict)
+
+    html   = "<div>http://example.com.</div>"
+    result = '<a href="http://example.com" target="_blank">http://example.com</a>.'
+    assert_equal(result, html.html2html_strict)
+
+    html   = "<div>http://example.com, and so on</div>"
+    result = '<a href="http://example.com" target="_blank">http://example.com</a>, and so on'
+    assert_equal(result, html.html2html_strict)
+
+    html   = "<div>http://example.com?lala=me, and so on</div>"
+    result = '<a href="http://example.com?lala=me" target="_blank">http://example.com?lala=me</a>, and so on'
+    assert_equal(result, html.html2html_strict)
+
+    html   = "<a href=\"http://facebook.de/examplesrbog\"><span lang=\"EN-US\" style='color:blue'>http://facebook.de/examplesrbog</span></a>"
+    result = "<a href=\"http://facebook.de/examplesrbog\" target=\"_blank\">http://facebook.de/examplesrbog</a>"
+    assert_equal(result, html.html2html_strict)
+
+    html   = "Damit Sie keinen Tag versäumen, empfehlen wir Ihnen den <a href=\"http://newsletters.cylex.de/\" class=\"\">Link des Adventkalenders</a> in<br class=\"\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Ihrer Lesezeichen-Symbolleiste zu ergänzen.</p><div class=\"\">&nbsp;"
+    result = "Damit Sie keinen Tag versäumen, empfehlen wir Ihnen den Link des Adventkalenders (<a href=\"http://newsletters.cylex.de/\" target=\"_blank\">http://newsletters.cylex.de/</a>) in<br>      Ihrer Lesezeichen-Symbolleiste zu ergänzen."
     assert_equal(result, html.html2html_strict)
 
     html   = '<b >test</b>'
@@ -495,7 +568,8 @@ Men-----------------------'
     assert_equal(result, html.html2html_strict)
 
     html   = '<a href="mailto:john.smith2@example.com" style="color: blue; text-decoration: underline; ">john.smith@example.com</a>'
-    result = 'john.smith@example.com (mailto:john.smith2@example.com)'
+    #result = 'john.smith@example.com (mailto:john.smith2@example.com)'
+    result = 'john.smith@example.com'
     assert_equal(result, html.html2html_strict)
 
   end

Разница между файлами не показана из-за своего большого размера
+ 3 - 3
test/unit/email_parser_test.rb


Разница между файлами не показана из-за своего большого размера
+ 0 - 0
test/unit/email_process_test.rb


Некоторые файлы не были показаны из-за большого количества измененных файлов