Просмотр исходного кода

Maintenance: HtmlSanitizer refactoring

Mantas Masalskis 2 лет назад
Родитель
Сommit
455e6b79b3

+ 5 - 441
lib/html_sanitizer.rb

@@ -1,8 +1,7 @@
 # Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
 
 class HtmlSanitizer
-  LINKABLE_URL_SCHEMES = URI.scheme_list.keys.map(&:downcase) - ['mailto'] + ['tel']
-  PROCESSING_TIMEOUT = 20
+  PROCESSING_TIMEOUT     = 20.seconds
   UNPROCESSABLE_HTML_MSG = __('This message cannot be displayed due to HTML processing issues. Download the raw message below and open it via an Email client if you still wish to view it.').freeze
 
 =begin
@@ -14,201 +13,7 @@ sanitize html string based on whiltelist
 =end
 
   def self.strict(string, external = false, timeout: true)
-    Timeout.timeout(timeout ? PROCESSING_TIMEOUT : nil) do
-      @fqdn              = Setting.get('fqdn')
-      http_type          = Setting.get('http_type')
-      web_app_url_prefix = "#{http_type}://#{@fqdn}/\#".downcase
-
-      # config
-      tags_remove_content = Rails.configuration.html_sanitizer_tags_remove_content
-      tags_quote_content = Rails.configuration.html_sanitizer_tags_quote_content
-      tags_allowlist = Rails.configuration.html_sanitizer_tags_allowlist
-      attributes_allowlist = Rails.configuration.html_sanitizer_attributes_allowlist
-      css_properties_allowlist = Rails.configuration.html_sanitizer_css_properties_allowlist
-      css_values_blocklist = Rails.application.config.html_sanitizer_css_values_blocklist
-
-      # We allowlist yahoo_quoted because Yahoo Mail marks quoted email content using
-      # <div class='yahoo_quoted'> and we rely on this class to identify quoted messages
-      classes_allowlist = %w[js-signatureMarker yahoo_quoted]
-      attributes_2_css = %w[width height]
-
-      # remove tags with subtree
-      scrubber_tag_remove = Loofah::Scrubber.new do |node|
-        next if tags_remove_content.exclude?(node.name)
-
-        node.remove
-        Loofah::Scrubber::STOP
-      end
-      string = Loofah.fragment(string).scrub!(scrubber_tag_remove).to_s
-
-      # remove tag, insert quoted content
-      scrubber_wipe_quote_content = Loofah::Scrubber.new do |node|
-        next if tags_quote_content.exclude?(node.name)
-
-        string = html_decode(node.content)
-        text = Nokogiri::XML::Text.new(string, node.document)
-        node.add_next_sibling(text)
-        node.remove
-        Loofah::Scrubber::STOP
-      end
-      string = Loofah.fragment(string).scrub!(scrubber_wipe_quote_content).to_s
-
-      scrubber_wipe = Loofah::Scrubber.new do |node|
-
-        # replace tags, keep subtree
-        if tags_allowlist.exclude?(node.name)
-          node.replace node.children.to_s
-          Loofah::Scrubber::STOP
-        end
-
-        # prepare src attribute
-        if node['src']
-          src = cleanup_target(CGI.unescape(node['src']))
-          if src =~ %r{(javascript|livescript|vbscript):}i || src.downcase.start_with?('http', 'ftp', '//')
-            node.remove
-            Loofah::Scrubber::STOP
-          end
-        end
-
-        # clean class / only use allowed classes
-        if node['class']
-          classes = node['class'].gsub(%r{\t|\n|\r}, '').split
-          class_new = ''
-          classes.each do |local_class|
-            next if classes_allowlist.exclude?(local_class.to_s.strip)
-
-            if class_new != ''
-              class_new += ' '
-            end
-            class_new += local_class
-          end
-          if class_new == ''
-            node.delete('class')
-          else
-            node['class'] = class_new
-          end
-        end
-
-        # move style attributes to css attributes
-        attributes_2_css.each do |key|
-          next if !node[key]
-
-          if node['style'].blank?
-            node['style'] = ''
-          else
-            node['style'] += ';'
-          end
-          value = node[key]
-          node.delete(key)
-          next if value.blank?
-
-          value += 'px' if !value.match?(%r{%|px|em}i)
-          node['style'] += "#{key}:#{value}"
-        end
-
-        # clean style / only use allowed style properties
-        if node['style']
-          pears = node['style'].downcase.gsub(%r{\t|\n|\r}, '').split(';')
-          style = ''
-          pears.each do |local_pear|
-            prop = local_pear.split(':')
-            next if !prop[0]
-
-            key = prop[0].strip
-            next if css_properties_allowlist.exclude?(node.name)
-            next if css_properties_allowlist[node.name].exclude?(key)
-            next if css_values_blocklist[node.name]&.include?(local_pear.gsub(%r{[[:space:]]}, '').strip)
-
-            style += "#{local_pear};"
-          end
-          node['style'] = style
-          if style == ''
-            node.delete('style')
-          end
-        end
-
-        # scan for invalid link content
-        %w[href style].each do |attribute_name|
-          next if !node[attribute_name]
-
-          href = cleanup_target(node[attribute_name])
-          next if !href.match?(%r{(javascript|livescript|vbscript):}i)
-
-          node.delete(attribute_name)
-        end
-
-        # remove attributes if not allowlisted
-        node.each do |attribute, _value|
-          attribute_name = attribute.downcase
-          next if attributes_allowlist[:all].include?(attribute_name) || attributes_allowlist[node.name]&.include?(attribute_name)
-
-          node.delete(attribute)
-        end
-
-      end
-
-      done = true
-      while done
-        new_string = Loofah.fragment(string).scrub!(scrubber_wipe).to_s
-        if string == new_string
-          done = false
-        end
-        string = new_string
-      end
-
-      scrubber_link = Loofah::Scrubber.new do |node|
-
-        # wrap plain-text URLs in <a> tags
-        if node.is_a?(Nokogiri::XML::Text) && node.content.present? && node.content.include?(':')
-          node_ancestor_names = node.ancestors.map(&:name)
-          if node_ancestor_names.exclude?('a') && node_ancestor_names.exclude?('pre')
-            urls = URI.extract(node.content, LINKABLE_URL_SCHEMES)
-                    .map { |u| u.sub(%r{[,.]$}, '') } # URI::extract captures trailing dots/commas
-                    .grep_v(%r{^[^:]+:$}) # URI::extract will match, e.g., 'tel:'
-
-            next if urls.blank?
-
-            add_link(node.content, urls, node)
-          end
-        end
-
-        # prepare links
-        if node['href']
-          href                = cleanup_target(node['href'], keep_spaces: true)
-          href_without_spaces = href.gsub(%r{[[:space:]]}, '')
-          if external && href_without_spaces.present? && !href_without_spaces.downcase.start_with?('mailto:') && !href_without_spaces.downcase.start_with?('//') && href_without_spaces.downcase !~ %r{^.{1,6}://.+?}
-            node['href']        = "http://#{node['href']}"
-            href                = node['href']
-            href_without_spaces = href.gsub(%r{[[:space:]]}, '')
-          end
-
-          next if !CGI.unescape(href_without_spaces).utf8_encode(fallback: :read_as_sanitized_binary).gsub(%r{[[:space:]]}, '').downcase.start_with?('http', 'ftp', '//')
-
-          node.set_attribute('href', href)
-          node.set_attribute('rel', 'nofollow noreferrer noopener')
-
-          # do not "target=_blank" WebApp URLs (e.g. mentions)
-          if !href.downcase.start_with?(web_app_url_prefix)
-            node.set_attribute('target', '_blank')
-          end
-        end
-
-        if node.name == 'a' && node['href'].blank?
-          node.replace node.children.to_s
-          Loofah::Scrubber::STOP
-        end
-
-        # check if href is different to text
-        if node.name == 'a' && !url_same?(node['href'], node.text) && node['title'].blank?
-          node['title'] = node['href']
-        end
-      end
-
-      Loofah.fragment(string).scrub!(scrubber_link).to_s
-    end
-  rescue Timeout::Error
-    Rails.logger.error "Could not process string via HtmlSanitizer.strict in #{PROCESSING_TIMEOUT} seconds. Current state: #{string}"
-    UNPROCESSABLE_HTML_MSG
+    HtmlSanitizer::Strict.new.sanitize(string, external: external, timeout: timeout)
   end
 
 =begin
@@ -223,191 +28,7 @@ cleanup html string:
 =end
 
   def self.cleanup(string, timeout: true)
-    Timeout.timeout(timeout ? PROCESSING_TIMEOUT : nil) do
-      string.gsub!(%r{<[A-z]:[A-z]>}, '')
-      string.gsub!(%r{</[A-z]:[A-z]>}, '')
-      string.delete!("\t")
-
-      # remove all new lines
-      string.gsub!(%r{(\n\r|\r\r\n|\r\n|\n)}, "\n")
-
-      # remove double multiple empty lines
-      string.gsub!(%r{\n\n\n+}, "\n\n")
-
-      string = cleanup_structure(string, 'pre')
-      string = cleanup_structure(string)
-      string
-    end
-  rescue Timeout::Error
-    Rails.logger.error "Could not process string via HtmlSanitizer.cleanup in #{PROCESSING_TIMEOUT} seconds. Current state: #{string}"
-    UNPROCESSABLE_HTML_MSG
-  end
-
-  def self.remove_last_empty_node(node, remove_empty_nodes, remove_empty_last_nodes)
-    if node.children.present?
-      if node.children.size == 1
-        local_name = node.name
-        child = node.children.first
-
-        # replace not needed node (parent <- child)
-        if local_name == child.name && node.attributes.present? && node.children.first.attributes.blank?
-          local_node_child = node.children.first
-          node.attributes.each do |k|
-            local_node_child.set_attribute(k[0], k[1])
-          end
-          node.replace local_node_child.to_s
-          Loofah::Scrubber::STOP
-
-        # replace not needed node (parent replace with child node)
-        elsif (local_name == 'span' || local_name == child.name) && node.attributes.blank?
-          node.replace node.children.to_s
-          Loofah::Scrubber::STOP
-        end
-      else
-
-        # loop through nodes
-        node.children.each do |local_node|
-          remove_last_empty_node(local_node, remove_empty_nodes, remove_empty_last_nodes)
-        end
-      end
-    # remove empty nodes
-    elsif (remove_empty_nodes.include?(node.name) || remove_empty_last_nodes.include?(node.name)) && node.content.blank? && node.attributes.blank?
-      node.remove
-      Loofah::Scrubber::STOP
-    end
-  end
-
-  def self.cleanup_structure(string, type = 'all')
-    remove_empty_nodes = if type == 'pre'
-                           %w[span]
-                         else
-                           %w[p div span small table]
-                         end
-    remove_empty_last_nodes = %w[b i u small table]
-
-    # remove last empty nodes and empty -not needed- parrent nodes
-    scrubber_structure = Loofah::Scrubber.new do |node|
-      remove_last_empty_node(node, remove_empty_nodes, remove_empty_last_nodes)
-    end
-
-    done = true
-    while done
-      new_string = Loofah.fragment(string).scrub!(scrubber_structure).to_s
-      if string == new_string
-        done = false
-      end
-      string = new_string
-    end
-
-    scrubber_cleanup = Loofah::Scrubber.new do |node|
-
-      # remove not needed new lines
-      if node.instance_of?(Nokogiri::XML::Text)
-        if !node.parent || (node.parent.name != 'pre' && node.parent.name != 'code') # rubocop:disable Style/SoleNestedConditional
-          content = node.content
-          if content
-            if content != ' ' && content != "\n"
-              content.gsub!(%r{[[:space:]]+}, ' ')
-            end
-            if node.previous
-              if node.previous.name == 'div' || node.previous.name == 'p'
-                content.strip!
-              end
-            elsif node.parent && !node.previous && (!node.next || node.next.name == 'div' || node.next.name == 'p' || node.next.name == 'br')
-              if (node.parent.name == 'div' || node.parent.name == 'p') && content != ' ' && content != "\n"
-                content.strip!
-              end
-            end
-            node.content = content
-          end
-        end
-      end
-    end
-    Loofah.fragment(string).scrub!(scrubber_cleanup).to_s
-  end
-
-  def self.add_link(content, urls, node)
-    if urls.blank?
-      text = Nokogiri::XML::Text.new(content, node.document)
-      node.add_next_sibling(text)
-      return
-    end
-    url = urls.shift
-
-    if content =~ %r{^(.*)#{Regexp.quote(url)}(.*)$}mx
-      pre = $1
-      post = $2
-
-      if url.match?(%r{^www}i)
-        url = "http://#{url}"
-      end
-
-      a = Nokogiri::XML::Node.new 'a', node.document
-      a['href'] = url
-      a['rel'] = 'nofollow noreferrer noopener'
-      a['target'] = '_blank'
-      a.content = url
-
-      if node.class != Nokogiri::XML::Text
-        text = Nokogiri::XML::Text.new(pre, node.document)
-        node.add_next_sibling(text).add_next_sibling(a)
-        return if post.blank?
-
-        add_link(post, urls, a)
-        return
-      end
-      node.content = pre
-      node.add_next_sibling(a)
-      return if post.blank?
-
-      add_link(post, urls, a)
-    end
-
-    true
-  end
-
-  def self.html_decode(string)
-    string.gsub('&amp;', '&').gsub('&lt;', '<').gsub('&gt;', '>').gsub('&quot;', '"').gsub('&nbsp;', ' ')
-  end
-
-  def self.cleanup_target(string, **options)
-    cleaned_string = string.utf8_encode(fallback: :read_as_sanitized_binary)
-    cleaned_string = cleaned_string.gsub(%r{[[:space:]]}, '') if !options[:keep_spaces]
-    cleaned_string = cleaned_string.strip
-                                   .delete("\t\n\r\u0000")
-                                   .gsub(%r{/\*.*?\*/}, '')
-                                   .gsub(%r{<!--.*?-->}, '')
-
-    sanitize_attachment_disposition(cleaned_string)
-  end
-
-  def self.sanitize_attachment_disposition(url)
-    @fqdn ||= Setting.get('fqdn')
-    uri = URI(url)
-
-    if uri.host == @fqdn && uri.query.present?
-      params = CGI.parse(uri.query || '')
-                  .tap { |p| p.merge!('disposition' => 'attachment') if p.include?('disposition') }
-      uri.query = URI.encode_www_form(params)
-    end
-
-    uri.to_s
-  rescue
-    url
-  end
-
-  def self.url_same?(url_new, url_old)
-    url_new = CGI.unescape(url_new.to_s).utf8_encode(fallback: :read_as_sanitized_binary).downcase.delete_suffix('/').gsub(%r{[[:space:]]|\t|\n|\r}, '').strip
-    url_old = CGI.unescape(url_old.to_s).utf8_encode(fallback: :read_as_sanitized_binary).downcase.delete_suffix('/').gsub(%r{[[:space:]]|\t|\n|\r}, '').strip
-    url_new = html_decode(url_new).sub('/?', '?')
-    url_old = html_decode(url_old).sub('/?', '?')
-    return true if url_new == url_old
-    return true if url_old == "http://#{url_new}"
-    return true if url_new == "http://#{url_old}"
-    return true if url_old == "https://#{url_new}"
-    return true if url_new == "https://#{url_old}"
-
-    false
+    HtmlSanitizer::Cleanup.new.sanitize(string, timeout: timeout)
   end
 
 =begin
@@ -419,36 +40,7 @@ replace inline images with cid images
 =end
 
   def self.replace_inline_images(string, prefix = SecureRandom.uuid)
-    fqdn = Setting.get('fqdn')
-    attachments_inline = []
-    filename_counter = 0
-    scrubber = Loofah::Scrubber.new do |node|
-      if node.name == 'img'
-        if node['src'] && node['src'] =~ %r{^(data:image/(jpeg|png);base64,.+?)$}i
-          filename_counter += 1
-          file_attributes = StaticAssets.data_url_attributes($1)
-          cid = "#{prefix}.#{SecureRandom.uuid}@#{fqdn}"
-          filename = cid
-          if file_attributes[:file_extention].present?
-            filename = "image#{filename_counter}.#{file_attributes[:file_extention]}"
-          end
-          attachment = {
-            data:        file_attributes[:content],
-            filename:    filename,
-            preferences: {
-              'Content-Type'        => file_attributes[:mime_type],
-              'Mime-Type'           => file_attributes[:mime_type],
-              'Content-ID'          => cid,
-              'Content-Disposition' => 'inline',
-            },
-          }
-          attachments_inline.push attachment
-          node['src'] = "cid:#{cid}"
-        end
-        Loofah::Scrubber::STOP
-      end
-    end
-    [Loofah.fragment(string).scrub!(scrubber).to_s, attachments_inline]
+    HtmlSanitizer::ReplaceInlineImages.new.sanitize(string, prefix)
   end
 
 =begin
@@ -460,35 +52,7 @@ sanitize style of img tags
 =end
 
   def self.dynamic_image_size(string)
-    scrubber = Loofah::Scrubber.new do |node|
-      if node.name == 'img'
-        if node['src']
-          style = 'max-width:100%;'
-          if node['style']
-            pears = node['style'].downcase.gsub(%r{\t|\n|\r}, '').split(';')
-            pears.each do |local_pear|
-              prop = local_pear.split(':')
-              next if !prop[0]
-
-              key = prop[0].strip
-              if key == 'height'
-                key = 'max-height'
-              end
-              style += "#{key}:#{prop[1]};"
-            end
-          end
-          node['style'] = style
-        end
-        Loofah::Scrubber::STOP
-      end
-    end
-    Loofah.fragment(string).scrub!(scrubber).to_s
+    HtmlSanitizer::DynamicImageSize.new.sanitize(string)
   end
 
-  private_class_method :cleanup_target
-  private_class_method :sanitize_attachment_disposition
-  private_class_method :add_link
-  private_class_method :url_same?
-  private_class_method :html_decode
-
 end

+ 24 - 0
lib/html_sanitizer/base.rb

@@ -0,0 +1,24 @@
+# Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
+
+class HtmlSanitizer
+  class Base
+    def with_timeout(string, &block)
+      Timeout.timeout(PROCESSING_TIMEOUT, &block)
+    rescue Timeout::Error
+      Rails.logger.error "Could not process string via #{self.class.name} in #{PROCESSING_TIMEOUT} seconds. Current state: #{string}"
+      UNPROCESSABLE_HTML_MSG
+    end
+
+    def loop(string, scrubber)
+      old_string = nil
+
+      while string != old_string
+        old_string = string
+
+        string = Loofah.fragment(string).scrub!(scrubber).to_html
+      end
+
+      string
+    end
+  end
+end

+ 41 - 0
lib/html_sanitizer/cleanup.rb

@@ -0,0 +1,41 @@
+# Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
+
+class HtmlSanitizer
+  class Cleanup < Base
+    def sanitize(string, timeout: true)
+      return run_sanitization(string) if !timeout
+
+      with_timeout(string) do
+        run_sanitization(string)
+      end
+    end
+
+    private
+
+    def run_sanitization(string)
+      string = clean_string(string)
+
+      string = cleanup_structure(string, 'pre')
+
+      cleanup_structure(string)
+    end
+
+    def clean_string(input)
+      output = input.gsub(%r{<(|/)[A-z]:[A-z]>}, '')
+
+      output = output.delete("\t")
+      # remove all new lines
+      output
+        .gsub(%r{(\n\r|\r\r\n|\r\n|\n)}, "\n")
+        .gsub(%r{\n\n\n+}, "\n\n")
+    end
+
+    def cleanup_structure(string, type = 'all')
+      empty_node_scrubber = HtmlSanitizer::Scrubber::RemoveLastEmptyNode.new(type)
+
+      string = loop(string, empty_node_scrubber)
+
+      Loofah.fragment(string).scrub!(HtmlSanitizer::Scrubber::Cleanup.new).to_html
+    end
+  end
+end

+ 12 - 0
lib/html_sanitizer/dynamic_image_size.rb

@@ -0,0 +1,12 @@
+# Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
+
+class HtmlSanitizer
+  class DynamicImageSize
+    def sanitize(string)
+      Loofah
+        .fragment(string)
+        .scrub!(HtmlSanitizer::Scrubber::ImageSize.new)
+        .to_html
+    end
+  end
+end

+ 15 - 0
lib/html_sanitizer/replace_inline_images.rb

@@ -0,0 +1,15 @@
+# Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
+
+class HtmlSanitizer
+  class ReplaceInlineImages
+    def sanitize(string, prefix)
+      scrubber = HtmlSanitizer::Scrubber::InlineImages.new(prefix)
+
+      sanitized = Loofah
+        .fragment(string)
+        .scrub!(scrubber)
+
+      [sanitized.to_html, scrubber.attachments_inline]
+    end
+  end
+end

+ 23 - 0
lib/html_sanitizer/scrubber/base.rb

@@ -0,0 +1,23 @@
+# Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
+
+class HtmlSanitizer
+  module Scrubber
+    class Base < Loofah::Scrubber
+      HTML_DECODABLE = {
+        '&amp;'  => '&',
+        '&lt;'   => '<',
+        '&gt;'   => '>',
+        '&quot;' => '"',
+        '&nbsp;' => ' '
+      }.freeze
+
+      HTML_DECODABLE_REGEXP = Regexp.union(HTML_DECODABLE.keys).freeze
+
+      protected
+
+      def html_decode(string)
+        string.gsub HTML_DECODABLE_REGEXP, HTML_DECODABLE
+      end
+    end
+  end
+end

+ 60 - 0
lib/html_sanitizer/scrubber/cleanup.rb

@@ -0,0 +1,60 @@
+# Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
+
+class HtmlSanitizer
+  module Scrubber
+    class Cleanup < Base
+      def scrub(node)
+        return if !node.instance_of?(Nokogiri::XML::Text)
+        return if %w[pre code].include? node.parent&.name
+
+        update_node_content(node)
+      end
+
+      private
+
+      def update_node_content(node)
+        content = node.content
+
+        return if !content
+
+        content = remove_space_if_needed(content)
+        content = strip_if_needed_previous(node, content)
+        content = strip_if_needed_next(node, content)
+
+        node.content = content
+      end
+
+      def remove_space_if_needed(content)
+        return content if space_or_nl?(content)
+
+        content.gsub(%r{[[:space:]]+}, ' ')
+      end
+
+      def strip_if_needed_previous(node, content)
+        return content if !node.previous
+        return content if !div_or_p?(node.previous)
+
+        content.strip
+      end
+
+      def strip_if_needed_next(node, content)
+        return content if !node.parent
+        return content if node.previous
+        return content if node.next && %w[div p br].exclude?(node.next.name)
+
+        return content if !div_or_p?(node.parent)
+        return content if space_or_nl?(content)
+
+        content.strip
+      end
+
+      def space_or_nl?(string)
+        [' ', "\n"].include?(string)
+      end
+
+      def div_or_p?(node)
+        %w[div p].include? node.name
+      end
+    end
+  end
+end

+ 45 - 0
lib/html_sanitizer/scrubber/image_size.rb

@@ -0,0 +1,45 @@
+# Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
+
+class HtmlSanitizer
+  module Scrubber
+    class ImageSize < Base
+      def scrub(node)
+        return CONTINUE if node.name != 'img'
+
+        if node['src']
+          update_style(node)
+        end
+
+        STOP
+      end
+
+      private
+
+      def update_style(node)
+        node['style'] = build_style(node['style'])
+      end
+
+      def build_style(input)
+        style = 'max-width:100%;'
+
+        return style if input.blank?
+
+        input
+          .downcase
+          .gsub(%r{\t|\n|\r}, '')
+          .split(';')
+          .each_with_object(style) do |elem, memo|
+            key, value = elem.split(':')
+
+            key.strip!
+
+            next if key.blank?
+
+            key = 'max-height' if key == 'height'
+
+            memo << "#{key}:#{value};"
+          end
+      end
+    end
+  end
+end

+ 70 - 0
lib/html_sanitizer/scrubber/inline_images.rb

@@ -0,0 +1,70 @@
+# Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
+
+class HtmlSanitizer
+  module Scrubber
+    class InlineImages < Base
+      attr_reader :attachments_inline, :prefix
+
+      def initialize(prefix = SecureRandom.uuid) # rubocop:disable Lint/MissingSuper
+        @direction = :top_down
+
+        @attachments_inline = []
+        @prefix             = prefix
+      end
+
+      def scrub(node)
+        return CONTINUE if node.name != 'img'
+
+        if node['src'] && node['src'] =~ %r{^(data:image/(jpeg|png);base64,.+?)$}i
+          process_inline_image(node, $1)
+        end
+
+        STOP
+      end
+
+      private
+
+      def inline_image_data(src)
+        return if src.blank?
+
+        matchdata = src.match %r{^(data:image/(jpeg|png);base64,.+?)$}i
+
+        return if !matchdata
+
+        matchdata[0]
+      end
+
+      def process_inline_image(node, data)
+        cid        = generate_cid
+        attachment = parse_inline_image(data, cid)
+
+        @attachments_inline.push attachment
+        node['src'] = "cid:#{cid}"
+      end
+
+      def parse_inline_image(data, cid)
+        file_attributes = StaticAssets.data_url_attributes(data)
+        filename        = "image#{@attachments_inline.length + 1}.#{file_attributes[:file_extention]}"
+
+        {
+          data:        file_attributes[:content],
+          filename:    filename,
+          preferences: {
+            'Content-Type'        => file_attributes[:mime_type],
+            'Mime-Type'           => file_attributes[:mime_type],
+            'Content-ID'          => cid,
+            'Content-Disposition' => 'inline',
+          }
+        }
+      end
+
+      def generate_cid
+        "#{prefix}.#{SecureRandom.uuid}@#{fqdn}"
+      end
+
+      def fqdn
+        @fqdn ||= Setting.get('fqdn')
+      end
+    end
+  end
+end

+ 218 - 0
lib/html_sanitizer/scrubber/link.rb

@@ -0,0 +1,218 @@
+# Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
+
+class HtmlSanitizer
+  module Scrubber
+    class Link < Base
+      LINKABLE_URL_SCHEMES = URI.scheme_list.keys.map(&:downcase) - ['mailto'] + ['tel']
+
+      attr_reader :external, :web_app_url_prefix
+
+      def initialize(web_app_url_prefix:, external: false) # rubocop:disable Lint/MissingSuper
+        @direction = :top_down
+
+        @external = external
+        @web_app_url_prefix = web_app_url_prefix
+      end
+
+      def scrub(node)
+        if (urls = node_urls(node))
+          return if urls.blank?
+
+          add_link(node.content, urls, node)
+        end
+
+        # prepare links
+        return if href_cleanup(node)
+
+        return STOP if ensure_href_present(node)
+
+        update_node_title(node)
+      end
+
+      private
+
+      def href_cleanup(node)
+        return if !node['href']
+
+        href                = cleanup_target(node['href'], keep_spaces: true)
+        href_without_spaces = href.gsub(%r{[[:space:]]}, '')
+
+        if href_retry_protocol?(href_without_spaces)
+          node['href']        = "http://#{node['href']}"
+          href                = node['href']
+          href_without_spaces = href.gsub(%r{[[:space:]]}, '')
+        end
+
+        return true if !href_starts_with_protocol?(href_without_spaces)
+
+        href_set_values(node, href)
+
+        false
+      end
+
+      def href_retry_protocol?(href_without_spaces)
+        return if !external
+        return if href_without_spaces.blank?
+        return if href_without_spaces.downcase.start_with?('mailto:')
+        return if href_without_spaces.downcase.start_with?('//')
+        return if href_without_spaces.downcase.match? %r{^.{1,6}://.+?}
+
+        true
+      end
+
+      def href_starts_with_protocol?(href_without_spaces)
+        CGI
+          .unescape(href_without_spaces)
+          .utf8_encode(fallback: :read_as_sanitized_binary)
+          .gsub(%r{[[:space:]]}, '')
+          .downcase
+          .start_with?('http', 'ftp', '//')
+      end
+
+      def href_set_values(node, value)
+        node.set_attribute('href', value)
+        node.set_attribute('rel', 'nofollow noreferrer noopener')
+
+        # do not "target=_blank" WebApp URLs (e.g. mentions)
+        return if value.downcase.start_with?(web_app_url_prefix)
+
+        node.set_attribute('target', '_blank')
+      end
+
+      def node_urls(node)
+        return if !node.is_a?(Nokogiri::XML::Text)
+        return if node.content.blank?
+        return if node.content.exclude?(':')
+        return if node.ancestors.map(&:name).intersection(%w[a pre]).any?
+
+        URI.extract(node.content, LINKABLE_URL_SCHEMES)
+          .map { |u| u.sub(%r{[,.]$}, '') } # URI::extract captures trailing dots/commas
+          .grep_v(%r{^[^:]+:$}) # URI::extract will match, e.g., 'tel:'
+      end
+
+      def ensure_href_present(node)
+        return if node.name != 'a'
+        return if node['href'].present?
+
+        node.replace node.children.to_s
+
+        true
+      end
+
+      def update_node_title(node)
+        return if node.name != 'a'
+        return if url_same?(node['href'], node.text)
+        return if node['title'].present?
+
+        node['title'] = node['href']
+      end
+
+      def add_link(content, urls, node)
+        return if add_link_blank_text(content, urls, node)
+
+        url = urls.shift
+
+        return if content !~ %r{^(.*)#{Regexp.quote(url)}(.*)$}mx
+
+        pre  = $1
+        post = $2
+
+        a_elem = add_link_build_node(node, url)
+
+        if node.class != Nokogiri::XML::Text
+          text = Nokogiri::XML::Text.new(pre, node.document)
+          node.add_next_sibling(text).add_next_sibling(a_elem)
+          return if post.blank?
+
+          add_link(post, urls, a_elem)
+          return
+        end
+
+        add_link_apply_to_node(node, pre, a_elem)
+        return if post.blank?
+
+        add_link(post, urls, a_elem)
+      end
+
+      def add_link_apply_to_node(node, pre, a_elem)
+        node.content = pre
+        node.add_next_sibling(a_elem)
+      end
+
+      def add_link_blank_text(content, urls, node)
+        return false if urls.present?
+
+        text = Nokogiri::XML::Text.new(content, node.document)
+        node.add_next_sibling(text)
+
+        true
+      end
+
+      def add_link_build_node(node, url)
+        if url.match?(%r{^www}i)
+          url = "http://#{url}"
+        end
+
+        a = Nokogiri::XML::Node.new 'a', node.document
+        a['href'] = url
+        a['rel'] = 'nofollow noreferrer noopener'
+        a['target'] = '_blank'
+        a.content = url
+
+        a
+      end
+
+      def cleanup_target(string, **options)
+        cleaned_string = string.utf8_encode(fallback: :read_as_sanitized_binary)
+        cleaned_string = cleaned_string.gsub(%r{[[:space:]]}, '') if !options[:keep_spaces]
+        cleaned_string = cleaned_string.strip
+                                       .delete("\t\n\r\u0000")
+                                       .gsub(%r{/\*.*?\*/}, '')
+                                       .gsub(%r{<!--.*?-->}, '')
+
+        sanitize_attachment_disposition(cleaned_string)
+      end
+
+      def sanitize_attachment_disposition(url)
+        @fqdn ||= Setting.get('fqdn')
+        uri = URI(url)
+
+        if uri.host == @fqdn && uri.query.present?
+          params = CGI.parse(uri.query || '')
+                      .tap { |p| p.merge!('disposition' => 'attachment') if p.include?('disposition') }
+          uri.query = URI.encode_www_form(params)
+        end
+
+        uri.to_s
+      rescue
+        url
+      end
+
+      def url_same?(url_new, url_old)
+        url_new = url_same_build(url_new)
+        url_old = url_same_build(url_old)
+
+        return true if url_new == url_old
+        return true if url_old == "http://#{url_new}"
+        return true if url_new == "http://#{url_old}"
+        return true if url_old == "https://#{url_new}"
+        return true if url_new == "https://#{url_old}"
+
+        false
+      end
+
+      def url_same_build(input)
+        url = CGI
+          .unescape(input.to_s)
+          .utf8_encode(fallback: :read_as_sanitized_binary)
+          .downcase
+          .delete_suffix('/')
+          .gsub(%r{[[:space:]]|\t|\n|\r}, '')
+          .strip
+
+        html_decode(url)
+          .sub('/?', '?')
+      end
+    end
+  end
+end

Некоторые файлы не были показаны из-за большого количества измененных файлов