|
@@ -1,8 +1,7 @@
|
|
|
# Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
|
|
|
|
|
|
class HtmlSanitizer
|
|
|
- LINKABLE_URL_SCHEMES = URI.scheme_list.keys.map(&:downcase) - ['mailto'] + ['tel']
|
|
|
- PROCESSING_TIMEOUT = 20
|
|
|
+ PROCESSING_TIMEOUT = 20.seconds
|
|
|
UNPROCESSABLE_HTML_MSG = __('This message cannot be displayed due to HTML processing issues. Download the raw message below and open it via an Email client if you still wish to view it.').freeze
|
|
|
|
|
|
=begin
|
|
@@ -14,201 +13,7 @@ sanitize html string based on whiltelist
|
|
|
=end
|
|
|
|
|
|
def self.strict(string, external = false, timeout: true)
|
|
|
- Timeout.timeout(timeout ? PROCESSING_TIMEOUT : nil) do
|
|
|
- @fqdn = Setting.get('fqdn')
|
|
|
- http_type = Setting.get('http_type')
|
|
|
- web_app_url_prefix = "#{http_type}://#{@fqdn}/\#".downcase
|
|
|
-
|
|
|
- # config
|
|
|
- tags_remove_content = Rails.configuration.html_sanitizer_tags_remove_content
|
|
|
- tags_quote_content = Rails.configuration.html_sanitizer_tags_quote_content
|
|
|
- tags_allowlist = Rails.configuration.html_sanitizer_tags_allowlist
|
|
|
- attributes_allowlist = Rails.configuration.html_sanitizer_attributes_allowlist
|
|
|
- css_properties_allowlist = Rails.configuration.html_sanitizer_css_properties_allowlist
|
|
|
- css_values_blocklist = Rails.application.config.html_sanitizer_css_values_blocklist
|
|
|
-
|
|
|
- # We allowlist yahoo_quoted because Yahoo Mail marks quoted email content using
|
|
|
- # <div class='yahoo_quoted'> and we rely on this class to identify quoted messages
|
|
|
- classes_allowlist = %w[js-signatureMarker yahoo_quoted]
|
|
|
- attributes_2_css = %w[width height]
|
|
|
-
|
|
|
- # remove tags with subtree
|
|
|
- scrubber_tag_remove = Loofah::Scrubber.new do |node|
|
|
|
- next if tags_remove_content.exclude?(node.name)
|
|
|
-
|
|
|
- node.remove
|
|
|
- Loofah::Scrubber::STOP
|
|
|
- end
|
|
|
- string = Loofah.fragment(string).scrub!(scrubber_tag_remove).to_s
|
|
|
-
|
|
|
- # remove tag, insert quoted content
|
|
|
- scrubber_wipe_quote_content = Loofah::Scrubber.new do |node|
|
|
|
- next if tags_quote_content.exclude?(node.name)
|
|
|
-
|
|
|
- string = html_decode(node.content)
|
|
|
- text = Nokogiri::XML::Text.new(string, node.document)
|
|
|
- node.add_next_sibling(text)
|
|
|
- node.remove
|
|
|
- Loofah::Scrubber::STOP
|
|
|
- end
|
|
|
- string = Loofah.fragment(string).scrub!(scrubber_wipe_quote_content).to_s
|
|
|
-
|
|
|
- scrubber_wipe = Loofah::Scrubber.new do |node|
|
|
|
-
|
|
|
- # replace tags, keep subtree
|
|
|
- if tags_allowlist.exclude?(node.name)
|
|
|
- node.replace node.children.to_s
|
|
|
- Loofah::Scrubber::STOP
|
|
|
- end
|
|
|
-
|
|
|
- # prepare src attribute
|
|
|
- if node['src']
|
|
|
- src = cleanup_target(CGI.unescape(node['src']))
|
|
|
- if src =~ %r{(javascript|livescript|vbscript):}i || src.downcase.start_with?('http', 'ftp', '//')
|
|
|
- node.remove
|
|
|
- Loofah::Scrubber::STOP
|
|
|
- end
|
|
|
- end
|
|
|
-
|
|
|
- # clean class / only use allowed classes
|
|
|
- if node['class']
|
|
|
- classes = node['class'].gsub(%r{\t|\n|\r}, '').split
|
|
|
- class_new = ''
|
|
|
- classes.each do |local_class|
|
|
|
- next if classes_allowlist.exclude?(local_class.to_s.strip)
|
|
|
-
|
|
|
- if class_new != ''
|
|
|
- class_new += ' '
|
|
|
- end
|
|
|
- class_new += local_class
|
|
|
- end
|
|
|
- if class_new == ''
|
|
|
- node.delete('class')
|
|
|
- else
|
|
|
- node['class'] = class_new
|
|
|
- end
|
|
|
- end
|
|
|
-
|
|
|
- # move style attributes to css attributes
|
|
|
- attributes_2_css.each do |key|
|
|
|
- next if !node[key]
|
|
|
-
|
|
|
- if node['style'].blank?
|
|
|
- node['style'] = ''
|
|
|
- else
|
|
|
- node['style'] += ';'
|
|
|
- end
|
|
|
- value = node[key]
|
|
|
- node.delete(key)
|
|
|
- next if value.blank?
|
|
|
-
|
|
|
- value += 'px' if !value.match?(%r{%|px|em}i)
|
|
|
- node['style'] += "#{key}:#{value}"
|
|
|
- end
|
|
|
-
|
|
|
- # clean style / only use allowed style properties
|
|
|
- if node['style']
|
|
|
- pears = node['style'].downcase.gsub(%r{\t|\n|\r}, '').split(';')
|
|
|
- style = ''
|
|
|
- pears.each do |local_pear|
|
|
|
- prop = local_pear.split(':')
|
|
|
- next if !prop[0]
|
|
|
-
|
|
|
- key = prop[0].strip
|
|
|
- next if css_properties_allowlist.exclude?(node.name)
|
|
|
- next if css_properties_allowlist[node.name].exclude?(key)
|
|
|
- next if css_values_blocklist[node.name]&.include?(local_pear.gsub(%r{[[:space:]]}, '').strip)
|
|
|
-
|
|
|
- style += "#{local_pear};"
|
|
|
- end
|
|
|
- node['style'] = style
|
|
|
- if style == ''
|
|
|
- node.delete('style')
|
|
|
- end
|
|
|
- end
|
|
|
-
|
|
|
- # scan for invalid link content
|
|
|
- %w[href style].each do |attribute_name|
|
|
|
- next if !node[attribute_name]
|
|
|
-
|
|
|
- href = cleanup_target(node[attribute_name])
|
|
|
- next if !href.match?(%r{(javascript|livescript|vbscript):}i)
|
|
|
-
|
|
|
- node.delete(attribute_name)
|
|
|
- end
|
|
|
-
|
|
|
- # remove attributes if not allowlisted
|
|
|
- node.each do |attribute, _value|
|
|
|
- attribute_name = attribute.downcase
|
|
|
- next if attributes_allowlist[:all].include?(attribute_name) || attributes_allowlist[node.name]&.include?(attribute_name)
|
|
|
-
|
|
|
- node.delete(attribute)
|
|
|
- end
|
|
|
-
|
|
|
- end
|
|
|
-
|
|
|
- done = true
|
|
|
- while done
|
|
|
- new_string = Loofah.fragment(string).scrub!(scrubber_wipe).to_s
|
|
|
- if string == new_string
|
|
|
- done = false
|
|
|
- end
|
|
|
- string = new_string
|
|
|
- end
|
|
|
-
|
|
|
- scrubber_link = Loofah::Scrubber.new do |node|
|
|
|
-
|
|
|
- # wrap plain-text URLs in <a> tags
|
|
|
- if node.is_a?(Nokogiri::XML::Text) && node.content.present? && node.content.include?(':')
|
|
|
- node_ancestor_names = node.ancestors.map(&:name)
|
|
|
- if node_ancestor_names.exclude?('a') && node_ancestor_names.exclude?('pre')
|
|
|
- urls = URI.extract(node.content, LINKABLE_URL_SCHEMES)
|
|
|
- .map { |u| u.sub(%r{[,.]$}, '') } # URI::extract captures trailing dots/commas
|
|
|
- .grep_v(%r{^[^:]+:$}) # URI::extract will match, e.g., 'tel:'
|
|
|
-
|
|
|
- next if urls.blank?
|
|
|
-
|
|
|
- add_link(node.content, urls, node)
|
|
|
- end
|
|
|
- end
|
|
|
-
|
|
|
- # prepare links
|
|
|
- if node['href']
|
|
|
- href = cleanup_target(node['href'], keep_spaces: true)
|
|
|
- href_without_spaces = href.gsub(%r{[[:space:]]}, '')
|
|
|
- if external && href_without_spaces.present? && !href_without_spaces.downcase.start_with?('mailto:') && !href_without_spaces.downcase.start_with?('//') && href_without_spaces.downcase !~ %r{^.{1,6}://.+?}
|
|
|
- node['href'] = "http://#{node['href']}"
|
|
|
- href = node['href']
|
|
|
- href_without_spaces = href.gsub(%r{[[:space:]]}, '')
|
|
|
- end
|
|
|
-
|
|
|
- next if !CGI.unescape(href_without_spaces).utf8_encode(fallback: :read_as_sanitized_binary).gsub(%r{[[:space:]]}, '').downcase.start_with?('http', 'ftp', '//')
|
|
|
-
|
|
|
- node.set_attribute('href', href)
|
|
|
- node.set_attribute('rel', 'nofollow noreferrer noopener')
|
|
|
-
|
|
|
- # do not "target=_blank" WebApp URLs (e.g. mentions)
|
|
|
- if !href.downcase.start_with?(web_app_url_prefix)
|
|
|
- node.set_attribute('target', '_blank')
|
|
|
- end
|
|
|
- end
|
|
|
-
|
|
|
- if node.name == 'a' && node['href'].blank?
|
|
|
- node.replace node.children.to_s
|
|
|
- Loofah::Scrubber::STOP
|
|
|
- end
|
|
|
-
|
|
|
- # check if href is different to text
|
|
|
- if node.name == 'a' && !url_same?(node['href'], node.text) && node['title'].blank?
|
|
|
- node['title'] = node['href']
|
|
|
- end
|
|
|
- end
|
|
|
-
|
|
|
- Loofah.fragment(string).scrub!(scrubber_link).to_s
|
|
|
- end
|
|
|
- rescue Timeout::Error
|
|
|
- Rails.logger.error "Could not process string via HtmlSanitizer.strict in #{PROCESSING_TIMEOUT} seconds. Current state: #{string}"
|
|
|
- UNPROCESSABLE_HTML_MSG
|
|
|
+ HtmlSanitizer::Strict.new.sanitize(string, external: external, timeout: timeout)
|
|
|
end
|
|
|
|
|
|
=begin
|
|
@@ -223,191 +28,7 @@ cleanup html string:
|
|
|
=end
|
|
|
|
|
|
def self.cleanup(string, timeout: true)
|
|
|
- Timeout.timeout(timeout ? PROCESSING_TIMEOUT : nil) do
|
|
|
- string.gsub!(%r{<[A-z]:[A-z]>}, '')
|
|
|
- string.gsub!(%r{</[A-z]:[A-z]>}, '')
|
|
|
- string.delete!("\t")
|
|
|
-
|
|
|
- # remove all new lines
|
|
|
- string.gsub!(%r{(\n\r|\r\r\n|\r\n|\n)}, "\n")
|
|
|
-
|
|
|
- # remove double multiple empty lines
|
|
|
- string.gsub!(%r{\n\n\n+}, "\n\n")
|
|
|
-
|
|
|
- string = cleanup_structure(string, 'pre')
|
|
|
- string = cleanup_structure(string)
|
|
|
- string
|
|
|
- end
|
|
|
- rescue Timeout::Error
|
|
|
- Rails.logger.error "Could not process string via HtmlSanitizer.cleanup in #{PROCESSING_TIMEOUT} seconds. Current state: #{string}"
|
|
|
- UNPROCESSABLE_HTML_MSG
|
|
|
- end
|
|
|
-
|
|
|
- def self.remove_last_empty_node(node, remove_empty_nodes, remove_empty_last_nodes)
|
|
|
- if node.children.present?
|
|
|
- if node.children.size == 1
|
|
|
- local_name = node.name
|
|
|
- child = node.children.first
|
|
|
-
|
|
|
- # replace not needed node (parent <- child)
|
|
|
- if local_name == child.name && node.attributes.present? && node.children.first.attributes.blank?
|
|
|
- local_node_child = node.children.first
|
|
|
- node.attributes.each do |k|
|
|
|
- local_node_child.set_attribute(k[0], k[1])
|
|
|
- end
|
|
|
- node.replace local_node_child.to_s
|
|
|
- Loofah::Scrubber::STOP
|
|
|
-
|
|
|
- # replace not needed node (parent replace with child node)
|
|
|
- elsif (local_name == 'span' || local_name == child.name) && node.attributes.blank?
|
|
|
- node.replace node.children.to_s
|
|
|
- Loofah::Scrubber::STOP
|
|
|
- end
|
|
|
- else
|
|
|
-
|
|
|
- # loop through nodes
|
|
|
- node.children.each do |local_node|
|
|
|
- remove_last_empty_node(local_node, remove_empty_nodes, remove_empty_last_nodes)
|
|
|
- end
|
|
|
- end
|
|
|
- # remove empty nodes
|
|
|
- elsif (remove_empty_nodes.include?(node.name) || remove_empty_last_nodes.include?(node.name)) && node.content.blank? && node.attributes.blank?
|
|
|
- node.remove
|
|
|
- Loofah::Scrubber::STOP
|
|
|
- end
|
|
|
- end
|
|
|
-
|
|
|
- def self.cleanup_structure(string, type = 'all')
|
|
|
- remove_empty_nodes = if type == 'pre'
|
|
|
- %w[span]
|
|
|
- else
|
|
|
- %w[p div span small table]
|
|
|
- end
|
|
|
- remove_empty_last_nodes = %w[b i u small table]
|
|
|
-
|
|
|
- # remove last empty nodes and empty -not needed- parrent nodes
|
|
|
- scrubber_structure = Loofah::Scrubber.new do |node|
|
|
|
- remove_last_empty_node(node, remove_empty_nodes, remove_empty_last_nodes)
|
|
|
- end
|
|
|
-
|
|
|
- done = true
|
|
|
- while done
|
|
|
- new_string = Loofah.fragment(string).scrub!(scrubber_structure).to_s
|
|
|
- if string == new_string
|
|
|
- done = false
|
|
|
- end
|
|
|
- string = new_string
|
|
|
- end
|
|
|
-
|
|
|
- scrubber_cleanup = Loofah::Scrubber.new do |node|
|
|
|
-
|
|
|
- # remove not needed new lines
|
|
|
- if node.instance_of?(Nokogiri::XML::Text)
|
|
|
- if !node.parent || (node.parent.name != 'pre' && node.parent.name != 'code') # rubocop:disable Style/SoleNestedConditional
|
|
|
- content = node.content
|
|
|
- if content
|
|
|
- if content != ' ' && content != "\n"
|
|
|
- content.gsub!(%r{[[:space:]]+}, ' ')
|
|
|
- end
|
|
|
- if node.previous
|
|
|
- if node.previous.name == 'div' || node.previous.name == 'p'
|
|
|
- content.strip!
|
|
|
- end
|
|
|
- elsif node.parent && !node.previous && (!node.next || node.next.name == 'div' || node.next.name == 'p' || node.next.name == 'br')
|
|
|
- if (node.parent.name == 'div' || node.parent.name == 'p') && content != ' ' && content != "\n"
|
|
|
- content.strip!
|
|
|
- end
|
|
|
- end
|
|
|
- node.content = content
|
|
|
- end
|
|
|
- end
|
|
|
- end
|
|
|
- end
|
|
|
- Loofah.fragment(string).scrub!(scrubber_cleanup).to_s
|
|
|
- end
|
|
|
-
|
|
|
- def self.add_link(content, urls, node)
|
|
|
- if urls.blank?
|
|
|
- text = Nokogiri::XML::Text.new(content, node.document)
|
|
|
- node.add_next_sibling(text)
|
|
|
- return
|
|
|
- end
|
|
|
- url = urls.shift
|
|
|
-
|
|
|
- if content =~ %r{^(.*)#{Regexp.quote(url)}(.*)$}mx
|
|
|
- pre = $1
|
|
|
- post = $2
|
|
|
-
|
|
|
- if url.match?(%r{^www}i)
|
|
|
- url = "http://#{url}"
|
|
|
- end
|
|
|
-
|
|
|
- a = Nokogiri::XML::Node.new 'a', node.document
|
|
|
- a['href'] = url
|
|
|
- a['rel'] = 'nofollow noreferrer noopener'
|
|
|
- a['target'] = '_blank'
|
|
|
- a.content = url
|
|
|
-
|
|
|
- if node.class != Nokogiri::XML::Text
|
|
|
- text = Nokogiri::XML::Text.new(pre, node.document)
|
|
|
- node.add_next_sibling(text).add_next_sibling(a)
|
|
|
- return if post.blank?
|
|
|
-
|
|
|
- add_link(post, urls, a)
|
|
|
- return
|
|
|
- end
|
|
|
- node.content = pre
|
|
|
- node.add_next_sibling(a)
|
|
|
- return if post.blank?
|
|
|
-
|
|
|
- add_link(post, urls, a)
|
|
|
- end
|
|
|
-
|
|
|
- true
|
|
|
- end
|
|
|
-
|
|
|
- def self.html_decode(string)
|
|
|
- string.gsub('&', '&').gsub('<', '<').gsub('>', '>').gsub('"', '"').gsub(' ', ' ')
|
|
|
- end
|
|
|
-
|
|
|
- def self.cleanup_target(string, **options)
|
|
|
- cleaned_string = string.utf8_encode(fallback: :read_as_sanitized_binary)
|
|
|
- cleaned_string = cleaned_string.gsub(%r{[[:space:]]}, '') if !options[:keep_spaces]
|
|
|
- cleaned_string = cleaned_string.strip
|
|
|
- .delete("\t\n\r\u0000")
|
|
|
- .gsub(%r{/\*.*?\*/}, '')
|
|
|
- .gsub(%r{<!--.*?-->}, '')
|
|
|
-
|
|
|
- sanitize_attachment_disposition(cleaned_string)
|
|
|
- end
|
|
|
-
|
|
|
- def self.sanitize_attachment_disposition(url)
|
|
|
- @fqdn ||= Setting.get('fqdn')
|
|
|
- uri = URI(url)
|
|
|
-
|
|
|
- if uri.host == @fqdn && uri.query.present?
|
|
|
- params = CGI.parse(uri.query || '')
|
|
|
- .tap { |p| p.merge!('disposition' => 'attachment') if p.include?('disposition') }
|
|
|
- uri.query = URI.encode_www_form(params)
|
|
|
- end
|
|
|
-
|
|
|
- uri.to_s
|
|
|
- rescue
|
|
|
- url
|
|
|
- end
|
|
|
-
|
|
|
- def self.url_same?(url_new, url_old)
|
|
|
- url_new = CGI.unescape(url_new.to_s).utf8_encode(fallback: :read_as_sanitized_binary).downcase.delete_suffix('/').gsub(%r{[[:space:]]|\t|\n|\r}, '').strip
|
|
|
- url_old = CGI.unescape(url_old.to_s).utf8_encode(fallback: :read_as_sanitized_binary).downcase.delete_suffix('/').gsub(%r{[[:space:]]|\t|\n|\r}, '').strip
|
|
|
- url_new = html_decode(url_new).sub('/?', '?')
|
|
|
- url_old = html_decode(url_old).sub('/?', '?')
|
|
|
- return true if url_new == url_old
|
|
|
- return true if url_old == "http://#{url_new}"
|
|
|
- return true if url_new == "http://#{url_old}"
|
|
|
- return true if url_old == "https://#{url_new}"
|
|
|
- return true if url_new == "https://#{url_old}"
|
|
|
-
|
|
|
- false
|
|
|
+ HtmlSanitizer::Cleanup.new.sanitize(string, timeout: timeout)
|
|
|
end
|
|
|
|
|
|
=begin
|
|
@@ -419,36 +40,7 @@ replace inline images with cid images
|
|
|
=end
|
|
|
|
|
|
def self.replace_inline_images(string, prefix = SecureRandom.uuid)
|
|
|
- fqdn = Setting.get('fqdn')
|
|
|
- attachments_inline = []
|
|
|
- filename_counter = 0
|
|
|
- scrubber = Loofah::Scrubber.new do |node|
|
|
|
- if node.name == 'img'
|
|
|
- if node['src'] && node['src'] =~ %r{^(data:image/(jpeg|png);base64,.+?)$}i
|
|
|
- filename_counter += 1
|
|
|
- file_attributes = StaticAssets.data_url_attributes($1)
|
|
|
- cid = "#{prefix}.#{SecureRandom.uuid}@#{fqdn}"
|
|
|
- filename = cid
|
|
|
- if file_attributes[:file_extention].present?
|
|
|
- filename = "image#{filename_counter}.#{file_attributes[:file_extention]}"
|
|
|
- end
|
|
|
- attachment = {
|
|
|
- data: file_attributes[:content],
|
|
|
- filename: filename,
|
|
|
- preferences: {
|
|
|
- 'Content-Type' => file_attributes[:mime_type],
|
|
|
- 'Mime-Type' => file_attributes[:mime_type],
|
|
|
- 'Content-ID' => cid,
|
|
|
- 'Content-Disposition' => 'inline',
|
|
|
- },
|
|
|
- }
|
|
|
- attachments_inline.push attachment
|
|
|
- node['src'] = "cid:#{cid}"
|
|
|
- end
|
|
|
- Loofah::Scrubber::STOP
|
|
|
- end
|
|
|
- end
|
|
|
- [Loofah.fragment(string).scrub!(scrubber).to_s, attachments_inline]
|
|
|
+ HtmlSanitizer::ReplaceInlineImages.new.sanitize(string, prefix)
|
|
|
end
|
|
|
|
|
|
=begin
|
|
@@ -460,35 +52,7 @@ sanitize style of img tags
|
|
|
=end
|
|
|
|
|
|
def self.dynamic_image_size(string)
|
|
|
- scrubber = Loofah::Scrubber.new do |node|
|
|
|
- if node.name == 'img'
|
|
|
- if node['src']
|
|
|
- style = 'max-width:100%;'
|
|
|
- if node['style']
|
|
|
- pears = node['style'].downcase.gsub(%r{\t|\n|\r}, '').split(';')
|
|
|
- pears.each do |local_pear|
|
|
|
- prop = local_pear.split(':')
|
|
|
- next if !prop[0]
|
|
|
-
|
|
|
- key = prop[0].strip
|
|
|
- if key == 'height'
|
|
|
- key = 'max-height'
|
|
|
- end
|
|
|
- style += "#{key}:#{prop[1]};"
|
|
|
- end
|
|
|
- end
|
|
|
- node['style'] = style
|
|
|
- end
|
|
|
- Loofah::Scrubber::STOP
|
|
|
- end
|
|
|
- end
|
|
|
- Loofah.fragment(string).scrub!(scrubber).to_s
|
|
|
+ HtmlSanitizer::DynamicImageSize.new.sanitize(string)
|
|
|
end
|
|
|
|
|
|
- private_class_method :cleanup_target
|
|
|
- private_class_method :sanitize_attachment_disposition
|
|
|
- private_class_method :add_link
|
|
|
- private_class_method :url_same?
|
|
|
- private_class_method :html_decode
|
|
|
-
|
|
|
end
|