123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- # Copyright (C) 2012-2024 Zammad Foundation, https://zammad-foundation.org/
- class HtmlSanitizer
- module Scrubber
- class Link < Base
- LINKABLE_URL_SCHEMES = URI.scheme_list.keys.map(&:downcase) - ['mailto'] + ['tel']
- attr_reader :external, :web_app_url_prefix
- def initialize(web_app_url_prefix:, external: false) # rubocop:disable Lint/MissingSuper
- @direction = :top_down
- @external = external
- @web_app_url_prefix = web_app_url_prefix
- end
- def scrub(node)
- if (urls = node_urls(node))
- return if urls.blank?
- add_link(node.content, urls, node)
- end
- # prepare links
- return if href_cleanup(node)
- return STOP if ensure_href_present(node)
- update_node_title(node)
- end
- private
- def href_cleanup(node)
- return if !node['href']
- href = cleanup_target(node['href'], keep_spaces: true)
- href_without_spaces = href.gsub(%r{[[:space:]]}, '')
- if href_retry_protocol?(href_without_spaces)
- node['href'] = "http://#{node['href']}"
- href = node['href']
- href_without_spaces = href.gsub(%r{[[:space:]]}, '')
- end
- return true if !href_starts_with_protocol?(href_without_spaces)
- href_set_values(node, href)
- false
- end
- def href_retry_protocol?(href_without_spaces)
- return if !external
- return if href_without_spaces.blank?
- return if href_without_spaces.downcase.start_with?('mailto:')
- return if href_without_spaces.downcase.start_with?('tel:')
- return if href_without_spaces.downcase.start_with?('//')
- return if href_without_spaces.downcase.match? %r{^.{1,6}://.+?}
- true
- end
- def href_starts_with_protocol?(href_without_spaces)
- CGI
- .unescape(href_without_spaces)
- .utf8_encode(fallback: :read_as_sanitized_binary)
- .gsub(%r{[[:space:]]}, '')
- .downcase
- .start_with?('http', 'ftp', '//')
- end
- def href_set_values(node, value)
- node.set_attribute('href', value)
- node.set_attribute('rel', 'nofollow noreferrer noopener')
- # do not "target=_blank" WebApp URLs (e.g. mentions)
- return if value.downcase.start_with?(web_app_url_prefix)
- node.set_attribute('target', '_blank')
- end
- def node_urls(node)
- return if !node.is_a?(Nokogiri::XML::Text)
- return if node.content.blank?
- return if node.content.exclude?(':')
- return if node.ancestors.map(&:name).intersection(%w[a pre]).any?
- URI.extract(node.content, LINKABLE_URL_SCHEMES)
- .map { |u| u.sub(%r{[,.]$}, '') } # URI::extract captures trailing dots/commas
- .grep_v(%r{^[^:]+:$}) # URI::extract will match, e.g., 'tel:'
- end
- def ensure_href_present(node)
- return if node.name != 'a'
- return if node['href'].present?
- node.replace node.children.to_s
- true
- end
- def update_node_title(node)
- return if node.name != 'a'
- return if url_same?(node['href'], node.text)
- return if node['title'].present?
- node['title'] = node['href']
- end
- def add_link(content, urls, node)
- return if add_link_blank_text(content, urls, node)
- url = urls.shift
- return if content !~ %r{^(.*)#{Regexp.quote(url)}(.*)$}mx
- pre = $1
- post = $2
- a_elem = add_link_build_node(node, url)
- if node.class != Nokogiri::XML::Text
- text = Nokogiri::XML::Text.new(pre, node.document)
- node.add_next_sibling(text).add_next_sibling(a_elem)
- return if post.blank?
- add_link(post, urls, a_elem)
- return
- end
- add_link_apply_to_node(node, pre, a_elem)
- return if post.blank?
- add_link(post, urls, a_elem)
- end
- def add_link_apply_to_node(node, pre, a_elem)
- node.content = pre
- node.add_next_sibling(a_elem)
- end
- def add_link_blank_text(content, urls, node)
- return false if urls.present?
- text = Nokogiri::XML::Text.new(content, node.document)
- node.add_next_sibling(text)
- true
- end
- def add_link_build_node(node, url)
- if url.match?(%r{^www}i)
- url = "http://#{url}"
- end
- a = Nokogiri::XML::Node.new 'a', node.document
- a['href'] = url
- a['rel'] = 'nofollow noreferrer noopener'
- a['target'] = '_blank'
- a.content = url
- a
- end
- def cleanup_target(string, **options)
- cleaned_string = string.utf8_encode(fallback: :read_as_sanitized_binary)
- cleaned_string = cleaned_string.gsub(%r{[[:space:]]}, '') if !options[:keep_spaces]
- cleaned_string = cleaned_string.strip
- .delete("\t\n\r\u0000")
- .gsub(%r{/\*.*?\*/}, '')
- .gsub(%r{<!--.*?-->}, '')
- sanitize_attachment_disposition(cleaned_string)
- end
- def sanitize_attachment_disposition(url)
- @fqdn ||= Setting.get('fqdn')
- uri = URI(url)
- if uri.host == @fqdn && uri.query.present?
- params = CGI.parse(uri.query || '')
- .tap { |p| p.merge!('disposition' => 'attachment') if p.include?('disposition') }
- uri.query = URI.encode_www_form(params)
- end
- uri.to_s
- rescue
- url
- end
- def url_same?(url_new, url_old)
- url_new = url_same_build(url_new)
- url_old = url_same_build(url_old)
- return true if url_new == url_old
- return true if url_old == "http://#{url_new}"
- return true if url_new == "http://#{url_old}"
- return true if url_old == "https://#{url_new}"
- return true if url_new == "https://#{url_old}"
- false
- end
- def url_same_build(input)
- url = CGI
- .unescape(input.to_s)
- .utf8_encode(fallback: :read_as_sanitized_binary)
- .downcase
- .delete_suffix('/')
- .gsub(%r{[[:space:]]|\t|\n|\r}, '')
- .strip
- html_decode(url)
- .sub('/?', '?')
- end
- end
- end
- end
|