link.rb 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. # Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
  2. class HtmlSanitizer
  3. module Scrubber
  4. class Link < Base
  5. LINKABLE_URL_SCHEMES = URI.scheme_list.keys.map(&:downcase) - ['mailto'] + ['tel']
  6. attr_reader :external, :web_app_url_prefix
  7. def initialize(web_app_url_prefix:, external: false) # rubocop:disable Lint/MissingSuper
  8. @direction = :top_down
  9. @external = external
  10. @web_app_url_prefix = web_app_url_prefix
  11. end
  12. def scrub(node)
  13. if (urls = node_urls(node))
  14. return if urls.blank?
  15. add_link(node.content, urls, node)
  16. end
  17. # prepare links
  18. return if href_cleanup(node)
  19. return STOP if ensure_href_present(node)
  20. update_node_title(node)
  21. end
  22. private
  23. def href_cleanup(node)
  24. return if !node['href']
  25. href = cleanup_target(node['href'], keep_spaces: true)
  26. href_without_spaces = href.gsub(%r{[[:space:]]}, '')
  27. if href_retry_protocol?(href_without_spaces)
  28. node['href'] = "http://#{node['href']}"
  29. href = node['href']
  30. href_without_spaces = href.gsub(%r{[[:space:]]}, '')
  31. end
  32. return true if !href_starts_with_protocol?(href_without_spaces)
  33. href_set_values(node, href)
  34. false
  35. end
  36. def href_retry_protocol?(href_without_spaces)
  37. return if !external
  38. return if href_without_spaces.blank?
  39. return if href_without_spaces.downcase.start_with?('mailto:')
  40. return if href_without_spaces.downcase.start_with?('//')
  41. return if href_without_spaces.downcase.match? %r{^.{1,6}://.+?}
  42. true
  43. end
  44. def href_starts_with_protocol?(href_without_spaces)
  45. CGI
  46. .unescape(href_without_spaces)
  47. .utf8_encode(fallback: :read_as_sanitized_binary)
  48. .gsub(%r{[[:space:]]}, '')
  49. .downcase
  50. .start_with?('http', 'ftp', '//')
  51. end
  52. def href_set_values(node, value)
  53. node.set_attribute('href', value)
  54. node.set_attribute('rel', 'nofollow noreferrer noopener')
  55. # do not "target=_blank" WebApp URLs (e.g. mentions)
  56. return if value.downcase.start_with?(web_app_url_prefix)
  57. node.set_attribute('target', '_blank')
  58. end
  59. def node_urls(node)
  60. return if !node.is_a?(Nokogiri::XML::Text)
  61. return if node.content.blank?
  62. return if node.content.exclude?(':')
  63. return if node.ancestors.map(&:name).intersection(%w[a pre]).any?
  64. URI.extract(node.content, LINKABLE_URL_SCHEMES)
  65. .map { |u| u.sub(%r{[,.]$}, '') } # URI::extract captures trailing dots/commas
  66. .grep_v(%r{^[^:]+:$}) # URI::extract will match, e.g., 'tel:'
  67. end
  68. def ensure_href_present(node)
  69. return if node.name != 'a'
  70. return if node['href'].present?
  71. node.replace node.children.to_s
  72. true
  73. end
  74. def update_node_title(node)
  75. return if node.name != 'a'
  76. return if url_same?(node['href'], node.text)
  77. return if node['title'].present?
  78. node['title'] = node['href']
  79. end
  80. def add_link(content, urls, node)
  81. return if add_link_blank_text(content, urls, node)
  82. url = urls.shift
  83. return if content !~ %r{^(.*)#{Regexp.quote(url)}(.*)$}mx
  84. pre = $1
  85. post = $2
  86. a_elem = add_link_build_node(node, url)
  87. if node.class != Nokogiri::XML::Text
  88. text = Nokogiri::XML::Text.new(pre, node.document)
  89. node.add_next_sibling(text).add_next_sibling(a_elem)
  90. return if post.blank?
  91. add_link(post, urls, a_elem)
  92. return
  93. end
  94. add_link_apply_to_node(node, pre, a_elem)
  95. return if post.blank?
  96. add_link(post, urls, a_elem)
  97. end
  98. def add_link_apply_to_node(node, pre, a_elem)
  99. node.content = pre
  100. node.add_next_sibling(a_elem)
  101. end
  102. def add_link_blank_text(content, urls, node)
  103. return false if urls.present?
  104. text = Nokogiri::XML::Text.new(content, node.document)
  105. node.add_next_sibling(text)
  106. true
  107. end
  108. def add_link_build_node(node, url)
  109. if url.match?(%r{^www}i)
  110. url = "http://#{url}"
  111. end
  112. a = Nokogiri::XML::Node.new 'a', node.document
  113. a['href'] = url
  114. a['rel'] = 'nofollow noreferrer noopener'
  115. a['target'] = '_blank'
  116. a.content = url
  117. a
  118. end
  119. def cleanup_target(string, **options)
  120. cleaned_string = string.utf8_encode(fallback: :read_as_sanitized_binary)
  121. cleaned_string = cleaned_string.gsub(%r{[[:space:]]}, '') if !options[:keep_spaces]
  122. cleaned_string = cleaned_string.strip
  123. .delete("\t\n\r\u0000")
  124. .gsub(%r{/\*.*?\*/}, '')
  125. .gsub(%r{<!--.*?-->}, '')
  126. sanitize_attachment_disposition(cleaned_string)
  127. end
  128. def sanitize_attachment_disposition(url)
  129. @fqdn ||= Setting.get('fqdn')
  130. uri = URI(url)
  131. if uri.host == @fqdn && uri.query.present?
  132. params = CGI.parse(uri.query || '')
  133. .tap { |p| p.merge!('disposition' => 'attachment') if p.include?('disposition') }
  134. uri.query = URI.encode_www_form(params)
  135. end
  136. uri.to_s
  137. rescue
  138. url
  139. end
  140. def url_same?(url_new, url_old)
  141. url_new = url_same_build(url_new)
  142. url_old = url_same_build(url_old)
  143. return true if url_new == url_old
  144. return true if url_old == "http://#{url_new}"
  145. return true if url_new == "http://#{url_old}"
  146. return true if url_old == "https://#{url_new}"
  147. return true if url_new == "https://#{url_old}"
  148. false
  149. end
  150. def url_same_build(input)
  151. url = CGI
  152. .unescape(input.to_s)
  153. .utf8_encode(fallback: :read_as_sanitized_binary)
  154. .downcase
  155. .delete_suffix('/')
  156. .gsub(%r{[[:space:]]|\t|\n|\r}, '')
  157. .strip
  158. html_decode(url)
  159. .sub('/?', '?')
  160. end
  161. end
  162. end
  163. end