link.rb 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. # Copyright (C) 2012-2023 Zammad Foundation, https://zammad-foundation.org/
  2. class HtmlSanitizer
  3. module Scrubber
  4. class Link < Base
  5. LINKABLE_URL_SCHEMES = URI.scheme_list.keys.map(&:downcase) - ['mailto'] + ['tel']
  6. attr_reader :external, :web_app_url_prefix
  7. def initialize(web_app_url_prefix:, external: false) # rubocop:disable Lint/MissingSuper
  8. @direction = :top_down
  9. @external = external
  10. @web_app_url_prefix = web_app_url_prefix
  11. end
  12. def scrub(node)
  13. if (urls = node_urls(node))
  14. return if urls.blank?
  15. add_link(node.content, urls, node)
  16. end
  17. # prepare links
  18. return if href_cleanup(node)
  19. return STOP if ensure_href_present(node)
  20. update_node_title(node)
  21. end
  22. private
  23. def href_cleanup(node)
  24. return if !node['href']
  25. href = cleanup_target(node['href'], keep_spaces: true)
  26. href_without_spaces = href.gsub(%r{[[:space:]]}, '')
  27. if href_retry_protocol?(href_without_spaces)
  28. node['href'] = "http://#{node['href']}"
  29. href = node['href']
  30. href_without_spaces = href.gsub(%r{[[:space:]]}, '')
  31. end
  32. return true if !href_starts_with_protocol?(href_without_spaces)
  33. href_set_values(node, href)
  34. false
  35. end
  36. def href_retry_protocol?(href_without_spaces)
  37. return if !external
  38. return if href_without_spaces.blank?
  39. return if href_without_spaces.downcase.start_with?('mailto:')
  40. return if href_without_spaces.downcase.start_with?('tel:')
  41. return if href_without_spaces.downcase.start_with?('//')
  42. return if href_without_spaces.downcase.match? %r{^.{1,6}://.+?}
  43. true
  44. end
  45. def href_starts_with_protocol?(href_without_spaces)
  46. CGI
  47. .unescape(href_without_spaces)
  48. .utf8_encode(fallback: :read_as_sanitized_binary)
  49. .gsub(%r{[[:space:]]}, '')
  50. .downcase
  51. .start_with?('http', 'ftp', '//')
  52. end
  53. def href_set_values(node, value)
  54. node.set_attribute('href', value)
  55. node.set_attribute('rel', 'nofollow noreferrer noopener')
  56. # do not "target=_blank" WebApp URLs (e.g. mentions)
  57. return if value.downcase.start_with?(web_app_url_prefix)
  58. node.set_attribute('target', '_blank')
  59. end
  60. def node_urls(node)
  61. return if !node.is_a?(Nokogiri::XML::Text)
  62. return if node.content.blank?
  63. return if node.content.exclude?(':')
  64. return if node.ancestors.map(&:name).intersection(%w[a pre]).any?
  65. URI.extract(node.content, LINKABLE_URL_SCHEMES)
  66. .map { |u| u.sub(%r{[,.]$}, '') } # URI::extract captures trailing dots/commas
  67. .grep_v(%r{^[^:]+:$}) # URI::extract will match, e.g., 'tel:'
  68. end
  69. def ensure_href_present(node)
  70. return if node.name != 'a'
  71. return if node['href'].present?
  72. node.replace node.children.to_s
  73. true
  74. end
  75. def update_node_title(node)
  76. return if node.name != 'a'
  77. return if url_same?(node['href'], node.text)
  78. return if node['title'].present?
  79. node['title'] = node['href']
  80. end
  81. def add_link(content, urls, node)
  82. return if add_link_blank_text(content, urls, node)
  83. url = urls.shift
  84. return if content !~ %r{^(.*)#{Regexp.quote(url)}(.*)$}mx
  85. pre = $1
  86. post = $2
  87. a_elem = add_link_build_node(node, url)
  88. if node.class != Nokogiri::XML::Text
  89. text = Nokogiri::XML::Text.new(pre, node.document)
  90. node.add_next_sibling(text).add_next_sibling(a_elem)
  91. return if post.blank?
  92. add_link(post, urls, a_elem)
  93. return
  94. end
  95. add_link_apply_to_node(node, pre, a_elem)
  96. return if post.blank?
  97. add_link(post, urls, a_elem)
  98. end
  99. def add_link_apply_to_node(node, pre, a_elem)
  100. node.content = pre
  101. node.add_next_sibling(a_elem)
  102. end
  103. def add_link_blank_text(content, urls, node)
  104. return false if urls.present?
  105. text = Nokogiri::XML::Text.new(content, node.document)
  106. node.add_next_sibling(text)
  107. true
  108. end
  109. def add_link_build_node(node, url)
  110. if url.match?(%r{^www}i)
  111. url = "http://#{url}"
  112. end
  113. a = Nokogiri::XML::Node.new 'a', node.document
  114. a['href'] = url
  115. a['rel'] = 'nofollow noreferrer noopener'
  116. a['target'] = '_blank'
  117. a.content = url
  118. a
  119. end
  120. def cleanup_target(string, **options)
  121. cleaned_string = string.utf8_encode(fallback: :read_as_sanitized_binary)
  122. cleaned_string = cleaned_string.gsub(%r{[[:space:]]}, '') if !options[:keep_spaces]
  123. cleaned_string = cleaned_string.strip
  124. .delete("\t\n\r\u0000")
  125. .gsub(%r{/\*.*?\*/}, '')
  126. .gsub(%r{<!--.*?-->}, '')
  127. sanitize_attachment_disposition(cleaned_string)
  128. end
  129. def sanitize_attachment_disposition(url)
  130. @fqdn ||= Setting.get('fqdn')
  131. uri = URI(url)
  132. if uri.host == @fqdn && uri.query.present?
  133. params = CGI.parse(uri.query || '')
  134. .tap { |p| p.merge!('disposition' => 'attachment') if p.include?('disposition') }
  135. uri.query = URI.encode_www_form(params)
  136. end
  137. uri.to_s
  138. rescue
  139. url
  140. end
  141. def url_same?(url_new, url_old)
  142. url_new = url_same_build(url_new)
  143. url_old = url_same_build(url_old)
  144. return true if url_new == url_old
  145. return true if url_old == "http://#{url_new}"
  146. return true if url_new == "http://#{url_old}"
  147. return true if url_old == "https://#{url_new}"
  148. return true if url_new == "https://#{url_old}"
  149. false
  150. end
  151. def url_same_build(input)
  152. url = CGI
  153. .unescape(input.to_s)
  154. .utf8_encode(fallback: :read_as_sanitized_binary)
  155. .downcase
  156. .delete_suffix('/')
  157. .gsub(%r{[[:space:]]|\t|\n|\r}, '')
  158. .strip
  159. html_decode(url)
  160. .sub('/?', '?')
  161. end
  162. end
  163. end
  164. end