html_sanitizer.rb 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. # Copyright (C) 2012-2022 Zammad Foundation, https://zammad-foundation.org/
  2. class HtmlSanitizer
  3. LINKABLE_URL_SCHEMES = URI.scheme_list.keys.map(&:downcase) - ['mailto'] + ['tel']
  4. PROCESSING_TIMEOUT = 20
  5. UNPROCESSABLE_HTML_MSG = __('This message cannot be displayed due to HTML processing issues. Download the raw message below and open it via an Email client if you still wish to view it.').freeze
  6. =begin
  7. sanitize html string based on whiltelist
  8. string = HtmlSanitizer.strict(string, external)
  9. =end
  10. def self.strict(string, external = false, timeout: true)
  11. Timeout.timeout(timeout ? PROCESSING_TIMEOUT : nil) do
  12. @fqdn = Setting.get('fqdn')
  13. http_type = Setting.get('http_type')
  14. web_app_url_prefix = "#{http_type}://#{@fqdn}/\#".downcase
  15. # config
  16. tags_remove_content = Rails.configuration.html_sanitizer_tags_remove_content
  17. tags_quote_content = Rails.configuration.html_sanitizer_tags_quote_content
  18. tags_allowlist = Rails.configuration.html_sanitizer_tags_allowlist
  19. attributes_allowlist = Rails.configuration.html_sanitizer_attributes_allowlist
  20. css_properties_allowlist = Rails.configuration.html_sanitizer_css_properties_allowlist
  21. css_values_blocklist = Rails.application.config.html_sanitizer_css_values_blocklist
  22. # We allowlist yahoo_quoted because Yahoo Mail marks quoted email content using
  23. # <div class='yahoo_quoted'> and we rely on this class to identify quoted messages
  24. classes_allowlist = %w[js-signatureMarker yahoo_quoted]
  25. attributes_2_css = %w[width height]
  26. # remove tags with subtree
  27. scrubber_tag_remove = Loofah::Scrubber.new do |node|
  28. next if tags_remove_content.exclude?(node.name)
  29. node.remove
  30. Loofah::Scrubber::STOP
  31. end
  32. string = Loofah.fragment(string).scrub!(scrubber_tag_remove).to_s
  33. # remove tag, insert quoted content
  34. scrubber_wipe_quote_content = Loofah::Scrubber.new do |node|
  35. next if tags_quote_content.exclude?(node.name)
  36. string = html_decode(node.content)
  37. text = Nokogiri::XML::Text.new(string, node.document)
  38. node.add_next_sibling(text)
  39. node.remove
  40. Loofah::Scrubber::STOP
  41. end
  42. string = Loofah.fragment(string).scrub!(scrubber_wipe_quote_content).to_s
  43. scrubber_wipe = Loofah::Scrubber.new do |node|
  44. # replace tags, keep subtree
  45. if tags_allowlist.exclude?(node.name)
  46. node.replace node.children.to_s
  47. Loofah::Scrubber::STOP
  48. end
  49. # prepare src attribute
  50. if node['src']
  51. src = cleanup_target(CGI.unescape(node['src']))
  52. if src =~ %r{(javascript|livescript|vbscript):}i || src.downcase.start_with?('http', 'ftp', '//')
  53. node.remove
  54. Loofah::Scrubber::STOP
  55. end
  56. end
  57. # clean class / only use allowed classes
  58. if node['class']
  59. classes = node['class'].gsub(%r{\t|\n|\r}, '').split
  60. class_new = ''
  61. classes.each do |local_class|
  62. next if classes_allowlist.exclude?(local_class.to_s.strip)
  63. if class_new != ''
  64. class_new += ' '
  65. end
  66. class_new += local_class
  67. end
  68. if class_new == ''
  69. node.delete('class')
  70. else
  71. node['class'] = class_new
  72. end
  73. end
  74. # move style attributes to css attributes
  75. attributes_2_css.each do |key|
  76. next if !node[key]
  77. if node['style'].blank?
  78. node['style'] = ''
  79. else
  80. node['style'] += ';'
  81. end
  82. value = node[key]
  83. node.delete(key)
  84. next if value.blank?
  85. value += 'px' if !value.match?(%r{%|px|em}i)
  86. node['style'] += "#{key}:#{value}"
  87. end
  88. # clean style / only use allowed style properties
  89. if node['style']
  90. pears = node['style'].downcase.gsub(%r{\t|\n|\r}, '').split(';')
  91. style = ''
  92. pears.each do |local_pear|
  93. prop = local_pear.split(':')
  94. next if !prop[0]
  95. key = prop[0].strip
  96. next if css_properties_allowlist.exclude?(node.name)
  97. next if css_properties_allowlist[node.name].exclude?(key)
  98. next if css_values_blocklist[node.name]&.include?(local_pear.gsub(%r{[[:space:]]}, '').strip)
  99. style += "#{local_pear};"
  100. end
  101. node['style'] = style
  102. if style == ''
  103. node.delete('style')
  104. end
  105. end
  106. # scan for invalid link content
  107. %w[href style].each do |attribute_name|
  108. next if !node[attribute_name]
  109. href = cleanup_target(node[attribute_name])
  110. next if !href.match?(%r{(javascript|livescript|vbscript):}i)
  111. node.delete(attribute_name)
  112. end
  113. # remove attributes if not allowlisted
  114. node.each do |attribute, _value|
  115. attribute_name = attribute.downcase
  116. next if attributes_allowlist[:all].include?(attribute_name) || attributes_allowlist[node.name]&.include?(attribute_name)
  117. node.delete(attribute)
  118. end
  119. end
  120. done = true
  121. while done
  122. new_string = Loofah.fragment(string).scrub!(scrubber_wipe).to_s
  123. if string == new_string
  124. done = false
  125. end
  126. string = new_string
  127. end
  128. scrubber_link = Loofah::Scrubber.new do |node|
  129. # wrap plain-text URLs in <a> tags
  130. if node.is_a?(Nokogiri::XML::Text) && node.content.present? && node.content.include?(':')
  131. node_ancestor_names = node.ancestors.map(&:name)
  132. if node_ancestor_names.exclude?('a') && node_ancestor_names.exclude?('pre')
  133. urls = URI.extract(node.content, LINKABLE_URL_SCHEMES)
  134. .map { |u| u.sub(%r{[,.]$}, '') } # URI::extract captures trailing dots/commas
  135. .grep_v(%r{^[^:]+:$}) # URI::extract will match, e.g., 'tel:'
  136. next if urls.blank?
  137. add_link(node.content, urls, node)
  138. end
  139. end
  140. # prepare links
  141. if node['href']
  142. href = cleanup_target(node['href'], keep_spaces: true)
  143. href_without_spaces = href.gsub(%r{[[:space:]]}, '')
  144. if external && href_without_spaces.present? && !href_without_spaces.downcase.start_with?('mailto:') && !href_without_spaces.downcase.start_with?('//') && href_without_spaces.downcase !~ %r{^.{1,6}://.+?}
  145. node['href'] = "http://#{node['href']}"
  146. href = node['href']
  147. href_without_spaces = href.gsub(%r{[[:space:]]}, '')
  148. end
  149. next if !CGI.unescape(href_without_spaces).utf8_encode(fallback: :read_as_sanitized_binary).gsub(%r{[[:space:]]}, '').downcase.start_with?('http', 'ftp', '//')
  150. node.set_attribute('href', href)
  151. node.set_attribute('rel', 'nofollow noreferrer noopener')
  152. # do not "target=_blank" WebApp URLs (e.g. mentions)
  153. if !href.downcase.start_with?(web_app_url_prefix)
  154. node.set_attribute('target', '_blank')
  155. end
  156. end
  157. if node.name == 'a' && node['href'].blank?
  158. node.replace node.children.to_s
  159. Loofah::Scrubber::STOP
  160. end
  161. # check if href is different to text
  162. if node.name == 'a' && !url_same?(node['href'], node.text) && node['title'].blank?
  163. node['title'] = node['href']
  164. end
  165. end
  166. Loofah.fragment(string).scrub!(scrubber_link).to_s
  167. end
  168. rescue Timeout::Error
  169. Rails.logger.error "Could not process string via HtmlSanitizer.strict in #{PROCESSING_TIMEOUT} seconds. Current state: #{string}"
  170. UNPROCESSABLE_HTML_MSG
  171. end
  172. =begin
  173. cleanup html string:
  174. * remove empty nodes (p, div, span, table)
  175. * remove nodes in general (keep content - span)
  176. string = HtmlSanitizer.cleanup(string)
  177. =end
  178. def self.cleanup(string, timeout: true)
  179. Timeout.timeout(timeout ? PROCESSING_TIMEOUT : nil) do
  180. string.gsub!(%r{<[A-z]:[A-z]>}, '')
  181. string.gsub!(%r{</[A-z]:[A-z]>}, '')
  182. string.delete!("\t")
  183. # remove all new lines
  184. string.gsub!(%r{(\n\r|\r\r\n|\r\n|\n)}, "\n")
  185. # remove double multiple empty lines
  186. string.gsub!(%r{\n\n\n+}, "\n\n")
  187. string = cleanup_structure(string, 'pre')
  188. string = cleanup_structure(string)
  189. string
  190. end
  191. rescue Timeout::Error
  192. Rails.logger.error "Could not process string via HtmlSanitizer.cleanup in #{PROCESSING_TIMEOUT} seconds. Current state: #{string}"
  193. UNPROCESSABLE_HTML_MSG
  194. end
  195. def self.remove_last_empty_node(node, remove_empty_nodes, remove_empty_last_nodes)
  196. if node.children.present?
  197. if node.children.size == 1
  198. local_name = node.name
  199. child = node.children.first
  200. # replace not needed node (parent <- child)
  201. if local_name == child.name && node.attributes.present? && node.children.first.attributes.blank?
  202. local_node_child = node.children.first
  203. node.attributes.each do |k|
  204. local_node_child.set_attribute(k[0], k[1])
  205. end
  206. node.replace local_node_child.to_s
  207. Loofah::Scrubber::STOP
  208. # replace not needed node (parent replace with child node)
  209. elsif (local_name == 'span' || local_name == child.name) && node.attributes.blank?
  210. node.replace node.children.to_s
  211. Loofah::Scrubber::STOP
  212. end
  213. else
  214. # loop through nodes
  215. node.children.each do |local_node|
  216. remove_last_empty_node(local_node, remove_empty_nodes, remove_empty_last_nodes)
  217. end
  218. end
  219. # remove empty nodes
  220. elsif (remove_empty_nodes.include?(node.name) || remove_empty_last_nodes.include?(node.name)) && node.content.blank? && node.attributes.blank?
  221. node.remove
  222. Loofah::Scrubber::STOP
  223. end
  224. end
  225. def self.cleanup_structure(string, type = 'all')
  226. remove_empty_nodes = if type == 'pre'
  227. %w[span]
  228. else
  229. %w[p div span small table]
  230. end
  231. remove_empty_last_nodes = %w[b i u small table]
  232. # remove last empty nodes and empty -not needed- parrent nodes
  233. scrubber_structure = Loofah::Scrubber.new do |node|
  234. remove_last_empty_node(node, remove_empty_nodes, remove_empty_last_nodes)
  235. end
  236. done = true
  237. while done
  238. new_string = Loofah.fragment(string).scrub!(scrubber_structure).to_s
  239. if string == new_string
  240. done = false
  241. end
  242. string = new_string
  243. end
  244. scrubber_cleanup = Loofah::Scrubber.new do |node|
  245. # remove not needed new lines
  246. if node.instance_of?(Nokogiri::XML::Text)
  247. if !node.parent || (node.parent.name != 'pre' && node.parent.name != 'code') # rubocop:disable Style/SoleNestedConditional
  248. content = node.content
  249. if content
  250. if content != ' ' && content != "\n"
  251. content.gsub!(%r{[[:space:]]+}, ' ')
  252. end
  253. if node.previous
  254. if node.previous.name == 'div' || node.previous.name == 'p'
  255. content.strip!
  256. end
  257. elsif node.parent && !node.previous && (!node.next || node.next.name == 'div' || node.next.name == 'p' || node.next.name == 'br')
  258. if (node.parent.name == 'div' || node.parent.name == 'p') && content != ' ' && content != "\n"
  259. content.strip!
  260. end
  261. end
  262. node.content = content
  263. end
  264. end
  265. end
  266. end
  267. Loofah.fragment(string).scrub!(scrubber_cleanup).to_s
  268. end
  269. def self.add_link(content, urls, node)
  270. if urls.blank?
  271. text = Nokogiri::XML::Text.new(content, node.document)
  272. node.add_next_sibling(text)
  273. return
  274. end
  275. url = urls.shift
  276. if content =~ %r{^(.*)#{Regexp.quote(url)}(.*)$}mx
  277. pre = $1
  278. post = $2
  279. if url.match?(%r{^www}i)
  280. url = "http://#{url}"
  281. end
  282. a = Nokogiri::XML::Node.new 'a', node.document
  283. a['href'] = url
  284. a['rel'] = 'nofollow noreferrer noopener'
  285. a['target'] = '_blank'
  286. a.content = url
  287. if node.class != Nokogiri::XML::Text
  288. text = Nokogiri::XML::Text.new(pre, node.document)
  289. node.add_next_sibling(text).add_next_sibling(a)
  290. return if post.blank?
  291. add_link(post, urls, a)
  292. return
  293. end
  294. node.content = pre
  295. node.add_next_sibling(a)
  296. return if post.blank?
  297. add_link(post, urls, a)
  298. end
  299. true
  300. end
  301. def self.html_decode(string)
  302. string.gsub('&amp;', '&').gsub('&lt;', '<').gsub('&gt;', '>').gsub('&quot;', '"').gsub('&nbsp;', ' ')
  303. end
  304. def self.cleanup_target(string, **options)
  305. cleaned_string = string.utf8_encode(fallback: :read_as_sanitized_binary)
  306. cleaned_string = cleaned_string.gsub(%r{[[:space:]]}, '') if !options[:keep_spaces]
  307. cleaned_string = cleaned_string.strip
  308. .delete("\t\n\r\u0000")
  309. .gsub(%r{/\*.*?\*/}, '')
  310. .gsub(%r{<!--.*?-->}, '')
  311. sanitize_attachment_disposition(cleaned_string)
  312. end
  313. def self.sanitize_attachment_disposition(url)
  314. @fqdn ||= Setting.get('fqdn')
  315. uri = URI(url)
  316. if uri.host == @fqdn && uri.query.present?
  317. params = CGI.parse(uri.query || '')
  318. .tap { |p| p.merge!('disposition' => 'attachment') if p.include?('disposition') }
  319. uri.query = URI.encode_www_form(params)
  320. end
  321. uri.to_s
  322. rescue
  323. url
  324. end
  325. def self.url_same?(url_new, url_old)
  326. url_new = CGI.unescape(url_new.to_s).utf8_encode(fallback: :read_as_sanitized_binary).downcase.delete_suffix('/').gsub(%r{[[:space:]]|\t|\n|\r}, '').strip
  327. url_old = CGI.unescape(url_old.to_s).utf8_encode(fallback: :read_as_sanitized_binary).downcase.delete_suffix('/').gsub(%r{[[:space:]]|\t|\n|\r}, '').strip
  328. url_new = html_decode(url_new).sub('/?', '?')
  329. url_old = html_decode(url_old).sub('/?', '?')
  330. return true if url_new == url_old
  331. return true if url_old == "http://#{url_new}"
  332. return true if url_new == "http://#{url_old}"
  333. return true if url_old == "https://#{url_new}"
  334. return true if url_new == "https://#{url_old}"
  335. false
  336. end
  337. =begin
  338. replace inline images with cid images
  339. string = HtmlSanitizer.replace_inline_images(article.body)
  340. =end
  341. def self.replace_inline_images(string, prefix = SecureRandom.uuid)
  342. fqdn = Setting.get('fqdn')
  343. attachments_inline = []
  344. filename_counter = 0
  345. scrubber = Loofah::Scrubber.new do |node|
  346. if node.name == 'img'
  347. if node['src'] && node['src'] =~ %r{^(data:image/(jpeg|png);base64,.+?)$}i
  348. filename_counter += 1
  349. file_attributes = StaticAssets.data_url_attributes($1)
  350. cid = "#{prefix}.#{SecureRandom.uuid}@#{fqdn}"
  351. filename = cid
  352. if file_attributes[:file_extention].present?
  353. filename = "image#{filename_counter}.#{file_attributes[:file_extention]}"
  354. end
  355. attachment = {
  356. data: file_attributes[:content],
  357. filename: filename,
  358. preferences: {
  359. 'Content-Type' => file_attributes[:mime_type],
  360. 'Mime-Type' => file_attributes[:mime_type],
  361. 'Content-ID' => cid,
  362. 'Content-Disposition' => 'inline',
  363. },
  364. }
  365. attachments_inline.push attachment
  366. node['src'] = "cid:#{cid}"
  367. end
  368. Loofah::Scrubber::STOP
  369. end
  370. end
  371. [Loofah.fragment(string).scrub!(scrubber).to_s, attachments_inline]
  372. end
  373. =begin
  374. sanitize style of img tags
  375. string = HtmlSanitizer.dynamic_image_size(article.body)
  376. =end
  377. def self.dynamic_image_size(string)
  378. scrubber = Loofah::Scrubber.new do |node|
  379. if node.name == 'img'
  380. if node['src']
  381. style = 'max-width:100%;'
  382. if node['style']
  383. pears = node['style'].downcase.gsub(%r{\t|\n|\r}, '').split(';')
  384. pears.each do |local_pear|
  385. prop = local_pear.split(':')
  386. next if !prop[0]
  387. key = prop[0].strip
  388. if key == 'height'
  389. key = 'max-height'
  390. end
  391. style += "#{key}:#{prop[1]};"
  392. end
  393. end
  394. node['style'] = style
  395. end
  396. Loofah::Scrubber::STOP
  397. end
  398. end
  399. Loofah.fragment(string).scrub!(scrubber).to_s
  400. end
  401. private_class_method :cleanup_target
  402. private_class_method :sanitize_attachment_disposition
  403. private_class_method :add_link
  404. private_class_method :url_same?
  405. private_class_method :html_decode
  406. end