html_sanitizer.rb 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. class HtmlSanitizer
  2. LINKABLE_URL_SCHEMES = URI.scheme_list.keys.map(&:downcase) - ['mailto'] + ['tel']
  3. PROCESSING_TIMEOUT = 20
  4. UNPROCESSABLE_HTML_MSG = 'This message cannot be displayed due to HTML processing issues. Download the raw message below and open it via an Email client if you still wish to view it.'.freeze
  5. =begin
  6. satinize html string based on whiltelist
  7. string = HtmlSanitizer.strict(string, external)
  8. =end
  9. def self.strict(string, external = false, timeout: true)
  10. Timeout.timeout(timeout ? PROCESSING_TIMEOUT : nil) do
  11. @fqdn = Setting.get('fqdn')
  12. # config
  13. tags_remove_content = Rails.configuration.html_sanitizer_tags_remove_content
  14. tags_quote_content = Rails.configuration.html_sanitizer_tags_quote_content
  15. tags_whitelist = Rails.configuration.html_sanitizer_tags_whitelist
  16. attributes_whitelist = Rails.configuration.html_sanitizer_attributes_whitelist
  17. css_properties_whitelist = Rails.configuration.html_sanitizer_css_properties_whitelist
  18. css_values_blacklist = Rails.application.config.html_sanitizer_css_values_backlist
  19. # We whitelist yahoo_quoted because Yahoo Mail marks quoted email content using
  20. # <div class='yahoo_quoted'> and we rely on this class to identify quoted messages
  21. classes_whitelist = %w[js-signatureMarker yahoo_quoted]
  22. attributes_2_css = %w[width height]
  23. # remove tags with subtree
  24. scrubber_tag_remove = Loofah::Scrubber.new do |node|
  25. next if tags_remove_content.exclude?(node.name)
  26. node.remove
  27. Loofah::Scrubber::STOP
  28. end
  29. string = Loofah.fragment(string).scrub!(scrubber_tag_remove).to_s
  30. # remove tag, insert quoted content
  31. scrubber_wipe_quote_content = Loofah::Scrubber.new do |node|
  32. next if tags_quote_content.exclude?(node.name)
  33. string = html_decode(node.content)
  34. text = Nokogiri::XML::Text.new(string, node.document)
  35. node.add_next_sibling(text)
  36. node.remove
  37. Loofah::Scrubber::STOP
  38. end
  39. string = Loofah.fragment(string).scrub!(scrubber_wipe_quote_content).to_s
  40. scrubber_wipe = Loofah::Scrubber.new do |node|
  41. # replace tags, keep subtree
  42. if tags_whitelist.exclude?(node.name)
  43. node.replace node.children.to_s
  44. Loofah::Scrubber::STOP
  45. end
  46. # prepare src attribute
  47. if node['src']
  48. src = cleanup_target(CGI.unescape(node['src']))
  49. if src =~ /(javascript|livescript|vbscript):/i || src.downcase.start_with?('http', 'ftp', '//')
  50. node.remove
  51. Loofah::Scrubber::STOP
  52. end
  53. end
  54. # clean class / only use allowed classes
  55. if node['class']
  56. classes = node['class'].gsub(/\t|\n|\r/, '').split
  57. class_new = ''
  58. classes.each do |local_class|
  59. next if classes_whitelist.exclude?(local_class.to_s.strip)
  60. if class_new != ''
  61. class_new += ' '
  62. end
  63. class_new += local_class
  64. end
  65. if class_new == ''
  66. node.delete('class')
  67. else
  68. node['class'] = class_new
  69. end
  70. end
  71. # move style attributes to css attributes
  72. attributes_2_css.each do |key|
  73. next if !node[key]
  74. if node['style'].blank?
  75. node['style'] = ''
  76. else
  77. node['style'] += ';'
  78. end
  79. value = node[key]
  80. node.delete(key)
  81. next if value.blank?
  82. value += 'px' if !value.match?(/%|px|em/i)
  83. node['style'] += "#{key}:#{value}"
  84. end
  85. # clean style / only use allowed style properties
  86. if node['style']
  87. pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
  88. style = ''
  89. pears.each do |local_pear|
  90. prop = local_pear.split(':')
  91. next if !prop[0]
  92. key = prop[0].strip
  93. next if css_properties_whitelist.exclude?(node.name)
  94. next if css_properties_whitelist[node.name].exclude?(key)
  95. next if css_values_blacklist[node.name]&.include?(local_pear.gsub(/[[:space:]]/, '').strip)
  96. style += "#{local_pear};"
  97. end
  98. node['style'] = style
  99. if style == ''
  100. node.delete('style')
  101. end
  102. end
  103. # scan for invalid link content
  104. %w[href style].each do |attribute_name|
  105. next if !node[attribute_name]
  106. href = cleanup_target(node[attribute_name])
  107. next if !href.match?(/(javascript|livescript|vbscript):/i)
  108. node.delete(attribute_name)
  109. end
  110. # remove attributes if not whitelisted
  111. node.each do |attribute, _value|
  112. attribute_name = attribute.downcase
  113. next if attributes_whitelist[:all].include?(attribute_name) || attributes_whitelist[node.name]&.include?(attribute_name)
  114. node.delete(attribute)
  115. end
  116. # remove mailto links
  117. if node['href']
  118. href = cleanup_target(node['href'])
  119. if href =~ /mailto:(.*)$/i
  120. text = Nokogiri::XML::Text.new(CGI.unescape($1), node.document)
  121. node.add_next_sibling(text)
  122. node.remove
  123. Loofah::Scrubber::STOP
  124. end
  125. end
  126. end
  127. done = true
  128. while done
  129. new_string = Loofah.fragment(string).scrub!(scrubber_wipe).to_s
  130. if string == new_string
  131. done = false
  132. end
  133. string = new_string
  134. end
  135. scrubber_link = Loofah::Scrubber.new do |node|
  136. # wrap plain-text URLs in <a> tags
  137. if node.is_a?(Nokogiri::XML::Text) && node.content.present? && node.content.include?(':') && node.ancestors.map(&:name).exclude?('a')
  138. urls = URI.extract(node.content, LINKABLE_URL_SCHEMES)
  139. .map { |u| u.sub(/[,.]$/, '') } # URI::extract captures trailing dots/commas
  140. .reject { |u| u.match?(/^[^:]+:$/) } # URI::extract will match, e.g., 'tel:'
  141. next if urls.blank?
  142. add_link(node.content, urls, node)
  143. end
  144. # prepare links
  145. if node['href']
  146. href = cleanup_target(node['href'], keep_spaces: true)
  147. href_without_spaces = href.gsub(/[[:space:]]/, '')
  148. if external && href_without_spaces.present? && !href_without_spaces.downcase.start_with?('//') && href_without_spaces.downcase !~ %r{^.{1,6}://.+?}
  149. node['href'] = "http://#{node['href']}"
  150. href = node['href']
  151. href_without_spaces = href.gsub(/[[:space:]]/, '')
  152. end
  153. next if !CGI.unescape(href_without_spaces).utf8_encode(fallback: :read_as_sanitized_binary).gsub(/[[:space:]]/, '').downcase.start_with?('http', 'ftp', '//')
  154. node.set_attribute('href', href)
  155. node.set_attribute('rel', 'nofollow noreferrer noopener')
  156. node.set_attribute('target', '_blank')
  157. end
  158. if node.name == 'a' && node['href'].blank?
  159. node.replace node.children.to_s
  160. Loofah::Scrubber::STOP
  161. end
  162. # check if href is different to text
  163. if node.name == 'a' && !url_same?(node['href'], node.text) && node['title'].blank?
  164. node['title'] = node['href']
  165. end
  166. end
  167. Loofah.fragment(string).scrub!(scrubber_link).to_s
  168. end
  169. rescue Timeout::Error
  170. Rails.logger.error "Could not process string via HtmlSanitizer.strict in #{PROCESSING_TIMEOUT} seconds. Current state: #{string}"
  171. UNPROCESSABLE_HTML_MSG
  172. end
  173. =begin
  174. cleanup html string:
  175. * remove empty nodes (p, div, span, table)
  176. * remove nodes in general (keep content - span)
  177. string = HtmlSanitizer.cleanup(string)
  178. =end
  179. def self.cleanup(string, timeout: true)
  180. Timeout.timeout(timeout ? PROCESSING_TIMEOUT : nil) do
  181. string.gsub!(/<[A-z]:[A-z]>/, '')
  182. string.gsub!(%r{</[A-z]:[A-z]>}, '')
  183. string.delete!("\t")
  184. # remove all new lines
  185. string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, "\n")
  186. # remove double multiple empty lines
  187. string.gsub!(/\n\n\n+/, "\n\n")
  188. string = cleanup_structure(string, 'pre')
  189. string = cleanup_replace_tags(string)
  190. string = cleanup_structure(string)
  191. string
  192. end
  193. rescue Timeout::Error
  194. Rails.logger.error "Could not process string via HtmlSanitizer.cleanup in #{PROCESSING_TIMEOUT} seconds. Current state: #{string}"
  195. UNPROCESSABLE_HTML_MSG
  196. end
  197. def self.remove_last_empty_node(node, remove_empty_nodes, remove_empty_last_nodes)
  198. if node.children.present?
  199. if node.children.size == 1
  200. local_name = node.name
  201. child = node.children.first
  202. # replace not needed node (parent <- child)
  203. replaceable_node_names = ['span', child.name]
  204. if local_name == child.name && node.attributes.present? && node.children.first.attributes.blank?
  205. local_node_child = node.children.first
  206. node.attributes.each do |k|
  207. local_node_child.set_attribute(k[0], k[1])
  208. end
  209. node.replace local_node_child.to_s
  210. Loofah::Scrubber::STOP
  211. # replace not needed node (parent replace with child node)
  212. elsif replaceable_node_names.include?(local_name) && node.attributes.blank?
  213. node.replace node.children.to_s
  214. Loofah::Scrubber::STOP
  215. end
  216. else
  217. # loop through nodes
  218. node.children.each do |local_node|
  219. remove_last_empty_node(local_node, remove_empty_nodes, remove_empty_last_nodes)
  220. end
  221. end
  222. # remove empty nodes
  223. elsif (remove_empty_nodes.include?(node.name) || remove_empty_last_nodes.include?(node.name)) && node.content.blank? && node.attributes.blank?
  224. node.remove
  225. Loofah::Scrubber::STOP
  226. end
  227. end
  228. def self.cleanup_replace_tags(string)
  229. #return string
  230. tags_backlist = %w[span center]
  231. scrubber = Loofah::Scrubber.new do |node|
  232. next if tags_backlist.exclude?(node.name)
  233. #next if !node.parent.present?
  234. hit = false
  235. local_node = nil
  236. (1..5).each do |_count|
  237. local_node = if local_node
  238. local_node.parent
  239. else
  240. node.parent
  241. end
  242. break if !local_node
  243. next if local_node.name != 'td'
  244. hit = true
  245. end
  246. next if hit && node.keys.count.positive?
  247. node.replace cleanup_replace_tags(node.children.to_s)
  248. Loofah::Scrubber::STOP
  249. end
  250. Loofah.fragment(string).scrub!(scrubber).to_s
  251. end
  252. def self.cleanup_structure(string, type = 'all')
  253. remove_empty_nodes = if type == 'pre'
  254. %w[span]
  255. else
  256. %w[p div span small table]
  257. end
  258. remove_empty_last_nodes = %w[b i u small table]
  259. # remove last empty nodes and empty -not needed- parrent nodes
  260. scrubber_structure = Loofah::Scrubber.new do |node|
  261. remove_last_empty_node(node, remove_empty_nodes, remove_empty_last_nodes)
  262. end
  263. done = true
  264. while done
  265. new_string = Loofah.fragment(string).scrub!(scrubber_structure).to_s
  266. if string == new_string
  267. done = false
  268. end
  269. string = new_string
  270. end
  271. scrubber_cleanup = Loofah::Scrubber.new do |node|
  272. # remove mailto links
  273. if node['href']
  274. href = cleanup_target(node['href'])
  275. if href =~ /mailto:(.*)$/i
  276. text = Nokogiri::XML::Text.new($1, node.document)
  277. node.add_next_sibling(text)
  278. node.remove
  279. Loofah::Scrubber::STOP
  280. end
  281. end
  282. # remove not needed new lines
  283. if node.instance_of?(Nokogiri::XML::Text)
  284. if !node.parent || (node.parent.name != 'pre' && node.parent.name != 'code') # rubocop:disable Style/SoleNestedConditional
  285. content = node.content
  286. if content
  287. if content != ' ' && content != "\n"
  288. content.gsub!(/[[:space:]]+/, ' ')
  289. end
  290. if node.previous
  291. if node.previous.name == 'div' || node.previous.name == 'p'
  292. content.strip!
  293. end
  294. elsif node.parent && !node.previous && (!node.next || node.next.name == 'div' || node.next.name == 'p' || node.next.name == 'br')
  295. if (node.parent.name == 'div' || node.parent.name == 'p') && content != ' ' && content != "\n"
  296. content.strip!
  297. end
  298. end
  299. node.content = content
  300. end
  301. end
  302. end
  303. end
  304. Loofah.fragment(string).scrub!(scrubber_cleanup).to_s
  305. end
  306. def self.add_link(content, urls, node)
  307. if urls.blank?
  308. text = Nokogiri::XML::Text.new(content, node.document)
  309. node.add_next_sibling(text)
  310. return
  311. end
  312. url = urls.shift
  313. if content =~ /^(.*)#{Regexp.quote(url)}(.*)$/mx
  314. pre = $1
  315. post = $2
  316. if url.match?(/^www/i)
  317. url = "http://#{url}"
  318. end
  319. a = Nokogiri::XML::Node.new 'a', node.document
  320. a['href'] = url
  321. a['rel'] = 'nofollow noreferrer noopener'
  322. a['target'] = '_blank'
  323. a.content = url
  324. if node.class != Nokogiri::XML::Text
  325. text = Nokogiri::XML::Text.new(pre, node.document)
  326. node.add_next_sibling(text).add_next_sibling(a)
  327. return if post.blank?
  328. add_link(post, urls, a)
  329. return
  330. end
  331. node.content = pre
  332. node.add_next_sibling(a)
  333. return if post.blank?
  334. add_link(post, urls, a)
  335. end
  336. true
  337. end
  338. def self.html_decode(string)
  339. string.gsub('&amp;', '&').gsub('&lt;', '<').gsub('&gt;', '>').gsub('&quot;', '"').gsub('&nbsp;', ' ')
  340. end
  341. def self.cleanup_target(string, **options)
  342. cleaned_string = string.utf8_encode(fallback: :read_as_sanitized_binary)
  343. cleaned_string = cleaned_string.gsub(/[[:space:]]/, '') if !options[:keep_spaces]
  344. cleaned_string = cleaned_string.strip
  345. .delete("\t\n\r\u0000")
  346. .gsub(%r{/\*.*?\*/}, '')
  347. .gsub(/<!--.*?-->/, '')
  348. sanitize_attachment_disposition(cleaned_string)
  349. end
  350. def self.sanitize_attachment_disposition(url)
  351. @fqdn ||= Setting.get('fqdn')
  352. uri = URI(url)
  353. if uri.host == @fqdn && uri.query.present?
  354. params = CGI.parse(uri.query || '')
  355. .tap { |p| p.merge!('disposition' => 'attachment') if p.include?('disposition') }
  356. uri.query = URI.encode_www_form(params)
  357. end
  358. uri.to_s
  359. rescue
  360. url
  361. end
  362. def self.url_same?(url_new, url_old)
  363. url_new = CGI.unescape(url_new.to_s).utf8_encode(fallback: :read_as_sanitized_binary).downcase.delete_suffix('/').gsub(/[[:space:]]|\t|\n|\r/, '').strip
  364. url_old = CGI.unescape(url_old.to_s).utf8_encode(fallback: :read_as_sanitized_binary).downcase.delete_suffix('/').gsub(/[[:space:]]|\t|\n|\r/, '').strip
  365. url_new = html_decode(url_new).sub('/?', '?')
  366. url_old = html_decode(url_old).sub('/?', '?')
  367. return true if url_new == url_old
  368. return true if url_old == "http://#{url_new}"
  369. return true if url_new == "http://#{url_old}"
  370. return true if url_old == "https://#{url_new}"
  371. return true if url_new == "https://#{url_old}"
  372. false
  373. end
  374. =begin
  375. reolace inline images with cid images
  376. string = HtmlSanitizer.replace_inline_images(article.body)
  377. =end
  378. def self.replace_inline_images(string, prefix = rand(999_999_999))
  379. fqdn = Setting.get('fqdn')
  380. attachments_inline = []
  381. filename_counter = 0
  382. scrubber = Loofah::Scrubber.new do |node|
  383. if node.name == 'img'
  384. if node['src'] && node['src'] =~ %r{^(data:image/(jpeg|png);base64,.+?)$}i
  385. filename_counter += 1
  386. file_attributes = StaticAssets.data_url_attributes($1)
  387. cid = "#{prefix}.#{rand(999_999_999)}@#{fqdn}"
  388. filename = cid
  389. if file_attributes[:file_extention].present?
  390. filename = "image#{filename_counter}.#{file_attributes[:file_extention]}"
  391. end
  392. attachment = {
  393. data: file_attributes[:content],
  394. filename: filename,
  395. preferences: {
  396. 'Content-Type' => file_attributes[:mime_type],
  397. 'Mime-Type' => file_attributes[:mime_type],
  398. 'Content-ID' => cid,
  399. 'Content-Disposition' => 'inline',
  400. },
  401. }
  402. attachments_inline.push attachment
  403. node['src'] = "cid:#{cid}"
  404. end
  405. Loofah::Scrubber::STOP
  406. end
  407. end
  408. [Loofah.fragment(string).scrub!(scrubber).to_s, attachments_inline]
  409. end
  410. =begin
  411. satinize style of img tags
  412. string = HtmlSanitizer.dynamic_image_size(article.body)
  413. =end
  414. def self.dynamic_image_size(string)
  415. scrubber = Loofah::Scrubber.new do |node|
  416. if node.name == 'img'
  417. if node['src']
  418. style = 'max-width:100%;'
  419. if node['style']
  420. pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
  421. pears.each do |local_pear|
  422. prop = local_pear.split(':')
  423. next if !prop[0]
  424. key = prop[0].strip
  425. if key == 'height'
  426. key = 'max-height'
  427. end
  428. style += "#{key}:#{prop[1]};"
  429. end
  430. end
  431. node['style'] = style
  432. end
  433. Loofah::Scrubber::STOP
  434. end
  435. end
  436. Loofah.fragment(string).scrub!(scrubber).to_s
  437. end
  438. private_class_method :cleanup_target
  439. private_class_method :sanitize_attachment_disposition
  440. private_class_method :add_link
  441. private_class_method :url_same?
  442. private_class_method :html_decode
  443. end