html_sanitizer.rb 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. class HtmlSanitizer
  2. LINKABLE_URL_SCHEMES = URI.scheme_list.keys.map(&:downcase) - ['mailto'] + ['tel']
  3. =begin
  4. satinize html string based on whiltelist
  5. string = HtmlSanitizer.strict(string, external)
  6. =end
  7. def self.strict(string, external = false)
  8. # config
  9. tags_remove_content = Rails.configuration.html_sanitizer_tags_remove_content
  10. tags_quote_content = Rails.configuration.html_sanitizer_tags_quote_content
  11. tags_whitelist = Rails.configuration.html_sanitizer_tags_whitelist
  12. attributes_whitelist = Rails.configuration.html_sanitizer_attributes_whitelist
  13. css_properties_whitelist = Rails.configuration.html_sanitizer_css_properties_whitelist
  14. css_values_blacklist = Rails.application.config.html_sanitizer_css_values_backlist
  15. classes_whitelist = ['js-signatureMarker']
  16. attributes_2_css = %w[width height]
  17. # remove html comments
  18. string.gsub!(/<!--.+?-->/m, '')
  19. scrubber_link = Loofah::Scrubber.new do |node|
  20. # wrap plain-text URLs in <a> tags
  21. if node.is_a?(Nokogiri::XML::Text) && node.ancestors.map(&:name).exclude?('a')
  22. urls = URI.extract(node.content, LINKABLE_URL_SCHEMES)
  23. .map { |u| u.sub(/[,.]$/, '') } # URI::extract captures trailing dots/commas
  24. .reject { |u| u.match?(/^[^:]+:$/) } # URI::extract will match, e.g., 'tel:'
  25. next if urls.blank?
  26. add_link(node.content, urls, node)
  27. end
  28. # prepare links
  29. if node['href']
  30. href = cleanup_target(node['href'], keep_spaces: true)
  31. href_without_spaces = href.gsub(/[[:space:]]/, '')
  32. if external && href_without_spaces.present? && !href_without_spaces.downcase.start_with?('//') && href_without_spaces.downcase !~ %r{^.{1,6}://.+?}
  33. node['href'] = "http://#{node['href']}"
  34. href = node['href']
  35. href_without_spaces = href.gsub(/[[:space:]]/, '')
  36. end
  37. next if !href_without_spaces.downcase.start_with?('http', 'ftp', '//')
  38. node.set_attribute('href', href)
  39. node.set_attribute('rel', 'nofollow noreferrer noopener')
  40. node.set_attribute('target', '_blank')
  41. end
  42. if node.name == 'a' && node['href'].blank?
  43. node.replace node.children.to_s
  44. Loofah::Scrubber::STOP
  45. end
  46. # check if href is different to text
  47. if node.name == 'a' && !url_same?(node['href'], node.text)
  48. if node['title'].blank?
  49. node['title'] = node['href']
  50. end
  51. end
  52. end
  53. scrubber_wipe = Loofah::Scrubber.new do |node|
  54. # remove tags with subtree
  55. if tags_remove_content.include?(node.name)
  56. node.remove
  57. Loofah::Scrubber::STOP
  58. end
  59. # remove tag, insert quoted content
  60. if tags_quote_content.include?(node.name)
  61. string = html_decode(node.content)
  62. text = Nokogiri::XML::Text.new(string, node.document)
  63. node.add_next_sibling(text)
  64. node.remove
  65. Loofah::Scrubber::STOP
  66. end
  67. # replace tags, keep subtree
  68. if !tags_whitelist.include?(node.name)
  69. node.replace node.children.to_s
  70. Loofah::Scrubber::STOP
  71. end
  72. # prepare src attribute
  73. if node['src']
  74. src = cleanup_target(node['src'])
  75. if src =~ /(javascript|livescript|vbscript):/i || src.downcase.start_with?('http', 'ftp', '//')
  76. node.remove
  77. Loofah::Scrubber::STOP
  78. end
  79. end
  80. # clean class / only use allowed classes
  81. if node['class']
  82. classes = node['class'].gsub(/\t|\n|\r/, '').split(' ')
  83. class_new = ''
  84. classes.each do |local_class|
  85. next if !classes_whitelist.include?(local_class.to_s.strip)
  86. if class_new != ''
  87. class_new += ' '
  88. end
  89. class_new += local_class
  90. end
  91. if class_new != ''
  92. node['class'] = class_new
  93. else
  94. node.delete('class')
  95. end
  96. end
  97. # move style attributes to css attributes
  98. attributes_2_css.each do |key|
  99. next if !node[key]
  100. if node['style'].blank?
  101. node['style'] = ''
  102. else
  103. node['style'] += ';'
  104. end
  105. value = node[key]
  106. node.delete(key)
  107. next if value.blank?
  108. value += 'px' if !value.match?(/%|px|em/i)
  109. node['style'] += "#{key}:#{value}"
  110. end
  111. # clean style / only use allowed style properties
  112. if node['style']
  113. pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
  114. style = ''
  115. pears.each do |local_pear|
  116. prop = local_pear.split(':')
  117. next if !prop[0]
  118. key = prop[0].strip
  119. next if !css_properties_whitelist.include?(node.name)
  120. next if !css_properties_whitelist[node.name].include?(key)
  121. next if css_values_blacklist[node.name]&.include?(local_pear.gsub(/[[:space:]]/, '').strip)
  122. style += "#{local_pear};"
  123. end
  124. node['style'] = style
  125. if style == ''
  126. node.delete('style')
  127. end
  128. end
  129. # scan for invalid link content
  130. %w[href style].each do |attribute_name|
  131. next if !node[attribute_name]
  132. href = cleanup_target(node[attribute_name])
  133. next if href !~ /(javascript|livescript|vbscript):/i
  134. node.delete(attribute_name)
  135. end
  136. # remove attributes if not whitelisted
  137. node.each do |attribute, _value|
  138. attribute_name = attribute.downcase
  139. next if attributes_whitelist[:all].include?(attribute_name) || (attributes_whitelist[node.name]&.include?(attribute_name))
  140. node.delete(attribute)
  141. end
  142. # remove mailto links
  143. if node['href']
  144. href = cleanup_target(node['href'])
  145. if href =~ /mailto:(.*)$/i
  146. text = Nokogiri::XML::Text.new($1, node.document)
  147. node.add_next_sibling(text)
  148. node.remove
  149. Loofah::Scrubber::STOP
  150. end
  151. end
  152. end
  153. new_string = ''
  154. done = true
  155. while done
  156. new_string = Loofah.fragment(string).scrub!(scrubber_wipe).to_s
  157. if string == new_string
  158. done = false
  159. end
  160. string = new_string
  161. end
  162. Loofah.fragment(string).scrub!(scrubber_link).to_s
  163. end
  164. =begin
  165. cleanup html string:
  166. * remove empty nodes (p, div, span, table)
  167. * remove nodes in general (keep content - span)
  168. string = HtmlSanitizer.cleanup(string)
  169. =end
  170. def self.cleanup(string)
  171. string.gsub!(/<[A-z]:[A-z]>/, '')
  172. string.gsub!(%r{</[A-z]:[A-z]>}, '')
  173. string.delete!("\t")
  174. # remove all new lines
  175. string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, "\n")
  176. # remove double multiple empty lines
  177. string.gsub!(/\n\n\n+/, "\n\n")
  178. string = cleanup_structure(string, 'pre')
  179. string = cleanup_replace_tags(string)
  180. string = cleanup_structure(string)
  181. string
  182. end
  183. def self.cleanup_replace_tags(string)
  184. #return string
  185. tags_backlist = %w[span center]
  186. scrubber = Loofah::Scrubber.new do |node|
  187. next if !tags_backlist.include?(node.name)
  188. hit = false
  189. local_node = nil
  190. (1..5).each do |_count|
  191. local_node = if local_node
  192. local_node.parent
  193. else
  194. node.parent
  195. end
  196. break if !local_node
  197. next if local_node.name != 'td'
  198. hit = true
  199. end
  200. next if hit && node.keys.count.positive?
  201. node.replace cleanup_replace_tags(node.children.to_s)
  202. Loofah::Scrubber::STOP
  203. end
  204. Loofah.fragment(string).scrub!(scrubber).to_s
  205. end
  206. def self.cleanup_structure(string, type = 'all')
  207. remove_empty_nodes = if type == 'pre'
  208. %w[span]
  209. else
  210. %w[p div span small table]
  211. end
  212. remove_empty_last_nodes = %w[b i u small table]
  213. # remove last empty nodes and empty -not needed- parrent nodes
  214. scrubber_structure = Loofah::Scrubber.new do |node|
  215. if remove_empty_last_nodes.include?(node.name) && node.children.size.zero?
  216. node.remove
  217. Loofah::Scrubber::STOP
  218. end
  219. # remove empty childs
  220. if node.content.blank? && remove_empty_nodes.include?(node.name) && node.children.size == 1 && remove_empty_nodes.include?(node.children.first.name)
  221. node.replace node.children.to_s
  222. Loofah::Scrubber::STOP
  223. end
  224. # remove empty childs
  225. if remove_empty_nodes.include?(node.name) && node.children.size == 1 && remove_empty_nodes.include?(node.children.first.name) && node.children.first.content == node.content
  226. node.replace node.children.to_s
  227. Loofah::Scrubber::STOP
  228. end
  229. # remove node if empty and parent was already a remove node
  230. if node.content.blank? && remove_empty_nodes.include?(node.name) && node.parent && node.children.size.zero? && remove_empty_nodes.include?(node.parent.name)
  231. node.remove
  232. Loofah::Scrubber::STOP
  233. end
  234. end
  235. new_string = ''
  236. done = true
  237. while done
  238. new_string = Loofah.fragment(string).scrub!(scrubber_structure).to_s
  239. if string == new_string
  240. done = false
  241. end
  242. string = new_string
  243. end
  244. scrubber_cleanup = Loofah::Scrubber.new do |node|
  245. # remove mailto links
  246. if node['href']
  247. href = cleanup_target(node['href'])
  248. if href =~ /mailto:(.*)$/i
  249. text = Nokogiri::XML::Text.new($1, node.document)
  250. node.add_next_sibling(text)
  251. node.remove
  252. Loofah::Scrubber::STOP
  253. end
  254. end
  255. # remove not needed new lines
  256. if node.class == Nokogiri::XML::Text
  257. if !node.parent || (node.parent.name != 'pre' && node.parent.name != 'code')
  258. content = node.content
  259. if content
  260. if content != ' ' && content != "\n"
  261. content.gsub!(/[[:space:]]+/, ' ')
  262. end
  263. if node.previous
  264. if node.previous.name == 'div' || node.previous.name == 'p'
  265. content.strip!
  266. end
  267. elsif node.parent && !node.previous && (!node.next || node.next.name == 'div' || node.next.name == 'p' || node.next.name == 'br')
  268. if (node.parent.name == 'div' || node.parent.name == 'p') && content != ' ' && content != "\n"
  269. content.strip!
  270. end
  271. end
  272. node.content = content
  273. end
  274. end
  275. end
  276. end
  277. Loofah.fragment(string).scrub!(scrubber_cleanup).to_s
  278. end
  279. def self.add_link(content, urls, node)
  280. if urls.blank?
  281. text = Nokogiri::XML::Text.new(content, node.document)
  282. node.add_next_sibling(text)
  283. return
  284. end
  285. url = urls.shift
  286. if content =~ /^(.*)#{Regexp.quote(url)}(.*)$/mx
  287. pre = $1
  288. post = $2
  289. if url.match?(/^www/i)
  290. url = "http://#{url}"
  291. end
  292. a = Nokogiri::XML::Node.new 'a', node.document
  293. a['href'] = url
  294. a['rel'] = 'nofollow noreferrer noopener'
  295. a['target'] = '_blank'
  296. a.content = url
  297. if node.class != Nokogiri::XML::Text
  298. text = Nokogiri::XML::Text.new(pre, node.document)
  299. node.add_next_sibling(text).add_next_sibling(a)
  300. return if post.blank?
  301. add_link(post, urls, a)
  302. return
  303. end
  304. node.content = pre
  305. node.add_next_sibling(a)
  306. return if post.blank?
  307. add_link(post, urls, a)
  308. end
  309. true
  310. end
  311. def self.html_decode(string)
  312. string.gsub('&amp;', '&').gsub('&lt;', '<').gsub('&gt;', '>').gsub('&quot;', '"').gsub('&nbsp;', ' ')
  313. end
  314. def self.cleanup_target(string, keep_spaces: false)
  315. string = CGI.unescape(string).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?')
  316. blank_regex = if keep_spaces
  317. /\t|\n|\r/
  318. else
  319. /[[:space:]]|\t|\n|\r/
  320. end
  321. cleaned_string = string.strip.gsub(blank_regex, '').gsub(%r{/\*.*?\*/}, '').gsub(/<!--.*?-->/, '').gsub(/\[.+?\]/, '').delete("\u0000")
  322. sanitize_attachment_disposition(cleaned_string)
  323. end
  324. def self.sanitize_attachment_disposition(url)
  325. uri = URI(url)
  326. return url if uri.host != Setting.get('fqdn')
  327. params = CGI.parse(uri.query || '')
  328. if params.key?('disposition')
  329. params['disposition'] = 'attachment'
  330. end
  331. uri.query = if params.blank?
  332. nil
  333. else
  334. URI.encode_www_form(params)
  335. end
  336. uri.to_s
  337. rescue URI::Error
  338. url
  339. end
  340. def self.url_same?(url_new, url_old)
  341. url_new = CGI.unescape(url_new.to_s).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?').downcase.gsub(%r{/$}, '').gsub(/[[:space:]]|\t|\n|\r/, '').strip
  342. url_old = CGI.unescape(url_old.to_s).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?').downcase.gsub(%r{/$}, '').gsub(/[[:space:]]|\t|\n|\r/, '').strip
  343. url_new = html_decode(url_new).sub('/?', '?')
  344. url_old = html_decode(url_old).sub('/?', '?')
  345. return true if url_new == url_old
  346. return true if url_old == "http://#{url_new}"
  347. return true if url_new == "http://#{url_old}"
  348. return true if url_old == "https://#{url_new}"
  349. return true if url_new == "https://#{url_old}"
  350. false
  351. end
  352. =begin
  353. reolace inline images with cid images
  354. string = HtmlSanitizer.replace_inline_images(article.body)
  355. =end
  356. def self.replace_inline_images(string, prefix = rand(999_999_999))
  357. attachments_inline = []
  358. filename_counter = 0
  359. scrubber = Loofah::Scrubber.new do |node|
  360. if node.name == 'img'
  361. if node['src'] && node['src'] =~ %r{^(data:image/(jpeg|png);base64,.+?)$}i
  362. filename_counter += 1
  363. file_attributes = StaticAssets.data_url_attributes($1)
  364. cid = "#{prefix}.#{rand(999_999_999)}@#{Setting.get('fqdn')}"
  365. filename = cid
  366. if file_attributes[:file_extention].present?
  367. filename = "image#{filename_counter}.#{file_attributes[:file_extention]}"
  368. end
  369. attachment = {
  370. data: file_attributes[:content],
  371. filename: filename,
  372. preferences: {
  373. 'Content-Type' => file_attributes[:mime_type],
  374. 'Mime-Type' => file_attributes[:mime_type],
  375. 'Content-ID' => cid,
  376. 'Content-Disposition' => 'inline',
  377. },
  378. }
  379. attachments_inline.push attachment
  380. node['src'] = "cid:#{cid}"
  381. end
  382. Loofah::Scrubber::STOP
  383. end
  384. end
  385. [Loofah.fragment(string).scrub!(scrubber).to_s, attachments_inline]
  386. end
  387. =begin
  388. satinize style of img tags
  389. string = HtmlSanitizer.dynamic_image_size(article.body)
  390. =end
  391. def self.dynamic_image_size(string)
  392. scrubber = Loofah::Scrubber.new do |node|
  393. if node.name == 'img'
  394. if node['src']
  395. style = 'max-width:100%;'
  396. if node['style']
  397. pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
  398. pears.each do |local_pear|
  399. prop = local_pear.split(':')
  400. next if !prop[0]
  401. key = prop[0].strip
  402. if key == 'height'
  403. key = 'max-height'
  404. end
  405. style += "#{key}:#{prop[1]};"
  406. end
  407. end
  408. node['style'] = style
  409. end
  410. Loofah::Scrubber::STOP
  411. end
  412. end
  413. Loofah.fragment(string).scrub!(scrubber).to_s
  414. end
  415. private_class_method :cleanup_target
  416. private_class_method :sanitize_attachment_disposition
  417. private_class_method :add_link
  418. private_class_method :url_same?
  419. private_class_method :html_decode
  420. end