html_sanitizer.rb 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. class HtmlSanitizer
  2. =begin
  3. satinize html string based on whiltelist
  4. string = HtmlSanitizer.strict(string, external)
  5. =end
  6. def self.strict(string, external = false)
  7. # config
  8. tags_remove_content = Rails.configuration.html_sanitizer_tags_remove_content
  9. tags_quote_content = Rails.configuration.html_sanitizer_tags_quote_content
  10. tags_whitelist = Rails.configuration.html_sanitizer_tags_whitelist
  11. attributes_whitelist = Rails.configuration.html_sanitizer_attributes_whitelist
  12. css_properties_whitelist = Rails.configuration.html_sanitizer_css_properties_whitelist
  13. css_values_blacklist = Rails.application.config.html_sanitizer_css_values_backlist
  14. classes_whitelist = ['js-signatureMarker']
  15. attributes_2_css = %w[width height]
  16. # remove html comments
  17. string.gsub!(/<!--.+?-->/m, '')
  18. scrubber_link = Loofah::Scrubber.new do |node|
  19. # check if text has urls which need to be clickable
  20. if node&.name != 'a' && node.parent && node.parent.name != 'a' && (!node.parent.parent || node.parent.parent.name != 'a')
  21. if node.class == Nokogiri::XML::Text
  22. urls = []
  23. node.content.scan(%r{((http|https|ftp|tel)://.+?)([[:space:]]|\.[[:space:]]|,[[:space:]]|\.$|,$|\)|\(|$)}mxi).each do |match|
  24. if match[0]
  25. urls.push match[0].to_s.strip
  26. end
  27. end
  28. node.content.scan(/(^|:|;|\s)(www\..+?)([[:space:]]|\.[[:space:]]|,[[:space:]]|\.$|,$|\)|\(|$)/mxi).each do |match|
  29. if match[1]
  30. urls.push match[1].to_s.strip
  31. end
  32. end
  33. next if urls.blank?
  34. add_link(node.content, urls, node)
  35. end
  36. end
  37. # prepare links
  38. if node['href']
  39. href = cleanup_target(node['href'], keep_spaces: true)
  40. href_without_spaces = href.gsub(/[[:space:]]/, '')
  41. if external && href_without_spaces.present? && !href_without_spaces.downcase.start_with?('//') && href_without_spaces.downcase !~ %r{^.{1,6}://.+?}
  42. node['href'] = "http://#{node['href']}"
  43. href = node['href']
  44. href_without_spaces = href.gsub(/[[:space:]]/, '')
  45. end
  46. next if !href_without_spaces.downcase.start_with?('http', 'ftp', '//')
  47. node.set_attribute('href', href)
  48. node.set_attribute('rel', 'nofollow noreferrer noopener')
  49. node.set_attribute('target', '_blank')
  50. end
  51. if node.name == 'a' && node['href'].blank?
  52. node.replace node.children.to_s
  53. Loofah::Scrubber::STOP
  54. end
  55. # check if href is different to text
  56. if node.name == 'a' && !url_same?(node['href'], node.text)
  57. if node['title'].blank?
  58. node['title'] = node['href']
  59. end
  60. end
  61. end
  62. scrubber_wipe = Loofah::Scrubber.new do |node|
  63. # remove tags with subtree
  64. if tags_remove_content.include?(node.name)
  65. node.remove
  66. Loofah::Scrubber::STOP
  67. end
  68. # remove tag, insert quoted content
  69. if tags_quote_content.include?(node.name)
  70. string = html_decode(node.content)
  71. text = Nokogiri::XML::Text.new(string, node.document)
  72. node.add_next_sibling(text)
  73. node.remove
  74. Loofah::Scrubber::STOP
  75. end
  76. # replace tags, keep subtree
  77. if !tags_whitelist.include?(node.name)
  78. node.replace node.children.to_s
  79. Loofah::Scrubber::STOP
  80. end
  81. # prepare src attribute
  82. if node['src']
  83. src = cleanup_target(node['src'])
  84. if src =~ /(javascript|livescript|vbscript):/i || src.downcase.start_with?('http', 'ftp', '//')
  85. node.remove
  86. Loofah::Scrubber::STOP
  87. end
  88. end
  89. # clean class / only use allowed classes
  90. if node['class']
  91. classes = node['class'].gsub(/\t|\n|\r/, '').split(' ')
  92. class_new = ''
  93. classes.each do |local_class|
  94. next if !classes_whitelist.include?(local_class.to_s.strip)
  95. if class_new != ''
  96. class_new += ' '
  97. end
  98. class_new += local_class
  99. end
  100. if class_new != ''
  101. node['class'] = class_new
  102. else
  103. node.delete('class')
  104. end
  105. end
  106. # move style attributes to css attributes
  107. attributes_2_css.each do |key|
  108. next if !node[key]
  109. if node['style'].blank?
  110. node['style'] = ''
  111. else
  112. node['style'] += ';'
  113. end
  114. value = node[key]
  115. node.delete(key)
  116. next if value.blank?
  117. if value !~ /%|px|em/i
  118. value += 'px'
  119. end
  120. node['style'] += "#{key}:#{value}"
  121. end
  122. # clean style / only use allowed style properties
  123. if node['style']
  124. pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
  125. style = ''
  126. pears.each do |local_pear|
  127. prop = local_pear.split(':')
  128. next if !prop[0]
  129. key = prop[0].strip
  130. next if !css_properties_whitelist.include?(node.name)
  131. next if !css_properties_whitelist[node.name].include?(key)
  132. next if css_values_blacklist[node.name]&.include?(local_pear.gsub(/[[:space:]]/, '').strip)
  133. style += "#{local_pear};"
  134. end
  135. node['style'] = style
  136. if style == ''
  137. node.delete('style')
  138. end
  139. end
  140. # scan for invalid link content
  141. %w[href style].each do |attribute_name|
  142. next if !node[attribute_name]
  143. href = cleanup_target(node[attribute_name])
  144. next if href !~ /(javascript|livescript|vbscript):/i
  145. node.delete(attribute_name)
  146. end
  147. # remove attributes if not whitelisted
  148. node.each do |attribute, _value| # rubocop:disable Performance/HashEachMethods
  149. attribute_name = attribute.downcase
  150. next if attributes_whitelist[:all].include?(attribute_name) || (attributes_whitelist[node.name]&.include?(attribute_name))
  151. node.delete(attribute)
  152. end
  153. # remove mailto links
  154. if node['href']
  155. href = cleanup_target(node['href'])
  156. if href =~ /mailto:(.*)$/i
  157. text = Nokogiri::XML::Text.new($1, node.document)
  158. node.add_next_sibling(text)
  159. node.remove
  160. Loofah::Scrubber::STOP
  161. end
  162. end
  163. end
  164. new_string = ''
  165. done = true
  166. while done
  167. new_string = Loofah.fragment(string).scrub!(scrubber_wipe).to_s
  168. if string == new_string
  169. done = false
  170. end
  171. string = new_string
  172. end
  173. Loofah.fragment(string).scrub!(scrubber_link).to_s
  174. end
  175. =begin
  176. cleanup html string:
  177. * remove empty nodes (p, div, span, table)
  178. * remove nodes in general (keep content - span)
  179. string = HtmlSanitizer.cleanup(string)
  180. =end
  181. def self.cleanup(string)
  182. string.gsub!(/<[A-z]:[A-z]>/, '')
  183. string.gsub!(%r{</[A-z]:[A-z]>}, '')
  184. string.delete!("\t")
  185. # remove all new lines
  186. string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, "\n")
  187. # remove double multiple empty lines
  188. string.gsub!(/\n\n\n+/, "\n\n")
  189. string = cleanup_structure(string, 'pre')
  190. string = cleanup_replace_tags(string)
  191. string = cleanup_structure(string)
  192. string
  193. end
  194. def self.cleanup_replace_tags(string)
  195. #return string
  196. tags_backlist = %w[span center]
  197. scrubber = Loofah::Scrubber.new do |node|
  198. next if !tags_backlist.include?(node.name)
  199. hit = false
  200. local_node = nil
  201. (1..5).each do |_count|
  202. local_node = if local_node
  203. local_node.parent
  204. else
  205. node.parent
  206. end
  207. break if !local_node
  208. next if local_node.name != 'td'
  209. hit = true
  210. end
  211. next if hit && node.keys.count.positive?
  212. node.replace cleanup_replace_tags(node.children.to_s)
  213. Loofah::Scrubber::STOP
  214. end
  215. Loofah.fragment(string).scrub!(scrubber).to_s
  216. end
  217. def self.cleanup_structure(string, type = 'all')
  218. remove_empty_nodes = if type == 'pre'
  219. %w[span]
  220. else
  221. %w[p div span small table]
  222. end
  223. remove_empty_last_nodes = %w[b i u small table]
  224. # remove last empty nodes and empty -not needed- parrent nodes
  225. scrubber_structure = Loofah::Scrubber.new do |node|
  226. if remove_empty_last_nodes.include?(node.name) && node.children.size.zero?
  227. node.remove
  228. Loofah::Scrubber::STOP
  229. end
  230. # remove empty childs
  231. if node.content.blank? && remove_empty_nodes.include?(node.name) && node.children.size == 1 && remove_empty_nodes.include?(node.children.first.name)
  232. node.replace node.children.to_s
  233. Loofah::Scrubber::STOP
  234. end
  235. # remove empty childs
  236. if remove_empty_nodes.include?(node.name) && node.children.size == 1 && remove_empty_nodes.include?(node.children.first.name) && node.children.first.content == node.content
  237. node.replace node.children.to_s
  238. Loofah::Scrubber::STOP
  239. end
  240. # remove node if empty and parent was already a remove node
  241. if node.content.blank? && remove_empty_nodes.include?(node.name) && node.parent && node.children.size.zero? && remove_empty_nodes.include?(node.parent.name)
  242. node.remove
  243. Loofah::Scrubber::STOP
  244. end
  245. end
  246. new_string = ''
  247. done = true
  248. while done
  249. new_string = Loofah.fragment(string).scrub!(scrubber_structure).to_s
  250. if string == new_string
  251. done = false
  252. end
  253. string = new_string
  254. end
  255. scrubber_cleanup = Loofah::Scrubber.new do |node|
  256. # remove mailto links
  257. if node['href']
  258. href = cleanup_target(node['href'])
  259. if href =~ /mailto:(.*)$/i
  260. text = Nokogiri::XML::Text.new($1, node.document)
  261. node.add_next_sibling(text)
  262. node.remove
  263. Loofah::Scrubber::STOP
  264. end
  265. end
  266. # remove not needed new lines
  267. if node.class == Nokogiri::XML::Text
  268. if !node.parent || (node.parent.name != 'pre' && node.parent.name != 'code')
  269. content = node.content
  270. if content
  271. if content != ' ' && content != "\n"
  272. content.gsub!(/[[:space:]]+/, ' ')
  273. end
  274. if node.previous
  275. if node.previous.name == 'div' || node.previous.name == 'p'
  276. content.strip!
  277. end
  278. elsif node.parent && !node.previous && (!node.next || node.next.name == 'div' || node.next.name == 'p' || node.next.name == 'br')
  279. if (node.parent.name == 'div' || node.parent.name == 'p') && content != ' ' && content != "\n"
  280. content.strip!
  281. end
  282. end
  283. node.content = content
  284. end
  285. end
  286. end
  287. end
  288. Loofah.fragment(string).scrub!(scrubber_cleanup).to_s
  289. end
  290. def self.add_link(content, urls, node)
  291. if urls.blank?
  292. text = Nokogiri::XML::Text.new(content, node.document)
  293. node.add_next_sibling(text)
  294. return
  295. end
  296. url = urls.shift
  297. if content =~ /^(.*)#{Regexp.quote(url)}(.*)$/mx
  298. pre = $1
  299. post = $2
  300. if url.match?(/^www/i)
  301. url = "http://#{url}"
  302. end
  303. a = Nokogiri::XML::Node.new 'a', node.document
  304. a['href'] = url
  305. a['rel'] = 'nofollow noreferrer noopener'
  306. a['target'] = '_blank'
  307. a.content = url
  308. if node.class != Nokogiri::XML::Text
  309. text = Nokogiri::XML::Text.new(pre, node.document)
  310. node.add_next_sibling(text).add_next_sibling(a)
  311. return if post.blank?
  312. add_link(post, urls, a)
  313. return
  314. end
  315. node.content = pre
  316. node.add_next_sibling(a)
  317. return if post.blank?
  318. add_link(post, urls, a)
  319. end
  320. true
  321. end
  322. def self.html_decode(string)
  323. string.gsub('&amp;', '&').gsub('&lt;', '<').gsub('&gt;', '>').gsub('&quot;', '"').gsub('&nbsp;', ' ')
  324. end
  325. def self.cleanup_target(string, keep_spaces: false)
  326. string = CGI.unescape(string).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?')
  327. blank_regex = if keep_spaces
  328. /\t|\n|\r/
  329. else
  330. /[[:space:]]|\t|\n|\r/
  331. end
  332. string.strip.gsub(blank_regex, '').gsub(%r{/\*.*?\*/}, '').gsub(/<!--.*?-->/, '').gsub(/\[.+?\]/, '').delete("\u0000")
  333. end
  334. def self.url_same?(url_new, url_old)
  335. url_new = CGI.unescape(url_new.to_s).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?').downcase.gsub(%r{/$}, '').gsub(/[[:space:]]|\t|\n|\r/, '').strip
  336. url_old = CGI.unescape(url_old.to_s).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?').downcase.gsub(%r{/$}, '').gsub(/[[:space:]]|\t|\n|\r/, '').strip
  337. url_new = html_decode(url_new).sub('/?', '?')
  338. url_old = html_decode(url_old).sub('/?', '?')
  339. return true if url_new == url_old
  340. return true if url_old == "http://#{url_new}"
  341. return true if url_new == "http://#{url_old}"
  342. return true if url_old == "https://#{url_new}"
  343. return true if url_new == "https://#{url_old}"
  344. false
  345. end
  346. =begin
  347. reolace inline images with cid images
  348. string = HtmlSanitizer.replace_inline_images(article.body)
  349. =end
  350. def self.replace_inline_images(string, prefix = rand(999_999_999))
  351. attachments_inline = []
  352. scrubber = Loofah::Scrubber.new do |node|
  353. if node.name == 'img'
  354. if node['src'] && node['src'] =~ %r{^(data:image/(jpeg|png);base64,.+?)$}i
  355. file_attributes = StaticAssets.data_url_attributes($1)
  356. cid = "#{prefix}.#{rand(999_999_999)}@#{Setting.get('fqdn')}"
  357. attachment = {
  358. data: file_attributes[:content],
  359. filename: cid,
  360. preferences: {
  361. 'Content-Type' => file_attributes[:mime_type],
  362. 'Mime-Type' => file_attributes[:mime_type],
  363. 'Content-ID' => cid,
  364. 'Content-Disposition' => 'inline',
  365. },
  366. }
  367. attachments_inline.push attachment
  368. node['src'] = "cid:#{cid}"
  369. end
  370. Loofah::Scrubber::STOP
  371. end
  372. end
  373. [Loofah.fragment(string).scrub!(scrubber).to_s, attachments_inline]
  374. end
  375. =begin
  376. satinize style of img tags
  377. string = HtmlSanitizer.dynamic_image_size(article.body)
  378. =end
  379. def self.dynamic_image_size(string)
  380. scrubber = Loofah::Scrubber.new do |node|
  381. if node.name == 'img'
  382. if node['src']
  383. style = 'max-width:100%;'
  384. if node['style']
  385. pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
  386. pears.each do |local_pear|
  387. prop = local_pear.split(':')
  388. next if !prop[0]
  389. key = prop[0].strip
  390. if key == 'height'
  391. key = 'max-height'
  392. end
  393. style += "#{key}:#{prop[1]};"
  394. end
  395. end
  396. node['style'] = style
  397. end
  398. Loofah::Scrubber::STOP
  399. end
  400. end
  401. Loofah.fragment(string).scrub!(scrubber).to_s
  402. end
  403. private_class_method :cleanup_target
  404. private_class_method :add_link
  405. private_class_method :url_same?
  406. private_class_method :html_decode
  407. end