html_sanitizer.rb 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. class HtmlSanitizer
  2. =begin
  3. satinize html string based on whiltelist
  4. string = HtmlSanitizer.strict(string, external)
  5. =end
  6. def self.strict(string, external = false)
  7. # config
  8. tags_remove_content = Rails.configuration.html_sanitizer_tags_remove_content
  9. tags_quote_content = Rails.configuration.html_sanitizer_tags_quote_content
  10. tags_whitelist = Rails.configuration.html_sanitizer_tags_whitelist
  11. attributes_whitelist = Rails.configuration.html_sanitizer_attributes_whitelist
  12. css_properties_whitelist = Rails.configuration.html_sanitizer_css_properties_whitelist
  13. classes_whitelist = ['js-signatureMarker']
  14. attributes_2_css = %w(width height)
  15. scrubber_link = Loofah::Scrubber.new do |node|
  16. # check if href is different to text
  17. if node.name == 'a' && !url_same?(node['href'], node.text)
  18. if node['href'].blank?
  19. node.replace node.children.to_s
  20. Loofah::Scrubber::STOP
  21. elsif ((node.children.empty? || node.children.first.class == Nokogiri::XML::Text) && node.text.present?) || (node.children.size == 1 && node.children.first.content == node.content && node.content.present?)
  22. if node.text.downcase.start_with?('http', 'ftp', '//')
  23. a = Nokogiri::XML::Node.new 'a', node.document
  24. a['href'] = node['href']
  25. a['rel'] = 'nofollow noreferrer noopener'
  26. a['target'] = '_blank'
  27. a.content = node['href']
  28. node.add_previous_sibling(a)
  29. text = Nokogiri::XML::Text.new(' (', node.document)
  30. node.add_previous_sibling(text)
  31. node['href'] = cleanup_target(node.text)
  32. else
  33. text = Nokogiri::XML::Text.new("#{node.text} (", node.document)
  34. node.add_previous_sibling(text)
  35. node.content = cleanup_target(node['href'])
  36. node['href'] = cleanup_target(node['href'])
  37. end
  38. text = Nokogiri::XML::Text.new(')', node.document)
  39. node.add_next_sibling(text)
  40. else
  41. node.content = cleanup_target(node['href'])
  42. end
  43. end
  44. # check if text has urls which need to be clickable
  45. if node && node.name != 'a' && node.parent && node.parent.name != 'a' && (!node.parent.parent || node.parent.parent.name != 'a')
  46. if node.class == Nokogiri::XML::Text
  47. urls = []
  48. node.content.scan(%r{((http|https|ftp|tel)://.+?)([[:space:]]|\.[[:space:]]|,[[:space:]]|\.$|,$|\)|\(|$)}mxi).each do |match|
  49. if match[0]
  50. urls.push match[0].to_s.strip
  51. end
  52. end
  53. node.content.scan(/(^|:|;|\s)(www\..+?)([[:space:]]|\.[[:space:]]|,[[:space:]]|\.$|,$|\)|\(|$)/mxi).each do |match|
  54. if match[1]
  55. urls.push match[1].to_s.strip
  56. end
  57. end
  58. next if urls.empty?
  59. add_link(node.content, urls, node)
  60. end
  61. end
  62. # prepare links
  63. if node['href']
  64. href = cleanup_target(node['href'])
  65. if external && href.present? && !href.downcase.start_with?('//') && href.downcase !~ %r{^.{1,6}://.+?}
  66. node['href'] = "http://#{node['href']}"
  67. href = node['href']
  68. end
  69. next if !href.downcase.start_with?('http', 'ftp', '//')
  70. node.set_attribute('href', href)
  71. node.set_attribute('rel', 'nofollow noreferrer noopener')
  72. node.set_attribute('target', '_blank')
  73. end
  74. end
  75. scrubber_wipe = Loofah::Scrubber.new do |node|
  76. # remove tags with subtree
  77. if tags_remove_content.include?(node.name)
  78. node.remove
  79. Loofah::Scrubber::STOP
  80. end
  81. # remove tag, insert quoted content
  82. if tags_quote_content.include?(node.name)
  83. string = html_decode(node.content)
  84. text = Nokogiri::XML::Text.new(string, node.document)
  85. node.add_next_sibling(text)
  86. node.remove
  87. Loofah::Scrubber::STOP
  88. end
  89. # replace tags, keep subtree
  90. if !tags_whitelist.include?(node.name)
  91. node.replace node.children.to_s
  92. Loofah::Scrubber::STOP
  93. end
  94. # prepare src attribute
  95. if node['src']
  96. src = cleanup_target(node['src'])
  97. if src =~ /(javascript|livescript|vbscript):/i || src.downcase.start_with?('http', 'ftp', '//')
  98. node.remove
  99. Loofah::Scrubber::STOP
  100. end
  101. end
  102. # clean class / only use allowed classes
  103. if node['class']
  104. classes = node['class'].gsub(/\t|\n|\r/, '').split(' ')
  105. class_new = ''
  106. classes.each do |local_class|
  107. next if !classes_whitelist.include?(local_class.to_s.strip)
  108. if class_new != ''
  109. class_new += ' '
  110. end
  111. class_new += local_class
  112. end
  113. if class_new != ''
  114. node['class'] = class_new
  115. else
  116. node.delete('class')
  117. end
  118. end
  119. # move style attributes to css attributes
  120. attributes_2_css.each do |key|
  121. next if !node[key]
  122. if node['style'].empty?
  123. node['style'] = ''
  124. else
  125. node['style'] += ';'
  126. end
  127. value = node[key]
  128. node.delete(key)
  129. next if value.blank?
  130. if value !~ /%|px|em/i
  131. value += 'px'
  132. end
  133. node['style'] += "#{key}:#{value}"
  134. end
  135. # clean style / only use allowed style properties
  136. if node['style']
  137. pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
  138. style = ''
  139. pears.each do |local_pear|
  140. prop = local_pear.split(':')
  141. next if !prop[0]
  142. key = prop[0].strip
  143. next if !css_properties_whitelist.include?(node.name)
  144. next if !css_properties_whitelist[node.name].include?(key)
  145. style += "#{local_pear};"
  146. end
  147. node['style'] = style
  148. if style == ''
  149. node.delete('style')
  150. end
  151. end
  152. # scan for invalid link content
  153. %w(href style).each do |attribute_name|
  154. next if !node[attribute_name]
  155. href = cleanup_target(node[attribute_name])
  156. next if href !~ /(javascript|livescript|vbscript):/i
  157. node.delete(attribute_name)
  158. end
  159. # remove attributes if not whitelisted
  160. node.each do |attribute, _value|
  161. attribute_name = attribute.downcase
  162. next if attributes_whitelist[:all].include?(attribute_name) || (attributes_whitelist[node.name] && attributes_whitelist[node.name].include?(attribute_name))
  163. node.delete(attribute)
  164. end
  165. # remove mailto links
  166. if node['href']
  167. href = cleanup_target(node['href'])
  168. if href =~ /mailto:(.*)$/i
  169. text = Nokogiri::XML::Text.new($1, node.document)
  170. node.add_next_sibling(text)
  171. node.remove
  172. Loofah::Scrubber::STOP
  173. end
  174. end
  175. end
  176. new_string = ''
  177. done = true
  178. while done
  179. new_string = Loofah.fragment(string).scrub!(scrubber_wipe).to_s
  180. if string == new_string
  181. done = false
  182. end
  183. string = new_string
  184. end
  185. Loofah.fragment(string).scrub!(scrubber_link).to_s
  186. end
  187. =begin
  188. cleanup html string:
  189. * remove empty nodes (p, div, span, table)
  190. * remove nodes in general (keep content - span)
  191. string = HtmlSanitizer.cleanup(string)
  192. =end
  193. def self.cleanup(string)
  194. string.gsub!(/<[A-z]:[A-z]>/, '')
  195. string.gsub!(%r{</[A-z]:[A-z]>}, '')
  196. string.delete!("\t")
  197. # remove all new lines
  198. string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, "\n")
  199. # remove double multiple empty lines
  200. string.gsub!(/\n\n\n+/, "\n\n")
  201. string = cleanup_structure(string, 'pre')
  202. string = cleanup_replace_tags(string)
  203. string = cleanup_structure(string)
  204. string
  205. end
  206. def self.cleanup_replace_tags(string)
  207. #return string
  208. tags_backlist = %w(span center)
  209. scrubber = Loofah::Scrubber.new do |node|
  210. next if !tags_backlist.include?(node.name)
  211. hit = false
  212. local_node = nil
  213. (1..5).each do |_count|
  214. local_node = if local_node
  215. local_node.parent
  216. else
  217. node.parent
  218. end
  219. break if !local_node
  220. next if local_node.name != 'td'
  221. hit = true
  222. end
  223. next if hit && node.keys.count.positive?
  224. node.replace cleanup_replace_tags(node.children.to_s)
  225. Loofah::Scrubber::STOP
  226. end
  227. Loofah.fragment(string).scrub!(scrubber).to_s
  228. end
  229. def self.cleanup_structure(string, type = 'all')
  230. remove_empty_nodes = if type == 'pre'
  231. %w(span)
  232. else
  233. %w(p div span small table)
  234. end
  235. remove_empty_last_nodes = %w(b i u small table)
  236. # remove last empty nodes and empty -not needed- parrent nodes
  237. scrubber_structure = Loofah::Scrubber.new do |node|
  238. if remove_empty_last_nodes.include?(node.name) && node.children.size.zero?
  239. node.remove
  240. Loofah::Scrubber::STOP
  241. end
  242. # remove empty childs
  243. if node.content.blank? && remove_empty_nodes.include?(node.name) && node.children.size == 1 && remove_empty_nodes.include?(node.children.first.name)
  244. node.replace node.children.to_s
  245. Loofah::Scrubber::STOP
  246. end
  247. # remove empty childs
  248. if remove_empty_nodes.include?(node.name) && node.children.size == 1 && remove_empty_nodes.include?(node.children.first.name) && node.children.first.content == node.content
  249. node.replace node.children.to_s
  250. Loofah::Scrubber::STOP
  251. end
  252. # remove node if empty and parent was already a remove node
  253. if node.content.blank? && remove_empty_nodes.include?(node.name) && node.parent && node.children.size.zero? && remove_empty_nodes.include?(node.parent.name)
  254. node.remove
  255. Loofah::Scrubber::STOP
  256. end
  257. end
  258. new_string = ''
  259. done = true
  260. while done
  261. new_string = Loofah.fragment(string).scrub!(scrubber_structure).to_s
  262. if string == new_string
  263. done = false
  264. end
  265. string = new_string
  266. end
  267. scrubber_cleanup = Loofah::Scrubber.new do |node|
  268. # remove mailto links
  269. if node['href']
  270. href = cleanup_target(node['href'])
  271. if href =~ /mailto:(.*)$/i
  272. text = Nokogiri::XML::Text.new($1, node.document)
  273. node.add_next_sibling(text)
  274. node.remove
  275. Loofah::Scrubber::STOP
  276. end
  277. end
  278. # remove not needed new lines
  279. if node.class == Nokogiri::XML::Text
  280. if !node.parent || (node.parent.name != 'pre' && node.parent.name != 'code')
  281. content = node.content
  282. if content
  283. if content != ' ' && content != "\n"
  284. content.gsub!(/[[:space:]]+/, ' ')
  285. end
  286. if node.previous
  287. if node.previous.name == 'div' || node.previous.name == 'p'
  288. content.strip!
  289. end
  290. elsif node.parent && !node.previous && (!node.next || node.next.name == 'div' || node.next.name == 'p' || node.next.name == 'br')
  291. if (node.parent.name == 'div' || node.parent.name == 'p') && content != ' ' && content != "\n"
  292. content.strip!
  293. end
  294. end
  295. node.content = content
  296. end
  297. end
  298. end
  299. end
  300. Loofah.fragment(string).scrub!(scrubber_cleanup).to_s
  301. end
  302. def self.add_link(content, urls, node)
  303. if urls.empty?
  304. text = Nokogiri::XML::Text.new(content, node.document)
  305. node.add_next_sibling(text)
  306. return
  307. end
  308. url = urls.shift
  309. if content =~ /^(.*)#{Regexp.quote(url)}(.*)$/mx
  310. pre = $1
  311. post = $2
  312. if url =~ /^www/i
  313. url = "http://#{url}"
  314. end
  315. a = Nokogiri::XML::Node.new 'a', node.document
  316. a['href'] = url
  317. a['rel'] = 'nofollow noreferrer noopener'
  318. a['target'] = '_blank'
  319. a.content = url
  320. if node.class != Nokogiri::XML::Text
  321. text = Nokogiri::XML::Text.new(pre, node.document)
  322. node.add_next_sibling(text).add_next_sibling(a)
  323. return if post.blank?
  324. add_link(post, urls, a)
  325. return
  326. end
  327. node.content = pre
  328. node.add_next_sibling(a)
  329. return if post.blank?
  330. add_link(post, urls, a)
  331. end
  332. end
  333. def self.html_decode(string)
  334. string.gsub('&amp;', '&').gsub('&lt;', '<').gsub('&gt;', '>').gsub('&quot;', '"').gsub('&nbsp;', ' ')
  335. end
  336. def self.cleanup_target(string)
  337. string = URI.unescape(string).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?')
  338. string.gsub(/[[:space:]]|\t|\n|\r/, '').gsub(%r{/\*.*?\*/}, '').gsub(/<!--.*?-->/, '').gsub(/\[.+?\]/, '')
  339. end
  340. def self.url_same?(url_new, url_old)
  341. url_new = URI.unescape(url_new.to_s).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?').downcase.gsub(%r{/$}, '').gsub(/[[:space:]]|\t|\n|\r/, '').strip
  342. url_old = URI.unescape(url_old.to_s).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?').downcase.gsub(%r{/$}, '').gsub(/[[:space:]]|\t|\n|\r/, '').strip
  343. url_new = html_decode(url_new).sub('/?', '?')
  344. url_old = html_decode(url_old).sub('/?', '?')
  345. return true if url_new == url_old
  346. return true if "http://#{url_new}" == url_old
  347. return true if "http://#{url_old}" == url_new
  348. return true if "https://#{url_new}" == url_old
  349. return true if "https://#{url_old}" == url_new
  350. false
  351. end
  352. =begin
  353. reolace inline images with cid images
  354. string = HtmlSanitizer.replace_inline_images(article.body)
  355. =end
  356. def self.replace_inline_images(string, prefix = rand(999_999_999))
  357. attachments_inline = []
  358. scrubber = Loofah::Scrubber.new do |node|
  359. if node.name == 'img'
  360. if node['src'] && node['src'] =~ %r{^(data:image/(jpeg|png);base64,.+?)$}i
  361. file_attributes = StaticAssets.data_url_attributes($1)
  362. cid = "#{prefix}.#{rand(999_999_999)}@#{Setting.get('fqdn')}"
  363. attachment = {
  364. data: file_attributes[:content],
  365. filename: cid,
  366. preferences: {
  367. 'Content-Type' => file_attributes[:mime_type],
  368. 'Mime-Type' => file_attributes[:mime_type],
  369. 'Content-ID' => cid,
  370. 'Content-Disposition' => 'inline',
  371. },
  372. }
  373. attachments_inline.push attachment
  374. node['src'] = "cid:#{cid}"
  375. end
  376. Loofah::Scrubber::STOP
  377. end
  378. end
  379. [Loofah.fragment(string).scrub!(scrubber).to_s, attachments_inline]
  380. end
  381. =begin
  382. satinize style of img tags
  383. string = HtmlSanitizer.dynamic_image_size(article.body)
  384. =end
  385. def self.dynamic_image_size(string)
  386. scrubber = Loofah::Scrubber.new do |node|
  387. if node.name == 'img'
  388. if node['src']
  389. style = 'max-width:100%;'
  390. if node['style']
  391. pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
  392. pears.each do |local_pear|
  393. prop = local_pear.split(':')
  394. next if !prop[0]
  395. key = prop[0].strip
  396. if key == 'height'
  397. key = 'max-height'
  398. end
  399. style += "#{key}:#{prop[1]};"
  400. end
  401. end
  402. node['style'] = style
  403. end
  404. Loofah::Scrubber::STOP
  405. end
  406. end
  407. Loofah.fragment(string).scrub!(scrubber).to_s
  408. end
  409. private_class_method :cleanup_target
  410. private_class_method :add_link
  411. private_class_method :url_same?
  412. private_class_method :html_decode
  413. end