html_sanitizer.rb 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. class HtmlSanitizer
  2. =begin
  3. satinize html string based on whiltelist
  4. string = HtmlSanitizer.strict(string, external)
  5. =end
  6. def self.strict(string, external = false)
  7. # config
  8. tags_remove_content = Rails.configuration.html_sanitizer_tags_remove_content
  9. tags_quote_content = Rails.configuration.html_sanitizer_tags_quote_content
  10. tags_whitelist = Rails.configuration.html_sanitizer_tags_whitelist
  11. attributes_whitelist = Rails.configuration.html_sanitizer_attributes_whitelist
  12. css_properties_whitelist = Rails.configuration.html_sanitizer_css_properties_whitelist
  13. classes_whitelist = ['js-signatureMarker']
  14. attributes_2_css = %w(width height)
  15. scrubber = Loofah::Scrubber.new do |node|
  16. # remove tags with subtree
  17. if tags_remove_content.include?(node.name)
  18. node.remove
  19. Loofah::Scrubber::STOP
  20. end
  21. # remove tag, insert quoted content
  22. if tags_quote_content.include?(node.name)
  23. string = html_decode(node.content)
  24. text = Nokogiri::XML::Text.new(string, node.document)
  25. node.add_next_sibling(text)
  26. node.remove
  27. Loofah::Scrubber::STOP
  28. end
  29. # replace tags, keep subtree
  30. if !tags_whitelist.include?(node.name)
  31. node.replace strict(node.children.to_s)
  32. Loofah::Scrubber::STOP
  33. end
  34. # prepare src attribute
  35. if node['src']
  36. src = cleanup_target(node['src'])
  37. if src =~ /(javascript|livescript|vbscript):/i || src.downcase.start_with?('http', 'ftp', '//')
  38. node.remove
  39. Loofah::Scrubber::STOP
  40. end
  41. end
  42. # clean class / only use allowed classes
  43. if node['class']
  44. classes = node['class'].gsub(/\t|\n|\r/, '').split(' ')
  45. class_new = ''
  46. classes.each { |local_class|
  47. next if !classes_whitelist.include?(local_class.to_s.strip)
  48. if class_new != ''
  49. class_new += ' '
  50. end
  51. class_new += local_class
  52. }
  53. if class_new != ''
  54. node['class'] = class_new
  55. else
  56. node.delete('class')
  57. end
  58. end
  59. # move style attributes to css attributes
  60. attributes_2_css.each { |key|
  61. next if !node[key]
  62. if node['style'].empty?
  63. node['style'] = ''
  64. else
  65. node['style'] += ';'
  66. end
  67. value = node[key]
  68. node.delete(key)
  69. next if value.blank?
  70. if value !~ /%|px|em/i
  71. value += 'px'
  72. end
  73. node['style'] += "#{key}:#{value}"
  74. }
  75. # clean style / only use allowed style properties
  76. if node['style']
  77. pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
  78. style = ''
  79. pears.each { |local_pear|
  80. prop = local_pear.split(':')
  81. next if !prop[0]
  82. key = prop[0].strip
  83. next if !css_properties_whitelist.include?(key)
  84. style += "#{local_pear};"
  85. }
  86. node['style'] = style
  87. if style == ''
  88. node.delete('style')
  89. end
  90. end
  91. # scan for invalid link content
  92. %w(href style).each { |attribute_name|
  93. next if !node[attribute_name]
  94. href = cleanup_target(node[attribute_name])
  95. next if href !~ /(javascript|livescript|vbscript):/i
  96. node.delete(attribute_name)
  97. }
  98. # remove attributes if not whitelisted
  99. node.each { |attribute, _value|
  100. attribute_name = attribute.downcase
  101. next if attributes_whitelist[:all].include?(attribute_name) || (attributes_whitelist[node.name] && attributes_whitelist[node.name].include?(attribute_name))
  102. node.delete(attribute)
  103. }
  104. # remove mailto links
  105. if node['href']
  106. href = cleanup_target(node['href'])
  107. if href =~ /mailto:(.*)$/i
  108. text = Nokogiri::XML::Text.new($1, node.document)
  109. node.add_next_sibling(text)
  110. node.remove
  111. Loofah::Scrubber::STOP
  112. end
  113. end
  114. # prepare links
  115. if node['href']
  116. href = cleanup_target(node['href'])
  117. if external && !href.downcase.start_with?('//') && href.downcase !~ %r{^.{1,6}://.+?}
  118. node['href'] = "http://#{node['href']}"
  119. href = node['href']
  120. end
  121. next if !href.downcase.start_with?('http', 'ftp', '//')
  122. node.set_attribute('href', href)
  123. node.set_attribute('rel', 'nofollow')
  124. node.set_attribute('target', '_blank')
  125. end
  126. # check if href is different to text
  127. if external && node.name == 'a' && !url_same?(node['href'], node.text)
  128. if node['href'].blank?
  129. node.replace strict(node.children.to_s)
  130. Loofah::Scrubber::STOP
  131. elsif node.children.empty? || node.children.first.class == Nokogiri::XML::Text
  132. text = Nokogiri::XML::Text.new("#{node['href']} (", node.document)
  133. node.add_previous_sibling(text)
  134. node['href'] = cleanup_target(node.text)
  135. text = Nokogiri::XML::Text.new(')', node.document)
  136. node.add_next_sibling(text)
  137. else
  138. node.content = cleanup_target(node['href'])
  139. end
  140. end
  141. # check if text has urls which need to be clickable
  142. if node && node.name != 'a' && node.parent && node.parent.name != 'a' && (!node.parent.parent || node.parent.parent.name != 'a')
  143. if node.class == Nokogiri::XML::Text
  144. urls = []
  145. node.content.scan(%r{((http|https|ftp|tel)://.+?)([[:space:]]|\.[[:space:]]|,[[:space:]]|\.$|,$|\)|\(|$)}mxi).each { |match|
  146. if match[0]
  147. urls.push match[0].to_s.strip
  148. end
  149. }
  150. node.content.scan(/(^|:|;|\s)(www\..+?)([[:space:]]|\.[[:space:]]|,[[:space:]]|\.$|,$|\)|\(|$)/mxi).each { |match|
  151. if match[1]
  152. urls.push match[1].to_s.strip
  153. end
  154. }
  155. next if urls.empty?
  156. add_link(node.content, urls, node)
  157. end
  158. end
  159. end
  160. Loofah.fragment(string).scrub!(scrubber).to_s
  161. end
  162. =begin
  163. cleanup html string:
  164. * remove empty nodes (p, div, span)
  165. * remove nodes in general (keep content - span)
  166. string = HtmlSanitizer.cleanup(string)
  167. =end
  168. def self.cleanup(string)
  169. string.gsub!(/<[A-z]:[A-z]>/, '')
  170. string.gsub!(%r{</[A-z]:[A-z]>}, '')
  171. string.delete!("\t")
  172. # remove all new lines
  173. string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, "\n")
  174. # remove double multiple empty lines
  175. string.gsub!(/\n\n\n+/, "\n\n")
  176. string = cleanup_replace_tags(string)
  177. cleanup_structure(string)
  178. end
  179. def self.cleanup_replace_tags(string)
  180. string.gsub!(%r{(<table(.+?|)>.+?</table>)}mxi) { |table|
  181. table.gsub!(/<table(.+?|)>/im, '<br>')
  182. table.gsub!(%r{</table>}im, ' ')
  183. table.gsub!(/<thead(.+?|)>/im, '')
  184. table.gsub!(%r{</thead>}im, ' ')
  185. table.gsub!(/<tbody(.+?|)>/im, '')
  186. table.gsub!(%r{</tbody>}im, ' ')
  187. table.gsub!(/<tr(.+?|)>/im, "<br>\n")
  188. #table.gsub!(%r{</td>}im, '')
  189. #table.gsub!(%r{</td>}im, "\n<br>\n")
  190. table.gsub!(%r{</td>}im, ' ')
  191. table.gsub!(/<td(.+?|)>/im, '')
  192. #table.gsub!(%r{</tr>}im, '')
  193. table.gsub!(%r{</tr>}im, "\n<br>")
  194. table.gsub!(/<br>[[:space:]]?<br>/im, '<br>')
  195. table.gsub!(/<br>[[:space:]]?<br>/im, '<br>')
  196. table.gsub!(%r{<br/>[[:space:]]?<br/>}im, '<br/>')
  197. table.gsub!(%r{<br/>[[:space:]]?<br/>}im, '<br/>')
  198. table
  199. }
  200. tags_backlist = %w(span table thead tbody td tr center)
  201. scrubber = Loofah::Scrubber.new do |node|
  202. next if !tags_backlist.include?(node.name)
  203. node.replace cleanup_replace_tags(node.children.to_s)
  204. Loofah::Scrubber::STOP
  205. end
  206. Loofah.fragment(string).scrub!(scrubber).to_s
  207. end
  208. def self.cleanup_structure(string)
  209. remove_empty_nodes = %w(p div span small)
  210. remove_empty_last_nodes = %w(b i u small)
  211. # remove last empty nodes and empty -not needed- parrent nodes
  212. scrubber_structure = Loofah::Scrubber.new do |node|
  213. if remove_empty_last_nodes.include?(node.name) && node.children.size.zero?
  214. node.remove
  215. Loofah::Scrubber::STOP
  216. end
  217. if remove_empty_nodes.include?(node.name) && node.children.size == 1 && remove_empty_nodes.include?(node.children.first.name)
  218. node.replace node.children.to_s
  219. Loofah::Scrubber::STOP
  220. end
  221. end
  222. string = Loofah.fragment(string).scrub!(scrubber_structure).to_s
  223. new_string = ''
  224. done = true
  225. while done
  226. new_string = Loofah.fragment(string).scrub!(scrubber_structure).to_s
  227. if string == new_string
  228. done = false
  229. end
  230. string = new_string
  231. end
  232. scrubber_cleanup = Loofah::Scrubber.new do |node|
  233. # remove mailto links
  234. if node['href']
  235. href = cleanup_target(node['href'])
  236. if href =~ /mailto:(.*)$/i
  237. text = Nokogiri::XML::Text.new($1, node.document)
  238. node.add_next_sibling(text)
  239. node.remove
  240. Loofah::Scrubber::STOP
  241. end
  242. end
  243. # check if href is different to text
  244. if node.name == 'a' && !url_same?(node['href'], node.text)
  245. if node['href'].blank?
  246. node.replace cleanup_structure(node.children.to_s)
  247. Loofah::Scrubber::STOP
  248. elsif node.children.empty? || node.children.first.class == Nokogiri::XML::Text
  249. text = Nokogiri::XML::Text.new("#{node.text} (", node.document)
  250. node.add_previous_sibling(text)
  251. node.content = cleanup_target(node['href'])
  252. node['href'] = cleanup_target(node['href'])
  253. text = Nokogiri::XML::Text.new(')', node.document)
  254. node.add_next_sibling(text)
  255. else
  256. node.content = cleanup_target(node['href'])
  257. end
  258. end
  259. # remove not needed new lines
  260. if node.class == Nokogiri::XML::Text
  261. if !node.parent || (node.parent.name != 'pre' && node.parent.name != 'code')
  262. content = node.content
  263. if content
  264. if content != ' ' && content != "\n"
  265. content.gsub!(/[[:space:]]+/, ' ')
  266. end
  267. if node.previous
  268. if node.previous.name == 'div' || node.previous.name == 'p'
  269. content.strip!
  270. end
  271. elsif node.parent && !node.previous && (!node.next || node.next.name == 'div' || node.next.name == 'p' || node.next.name == 'br')
  272. if (node.parent.name == 'div' || node.parent.name == 'p') && content != ' ' && content != "\n"
  273. content.strip!
  274. end
  275. end
  276. node.content = content
  277. end
  278. end
  279. end
  280. end
  281. Loofah.fragment(string).scrub!(scrubber_cleanup).to_s
  282. end
  283. def self.add_link(content, urls, node)
  284. if urls.empty?
  285. text = Nokogiri::XML::Text.new(content, node.document)
  286. node.add_next_sibling(text)
  287. return
  288. end
  289. url = urls.shift
  290. if content =~ /^(.*)#{Regexp.quote(url)}(.*)$/mx
  291. pre = $1
  292. post = $2
  293. if url =~ /^www/i
  294. url = "http://#{url}"
  295. end
  296. a = Nokogiri::XML::Node.new 'a', node.document
  297. a['href'] = url
  298. a['rel'] = 'nofollow'
  299. a['target'] = '_blank'
  300. a.content = url
  301. if node.class != Nokogiri::XML::Text
  302. text = Nokogiri::XML::Text.new(pre, node.document)
  303. node.add_next_sibling(text).add_next_sibling(a)
  304. return if post.blank?
  305. add_link(post, urls, a)
  306. return
  307. end
  308. node.content = pre
  309. node.add_next_sibling(a)
  310. return if post.blank?
  311. add_link(post, urls, a)
  312. end
  313. end
  314. def self.html_decode(string)
  315. string.gsub('&amp;', '&').gsub('&lt;', '<').gsub('&gt;', '>').gsub('&quot;', '"').gsub('&nbsp;', ' ')
  316. end
  317. def self.cleanup_target(string)
  318. string = URI.unescape(string).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?')
  319. string.gsub(/[[:space:]]|\t|\n|\r/, '').gsub(%r{/\*.*?\*/}, '').gsub(/<!--.*?-->/, '').gsub(/\[.+?\]/, '')
  320. end
  321. def self.url_same?(url_new, url_old)
  322. url_new = URI.unescape(url_new.to_s).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?').downcase.gsub(%r{/$}, '').gsub(/[[:space:]]|\t|\n|\r/, '').strip
  323. url_old = URI.unescape(url_old.to_s).encode('utf-8', 'binary', invalid: :replace, undef: :replace, replace: '?').downcase.gsub(%r{/$}, '').gsub(/[[:space:]]|\t|\n|\r/, '').strip
  324. url_new = html_decode(url_new).sub('/?', '?')
  325. url_old = html_decode(url_old).sub('/?', '?')
  326. return true if url_new == url_old
  327. return true if "http://#{url_new}" == url_old
  328. return true if "http://#{url_old}" == url_new
  329. return true if "https://#{url_new}" == url_old
  330. return true if "https://#{url_old}" == url_new
  331. false
  332. end
  333. =begin
  334. reolace inline images with cid images
  335. string = HtmlSanitizer.replace_inline_images(article.body)
  336. =end
  337. def self.replace_inline_images(string, prefix = rand(999_999_999))
  338. attachments_inline = []
  339. scrubber = Loofah::Scrubber.new do |node|
  340. if node.name == 'img'
  341. if node['src'] && node['src'] =~ %r{^(data:image/(jpeg|png);base64,.+?)$}i
  342. file_attributes = StaticAssets.data_url_attributes($1)
  343. cid = "#{prefix}.#{rand(999_999_999)}@#{Setting.get('fqdn')}"
  344. attachment = {
  345. data: file_attributes[:content],
  346. filename: cid,
  347. preferences: {
  348. 'Content-Type' => file_attributes[:mime_type],
  349. 'Mime-Type' => file_attributes[:mime_type],
  350. 'Content-ID' => cid,
  351. 'Content-Disposition' => 'inline',
  352. },
  353. }
  354. attachments_inline.push attachment
  355. node['src'] = "cid:#{cid}"
  356. end
  357. Loofah::Scrubber::STOP
  358. end
  359. end
  360. [Loofah.fragment(string).scrub!(scrubber).to_s, attachments_inline]
  361. end
  362. =begin
  363. satinize style of img tags
  364. string = HtmlSanitizer.dynamic_image_size(article.body)
  365. =end
  366. def self.dynamic_image_size(string)
  367. scrubber = Loofah::Scrubber.new do |node|
  368. if node.name == 'img'
  369. if node['src']
  370. style = 'max-width:100%;'
  371. if node['style']
  372. pears = node['style'].downcase.gsub(/\t|\n|\r/, '').split(';')
  373. pears.each { |local_pear|
  374. prop = local_pear.split(':')
  375. next if !prop[0]
  376. key = prop[0].strip
  377. if key == 'height'
  378. key = 'max-height'
  379. end
  380. style += "#{key}:#{prop[1]};"
  381. }
  382. end
  383. node['style'] = style
  384. end
  385. Loofah::Scrubber::STOP
  386. end
  387. end
  388. Loofah.fragment(string).scrub!(scrubber).to_s
  389. end
  390. private_class_method :cleanup_target
  391. private_class_method :add_link
  392. private_class_method :url_same?
  393. private_class_method :html_decode
  394. end