string.rb 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551
  1. # Copyright (C) 2012-2024 Zammad Foundation, https://zammad-foundation.org/
  2. require 'rchardet'
  3. class String
  4. alias old_strip strip
  5. alias old_strip! strip!
  6. def strip!
  7. begin
  8. sub!(%r{\A[[[:space:]]\u{200B}\u{FEFF}]+}, '')
  9. sub!(%r{[[[:space:]]\u{200B}\u{FEFF}]+\Z}, '')
  10. # if incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string) (Encoding::CompatibilityError), use default
  11. rescue Encoding::CompatibilityError
  12. old_strip!
  13. end
  14. self
  15. end
  16. def strip
  17. begin
  18. new_string = sub(%r{\A[[[:space:]]\u{200B}\u{FEFF}]+}, '')
  19. new_string.sub!(%r{[[[:space:]]\u{200B}\u{FEFF}]+\Z}, '')
  20. # if incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string) (Encoding::CompatibilityError), use default
  21. rescue Encoding::CompatibilityError
  22. new_string = old_strip
  23. end
  24. new_string
  25. end
  26. def message_quote
  27. quote = split("\n")
  28. body_quote = ''
  29. quote.each do |line|
  30. body_quote = "#{body_quote}> #{line}\n"
  31. end
  32. body_quote
  33. end
  34. def word_wrap(*args)
  35. options = args.extract_options!
  36. if args.present?
  37. options[:line_width] = args[0] || 82
  38. end
  39. options.reverse_merge!(line_width: 82)
  40. lines = self
  41. lines.split("\n").collect do |line|
  42. line.length > options[:line_width] ? line.gsub(%r{(.{1,#{options[:line_width]}})(\s+|$)}, "\\1\n").strip : line
  43. end * "\n"
  44. end
  45. =begin
  46. filename = 'Some::Module'.to_filename
  47. returns
  48. 'some/module'
  49. =end
  50. def to_filename
  51. camel_cased_word = dup
  52. camel_cased_word.gsub(%r{::}, '/')
  53. .gsub(%r{([A-Z]+)([A-Z][a-z])}, '\1_\2')
  54. .gsub(%r{([a-z\d])([A-Z])}, '\1_\2')
  55. .tr('-', '_').downcase
  56. end
  57. =begin
  58. filename = 'some/module.rb'.to_classname
  59. returns
  60. 'Some::Module'
  61. =end
  62. def to_classname
  63. camel_cased_word = dup
  64. camel_cased_word.delete_suffix!('.rb')
  65. camel_cased_word.split('/').map(&:camelize).join('::')
  66. end
  67. # because of mysql inno_db limitations, strip 4 bytes utf8 chars (e. g. emojis)
  68. # unfortunaly UTF8mb4 will raise other limitaions of max varchar and lower index sizes
  69. # More details: http://pjambet.github.io/blog/emojis-and-mysql/
  70. def utf8_to_3bytesutf8
  71. return self if Rails.application.config.db_4bytes_utf8
  72. removed = ''
  73. each_char.with_object('') do |c, result|
  74. if c.bytes.count > 3
  75. removed << c
  76. next
  77. end
  78. result << c
  79. end.tap do
  80. if removed.present?
  81. Rails.logger.warn "strip out 4 bytes utf8 chars '#{removed[0..255]}' of '#{self[0..255]}'"
  82. end
  83. end
  84. end
  85. =begin
  86. text = html_string.html2text
  87. returns
  88. 'string with text only'
  89. =end
  90. def html2text(string_only = false, strict = false)
  91. string = dup
  92. # in case of invalid encoding, strip invalid chars
  93. # see also test/data/mail/mail021.box
  94. # note: string.encode!('UTF-8', 'UTF-8', :invalid => :replace, :replace => '?') was not detecting invalid chars
  95. if !string.valid_encoding?
  96. string = string.chars.select(&:valid_encoding?).join
  97. end
  98. # remove html comments
  99. string.gsub!(%r{<!--.+?-->}m, '')
  100. # find <a href=....> and replace it with [x]
  101. link_list = ''
  102. counter = 0
  103. if string_only
  104. string.gsub!(%r{<a[[:space:]]+(|\S+[[:space:]]+)href=("|')(.+?)("|')([[:space:]]*|[[:space:]]+[^>]*)>(.+?)<[[:space:]]*/a[[:space:]]*>}mxi) do |_placeholder|
  105. link = $3
  106. text = $6
  107. text.gsub!(%r{<.+?>}, '')
  108. link_compare = link.dup
  109. if link_compare.present?
  110. link.strip!
  111. link_compare.strip!
  112. link_compare.downcase!
  113. link_compare.sub!(%r{/$}, '')
  114. end
  115. text_compare = text.dup
  116. if text_compare.present?
  117. text.strip!
  118. text_compare.strip!
  119. text_compare.downcase!
  120. text_compare.sub!(%r{/$}, '')
  121. end
  122. if link_compare.present? && text_compare.blank?
  123. link
  124. elsif (link_compare.blank? && text_compare.present?) || (link_compare && link_compare =~ %r{^mailto}i)
  125. text
  126. elsif link_compare.present? && text_compare.present? && (link_compare == text_compare || link_compare == "mailto:#{text}".downcase || link_compare == "http://#{text}".downcase)
  127. "######LINKEXT:#{link}/TEXT:#{text}######"
  128. elsif text !~ %r{^http}
  129. "#{text} (######LINKRAW:#{link}######)"
  130. else
  131. "#{link} (######LINKRAW:#{text}######)"
  132. end
  133. end
  134. elsif string.scan(%r{<a[[:space:]]}i).count < 5_000
  135. string.gsub!(%r{<a[[:space:]].*?href=("|')(.+?)("|').*?>}ix) do
  136. link = $2
  137. counter += 1
  138. link_list += "[#{counter}] #{link}\n"
  139. "[#{counter}] "
  140. end
  141. end
  142. # remove style tags with content
  143. string.gsub!(%r{<style(|[[:space:]].+?)>(.+?)</style>}im, '')
  144. # remove empty lines
  145. string.gsub!(%r{^[[:space:]]*}m, '')
  146. if strict
  147. string.gsub!(%r{< [[:space:]]* (/*) [[:space:]]* (b|i|ul|ol|li|u|h1|h2|h3|hr) ([[:space:]]*|[[:space:]]+[^>]*) >}mxi, '######\1\2######')
  148. end
  149. # pre/code handling 1/2
  150. string.gsub!(%r{<pre>(.+?)</pre>}m) do |placeholder|
  151. placeholder.gsub(%r{\n}, '###BR###')
  152. end
  153. string.gsub!(%r{<code>(.+?)</code>}m) do |placeholder|
  154. placeholder.gsub(%r{\n}, '###BR###')
  155. end
  156. # insert spaces on [A-z]\n[A-z]
  157. string.gsub!(%r{([A-z])[[:space:]]([A-z])}m, '\1 \2')
  158. # remove all new lines
  159. string.gsub!(%r{(\n\r|\r\r\n|\r\n|\n)}, '')
  160. # blockquote handling
  161. string.gsub!(%r{<blockquote(| [^>]*)>(.+?)</blockquote>}m) do
  162. "\n#{$2.html2text(true).gsub(%r{^(.*)$}, '&gt; \1')}\n"
  163. end
  164. # pre/code handling 2/2
  165. string.gsub!(%r{###BR###}, "\n")
  166. # add counting
  167. string.gsub!(%r{<li(| [^>]*)>}i, "\n* ")
  168. # add hr
  169. string.gsub!(%r{<hr(|/| [^>]*)>}i, "\n___\n")
  170. # add h\d
  171. string.gsub!(%r{</h\d>}i, "\n")
  172. # add new lines
  173. string.gsub!(%r{</div><div(|[[:space:]].+?)>}im, "\n")
  174. string.gsub!(%r{</p><p(|[[:space:]].+?)>}im, "\n")
  175. string.gsub!(%r{<(div|p|pre|br|table|tr|h)(|/| [^>]*)>}i, "\n")
  176. string.gsub!(%r{</(p|br|div)(|[[:space:]].+?)>}i, "\n")
  177. string.gsub!(%r{</td>}i, ' ')
  178. # strip all other tags
  179. string.gsub!(%r{<.+?>}, '')
  180. # replace multiple spaces with one
  181. string.gsub!(%r{ }, ' ')
  182. # add hyperlinks
  183. if strict
  184. string.gsub!(%r{([[:space:]])((http|https|ftp|tel)://.+?|(www..+?))([[:space:]]|\.[[:space:]]|,[[:space:]])}mxi) do |_placeholder|
  185. pre = $1
  186. content = $2
  187. post = $5
  188. if content.match?(%r{^www}i)
  189. content = "http://#{content}"
  190. end
  191. if content =~ %r{^(http|https|ftp|tel)}i
  192. "#{pre}######LINKRAW:#{content}#######{post}"
  193. else
  194. "#{pre}#{content}#{post}"
  195. end
  196. end
  197. end
  198. # try HTMLEntities, if it fails on invalid signes, use manual way
  199. begin
  200. coder = HTMLEntities.new
  201. string = coder.decode(string)
  202. rescue
  203. # strip all &amp; &lt; &gt; &quot;
  204. string.gsub!('&amp;', '&')
  205. string.gsub!('&lt;', '<')
  206. string.gsub!('&gt;', '>')
  207. string.gsub!('&quot;', '"')
  208. string.gsub!('&nbsp;', ' ')
  209. # encode html entities like "&#8211;"
  210. string.gsub!(%r{(&\#(\d+);?)}x) do
  211. $2.chr
  212. end
  213. # encode html entities like "&#3d;"
  214. string.gsub!(%r{(&\#[xX]([0-9a-fA-F]+);?)}x) do
  215. chr_orig = $1
  216. hex = $2.hex
  217. if hex
  218. chr = hex.chr
  219. if chr
  220. chr_orig = chr
  221. else
  222. chr_orig
  223. end
  224. else
  225. chr_orig
  226. end
  227. # check valid encoding
  228. begin
  229. if !chr_orig.encode('UTF-8').valid_encoding?
  230. chr_orig = '?'
  231. end
  232. rescue
  233. chr_orig = '?'
  234. end
  235. chr_orig
  236. end
  237. end
  238. string = string.utf8_encode(fallback: :read_as_sanitized_binary)
  239. # signature block handling 1/2
  240. string.gsub!(%r{^-- \s{1}}, '###SIGNATURE_BLOCK###')
  241. # remove tailing empty spaces
  242. string.gsub!(%r{[[:blank:]]+$}, '')
  243. # remove double multiple empty lines
  244. string.gsub!(%r{\n\n\n+}, "\n\n")
  245. # add extracted links
  246. if link_list != ''
  247. string += "\n\n\n#{link_list}"
  248. end
  249. # remove double multiple empty lines
  250. string.gsub!(%r{\n\n\n+}, "\n\n")
  251. # signature block handling 2/2
  252. string.gsub!(%r{###SIGNATURE_BLOCK###}, "-- \n")
  253. string.strip
  254. end
  255. =begin
  256. html = text_string.text2html
  257. =end
  258. def text2html
  259. text = CGI.escapeHTML(self)
  260. text.gsub!(%r{\n}, '<br>')
  261. text.gsub!('&amp;amp;', '&amp;')
  262. text.chomp
  263. end
  264. =begin
  265. html = text_string.text2html
  266. =end
  267. def html2html_strict
  268. string = dup
  269. # https://github.com/zammad/zammad/issues/4112
  270. string.gsub!(%r{<!\[if !supportLists\]>.+?<!\[endif\]>}mi, '• ')
  271. strict_sanitizer = HtmlSanitizer::Strict.new
  272. string = strict_sanitizer.sanitize(string, external: true).strip
  273. string = HtmlSanitizer::Cleanup.new.sanitize(string).strip
  274. # as fallback, use html2text and text2html
  275. if string.blank?
  276. string = html2text.text2html
  277. string.signature_identify('text')
  278. marker_template = '<span class="js-signatureMarker"></span>'
  279. string.sub!(%r{######SIGNATURE_MARKER######}, marker_template)
  280. string.gsub!(%r{######SIGNATURE_MARKER######}, '')
  281. return string.chomp
  282. end
  283. string.gsub!(%r{(<p>[[:space:]]*</p>([[:space:]]*)){2,}}im, '<p>&nbsp;</p>\2')
  284. string.gsub!(%r\<div>[[:space:]]*(<br(|/)>([[:space:]]*)){2,}\im, '<div><br>\3')
  285. string.gsub!(%r\[[:space:]]*(<br>[[:space:]]*){3,}[[:space:]]*</div>\im, '<br><br></div>')
  286. string.gsub!(%r\<div>[[:space:]]*(<br>[[:space:]]*){1,}[[:space:]]*</div>\im, '<div>&nbsp;</div>')
  287. string.gsub!(%r\<div>[[:space:]]*(<div>[[:space:]]*</div>[[:space:]]*){2,}</div>\im, '<div>&nbsp;</div>')
  288. string.gsub!(%r\<p>[[:space:]]*</p>(<br(|/)>[[:space:]]*){2,}[[:space:]]*\im, '<p> </p><br>')
  289. string.gsub!(%r{<p>[[:space:]]*</p>(<br(|/)>[[:space:]]*)+<p>[[:space:]]*</p>}im, '<p> </p><p> </p>')
  290. string.gsub!(%r\(<div>[[:space:]]*</div>[[:space:]]*){2,}\im, '<div> </div>')
  291. string.gsub!(%r{<div>&nbsp;</div>[[:space:]]*(<div>&nbsp;</div>){1,}}im, '<div>&nbsp;</div>')
  292. string.gsub!(%r{(<br>[[:space:]]*){3,}}im, '<br><br>')
  293. string.gsub!(%r\(<br(|/)>[[:space:]]*){3,}\im, '<br/><br/>')
  294. string.gsub!(%r{<p>[[:space:]]+</p>}im, '<p>&nbsp;</p>')
  295. string.gsub!(%r{\A(<br(|/)>[[:space:]]*)*}i, '')
  296. string.gsub!(%r{[[:space:]]*(<br(|/)>[[:space:]]*)*\Z}i, '')
  297. string.gsub!(%r{(<p></p>){1,10}\Z}i, '')
  298. string.signature_identify('html')
  299. marker_template = '<span class="js-signatureMarker"></span>'
  300. string.sub!(%r{######SIGNATURE_MARKER######}, marker_template)
  301. string.gsub!(%r{######SIGNATURE_MARKER######}, '')
  302. [
  303. string.chomp,
  304. {
  305. remote_content_removed: strict_sanitizer.remote_content_removed
  306. },
  307. ]
  308. end
  309. def signature_identify(type = 'text', force = false)
  310. string = self
  311. marker = '######SIGNATURE_MARKER######'
  312. if type == 'html'
  313. map = [
  314. '<br(|\/)>[[:space:]]*(--|__)',
  315. '<\/div>[[:space:]]*(--|__)',
  316. '<p>[[:space:]]*(--|__)',
  317. '(<br(|\/)>|<p>|<div>)[[:space:]]*<b>(|<span[[:space:]]lang=".{1,6}">)(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ):[[:space:]]*(|</span>)</b>',
  318. '(<br>|<div>)[[:space:]]*<br>[[:space:]]*(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ):[[:space:]]+',
  319. '<blockquote(|.+?)>[[:space:]]*<div>[[:space:]]*(On|Am|Le|El|Den|Dňa|W dniu|Il|Op|Dne|Dana)[[:space:]]',
  320. '<div(|.+?)>[[:space:]]*<br>[[:space:]]*(On|Am|Le|El|Den|Dňa|W dniu|Il|Op|Dne|Dana)[[:space:]].{1,500}<blockquote',
  321. ]
  322. map.each do |regexp|
  323. string.sub!(%r{#{regexp}}m) do |placeholder|
  324. "#{marker}#{placeholder}"
  325. end
  326. end
  327. return string
  328. end
  329. # if we do have less then 10 lines and less then 300 chars ignore this
  330. if !force
  331. lines = string.split("\n")
  332. return if lines.count < 10 && string.length < 300
  333. end
  334. # search for signature separator "--\n"
  335. string.sub!(%r{^\s{0,2}--\s{0,2}$}) do |placeholder|
  336. "#{marker}#{placeholder}"
  337. end
  338. map = {}
  339. # Apple Mail
  340. # On 01/04/15 10:55, Bob Smith wrote:
  341. map['apple-en'] = '^(On)[[:space:]].{6,20}[[:space:]].{3,10}[[:space:]].{1,250}[[:space:]](wrote):'
  342. # Am 03.04.2015 um 20:58 schrieb Martin Edenhofer <me@zammad.ink>:
  343. map['apple-de'] = '^(Am)[[:space:]].{6,20}[[:space:]](um)[[:space:]].{3,10}[[:space:]](schrieb)[[:space:]].{1,250}:'
  344. # Thunderbird
  345. # Am 04.03.2015 um 12:47 schrieb Alf Aardvark:
  346. map['thunderbird-de'] = '^(Am)[[:space:]].{6,20}[[:space:]](um)[[:space:]].{3,10}[[:space:]](schrieb)[[:space:]].{1,250}:'
  347. # Thunderbird default - http://kb.mozillazine.org/Reply_header_settings
  348. # On 01-01-2007 11:00 AM, Alf Aardvark wrote:
  349. map['thunderbird-en-default'] = '^(On)[[:space:]].{6,20}[[:space:]].{3,10},[[:space:]].{1,250}(wrote):'
  350. # http://kb.mozillazine.org/Reply_header_settings
  351. # Alf Aardvark wrote, on 01-01-2007 11:00 AM:
  352. map['thunderbird-en'] = '^.{1,250}[[:space:]](wrote),[[:space:]]on[[:space:]].{3,20}:'
  353. # otrs
  354. # 25.02.2015 10:26 - edv hotline wrote:
  355. # 25.02.2015 10:26 - edv hotline schrieb:
  356. map['otrs-en-de'] = '^.{6,10}[[:space:]].{3,10}[[:space:]]-[[:space:]].{1,250}[[:space:]](wrote|schrieb):'
  357. # Ms
  358. # From: Martin Edenhofer via Zammad Support [mailto:support@zammad.inc]
  359. # Send: Donnerstag, 2. April 2015 10:00
  360. # To/Cc/Bcc: xxx
  361. # Subject: xxx
  362. # - or -
  363. # From: xxx
  364. # To/Cc/Bcc: xxx
  365. # Date: 01.04.2015 12:41
  366. # Subject: xxx
  367. # - or -
  368. # De : xxx
  369. # À/?/?: xxx
  370. # Envoyé : mercredi 29 avril 2015 17:31
  371. # Objet : xxx
  372. # en/de/fr | sometimes ms adds a space to "xx : value"
  373. map['ms-en-de-fr_from'] = '^(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ)( ?):[[:space:]].+?'
  374. map['ms-en-de-fr_from_html'] = "\n######b######(From|Von|De)([[:space:]]?):([[:space:]]?)(######/b######)[[:space:]].+?"
  375. # word 14
  376. # edv hotline wrote:
  377. # edv hotline schrieb:
  378. # map['word-en-de'] = "[^#{marker}].{1,250}\s(wrote|schrieb):"
  379. map.each_value do |regexp|
  380. string.sub!(%r{#{regexp}}) do |placeholder|
  381. "#{marker}#{placeholder}"
  382. rescue
  383. # regexp was not possible because of some string encoding issue, use next
  384. Rails.logger.debug { "Invalid string/charset combination with regexp #{regexp} in string" }
  385. end
  386. end
  387. string
  388. end
  389. # Returns a copied string whose encoding is UTF-8.
  390. # If both the provided and current encodings are invalid,
  391. # an auto-detected encoding is tried.
  392. #
  393. # Supports some fallback strategies if a valid encoding cannot be found.
  394. #
  395. # Options:
  396. #
  397. # * from: An encoding to try first.
  398. # Takes precedence over the current and auto-detected encodings.
  399. #
  400. # * fallback: The strategy to follow if no valid encoding can be found.
  401. # * `:output_to_binary` returns an ASCII-8BIT-encoded string.
  402. # * `:read_as_sanitized_binary` returns a UTF-8-encoded string with all
  403. # invalid byte sequences replaced with "?" characters.
  404. def utf8_encode(...)
  405. dup.utf8_encode!(...)
  406. end
  407. def utf8_encode!(**options)
  408. return force_encoding('utf-8') if dup.force_encoding('utf-8').valid_encoding?
  409. # convert string to given charset, if valid_encoding? is true
  410. if options[:from].present?
  411. begin
  412. encoding = Encoding.find(options[:from])
  413. if encoding.present? && dup.force_encoding(encoding).valid_encoding?
  414. force_encoding(encoding)
  415. return encode!('utf-8', encoding)
  416. end
  417. rescue ArgumentError, EncodingError => e
  418. Rails.logger.error { e.inspect }
  419. end
  420. end
  421. # try to find valid encodings of string
  422. viable_encodings.each do |enc|
  423. return encode!('utf-8', enc)
  424. rescue EncodingError => e
  425. Rails.logger.error { e.inspect }
  426. end
  427. case options[:fallback]
  428. when :output_to_binary
  429. force_encoding('ascii-8bit')
  430. when :read_as_sanitized_binary
  431. encode!('utf-8', 'ascii-8bit', invalid: :replace, undef: :replace, replace: '?')
  432. else
  433. raise EncodingError, 'could not find a valid input encoding'
  434. end
  435. end
  436. private
  437. def viable_encodings(try_first: nil)
  438. return dup.viable_encodings(try_first: try_first) if frozen?
  439. provided = Encoding.find(try_first) if try_first.present?
  440. original = encoding
  441. detected = CharDet.detect(self)['encoding']
  442. [provided, original, detected]
  443. .compact
  444. .reject { |e| Encoding.find(e) == Encoding::ASCII_8BIT }
  445. .reject { |e| Encoding.find(e) == Encoding::UTF_8 }
  446. .select { |e| force_encoding(e).valid_encoding? }
  447. .tap { force_encoding(original) } # clean up changes from previous line
  448. # if `try_first` is not a valid encoding, try_first again without it
  449. rescue ArgumentError
  450. try_first.present? ? viable_encodings : raise
  451. end
  452. end