string.rb 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535
  1. require 'rchardet'
  2. class String
  3. alias old_strip strip
  4. alias old_strip! strip!
  5. def strip!
  6. begin
  7. sub!(/\A[[[:space:]]\u{200B}\u{FEFF}]+/, '')
  8. sub!(/[[[:space:]]\u{200B}\u{FEFF}]+\Z/, '')
  9. # if incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string) (Encoding::CompatibilityError), use default
  10. rescue Encoding::CompatibilityError
  11. old_strip!
  12. end
  13. self
  14. end
  15. def strip
  16. begin
  17. new_string = sub(/\A[[[:space:]]\u{200B}\u{FEFF}]+/, '')
  18. new_string.sub!(/[[[:space:]]\u{200B}\u{FEFF}]+\Z/, '')
  19. # if incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string) (Encoding::CompatibilityError), use default
  20. rescue Encoding::CompatibilityError
  21. new_string = old_strip
  22. end
  23. new_string
  24. end
  25. def message_quote
  26. quote = split("\n")
  27. body_quote = ''
  28. quote.each do |line|
  29. body_quote = body_quote + '> ' + line + "\n"
  30. end
  31. body_quote
  32. end
  33. def word_wrap(*args)
  34. options = args.extract_options!
  35. if args.present?
  36. options[:line_width] = args[0] || 82
  37. end
  38. options.reverse_merge!(line_width: 82)
  39. lines = self
  40. lines.split("\n").collect do |line|
  41. line.length > options[:line_width] ? line.gsub(/(.{1,#{options[:line_width]}})(\s+|$)/, "\\1\n").strip : line
  42. end * "\n"
  43. end
  44. =begin
  45. filename = 'Some::Module'.to_filename
  46. returns
  47. 'some/module'
  48. =end
  49. def to_filename
  50. camel_cased_word = dup
  51. camel_cased_word.gsub(/::/, '/')
  52. .gsub(/([A-Z]+)([A-Z][a-z])/, '\1_\2')
  53. .gsub(/([a-z\d])([A-Z])/, '\1_\2')
  54. .tr('-', '_').downcase
  55. end
  56. =begin
  57. filename = 'some/module.rb'.to_classname
  58. returns
  59. 'Some::Module'
  60. =end
  61. def to_classname
  62. camel_cased_word = dup
  63. camel_cased_word.delete_suffix!('.rb')
  64. camel_cased_word.split('/').map(&:camelize).join('::')
  65. end
  66. # because of mysql inno_db limitations, strip 4 bytes utf8 chars (e. g. emojis)
  67. # unfortunaly UTF8mb4 will raise other limitaions of max varchar and lower index sizes
  68. # More details: http://pjambet.github.io/blog/emojis-and-mysql/
  69. def utf8_to_3bytesutf8
  70. return self if Rails.application.config.db_4bytes_utf8
  71. each_char.select do |c|
  72. if c.bytes.count > 3
  73. Rails.logger.warn "strip out 4 bytes utf8 chars '#{c}' of '#{self}'"
  74. next
  75. end
  76. c
  77. end
  78. .join('')
  79. end
  80. =begin
  81. text = html_string.html2text
  82. returns
  83. 'string with text only'
  84. =end
  85. def html2text(string_only = false, strict = false)
  86. string = dup
  87. # in case of invalid encoding, strip invalid chars
  88. # see also test/data/mail/mail021.box
  89. # note: string.encode!('UTF-8', 'UTF-8', :invalid => :replace, :replace => '?') was not detecting invalid chars
  90. if !string.valid_encoding?
  91. string = string.chars.select(&:valid_encoding?).join
  92. end
  93. # remove html comments
  94. string.gsub!(/<!--.+?-->/m, '')
  95. # find <a href=....> and replace it with [x]
  96. link_list = ''
  97. counter = 0
  98. if !string_only
  99. if string.scan(/<a[[:space:]]/i).count < 5_000
  100. string.gsub!(/<a[[:space:]].*?href=("|')(.+?)("|').*?>/ix) do
  101. link = $2
  102. counter = counter + 1
  103. link_list += "[#{counter}] #{link}\n"
  104. "[#{counter}] "
  105. end
  106. end
  107. else
  108. string.gsub!(%r{<a[[:space:]]+(|\S+[[:space:]]+)href=("|')(.+?)("|')([[:space:]]*|[[:space:]]+[^>]*)>(.+?)<[[:space:]]*/a[[:space:]]*>}mxi) do |_placeholder|
  109. link = $3
  110. text = $6
  111. text.gsub!(/<.+?>/, '')
  112. link_compare = link.dup
  113. if link_compare.present?
  114. link.strip!
  115. link_compare.strip!
  116. link_compare.downcase!
  117. link_compare.sub!(%r{/$}, '')
  118. end
  119. text_compare = text.dup
  120. if text_compare.present?
  121. text.strip!
  122. text_compare.strip!
  123. text_compare.downcase!
  124. text_compare.sub!(%r{/$}, '')
  125. end
  126. if link_compare.present? && text_compare.blank?
  127. link
  128. elsif link_compare.blank? && text_compare.present?
  129. text
  130. elsif link_compare && link_compare =~ /^mailto/i
  131. text
  132. elsif link_compare.present? && text_compare.present? && (link_compare == text_compare || link_compare == "mailto:#{text}".downcase || link_compare == "http://#{text}".downcase)
  133. "######LINKEXT:#{link}/TEXT:#{text}######"
  134. elsif text !~ /^http/
  135. "#{text} (######LINKRAW:#{link}######)"
  136. else
  137. "#{link} (######LINKRAW:#{text}######)"
  138. end
  139. end
  140. end
  141. # remove style tags with content
  142. string.gsub!(%r{<style(|[[:space:]].+?)>(.+?)</style>}im, '')
  143. # remove empty lines
  144. string.gsub!(/^[[:space:]]*/m, '')
  145. if strict
  146. string.gsub!(%r{< [[:space:]]* (/*) [[:space:]]* (b|i|ul|ol|li|u|h1|h2|h3|hr) ([[:space:]]*|[[:space:]]+[^>]*) >}mxi, '######\1\2######')
  147. end
  148. # pre/code handling 1/2
  149. string.gsub!(%r{<pre>(.+?)</pre>}m) do |placeholder|
  150. placeholder.gsub(/\n/, '###BR###')
  151. end
  152. string.gsub!(%r{<code>(.+?)</code>}m) do |placeholder|
  153. placeholder.gsub(/\n/, '###BR###')
  154. end
  155. # insert spaces on [A-z]\n[A-z]
  156. string.gsub!(/([A-z])[[:space:]]([A-z])/m, '\1 \2')
  157. # remove all new lines
  158. string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, '')
  159. # blockquote handling
  160. string.gsub!(%r{<blockquote(| [^>]*)>(.+?)</blockquote>}m) do
  161. "\n" + $2.html2text(true).gsub(/^(.*)$/, '&gt; \1') + "\n"
  162. end
  163. # pre/code handling 2/2
  164. string.gsub!(/###BR###/, "\n")
  165. # add counting
  166. string.gsub!(/<li(| [^>]*)>/i, "\n* ")
  167. # add hr
  168. string.gsub!(%r{<hr(|/| [^>]*)>}i, "\n___\n")
  169. # add h\d
  170. string.gsub!(%r{</h\d>}i, "\n")
  171. # add new lines
  172. string.gsub!(%r{</div><div(|[[:space:]].+?)>}im, "\n")
  173. string.gsub!(%r{</p><p(|[[:space:]].+?)>}im, "\n")
  174. string.gsub!(%r{<(div|p|pre|br|table|tr|h)(|/| [^>]*)>}i, "\n")
  175. string.gsub!(%r{</(p|br|div)(|[[:space:]].+?)>}i, "\n")
  176. string.gsub!(%r{</td>}i, ' ')
  177. # strip all other tags
  178. string.gsub!(/<.+?>/, '')
  179. # replace multiple spaces with one
  180. string.gsub!(/ /, ' ')
  181. # add hyperlinks
  182. if strict
  183. string.gsub!(%r{([[:space:]])((http|https|ftp|tel)://.+?|(www..+?))([[:space:]]|\.[[:space:]]|,[[:space:]])}mxi) do |_placeholder|
  184. pre = $1
  185. content = $2
  186. post = $5
  187. if content.match?(/^www/i)
  188. content = "http://#{content}"
  189. end
  190. if content =~ /^(http|https|ftp|tel)/i
  191. "#{pre}######LINKRAW:#{content}#######{post}"
  192. else
  193. "#{pre}#{content}#{post}"
  194. end
  195. end
  196. end
  197. # try HTMLEntities, if it fails on invalid signes, use manual way
  198. begin
  199. coder = HTMLEntities.new
  200. string = coder.decode(string)
  201. rescue
  202. # strip all &amp; &lt; &gt; &quot;
  203. string.gsub!('&amp;', '&')
  204. string.gsub!('&lt;', '<')
  205. string.gsub!('&gt;', '>')
  206. string.gsub!('&quot;', '"')
  207. string.gsub!('&nbsp;', ' ')
  208. # encode html entities like "&#8211;"
  209. string.gsub!(/(&\#(\d+);?)/x) do
  210. $2.chr
  211. end
  212. # encode html entities like "&#3d;"
  213. string.gsub!(/(&\#[xX]([0-9a-fA-F]+);?)/x) do
  214. chr_orig = $1
  215. hex = $2.hex
  216. if hex
  217. chr = hex.chr
  218. if chr
  219. chr_orig = chr
  220. else
  221. chr_orig
  222. end
  223. else
  224. chr_orig
  225. end
  226. # check valid encoding
  227. begin
  228. if !chr_orig.encode('UTF-8').valid_encoding?
  229. chr_orig = '?'
  230. end
  231. rescue
  232. chr_orig = '?'
  233. end
  234. chr_orig
  235. end
  236. end
  237. string = string.utf8_encode(fallback: :read_as_sanitized_binary)
  238. # remove tailing empty spaces
  239. string.gsub!(/[[:blank:]]+$/, '')
  240. # remove double multiple empty lines
  241. string.gsub!(/\n\n\n+/, "\n\n")
  242. # add extracted links
  243. if link_list != ''
  244. string += "\n\n\n" + link_list
  245. end
  246. # remove double multiple empty lines
  247. string.gsub!(/\n\n\n+/, "\n\n")
  248. string.strip
  249. end
  250. =begin
  251. html = text_string.text2html
  252. =end
  253. def text2html
  254. text = CGI.escapeHTML(self)
  255. text.gsub!(/\n/, '<br>')
  256. text.chomp
  257. end
  258. =begin
  259. html = text_string.text2html
  260. =end
  261. def html2html_strict
  262. string = dup
  263. string = HtmlSanitizer.cleanup_replace_tags(string)
  264. string = HtmlSanitizer.strict(string, true).strip
  265. string = HtmlSanitizer.cleanup(string).strip
  266. # as fallback, use html2text and text2html
  267. if string.blank?
  268. string = html2text.text2html
  269. string.signature_identify('text')
  270. marker_template = '<span class="js-signatureMarker"></span>'
  271. string.sub!(/######SIGNATURE_MARKER######/, marker_template)
  272. string.gsub!(/######SIGNATURE_MARKER######/, '')
  273. return string.chomp
  274. end
  275. string.gsub!(%r{(<p>[[:space:]]*</p>([[:space:]]*)){2,}}im, '<p>&nbsp;</p>\2')
  276. string.gsub!(%r\<div>[[:space:]]*(<br(|/)>([[:space:]]*)){2,}\im, '<div><br>\3')
  277. string.gsub!(%r\[[:space:]]*(<br>[[:space:]]*){3,}[[:space:]]*</div>\im, '<br><br></div>')
  278. string.gsub!(%r\<div>[[:space:]]*(<br>[[:space:]]*){1,}[[:space:]]*</div>\im, '<div>&nbsp;</div>')
  279. string.gsub!(%r\<div>[[:space:]]*(<div>[[:space:]]*</div>[[:space:]]*){2,}</div>\im, '<div>&nbsp;</div>')
  280. string.gsub!(%r\<p>[[:space:]]*</p>(<br(|/)>[[:space:]]*){2,}[[:space:]]*\im, '<p> </p><br>')
  281. string.gsub!(%r{<p>[[:space:]]*</p>(<br(|/)>[[:space:]]*)+<p>[[:space:]]*</p>}im, '<p> </p><p> </p>')
  282. string.gsub!(%r\(<div>[[:space:]]*</div>[[:space:]]*){2,}\im, '<div> </div>')
  283. string.gsub!(%r{<div>&nbsp;</div>[[:space:]]*(<div>&nbsp;</div>){1,}}im, '<div>&nbsp;</div>')
  284. string.gsub!(/(<br>[[:space:]]*){3,}/im, '<br><br>')
  285. string.gsub!(%r\(<br(|/)>[[:space:]]*){3,}\im, '<br/><br/>')
  286. string.gsub!(%r{<p>[[:space:]]+</p>}im, '<p>&nbsp;</p>')
  287. string.gsub!(%r{\A(<br(|/)>[[:space:]]*)*}i, '')
  288. string.gsub!(%r{[[:space:]]*(<br(|/)>[[:space:]]*)*\Z}i, '')
  289. string.gsub!(%r{(<p></p>){1,10}\Z}i, '')
  290. string.signature_identify('html')
  291. marker_template = '<span class="js-signatureMarker"></span>'
  292. string.sub!(/######SIGNATURE_MARKER######/, marker_template)
  293. string.gsub!(/######SIGNATURE_MARKER######/, '')
  294. string.chomp
  295. end
  296. def signature_identify(type = 'text', force = false)
  297. string = self
  298. marker = '######SIGNATURE_MARKER######'
  299. if type == 'html'
  300. map = [
  301. '<br(|\/)>[[:space:]]*(--|__)',
  302. '<\/div>[[:space:]]*(--|__)',
  303. '<p>[[:space:]]*(--|__)',
  304. '(<br(|\/)>|<p>|<div>)[[:space:]]*<b>(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ):[[:space:]]*</b>',
  305. '(<br>|<div>)[[:space:]]*<br>[[:space:]]*(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ):[[:space:]]+',
  306. '<blockquote(|.+?)>[[:space:]]*<div>[[:space:]]*(On|Am|Le|El|Den|Dňa|W dniu|Il|Op|Dne|Dana)[[:space:]]',
  307. '<div(|.+?)>[[:space:]]*<br>[[:space:]]*(On|Am|Le|El|Den|Dňa|W dniu|Il|Op|Dne|Dana)[[:space:]].{1,500}<blockquote',
  308. ]
  309. map.each do |regexp|
  310. string.sub!(/#{regexp}/m) do |placeholder|
  311. "#{marker}#{placeholder}"
  312. end
  313. end
  314. return string
  315. end
  316. # if we do have less then 10 lines and less then 300 chars ignore this
  317. if !force
  318. lines = string.split("\n")
  319. return if lines.count < 10 && string.length < 300
  320. end
  321. # search for signature separator "--\n"
  322. string.sub!(/^\s{0,2}--\s{0,2}$/) do |placeholder|
  323. "#{marker}#{placeholder}"
  324. end
  325. map = {}
  326. # Apple Mail
  327. # On 01/04/15 10:55, Bob Smith wrote:
  328. map['apple-en'] = '^(On)[[:space:]].{6,20}[[:space:]].{3,10}[[:space:]].{1,250}[[:space:]](wrote):'
  329. # Am 03.04.2015 um 20:58 schrieb Martin Edenhofer <me@znuny.ink>:
  330. map['apple-de'] = '^(Am)[[:space:]].{6,20}[[:space:]](um)[[:space:]].{3,10}[[:space:]](schrieb)[[:space:]].{1,250}:'
  331. # Thunderbird
  332. # Am 04.03.2015 um 12:47 schrieb Alf Aardvark:
  333. map['thunderbird-de'] = '^(Am)[[:space:]].{6,20}[[:space:]](um)[[:space:]].{3,10}[[:space:]](schrieb)[[:space:]].{1,250}:'
  334. # Thunderbird default - http://kb.mozillazine.org/Reply_header_settings
  335. # On 01-01-2007 11:00 AM, Alf Aardvark wrote:
  336. map['thunderbird-en-default'] = '^(On)[[:space:]].{6,20}[[:space:]].{3,10},[[:space:]].{1,250}(wrote):'
  337. # http://kb.mozillazine.org/Reply_header_settings
  338. # Alf Aardvark wrote, on 01-01-2007 11:00 AM:
  339. map['thunderbird-en'] = '^.{1,250}[[:space:]](wrote),[[:space:]]on[[:space:]].{3,20}:'
  340. # otrs
  341. # 25.02.2015 10:26 - edv hotline wrote:
  342. # 25.02.2015 10:26 - edv hotline schrieb:
  343. map['otrs-en-de'] = '^.{6,10}[[:space:]].{3,10}[[:space:]]-[[:space:]].{1,250}[[:space:]](wrote|schrieb):'
  344. # Ms
  345. # rubocop:disable Style/AsciiComments
  346. # From: Martin Edenhofer via Znuny Support [mailto:support@znuny.inc]
  347. # Send: Donnerstag, 2. April 2015 10:00
  348. # To/Cc/Bcc: xxx
  349. # Subject: xxx
  350. # - or -
  351. # From: xxx
  352. # To/Cc/Bcc: xxx
  353. # Date: 01.04.2015 12:41
  354. # Subject: xxx
  355. # - or -
  356. # De : xxx
  357. # À/?/?: xxx
  358. # Envoyé : mercredi 29 avril 2015 17:31
  359. # Objet : xxx
  360. # rubocop:enable Style/AsciiComments
  361. # en/de/fr | sometimes ms adds a space to "xx : value"
  362. map['ms-en-de-fr_from'] = '^(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ)( ?):[[:space:]].+?'
  363. map['ms-en-de-fr_from_html'] = "\n######b######(From|Von|De)([[:space:]]?):([[:space:]]?)(######\/b######)[[:space:]].+?"
  364. # word 14
  365. # edv hotline wrote:
  366. # edv hotline schrieb:
  367. #map['word-en-de'] = "[^#{marker}].{1,250}\s(wrote|schrieb):"
  368. map.each_value do |regexp|
  369. string.sub!(/#{regexp}/) do |placeholder|
  370. "#{marker}#{placeholder}"
  371. rescue
  372. # regexp was not possible because of some string encoding issue, use next
  373. Rails.logger.debug { "Invalid string/charset combination with regexp #{regexp} in string" }
  374. end
  375. end
  376. string
  377. end
  378. # Returns a copied string whose encoding is UTF-8.
  379. # If both the provided and current encodings are invalid,
  380. # an auto-detected encoding is tried.
  381. #
  382. # Supports some fallback strategies if a valid encoding cannot be found.
  383. #
  384. # Options:
  385. #
  386. # * from: An encoding to try first.
  387. # Takes precedence over the current and auto-detected encodings.
  388. #
  389. # * fallback: The strategy to follow if no valid encoding can be found.
  390. # * `:output_to_binary` returns an ASCII-8BIT-encoded string.
  391. # * `:read_as_sanitized_binary` returns a UTF-8-encoded string with all
  392. # invalid byte sequences replaced with "?" characters.
  393. def utf8_encode(**options)
  394. dup.utf8_encode!(options)
  395. end
  396. def utf8_encode!(**options)
  397. return force_encoding('utf-8') if dup.force_encoding('utf-8').valid_encoding?
  398. # convert string to given charset, if valid_encoding? is true
  399. if options[:from].present?
  400. begin
  401. encoding = Encoding.find(options[:from])
  402. if encoding.present? && dup.force_encoding(encoding).valid_encoding?
  403. force_encoding(encoding)
  404. return encode!('utf-8', encoding)
  405. end
  406. rescue ArgumentError, EncodingError => e
  407. Rails.logger.error { e.inspect }
  408. end
  409. end
  410. # try to find valid encodings of string
  411. viable_encodings.each do |enc|
  412. return encode!('utf-8', enc)
  413. rescue EncodingError => e
  414. Rails.logger.error { e.inspect }
  415. end
  416. case options[:fallback]
  417. when :output_to_binary
  418. force_encoding('ascii-8bit')
  419. when :read_as_sanitized_binary
  420. encode!('utf-8', 'ascii-8bit', invalid: :replace, undef: :replace, replace: '?')
  421. else
  422. raise EncodingError, 'could not find a valid input encoding'
  423. end
  424. end
  425. private
  426. def viable_encodings(try_first: nil)
  427. return dup.viable_encodings(try_first: try_first) if frozen?
  428. provided = Encoding.find(try_first) if try_first.present?
  429. original = encoding
  430. detected = CharDet.detect(self)['encoding']
  431. [provided, original, detected]
  432. .compact
  433. .reject { |e| Encoding.find(e) == Encoding::ASCII_8BIT }
  434. .reject { |e| Encoding.find(e) == Encoding::UTF_8 }
  435. .select { |e| force_encoding(e).valid_encoding? }
  436. .tap { force_encoding(original) } # clean up changes from previous line
  437. # if `try_first` is not a valid encoding, try_first again without it
  438. rescue ArgumentError
  439. try_first.present? ? viable_encodings : raise
  440. end
  441. end