123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535 |
- require 'rchardet'
- class String
- alias old_strip strip
- alias old_strip! strip!
- def strip!
- begin
- sub!(/\A[[[:space:]]\u{200B}\u{FEFF}]+/, '')
- sub!(/[[[:space:]]\u{200B}\u{FEFF}]+\Z/, '')
- # if incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string) (Encoding::CompatibilityError), use default
- rescue Encoding::CompatibilityError
- old_strip!
- end
- self
- end
- def strip
- begin
- new_string = sub(/\A[[[:space:]]\u{200B}\u{FEFF}]+/, '')
- new_string.sub!(/[[[:space:]]\u{200B}\u{FEFF}]+\Z/, '')
- # if incompatible encoding regexp match (UTF-8 regexp with ASCII-8BIT string) (Encoding::CompatibilityError), use default
- rescue Encoding::CompatibilityError
- new_string = old_strip
- end
- new_string
- end
- def message_quote
- quote = split("\n")
- body_quote = ''
- quote.each do |line|
- body_quote = body_quote + '> ' + line + "\n"
- end
- body_quote
- end
- def word_wrap(*args)
- options = args.extract_options!
- if args.present?
- options[:line_width] = args[0] || 82
- end
- options.reverse_merge!(line_width: 82)
- lines = self
- lines.split("\n").collect do |line|
- line.length > options[:line_width] ? line.gsub(/(.{1,#{options[:line_width]}})(\s+|$)/, "\\1\n").strip : line
- end * "\n"
- end
- =begin
- filename = 'Some::Module'.to_filename
- returns
- 'some/module'
- =end
- def to_filename
- camel_cased_word = dup
- camel_cased_word.gsub(/::/, '/')
- .gsub(/([A-Z]+)([A-Z][a-z])/, '\1_\2')
- .gsub(/([a-z\d])([A-Z])/, '\1_\2')
- .tr('-', '_').downcase
- end
- =begin
- filename = 'some/module.rb'.to_classname
- returns
- 'Some::Module'
- =end
- def to_classname
- camel_cased_word = dup
- camel_cased_word.delete_suffix!('.rb')
- camel_cased_word.split('/').map(&:camelize).join('::')
- end
- # because of mysql inno_db limitations, strip 4 bytes utf8 chars (e. g. emojis)
- # unfortunaly UTF8mb4 will raise other limitaions of max varchar and lower index sizes
- # More details: http://pjambet.github.io/blog/emojis-and-mysql/
- def utf8_to_3bytesutf8
- return self if Rails.application.config.db_4bytes_utf8
- each_char.select do |c|
- if c.bytes.count > 3
- Rails.logger.warn "strip out 4 bytes utf8 chars '#{c}' of '#{self}'"
- next
- end
- c
- end
- .join('')
- end
- =begin
- text = html_string.html2text
- returns
- 'string with text only'
- =end
- def html2text(string_only = false, strict = false)
- string = dup
- # in case of invalid encoding, strip invalid chars
- # see also test/data/mail/mail021.box
- # note: string.encode!('UTF-8', 'UTF-8', :invalid => :replace, :replace => '?') was not detecting invalid chars
- if !string.valid_encoding?
- string = string.chars.select(&:valid_encoding?).join
- end
- # remove html comments
- string.gsub!(/<!--.+?-->/m, '')
- # find <a href=....> and replace it with [x]
- link_list = ''
- counter = 0
- if !string_only
- if string.scan(/<a[[:space:]]/i).count < 5_000
- string.gsub!(/<a[[:space:]].*?href=("|')(.+?)("|').*?>/ix) do
- link = $2
- counter = counter + 1
- link_list += "[#{counter}] #{link}\n"
- "[#{counter}] "
- end
- end
- else
- string.gsub!(%r{<a[[:space:]]+(|\S+[[:space:]]+)href=("|')(.+?)("|')([[:space:]]*|[[:space:]]+[^>]*)>(.+?)<[[:space:]]*/a[[:space:]]*>}mxi) do |_placeholder|
- link = $3
- text = $6
- text.gsub!(/<.+?>/, '')
- link_compare = link.dup
- if link_compare.present?
- link.strip!
- link_compare.strip!
- link_compare.downcase!
- link_compare.sub!(%r{/$}, '')
- end
- text_compare = text.dup
- if text_compare.present?
- text.strip!
- text_compare.strip!
- text_compare.downcase!
- text_compare.sub!(%r{/$}, '')
- end
- if link_compare.present? && text_compare.blank?
- link
- elsif link_compare.blank? && text_compare.present?
- text
- elsif link_compare && link_compare =~ /^mailto/i
- text
- elsif link_compare.present? && text_compare.present? && (link_compare == text_compare || link_compare == "mailto:#{text}".downcase || link_compare == "http://#{text}".downcase)
- "######LINKEXT:#{link}/TEXT:#{text}######"
- elsif text !~ /^http/
- "#{text} (######LINKRAW:#{link}######)"
- else
- "#{link} (######LINKRAW:#{text}######)"
- end
- end
- end
- # remove style tags with content
- string.gsub!(%r{<style(|[[:space:]].+?)>(.+?)</style>}im, '')
- # remove empty lines
- string.gsub!(/^[[:space:]]*/m, '')
- if strict
- string.gsub!(%r{< [[:space:]]* (/*) [[:space:]]* (b|i|ul|ol|li|u|h1|h2|h3|hr) ([[:space:]]*|[[:space:]]+[^>]*) >}mxi, '######\1\2######')
- end
- # pre/code handling 1/2
- string.gsub!(%r{<pre>(.+?)</pre>}m) do |placeholder|
- placeholder.gsub(/\n/, '###BR###')
- end
- string.gsub!(%r{<code>(.+?)</code>}m) do |placeholder|
- placeholder.gsub(/\n/, '###BR###')
- end
- # insert spaces on [A-z]\n[A-z]
- string.gsub!(/([A-z])[[:space:]]([A-z])/m, '\1 \2')
- # remove all new lines
- string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, '')
- # blockquote handling
- string.gsub!(%r{<blockquote(| [^>]*)>(.+?)</blockquote>}m) do
- "\n" + $2.html2text(true).gsub(/^(.*)$/, '> \1') + "\n"
- end
- # pre/code handling 2/2
- string.gsub!(/###BR###/, "\n")
- # add counting
- string.gsub!(/<li(| [^>]*)>/i, "\n* ")
- # add hr
- string.gsub!(%r{<hr(|/| [^>]*)>}i, "\n___\n")
- # add h\d
- string.gsub!(%r{</h\d>}i, "\n")
- # add new lines
- string.gsub!(%r{</div><div(|[[:space:]].+?)>}im, "\n")
- string.gsub!(%r{</p><p(|[[:space:]].+?)>}im, "\n")
- string.gsub!(%r{<(div|p|pre|br|table|tr|h)(|/| [^>]*)>}i, "\n")
- string.gsub!(%r{</(p|br|div)(|[[:space:]].+?)>}i, "\n")
- string.gsub!(%r{</td>}i, ' ')
- # strip all other tags
- string.gsub!(/<.+?>/, '')
- # replace multiple spaces with one
- string.gsub!(/ /, ' ')
- # add hyperlinks
- if strict
- string.gsub!(%r{([[:space:]])((http|https|ftp|tel)://.+?|(www..+?))([[:space:]]|\.[[:space:]]|,[[:space:]])}mxi) do |_placeholder|
- pre = $1
- content = $2
- post = $5
- if content.match?(/^www/i)
- content = "http://#{content}"
- end
- if content =~ /^(http|https|ftp|tel)/i
- "#{pre}######LINKRAW:#{content}#######{post}"
- else
- "#{pre}#{content}#{post}"
- end
- end
- end
- # try HTMLEntities, if it fails on invalid signes, use manual way
- begin
- coder = HTMLEntities.new
- string = coder.decode(string)
- rescue
- # strip all & < > "
- string.gsub!('&', '&')
- string.gsub!('<', '<')
- string.gsub!('>', '>')
- string.gsub!('"', '"')
- string.gsub!(' ', ' ')
- # encode html entities like "–"
- string.gsub!(/(&\#(\d+);?)/x) do
- $2.chr
- end
- # encode html entities like "d;"
- string.gsub!(/(&\#[xX]([0-9a-fA-F]+);?)/x) do
- chr_orig = $1
- hex = $2.hex
- if hex
- chr = hex.chr
- if chr
- chr_orig = chr
- else
- chr_orig
- end
- else
- chr_orig
- end
- # check valid encoding
- begin
- if !chr_orig.encode('UTF-8').valid_encoding?
- chr_orig = '?'
- end
- rescue
- chr_orig = '?'
- end
- chr_orig
- end
- end
- string = string.utf8_encode(fallback: :read_as_sanitized_binary)
- # remove tailing empty spaces
- string.gsub!(/[[:blank:]]+$/, '')
- # remove double multiple empty lines
- string.gsub!(/\n\n\n+/, "\n\n")
- # add extracted links
- if link_list != ''
- string += "\n\n\n" + link_list
- end
- # remove double multiple empty lines
- string.gsub!(/\n\n\n+/, "\n\n")
- string.strip
- end
- =begin
- html = text_string.text2html
- =end
- def text2html
- text = CGI.escapeHTML(self)
- text.gsub!(/\n/, '<br>')
- text.chomp
- end
- =begin
- html = text_string.text2html
- =end
- def html2html_strict
- string = dup
- string = HtmlSanitizer.cleanup_replace_tags(string)
- string = HtmlSanitizer.strict(string, true).strip
- string = HtmlSanitizer.cleanup(string).strip
- # as fallback, use html2text and text2html
- if string.blank?
- string = html2text.text2html
- string.signature_identify('text')
- marker_template = '<span class="js-signatureMarker"></span>'
- string.sub!(/######SIGNATURE_MARKER######/, marker_template)
- string.gsub!(/######SIGNATURE_MARKER######/, '')
- return string.chomp
- end
- string.gsub!(%r{(<p>[[:space:]]*</p>([[:space:]]*)){2,}}im, '<p> </p>\2')
- string.gsub!(%r\<div>[[:space:]]*(<br(|/)>([[:space:]]*)){2,}\im, '<div><br>\3')
- string.gsub!(%r\[[:space:]]*(<br>[[:space:]]*){3,}[[:space:]]*</div>\im, '<br><br></div>')
- string.gsub!(%r\<div>[[:space:]]*(<br>[[:space:]]*){1,}[[:space:]]*</div>\im, '<div> </div>')
- string.gsub!(%r\<div>[[:space:]]*(<div>[[:space:]]*</div>[[:space:]]*){2,}</div>\im, '<div> </div>')
- string.gsub!(%r\<p>[[:space:]]*</p>(<br(|/)>[[:space:]]*){2,}[[:space:]]*\im, '<p> </p><br>')
- string.gsub!(%r{<p>[[:space:]]*</p>(<br(|/)>[[:space:]]*)+<p>[[:space:]]*</p>}im, '<p> </p><p> </p>')
- string.gsub!(%r\(<div>[[:space:]]*</div>[[:space:]]*){2,}\im, '<div> </div>')
- string.gsub!(%r{<div> </div>[[:space:]]*(<div> </div>){1,}}im, '<div> </div>')
- string.gsub!(/(<br>[[:space:]]*){3,}/im, '<br><br>')
- string.gsub!(%r\(<br(|/)>[[:space:]]*){3,}\im, '<br/><br/>')
- string.gsub!(%r{<p>[[:space:]]+</p>}im, '<p> </p>')
- string.gsub!(%r{\A(<br(|/)>[[:space:]]*)*}i, '')
- string.gsub!(%r{[[:space:]]*(<br(|/)>[[:space:]]*)*\Z}i, '')
- string.gsub!(%r{(<p></p>){1,10}\Z}i, '')
- string.signature_identify('html')
- marker_template = '<span class="js-signatureMarker"></span>'
- string.sub!(/######SIGNATURE_MARKER######/, marker_template)
- string.gsub!(/######SIGNATURE_MARKER######/, '')
- string.chomp
- end
- def signature_identify(type = 'text', force = false)
- string = self
- marker = '######SIGNATURE_MARKER######'
- if type == 'html'
- map = [
- '<br(|\/)>[[:space:]]*(--|__)',
- '<\/div>[[:space:]]*(--|__)',
- '<p>[[:space:]]*(--|__)',
- '(<br(|\/)>|<p>|<div>)[[:space:]]*<b>(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ):[[:space:]]*</b>',
- '(<br>|<div>)[[:space:]]*<br>[[:space:]]*(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ):[[:space:]]+',
- '<blockquote(|.+?)>[[:space:]]*<div>[[:space:]]*(On|Am|Le|El|Den|Dňa|W dniu|Il|Op|Dne|Dana)[[:space:]]',
- '<div(|.+?)>[[:space:]]*<br>[[:space:]]*(On|Am|Le|El|Den|Dňa|W dniu|Il|Op|Dne|Dana)[[:space:]].{1,500}<blockquote',
- ]
- map.each do |regexp|
- string.sub!(/#{regexp}/m) do |placeholder|
- "#{marker}#{placeholder}"
- end
- end
- return string
- end
- # if we do have less then 10 lines and less then 300 chars ignore this
- if !force
- lines = string.split("\n")
- return if lines.count < 10 && string.length < 300
- end
- # search for signature separator "--\n"
- string.sub!(/^\s{0,2}--\s{0,2}$/) do |placeholder|
- "#{marker}#{placeholder}"
- end
- map = {}
- # Apple Mail
- # On 01/04/15 10:55, Bob Smith wrote:
- map['apple-en'] = '^(On)[[:space:]].{6,20}[[:space:]].{3,10}[[:space:]].{1,250}[[:space:]](wrote):'
- # Am 03.04.2015 um 20:58 schrieb Martin Edenhofer <me@znuny.ink>:
- map['apple-de'] = '^(Am)[[:space:]].{6,20}[[:space:]](um)[[:space:]].{3,10}[[:space:]](schrieb)[[:space:]].{1,250}:'
- # Thunderbird
- # Am 04.03.2015 um 12:47 schrieb Alf Aardvark:
- map['thunderbird-de'] = '^(Am)[[:space:]].{6,20}[[:space:]](um)[[:space:]].{3,10}[[:space:]](schrieb)[[:space:]].{1,250}:'
- # Thunderbird default - http://kb.mozillazine.org/Reply_header_settings
- # On 01-01-2007 11:00 AM, Alf Aardvark wrote:
- map['thunderbird-en-default'] = '^(On)[[:space:]].{6,20}[[:space:]].{3,10},[[:space:]].{1,250}(wrote):'
- # http://kb.mozillazine.org/Reply_header_settings
- # Alf Aardvark wrote, on 01-01-2007 11:00 AM:
- map['thunderbird-en'] = '^.{1,250}[[:space:]](wrote),[[:space:]]on[[:space:]].{3,20}:'
- # otrs
- # 25.02.2015 10:26 - edv hotline wrote:
- # 25.02.2015 10:26 - edv hotline schrieb:
- map['otrs-en-de'] = '^.{6,10}[[:space:]].{3,10}[[:space:]]-[[:space:]].{1,250}[[:space:]](wrote|schrieb):'
- # Ms
- # rubocop:disable Style/AsciiComments
- # From: Martin Edenhofer via Znuny Support [mailto:support@znuny.inc]
- # Send: Donnerstag, 2. April 2015 10:00
- # To/Cc/Bcc: xxx
- # Subject: xxx
- # - or -
- # From: xxx
- # To/Cc/Bcc: xxx
- # Date: 01.04.2015 12:41
- # Subject: xxx
- # - or -
- # De : xxx
- # À/?/?: xxx
- # Envoyé : mercredi 29 avril 2015 17:31
- # Objet : xxx
- # rubocop:enable Style/AsciiComments
- # en/de/fr | sometimes ms adds a space to "xx : value"
- map['ms-en-de-fr_from'] = '^(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ)( ?):[[:space:]].+?'
- map['ms-en-de-fr_from_html'] = "\n######b######(From|Von|De)([[:space:]]?):([[:space:]]?)(######\/b######)[[:space:]].+?"
- # word 14
- # edv hotline wrote:
- # edv hotline schrieb:
- #map['word-en-de'] = "[^#{marker}].{1,250}\s(wrote|schrieb):"
- map.each_value do |regexp|
- string.sub!(/#{regexp}/) do |placeholder|
- "#{marker}#{placeholder}"
- rescue
- # regexp was not possible because of some string encoding issue, use next
- Rails.logger.debug { "Invalid string/charset combination with regexp #{regexp} in string" }
- end
- end
- string
- end
- # Returns a copied string whose encoding is UTF-8.
- # If both the provided and current encodings are invalid,
- # an auto-detected encoding is tried.
- #
- # Supports some fallback strategies if a valid encoding cannot be found.
- #
- # Options:
- #
- # * from: An encoding to try first.
- # Takes precedence over the current and auto-detected encodings.
- #
- # * fallback: The strategy to follow if no valid encoding can be found.
- # * `:output_to_binary` returns an ASCII-8BIT-encoded string.
- # * `:read_as_sanitized_binary` returns a UTF-8-encoded string with all
- # invalid byte sequences replaced with "?" characters.
- def utf8_encode(**options)
- dup.utf8_encode!(options)
- end
- def utf8_encode!(**options)
- return force_encoding('utf-8') if dup.force_encoding('utf-8').valid_encoding?
- # convert string to given charset, if valid_encoding? is true
- if options[:from].present?
- begin
- encoding = Encoding.find(options[:from])
- if encoding.present? && dup.force_encoding(encoding).valid_encoding?
- force_encoding(encoding)
- return encode!('utf-8', encoding)
- end
- rescue ArgumentError, EncodingError => e
- Rails.logger.error { e.inspect }
- end
- end
- # try to find valid encodings of string
- viable_encodings.each do |enc|
- return encode!('utf-8', enc)
- rescue EncodingError => e
- Rails.logger.error { e.inspect }
- end
- case options[:fallback]
- when :output_to_binary
- force_encoding('ascii-8bit')
- when :read_as_sanitized_binary
- encode!('utf-8', 'ascii-8bit', invalid: :replace, undef: :replace, replace: '?')
- else
- raise EncodingError, 'could not find a valid input encoding'
- end
- end
- private
- def viable_encodings(try_first: nil)
- return dup.viable_encodings(try_first: try_first) if frozen?
- provided = Encoding.find(try_first) if try_first.present?
- original = encoding
- detected = CharDet.detect(self)['encoding']
- [provided, original, detected]
- .compact
- .reject { |e| Encoding.find(e) == Encoding::ASCII_8BIT }
- .reject { |e| Encoding.find(e) == Encoding::UTF_8 }
- .select { |e| force_encoding(e).valid_encoding? }
- .tap { force_encoding(original) } # clean up changes from previous line
- # if `try_first` is not a valid encoding, try_first again without it
- rescue ArgumentError
- try_first.present? ? viable_encodings : raise
- end
- end
|