(.+?)}m) do |placeholder| placeholder.gsub(/\n/, '###BR###') end string.gsub!(%r{
(.+?)
}m) do |placeholder|
placeholder.gsub(/\n/, '###BR###')
end
# insert spaces on [A-z]\n[A-z]
string.gsub!(/([A-z])[[:space:]]([A-z])/m, '\1 \2')
# remove all new lines
string.gsub!(/(\n\r|\r\r\n|\r\n|\n)/, '')
# blockquote handling
string.gsub!(%r{]*)>(.+?)}m) do "\n" + $2.html2text(true).gsub(/^(.*)$/, '> \1') + "\n" end # pre/code handling 2/2 string.gsub!(/###BR###/, "\n") # add counting string.gsub!(/
}im, "\n")
string.gsub!(%r{<(div|p|pre|br|table|tr|h)(|/| [^>]*)>}i, "\n")
string.gsub!(%r{(p|br|div)(|[[:space:]].+?)>}i, "\n")
string.gsub!(%r{}i, ' ')
# strip all other tags
string.gsub!(/<.+?>/, '')
# replace multiple spaces with one
string.gsub!(/ /, ' ')
# add hyperlinks
if strict
string.gsub!(%r{([[:space:]])((http|https|ftp|tel)://.+?|(www..+?))([[:space:]]|\.[[:space:]]|,[[:space:]])}mxi) do |_placeholder|
pre = $1
content = $2
post = $5
if content.match?(/^www/i)
content = "http://#{content}"
end
if content =~ /^(http|https|ftp|tel)/i
"#{pre}######LINKRAW:#{content}#######{post}"
else
"#{pre}#{content}#{post}"
end
end
end
# try HTMLEntities, if it fails on invalid signes, use manual way
begin
coder = HTMLEntities.new
string = coder.decode(string)
rescue
# strip all & < > "
string.gsub!('&', '&')
string.gsub!('<', '<')
string.gsub!('>', '>')
string.gsub!('"', '"')
string.gsub!(' ', ' ')
# encode html entities like "–"
string.gsub!(/(&\#(\d+);?)/x) do
$2.chr
end
# encode html entities like "d;"
string.gsub!(/(&\#[xX]([0-9a-fA-F]+);?)/x) do
chr_orig = $1
hex = $2.hex
if hex
chr = hex.chr
if chr
chr_orig = chr
else
chr_orig
end
else
chr_orig
end
# check valid encoding
begin
if !chr_orig.encode('UTF-8').valid_encoding?
chr_orig = '?'
end
rescue
chr_orig = '?'
end
chr_orig
end
end
string = string.utf8_encode(fallback: :read_as_sanitized_binary)
# remove tailing empty spaces
string.gsub!(/[[:blank:]]+$/, '')
# remove double multiple empty lines
string.gsub!(/\n\n\n+/, "\n\n")
# add extracted links
if link_list != ''
string += "\n\n\n" + link_list
end
# remove double multiple empty lines
string.gsub!(/\n\n\n+/, "\n\n")
string.strip
end
=begin
html = text_string.text2html
=end
def text2html
text = CGI.escapeHTML(self)
text.gsub!(/\n/, '
')
text.chomp
end
=begin
html = text_string.text2html
=end
def html2html_strict
string = dup
string = HtmlSanitizer.cleanup_replace_tags(string)
string = HtmlSanitizer.strict(string, true).strip
string = HtmlSanitizer.cleanup(string).strip
# as fallback, use html2text and text2html
if string.blank?
string = html2text.text2html
string.signature_identify('text')
marker_template = ''
string.sub!(/######SIGNATURE_MARKER######/, marker_template)
string.gsub!(/######SIGNATURE_MARKER######/, '')
return string.chomp
end
string.gsub!(%r{(
[[:space:]]*
([[:space:]]*)){2,}}im, '\2') string.gsub!(%r\
[[:space:]]*
(
[[:space:]]*
([[:space:]]*
}im, '
') string.gsub!(%r\(
[[:space:]]+
}im, '') string.gsub!(%r{\A(
[[:space:]]*(--|__)',
'(
|
|
[[:space:]]*[[:space:]]*(On|Am|Le|El|Den|Dňa|W dniu|Il|Op|Dne|Dana)[[:space:]]', '[[:space:]]*
[[:space:]]*(On|Am|Le|El|Den|Dňa|W dniu|Il|Op|Dne|Dana)[[:space:]].{1,500}: map['apple-de'] = '^(Am)[[:space:]].{6,20}[[:space:]](um)[[:space:]].{3,10}[[:space:]](schrieb)[[:space:]].{1,250}:' # Thunderbird # Am 04.03.2015 um 12:47 schrieb Alf Aardvark: map['thunderbird-de'] = '^(Am)[[:space:]].{6,20}[[:space:]](um)[[:space:]].{3,10}[[:space:]](schrieb)[[:space:]].{1,250}:' # Thunderbird default - http://kb.mozillazine.org/Reply_header_settings # On 01-01-2007 11:00 AM, Alf Aardvark wrote: map['thunderbird-en-default'] = '^(On)[[:space:]].{6,20}[[:space:]].{3,10},[[:space:]].{1,250}(wrote):' # http://kb.mozillazine.org/Reply_header_settings # Alf Aardvark wrote, on 01-01-2007 11:00 AM: map['thunderbird-en'] = '^.{1,250}[[:space:]](wrote),[[:space:]]on[[:space:]].{3,20}:' # otrs # 25.02.2015 10:26 - edv hotline wrote: # 25.02.2015 10:26 - edv hotline schrieb: map['otrs-en-de'] = '^.{6,10}[[:space:]].{3,10}[[:space:]]-[[:space:]].{1,250}[[:space:]](wrote|schrieb):' # Ms # rubocop:disable Style/AsciiComments # From: Martin Edenhofer via Znuny Support [mailto:support@znuny.inc] # Send: Donnerstag, 2. April 2015 10:00 # To/Cc/Bcc: xxx # Subject: xxx # - or - # From: xxx # To/Cc/Bcc: xxx # Date: 01.04.2015 12:41 # Subject: xxx # - or - # De : xxx # À/?/?: xxx # Envoyé : mercredi 29 avril 2015 17:31 # Objet : xxx # rubocop:enable Style/AsciiComments # en/de/fr | sometimes ms adds a space to "xx : value" map['ms-en-de-fr_from'] = '^(Von|From|De|от|Z|Od|Ze|Fra|Van|Mistä|Από|Dal|から|Из|од|iz|Från|จาก|з|Từ)( ?):[[:space:]].+?' map['ms-en-de-fr_from_html'] = "\n######b######(From|Von|De)([[:space:]]?):([[:space:]]?)(######\/b######)[[:space:]].+?" # word 14 # edv hotline wrote: # edv hotline schrieb: #map['word-en-de'] = "[^#{marker}].{1,250}\s(wrote|schrieb):" map.each_value do |regexp| string.sub!(/#{regexp}/) do |placeholder| "#{marker}#{placeholder}" rescue # regexp was not possible because of some string encoding issue, use next Rails.logger.debug { "Invalid string/charset combination with regexp #{regexp} in string" } end end string end # Returns a copied string whose encoding is UTF-8. # If both the provided and current encodings are invalid, # an auto-detected encoding is tried. # # Supports some fallback strategies if a valid encoding cannot be found. # # Options: # # * from: An encoding to try first. # Takes precedence over the current and auto-detected encodings. # # * fallback: The strategy to follow if no valid encoding can be found. # * `:output_to_binary` returns an ASCII-8BIT-encoded string. # * `:read_as_sanitized_binary` returns a UTF-8-encoded string with all # invalid byte sequences replaced with "?" characters. def utf8_encode(**options) dup.utf8_encode!(options) end def utf8_encode!(**options) return force_encoding('utf-8') if dup.force_encoding('utf-8').valid_encoding? # convert string to given charset, if valid_encoding? is true if options[:from].present? begin encoding = Encoding.find(options[:from]) if encoding.present? && dup.force_encoding(encoding).valid_encoding? force_encoding(encoding) return encode!('utf-8', encoding) end rescue ArgumentError, EncodingError => e Rails.logger.error { e.inspect } end end # try to find valid encodings of string viable_encodings.each do |enc| return encode!('utf-8', enc) rescue EncodingError => e Rails.logger.error { e.inspect } end case options[:fallback] when :output_to_binary force_encoding('ascii-8bit') when :read_as_sanitized_binary encode!('utf-8', 'ascii-8bit', invalid: :replace, undef: :replace, replace: '?') else raise EncodingError, 'could not find a valid input encoding' end end private def viable_encodings(try_first: nil) return dup.viable_encodings(try_first: try_first) if frozen? provided = Encoding.find(try_first) if try_first.present? original = encoding detected = CharDet.detect(self)['encoding'] [provided, original, detected] .compact .reject { |e| Encoding.find(e) == Encoding::ASCII_8BIT } .reject { |e| Encoding.find(e) == Encoding::UTF_8 } .select { |e| force_encoding(e).valid_encoding? } .tap { force_encoding(original) } # clean up changes from previous line # if `try_first` is not a valid encoding, try_first again without it rescue ArgumentError try_first.present? ? viable_encodings : raise end end