Browse Source

Improved email parsing.

Martin Edenhofer 13 years ago
parent
commit
21b244bc08

+ 93 - 6
app/models/channel/email_parser.rb

@@ -2,7 +2,7 @@ require 'mail'
 require 'iconv'
 class Channel::EmailParser
   def conv (charset, string)
-    if charset == 'US-ASCII' || charset == 'ASCII-8BIT'
+    if !charset || charset == 'US-ASCII' || charset == 'ASCII-8BIT'
       charset = 'LATIN1'
     end
     return string if charset.downcase == 'utf8' || charset.downcase == 'utf-8'
@@ -25,7 +25,8 @@ class Channel::EmailParser
     data[:from_email]        = Mail::Address.new( mail[:from].value ).address
     data[:from_local]        = Mail::Address.new( mail[:from].value ).local
     data[:from_domain]       = Mail::Address.new( mail[:from].value ).domain
-    data[:from_display_name] = Mail::Address.new( mail[:from].value ).display_name
+    data[:from_display_name] = Mail::Address.new( mail[:from].value ).display_name ||
+      ( Mail::Address.new( mail[:from].value ).comments && Mail::Address.new( mail[:from].value ).comments[0] )
 
     # do extra decoding because we needed to use field.value
     data[:from_display_name] = Mail::Field.new( 'X-From', data[:from_display_name] ).to_s
@@ -36,17 +37,50 @@ class Channel::EmailParser
     # body
 #    plain_part = mail.multipart? ? (mail.text_part ? mail.text_part.body.decoded : nil) : mail.body.decoded
 #    html_part = message.html_part ? message.html_part.body.decoded : nil
+    data[:attachments] = []
     if mail.multipart?
       data[:plain_part] = mail.text_part.body.decoded
-      data[:plain_part] = conv( mail.text_part.charset || 'LATIN1', data[:plain_part] )
+      data[:plain_part] = conv( mail.text_part.charset, data[:plain_part] )
     else
-      data[:plain_part] = mail.body.decoded
-      data[:plain_part] = conv( mail.body.charset || 'LATIN1', data[:plain_part] )
+
+      # text part
+      if !mail.mime_type || mail.mime_type.to_s ==  '' || mail.mime_type.to_s.downcase == 'text/plain'
+        data[:plain_part] = mail.body.decoded
+        data[:plain_part] = conv( mail.charset, data[:plain_part] )
+      else
+
+        # html part
+        filename = '-no name-'
+        if mail.mime_type.to_s.downcase == 'text/html'
+          filename = 'html-email'
+          data[:plain_part] = mail.body.decoded
+          data[:plain_part] = conv( mail.charset, data[:plain_part] )
+          data[:plain_part] = html2ascii( data[:plain_part] )
+
+        # any other attachments
+        else
+          data[:plain_part] = 'no visible content'
+        end
+
+        # add body as attachment
+        headers_store = {}
+        if mail.mime_type
+          headers_store['Mime-Type'] = mail.mime_type
+        end
+        if mail.charset
+          headers_store['Charset'] = mail.charset
+        end
+        attachment = {
+          :data        => mail.body.decoded,
+          :filename    => mail.filename || filename,
+          :preferences => headers_store          
+        }
+        data[:attachments].push attachment
+      end
     end
 
     # attachments
     if mail.attachments
-      data[:attachments] = []
       mail.attachments.each do |attachment|
         
         # get file preferences
@@ -232,4 +266,57 @@ class Channel::EmailParser
     # return new objects
     return ticket, article, user
   end
+  
+  def html2ascii(string)
+
+    # find <a href=....> and replace it with [x]
+    link_list = ''
+    counter   = 0
+    string.gsub!( /<a\s.*?href=("|')(.+?)("|').*?>/ix ) { |item|
+      link = $2
+      counter   = counter + 1
+      link_list += "[#{counter}] #{link}\n"
+      "[#{counter}]"
+    }
+
+    # remove empty lines
+    string.gsub!( /^\s*/m, '' )
+
+    # fix some bad stuff from opera and others
+    string.gsub!( /(\n\r|\r\r\n|\r\n)/s, "\n" )
+
+    # strip all other tags
+    string.gsub!( /\<.+?\>/s, '' )
+
+    # encode html entities like "&#8211;"
+    string.gsub!( /(&\#(\d+);?)/x ) { |item|
+      $2.chr
+    }
+
+    # encode html entities like "&#3d;"
+    string.gsub!( /(&\#[xX]([0-9a-fA-F]+);?)/x ) { |item|
+      chr_orig = $1
+      hex      = $2.hex
+      if hex
+        chr = hex.chr
+        if chr
+          chr
+        else
+          chr_orig
+        end
+      else
+        chr_orig
+      end
+    }
+
+    # remove empty lines
+    string.gsub!( /^\s*\n\s*\n/m, "\n" )
+
+    # add extracted links
+    if link_list
+      string += "\n\n" + link_list
+    end
+
+    return string
+  end
 end

+ 38 - 0
test/fixtures/mail4.box

@@ -0,0 +1,38 @@
+From k.guenther@example.com  Mon May  7 15:08:10 2012
+Return-Path: <k.guenther@example.com>
+X-Original-To: support@example.com
+Delivered-To: box@samba.example.com
+X-Greylist: delayed 355 seconds by postgrey-1.32 at samba; Mon, 07 May 2012 15:08:09 BST
+Received: from smtprelay05.example.com (smtprelay05.example.com [8.6.3.9])
+	by samba.example.com (Postfix) with ESMTP id 011F9500D3D
+	for <support@example.com>; Mon,  7 May 2012 15:08:09 +0100 (BST)
+Received: from [1.1.0.7] (helo=exchange.df.eu)
+	by smtprelay05.example.com with esmtps (TLSv1:RC4-MD5:128)
+	(Exim 4.68)
+	(envelope-from <k.guenther@example.com>)
+	id 1SROW2-0007tk-QP
+	for support@example.com; Mon, 07 May 2012 16:02:18 +0200
+Received: from ECCR04PUBLIC.exchange.local ([1.1.2.4]) by
+ efe04.exchange.local ([1.1.0.7]) with mapi; Mon, 7 May 2012 15:58:33 +0200
+From: =?utf-8?B?R8O8bnRoZXIgS2F0amEgfCBFeGFtcGxlIEdtYkg=?=
+	<k.guenther@example.com>
+To: Martin Edenhofer via Znuny Team <support@example.com>
+Date: Mon, 7 May 2012 15:58:32 +0200
+Subject: AW: Ticket Templates [Ticket#11168]
+Thread-Topic: Ticket Templates [Ticket#11168]
+Thread-Index: Ac0sGqTnvktNHx1lQoaTDcVI7lUxJQAPqvXA
+Message-ID: <F799DA4E63A20B4EBE9D5A412196D71D3CADBEA04E@ECCR04PUBLIC.exchange.local>
+References: <F799DA4E63A20B4EBE9D5A412196D71D3CADBE9DF6@ECCR04PUBLIC.exchange.local>
+ <20120507062840.265.107538@portal.example.com>
+In-Reply-To: <20120507062840.265.107538@portal.example.com>
+Accept-Language: de-DE
+Content-Language: de-DE
+X-MS-Has-Attach:
+X-MS-TNEF-Correlator:
+acceptlanguage: de-DE
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: base64
+MIME-Version: 1.0
+
+SGFsbG8gS2F0amEsCgpzdXBlciEgSWNoIGZyZXUgbWljaCEKCldpciB3w7xyZGVuIGdlcm5lIGRpZSBQcsOkc2VudGF0aW9uL0VpbmbDvGhydW5nIGluIGRpZSBUaWNrZXQgVGVtcGxhdGVzIHBlciBTY3JlZW5zaGFyaW5nIG9kZXIgenVtaW5kZXN0IHBlciBUZWxlZm9uIG1hY2hlbi4KCk3DtmdsaWNoZSBUZXJtaW5lOgpvIERvLCAxMC4wNS4yMDEyIDE1OjAwLTE2OjAwCm8gRnIsICAxMS4wNS4yMDEyIDEzOjAwLTE0OjAwCm8gRGksICAxNS4wNS4yMDEyIDE3OjAwLTE4OjAwCgrDnGJlciBGZWVkYmFjayB3w7xyZGUgaWNoIG1pY2ggZnJldWVuIQoKUFM6IFp1ciBiZXNzZXJlbiDDnGJlcnNpY2h0IGhhYmUgaWNoIGVpbiBUaWNrZXQgZXJzdGVsbHQuIDopIEltIEZvb3RlciBzaW5kIHVuc2VyZSBnZXNjaMOkZnRsaWNoZW4gS29udGFrdGRhdGVuIChmYWxscyBkaWVzZSBpcmdlbmR3YW5uIGVpbm1hbCBiZW7DtnRpZ3Qgd2VyZGVuIHNvbGx0ZW4pLCBtZWhyIGRhenUgaW4gZWluIHBhYXIgVGFnZW4uCgpMaWViZSBHcsO8w59lIQoKIC1NYXJ0aW4KCgo
+

+ 76 - 0
test/fixtures/mail5.box

@@ -0,0 +1,76 @@
+From marc.smith@example.com  Mon May  7 07:45:48 2012
+Return-Path: <marc.smith@example.com>
+X-Original-To: support@znuny.com
+Delivered-To: box@samba.example.com
+Received: from mailout-de.example.com (mailout-de.example.com [2.1.6.2])
+	by samba.example.com (Postfix) with SMTP id F1C9E500D3D
+	for <support@znuny.com>; Mon,  7 May 2012 07:45:47 +0100 (BST)
+Received: (qmail invoked by alias); 07 May 2012 06:45:48 -0000
+Received: from unknown (EHLO [1.2.1.2]) [7.3.2.1]
+  by mail.example.com (mp072) with SMTP; 07 May 2012 08:45:48 +0200
+X-Authenticated: #69078992
+X-Provags-ID: V01U2FsdGVkX1+IkUVPK6GIbZ2ezhmZfpCU0OVlFkuyPGDNsL0V5H
+	FxvJdecWb4ibKL
+Message-ID: <4FA76F9A.3060602@example.com>
+Date: Mon, 07 May 2012 08:45:46 +0200
+From: marc.smith@example.com (Marc Smith)
+User-Agent: Mozilla/5.0 (Windows NT 6.0; WOW64; rv:12.0) Gecko/20120428 Thunderbird/12.0.1
+MIME-Version: 1.0
+To: Martin Edenhofer via Znuny Team <support@znuny.com>
+Subject: Re: XXXX Betatest Ticket Templates [Ticket#11162]
+References: <20120507061007.259.822311@portal.znuny.com>
+In-Reply-To: <20120507061007.259.822311@portal.znuny.com>
+Content-Type: text/plain; charset=UTF-8; format=flowed
+Content-Transfer-Encoding: 8bit
+X-Y-GMX-Trusted: 0
+Status: RO
+Content-Length: 1418
+Lines: 46
+
+Am 07.05.2012 08:10, schrieb Martin Edenhofer via Znuny Team:
+> Hallo Marc,
+>
+> super! Ich freu mich!
+>
+> Wir würden gerne die Präsentation/Einführung in die Ticket Templates per Screensharing oder zumindest per Telefon machen.
+>
+> Mögliche Termine:
+> o Do, 10.05.2012 11:00-12:00
+> o Fr,  11.05.2012 09:00-10:00
+> o Di,  15.05.2012 14:00-15:00
+>
+> Über Feedback würde ich mich freuen!
+>
+> PS: Zur besseren Übersicht habe ich ein Ticket erstellt. :) Im Footer sind unsere geschäftlichen Kontaktdaten (falls diese irgendwann einmal benötigt werden sollten), mehr dazu in ein paar Tagen.
+>
+> Liebe Grüße!
+>
+>   -Martin
+>
+> --
+> Martin Edenhofer
+>
+> Znuny GmbH // Marienstraße 11 // 10117 Berlin // Germany
+>
+> P: +49 (0) 30 60 98 54 18-0
+> F: +49 (0) 30 60 98 54 18-8
+>
+> Location: Berlin - HRB 139852 B Amtsgericht Berlin-Charlottenburg
+> Managing Director: Martin Edenhofer
+Hallo Martin,
+
+John und ich könnten leider nur am Freitag, da wir Donnerstag und nächste 
+Woche bereits Termine haben.
+
+Wir würden uns dann den Freitag vormerken...;-)
+
+N Screensharing ist bei uns leider nicht so ohne Probleme möglich, bzw. 
+wir könnten einen PC aufsetzen mit nem seperaten Internetzugang auf dem 
+wir ne VM vorbereiten könnten, da wir von dem "Internet PC" nicht auf 
+unser XXXX zugreifen können. Falls ihr sonst noch irgendwas benötigt 
+einfach kurz ne Rückmeldung...;-)
+
+Grüße aus Bonn
+
+John & Marc
+

+ 32 - 0
test/fixtures/mail6.box

@@ -0,0 +1,32 @@
+From me@bogen.net  Sat Sep 13 16:50:43 2003
+Return-Path: <me@bogen.net>
+Received: from airoma.example (law10-f30.law10.airoma.example [4.4.4.4]) by esanta.edenhofer.de (Postfix) with ESMTP id 2307484296 for <demo@exampel.com>; Sat, 13 Sep 2003 16:50:43 +0200 (CEST)
+Received: from mail pickup service by airoma.example with Mc SMTPSVC; Sat, 13 Sep 2003 07:37:26 -0700
+Received: from 11.11.11.11 by lw10fd.law10.com with HTTP; Sat, 13 Sep 2003 14:37:26 GMT
+X-Originating-Ip: [5.5.5.5]
+X-Originating-Email: [me@example.com]
+From: =?Windows-1252?Q?Hans_B=C4KO?= =?iso-8859-15?q?Sch=F6nland?= <me@bogen.net>
+To: =?iso-8859-2?Q?Namedy=F1ski?= (hans@example.com)
+Subject: utf8: =?UTF-8?Q?=E4=BD=BF=E3=81=A3=E3=81=A6?= / ISO-8859-1: =?iso-8859-1?Q?Priorit=E4t=22_?= / cp-1251: =?windows-1251?B?0eXw4+XpINPj6+j26uj1?= 
+Date: Sat, 13 Sep 2003 10:37:26 -0400
+MIME-Version: 1.0
+Content-Type: text/html; charset="iso-8859-15"; format=flowed
+Message-Id: <Law10-F30dRmhKuTqtA00018823@coolair.example>
+X-Originalarrivaltime: 13 Sep 2003 14:37:26.0630 (UTC) FILETIME=[8D57B860:01C37A04]
+
+<html><div style='background-color:'><P>this is a test</P></div><br clear=all><hr> <a href="http://localhost/8HMZENUS/2737??PS=">Compare Cable, DSL or Satellite plans: As low as $2.95. </a>
+
+<br>
+
+<br>
+Test1:&#8211;
+<br>
+Test2:&amp;
+<br>
+Test3:&ni;
+<br>
+Test4:&amp;
+<br>
+Test5:&#x3d;
+
+</html>

+ 64 - 0
test/unit/email_parser_test.rb

@@ -35,6 +35,70 @@ class EmailParserTest < ActiveSupport::TestCase
           :subject            => 'Ticket Templates',
         },
       },
+      {
+        :data     => IO.read('test/fixtures/mail4.box'),
+        :body_md5 => '2f2c3a5c233dbd9658ab37d39469b7d0',
+        :params   => {
+          :from               => '"Günther Katja | Example GmbH" <k.guenther@example.com>',
+          :from_email         => 'k.guenther@example.com',
+          :from_display_name  => 'Günther Katja | Example GmbH',
+          :subject            => 'AW: Ticket Templates [Ticket#11168]',
+          :plain_part         => "Hallo Katja,
+
+super! Ich freu mich!
+
+Wir würden gerne die Präsentation/Einführung in die Ticket Templates per Screensharing oder zumindest per Telefon machen.
+
+Mögliche Termine:
+o Do, 10.05.2012 15:00-16:00
+o Fr,  11.05.2012 13:00-14:00
+o Di,  15.05.2012 17:00-18:00
+
+Über Feedback würde ich mich freuen!
+
+PS: Zur besseren Übersicht habe ich ein Ticket erstellt. :) Im Footer sind unsere geschäftlichen Kontaktdaten (falls diese irgendwann einmal benötigt werden sollten), mehr dazu in ein paar Tagen.
+
+Liebe Grüße!
+
+ -Martin
+",
+        },
+      },
+      {
+        :data     => IO.read('test/fixtures/mail5.box'),
+        :body_md5 => '51364a306362f513f53f2bbea7820f37',
+        :params   => {
+          :from               => 'marc.smith@example.com (Marc Smith)',
+          :from_email         => 'marc.smith@example.com',
+          :from_display_name  => 'Marc Smith',
+          :subject            => 'Re: XXXX Betatest Ticket Templates [Ticket#11162]',
+        },
+      },
+      {
+        :data     => IO.read('test/fixtures/mail6.box'),
+        :body_md5 => '1fc492b8d762d82f861dbb70b7cf7610',
+        :params   => {
+          :from               => '"Hans BÄKOSchönland" <me@bogen.net>',
+          :from_email         => 'me@bogen.net',
+          :from_display_name  => 'Hans BÄKOSchönland',
+          :subject            => 'utf8: 使って / ISO-8859-1: Priorität"  / cp-1251: Сергей Углицких',
+          :plain_part         => "this is a test [1]Compare Cable, DSL or Satellite plans: As low as $2.95. 
+
+Test1:8
+
+Test2:&amp;
+
+Test3:&ni;
+
+Test4:&amp;
+
+Test5:=
+
+
+[1] http://localhost/8HMZENUS/2737??PS=
+"
+        },
+      },
     ]
 
     files.each { |file|