You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by rg...@apache.org on 2012/12/12 00:30:19 UTC

svn commit: r1420484 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java tika-parsers/src/test/resources/test-documents/testLotusEml.eml

Author: rgauss
Date: Tue Dec 11 23:30:18 2012
New Revision: 1420484

URL: http://svn.apache.org/viewvc?rev=1420484&view=rev
Log:
TIKA-1042: Lotus Notes .eml Files Not Always Detected Properly
   - Added testLotusEml.eml which demonstrates the problem (with some info redacted)
   - Added testDetectLotusNotesEml method to TestContainerAwareDetector
   - Added new match to the message/rfc822 mime-type which looks for X-Notes-Item and Message-ID

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testLotusEml.eml
Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1420484&r1=1420483&r2=1420484&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Tue Dec 11 23:30:18 2012
@@ -4198,8 +4198,11 @@
       <match value="Return-Path:" type="string" offset="0"/>
       <match value="From:" type="string" offset="0"/>
       <match value="Received:" type="string" offset="0"/>
-      <match type="string" value="Message-ID:" offset="0"/>
-      <match type="string" value="Date:" offset="0"/>
+      <match value="Message-ID:" type="string" offset="0"/>
+      <match value="Date:" type="string" offset="0"/>
+      <match value="X-Notes-Item:" type="string" offset="0">
+        <match value="Message-ID:" type="string" offset="0:8192"/>
+      </match>
     </magic>
     <glob pattern="*.eml"/>
     <glob pattern="*.mime"/>

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1420484&r1=1420483&r2=1420484&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Tue Dec 11 23:30:18 2012
@@ -163,6 +163,12 @@ public class TestContainerAwareDetector 
        assertTypeByData("testEPUB.epub", "application/epub+zip");
        assertTypeByData("testiBooks.ibooks", "application/x-ibooks+zip");
     }
+    
+    public void testDetectLotusNotesEml() throws Exception {
+        // Lotus .eml files aren't guaranteed to have any of the magic 
+        // matches as the first line, but should have X-Notes-Item and Message-ID
+        assertTypeByData("testLotusEml.eml", "message/rfc822");
+     }
 
     public void testDetectODF() throws Exception {
         assertTypeByData("testODFwithOOo3.odt", "application/vnd.oasis.opendocument.text");

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testLotusEml.eml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testLotusEml.eml?rev=1420484&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testLotusEml.eml (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testLotusEml.eml Tue Dec 11 23:30:18 2012
@@ -0,0 +1,71 @@
+X-Notes-Item: CN=XXXX/OU=XX/OU=XXXX/O=XXX/C=XX;
+ flags=44; name=ForwardedFrom
+X-Notes-Item: Thu, 4 Oct 2012 13:01:56 +0200;
+ type=400; name=ForwardedDate
+X-Notes-Item: Thu, 4 Oct 2012 13:02:39 +0200;
+ type=400; name=OriginalModTime
+X-Notes-Item: .;
+ name=$StorageTo
+X-Notes-Item: 62AEA923:7CEE804A-C1257A8D:003C970D;
+ type=4; name=$Orig
+X-Notes-Item: 2031619;
+ name=MIMEMailHeaderCharset
+MIME-Version: 1.0
+X-Mailer: Lotus Notes Release 8.5.3 September 15, 2011
+Message-ID: <OF...@LocalDomain>
+Date: Thu, 4 Oct 2012 13:02:39 +0200
+X-Notes-Item: 0;
+ name=MAILOPTIONS
+X-Notes-Item: 1;
+ name=SaveOptions
+X-Notes-Item: Memo;
+ name=Form
+From: XXX.XXX@XXXX.de
+X-Notes-Item: CN=XXXX/OU=XX/OU=XXXX/O=XXX/C=XX;
+ name=AltFrom
+X-Notes-Item: StdNotesLtr25;
+ name=Logo
+X-Notes-Item: StdNotesLtr25;
+ name=dLogo
+X-Notes-Item: True;
+ name=useApplet
+X-Notes-Item: 1;
+ name=DefaultMailSaveOptions
+X-Notes-Item: ;
+ name=Query_String
+X-Notes-Item: 1;
+ name=ExpandPersonalGroups
+To: XXXX@alfresco.com
+X-Notes-Item: ;
+ flags=44; name=INetCopyTo
+X-Notes-Item: ;
+ flags=44; name=INetBlindCopyTo
+X-Notes-Item: ;
+ name=tmpImp
+X-Notes-Item: ;
+ name=Sign
+X-Notes-Item: ;
+ name=Encrypt
+X-Notes-Item: ;
+ name=tmpClassification
+X-Notes-Item: ;
+ name=SetClassification
+X-Notes-Item: 1;
+ name=$NoteHasNativeMIME
+X-MIMETrack: Serialize by Notes Client on XXXX(Release
+ 8.5.3|September 15, 2011) at 25.10.2012 12:14:50
+Content-type: multipart/related; 
+	Boundary="0__=4EBBF01EDFAF119D8f9e8a93df938690918c4EBBF01EDFAF119D"
+Content-Disposition: inline
+
+--0__=4EBBF01EDFAF119D8f9e8a93df938690918c4EBBF01EDFAF119D
+Content-type: text/html; charset=ISO-8859-1
+Content-Disposition: inline
+Content-transfer-encoding: quoted-printable
+
+<html><body>
+Message body
+</body></html>=
+
+--0__=4EBBF01EDFAF119D8f9e8a93df938690918c4EBBF01EDFAF119D--
+