You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2016/07/26 11:18:11 UTC

[3/5] tika git commit: TIKA-2042 MBOX magic and detection unit test

TIKA-2042 MBOX magic and detection unit test


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/65cc9bce
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/65cc9bce
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/65cc9bce

Branch: refs/heads/2.x
Commit: 65cc9bcecdc6b86294a88f3b2b6b26017f356ae5
Parents: 31374a3
Author: Nick Burch <ni...@gagravarr.org>
Authored: Tue Jul 26 11:36:29 2016 +0100
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Tue Jul 26 12:06:50 2016 +0100

----------------------------------------------------------------------
 .../java/org/apache/tika/mime/TestMimeTypes.java     |  3 +++
 .../org/apache/tika/mime/tika-mimetypes.xml          | 15 ++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/65cc9bce/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 09864b8..d4840b7 100644
--- a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -833,6 +833,9 @@ public class TestMimeTypes extends TikaTest {
         // Lotus
         assertTypeDetection("testLotusEml.eml", "message/rfc822");
         
+        // MBOX
+        assertTypeDetection("headers.mbox", "application/mbox");
+        
         // Thunderbird - doesn't currently work by name
         assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/65cc9bce/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index e07f449..1d1f70a 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -361,9 +361,22 @@
   <mime-type type="application/mbms-register+xml"/>
   <mime-type type="application/mbms-register-response+xml"/>
   <mime-type type="application/mbms-user-service-description+xml"/>
+
   <mime-type type="application/mbox">
-    <sub-class-of type="text/plain"/>
+    <!-- MBOX files start with "From [sender] [date]" -->
+    <!-- To avoid false matches, check for other headers after that -->
+    <magic priority="70">
+      <match value="From " type="string" offset="0">
+         <match value="\nFrom: " type="string" offset="32:256"/>
+         <match value="\nDate: " type="string" offset="32:256"/>
+         <match value="\nDelivered-To: " type="string" offset="32:256"/>
+         <match value="\nReceived: by " type="string" offset="32:256"/>
+         <match value="\nReceived: via " type="string" offset="32:256"/>
+         <match value="\nReceived: from " type="string" offset="32:256"/>
+      </match>
+    </magic>
     <glob pattern="*.mbox"/>
+    <sub-class-of type="text/x-tika-text-based-message"/>
   </mime-type>
   <mime-type type="application/media_control+xml"/>
   <mime-type type="application/mediaservercontrol+xml">