You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2016/07/26 10:37:18 UTC

tika git commit: TIKA-2042 MBOX magic and detection unit test

Repository: tika
Updated Branches:
  refs/heads/master f00ab040d -> 72d2d88b3


TIKA-2042 MBOX magic and detection unit test


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/72d2d88b
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/72d2d88b
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/72d2d88b

Branch: refs/heads/master
Commit: 72d2d88b381ba75942ae791042ef54af33ee1f38
Parents: f00ab04
Author: Nick Burch <ni...@gagravarr.org>
Authored: Tue Jul 26 11:36:29 2016 +0100
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Tue Jul 26 11:36:29 2016 +0100

----------------------------------------------------------------------
 .../org/apache/tika/mime/tika-mimetypes.xml          | 15 ++++++++++++++-
 .../java/org/apache/tika/mime/TestMimeTypes.java     |  3 +++
 2 files changed, 17 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/72d2d88b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index b39f529..22a814c 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -365,9 +365,22 @@
   <mime-type type="application/mbms-register+xml"/>
   <mime-type type="application/mbms-register-response+xml"/>
   <mime-type type="application/mbms-user-service-description+xml"/>
+
   <mime-type type="application/mbox">
-    <sub-class-of type="text/plain"/>
+    <!-- MBOX files start with "From [sender] [date]" -->
+    <!-- To avoid false matches, check for other headers after that -->
+    <magic priority="70">
+      <match value="From " type="string" offset="0">
+         <match value="\nFrom: " type="string" offset="32:256"/>
+         <match value="\nDate: " type="string" offset="32:256"/>
+         <match value="\nDelivered-To: " type="string" offset="32:256"/>
+         <match value="\nReceived: by " type="string" offset="32:256"/>
+         <match value="\nReceived: via " type="string" offset="32:256"/>
+         <match value="\nReceived: from " type="string" offset="32:256"/>
+      </match>
+    </magic>
     <glob pattern="*.mbox"/>
+    <sub-class-of type="text/x-tika-text-based-message"/>
   </mime-type>
   <mime-type type="application/media_control+xml"/>
   <mime-type type="application/mediaservercontrol+xml">

http://git-wip-us.apache.org/repos/asf/tika/blob/72d2d88b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 81b154c..d35a716 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -869,6 +869,9 @@ public class TestMimeTypes {
         // Lotus
         assertTypeDetection("testLotusEml.eml", "message/rfc822");
         
+        // MBOX
+        assertTypeDetection("headers.mbox", "application/mbox");
+        
         // Thunderbird - doesn't currently work by name
         assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
     }