You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2016/07/26 10:37:18 UTC
tika git commit: TIKA-2042 MBOX magic and detection unit test
Repository: tika
Updated Branches:
refs/heads/master f00ab040d -> 72d2d88b3
TIKA-2042 MBOX magic and detection unit test
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/72d2d88b
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/72d2d88b
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/72d2d88b
Branch: refs/heads/master
Commit: 72d2d88b381ba75942ae791042ef54af33ee1f38
Parents: f00ab04
Author: Nick Burch <ni...@gagravarr.org>
Authored: Tue Jul 26 11:36:29 2016 +0100
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Tue Jul 26 11:36:29 2016 +0100
----------------------------------------------------------------------
.../org/apache/tika/mime/tika-mimetypes.xml | 15 ++++++++++++++-
.../java/org/apache/tika/mime/TestMimeTypes.java | 3 +++
2 files changed, 17 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/72d2d88b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index b39f529..22a814c 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -365,9 +365,22 @@
<mime-type type="application/mbms-register+xml"/>
<mime-type type="application/mbms-register-response+xml"/>
<mime-type type="application/mbms-user-service-description+xml"/>
+
<mime-type type="application/mbox">
- <sub-class-of type="text/plain"/>
+ <!-- MBOX files start with "From [sender] [date]" -->
+ <!-- To avoid false matches, check for other headers after that -->
+ <magic priority="70">
+ <match value="From " type="string" offset="0">
+ <match value="\nFrom: " type="string" offset="32:256"/>
+ <match value="\nDate: " type="string" offset="32:256"/>
+ <match value="\nDelivered-To: " type="string" offset="32:256"/>
+ <match value="\nReceived: by " type="string" offset="32:256"/>
+ <match value="\nReceived: via " type="string" offset="32:256"/>
+ <match value="\nReceived: from " type="string" offset="32:256"/>
+ </match>
+ </magic>
<glob pattern="*.mbox"/>
+ <sub-class-of type="text/x-tika-text-based-message"/>
</mime-type>
<mime-type type="application/media_control+xml"/>
<mime-type type="application/mediaservercontrol+xml">
http://git-wip-us.apache.org/repos/asf/tika/blob/72d2d88b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 81b154c..d35a716 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -869,6 +869,9 @@ public class TestMimeTypes {
// Lotus
assertTypeDetection("testLotusEml.eml", "message/rfc822");
+ // MBOX
+ assertTypeDetection("headers.mbox", "application/mbox");
+
// Thunderbird - doesn't currently work by name
assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
}