You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/02/26 22:29:46 UTC
[tika] branch branch_1x updated: TIKA-2578 and TIKA-2587 -- Allow
for RFC822 detection for files starting with "dkim-" and/or "x-" via
Andreas Meier
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 5e3e910 TIKA-2578 and TIKA-2587 -- Allow for RFC822 detection for files starting with "dkim-" and/or "x-" via Andreas Meier
5e3e910 is described below
commit 5e3e910d05d0427a752dda93a5341c926ec399c3
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Feb 26 17:28:47 2018 -0500
TIKA-2578 and TIKA-2587 -- Allow for RFC822 detection for files starting with "dkim-" and/or "x-" via Andreas Meier
---
CHANGES.txt | 3 ++
.../org/apache/tika/mime/tika-mimetypes.xml | 17 +++++++--
.../java/org/apache/tika/mime/TestMimeTypes.java | 11 ++++--
.../resources/test-documents/testRFC822_dkim.eml | 22 ++++++++++++
.../resources/test-documents/testRFC822_x-.eml | 41 ++++++++++++++++++++++
5 files changed, 89 insertions(+), 5 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index eb5fa7d..c99034d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.18 - ???
+ * Allow for RFC822 detection for files starting with "dkim-"
+ and/or "x-" via Andreas Meier (TIKA-2578 and TIKA-2587)
+
* Extract xlsx files embedded in OLE objects within PPT and PPTX
via Brian McColgan (TIKA-2588).
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 3d4284d..7432a56 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5613,9 +5613,6 @@
<magic priority="50">
<match value="Delivered-To:" type="string" offset="0"/>
<match value="Status:" type="string" offset="0"/>
- <match value="X-Mozilla-Keys:" type="string" offset="0"/>
- <match value="X-Mozilla-Status:" type="string" offset="0"/>
- <match value="X-Mozilla-Status2:" type="string" offset="0"/>
<match value="Relay-Version:" type="stringignorecase" offset="0"/>
<match value="#!\ rnews" type="string" offset="0"/>
<match value="N#!\ rnews" type="string" offset="0"/>
@@ -5632,6 +5629,20 @@
<match value="X-Notes-Item:" type="string" offset="0">
<match value="Message-ID:" type="string" offset="0:8192"/>
</match>
+ <match value="X-" type="stringignorecase" offset="0">
+ <match value="\nMessage-ID:" type="string" offset="0:8192"/>
+ <match value="\nFrom:" type="stringignorecase" offset="0:8192"/>
+ <match value="\nTo:" type="stringignorecase" offset="0:8192"/>
+ <match value="\nSubject:" type="string" offset="0:8192"/>
+ <match value="\nMIME-Version:" type="stringignorecase" offset="0:8192"/>
+ </match>
+ <match value="DKIM-" type="string" offset="0">
+ <match value="\nMessage-ID:" type="string" offset="0:8192"/>
+ <match value="\nFrom:" type="stringignorecase" offset="0:8192"/>
+ <match value="\nTo:" type="stringignorecase" offset="0:8192"/>
+ <match value="\nSubject:" type="string" offset="0:8192"/>
+ <match value="\nMIME-Version:" type="stringignorecase" offset="0:8192"/>
+ </match>
</magic>
<glob pattern="*.eml"/>
<glob pattern="*.mime"/>
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index e423fdd..e76a7d5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -897,8 +897,15 @@ public class TestMimeTypes {
// MBOX
assertTypeDetection("headers.mbox", "application/mbox");
- // Thunderbird - doesn't currently work by name
- assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
+ // Thunderbird
+ assertTypeDetection("testThunderbirdEml.eml", "message/rfc822");
+
+ //dkim header
+ assertTypeDetection("testThunderbirdEml.eml", "message/rfc822");
+
+ //x- custom header
+ assertTypeDetection("testRFC822_x-.eml", "message/rfc822");
+
}
@Test
diff --git a/tika-parsers/src/test/resources/test-documents/testRFC822_dkim.eml b/tika-parsers/src/test/resources/test-documents/testRFC822_dkim.eml
new file mode 100644
index 0000000..aeb5ea7
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testRFC822_dkim.eml
@@ -0,0 +1,22 @@
+DKIM-Signature: v=1; a=rsa-sha256; s=brisbane; d=example.com;
+ c=simple/simple; q=dns/txt; i=joe@football.example.com;
+ h=Received : From : To : Subject : Date : Message-ID;
+ bh=2jUSOH9NhtVGCQWNr9BrIAPreKQjO6Sn7XIkfJVOzv8=;
+ b=AuUoFEfDxTDkHlLXSZEpZj79LICEps6eda7W3deTVFOk4yAUoqOB
+ 4nujc7YopdG5dWLSdNg6xNAZpOPr+kHxt1IrE+NahM6L/LbvaHut
+ KVdkLLkpVaVVQPzeRDI009SO2Il5Lu7rDNH6mZckBdrIx0orEtZV
+ 4bmp/YzhwvcubU4=;
+Received: from client1.football.example.com [192.0.2.1]
+ by submitserver.example.com with SUBMISSION;
+ Fri, 11 Jul 2003 21:01:54 -0700 (PDT)
+From: Joe SixPack <jo...@football.example.com>
+To: Suzie Q <su...@shopping.example.net>
+Subject: Is dinner ready?
+Date: Fri, 11 Jul 2003 21:00:37 -0700 (PDT)
+Message-ID: <20...@football.example.com>
+
+Hi.
+
+We lost the game. Are you hungry yet?
+
+Joe.
\ No newline at end of file
diff --git a/tika-parsers/src/test/resources/test-documents/testRFC822_x-.eml b/tika-parsers/src/test/resources/test-documents/testRFC822_x-.eml
new file mode 100644
index 0000000..6074468
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testRFC822_x-.eml
@@ -0,0 +1,41 @@
+X-IronPort-AV: E=Sophos;i="1.11,500,1111111111";
+ d="xml'?bin'?scan'12,345,67,89,111,222,333,444,555?rels";a="12345678"
+From: "Julien Nioche (JIRA)" <ji...@apache.org>
+To: dev@tika.apache.org
+Subject: [jira] Commented: (TIKA-461) RFC822 messages not parsed
+Reply-To: dev@tika.apache.org
+Delivered-To: mailing list dev@tika.apache.org
+Date: Mon, 6 Sep 2010 05:25:34 -0400 (EDT)
+In-Reply-To: <60...@thor>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+
+
+ [ https://issues.apache.org/jira/browse/TIKA-461?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12906468#action_12906468 ]
+
+Julien Nioche commented on TIKA-461:
+------------------------------------
+
+I'll have a look at mime4j and try to use it in Tika
+
+> RFC822 messages not parsed
+> --------------------------
+>
+> Key: TIKA-461
+> URL: https://issues.apache.org/jira/browse/TIKA-461
+> Project: Tika
+> Issue Type: Bug
+> Components: parser
+> Affects Versions: 0.7
+> Reporter: Joshua Turner
+> Assignee: Julien Nioche
+>
+> Presented with an RFC822 message exported from Thunderbird, AutodetectParser produces an empty body, and a Metadata containing only one key-value pair: "Content-Type=message/rfc822". Directly calling MboxParser likewise gives an empty body, but with two metadata pairs: "Content-Encoding=us-ascii Content-Type=application/mbox".
+> A quick peek at the source of MboxParser shows that the implementation is pretty naive. If the wiring can be sorted out, something like Apache James' mime4j might be a better bet.
+
+--
+This message is automatically generated by JIRA.
+-
+You can reply to this email to add a comment to the issue online.
+
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.