You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/02/26 22:29:46 UTC

[tika] branch branch_1x updated: TIKA-2578 and TIKA-2587 -- Allow for RFC822 detection for files starting with "dkim-" and/or "x-" via Andreas Meier

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 5e3e910   TIKA-2578 and TIKA-2587 -- Allow for RFC822 detection for files starting with "dkim-" and/or "x-" via Andreas Meier
5e3e910 is described below

commit 5e3e910d05d0427a752dda93a5341c926ec399c3
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Feb 26 17:28:47 2018 -0500

     TIKA-2578 and TIKA-2587 -- Allow for RFC822 detection for files starting with "dkim-" and/or "x-" via Andreas Meier
---
 CHANGES.txt                                        |  3 ++
 .../org/apache/tika/mime/tika-mimetypes.xml        | 17 +++++++--
 .../java/org/apache/tika/mime/TestMimeTypes.java   | 11 ++++--
 .../resources/test-documents/testRFC822_dkim.eml   | 22 ++++++++++++
 .../resources/test-documents/testRFC822_x-.eml     | 41 ++++++++++++++++++++++
 5 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index eb5fa7d..c99034d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.18 - ???
 
+   * Allow for RFC822 detection for files starting with "dkim-"
+     and/or "x-" via Andreas Meier (TIKA-2578 and TIKA-2587)
+
    * Extract xlsx files embedded in OLE objects within PPT and PPTX
      via Brian McColgan (TIKA-2588).
 
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 3d4284d..7432a56 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5613,9 +5613,6 @@
     <magic priority="50">
       <match value="Delivered-To:" type="string" offset="0"/>
       <match value="Status:" type="string" offset="0"/>
-      <match value="X-Mozilla-Keys:" type="string" offset="0"/>
-      <match value="X-Mozilla-Status:" type="string" offset="0"/>
-      <match value="X-Mozilla-Status2:" type="string" offset="0"/>
       <match value="Relay-Version:" type="stringignorecase" offset="0"/>
       <match value="#!\ rnews" type="string" offset="0"/>
       <match value="N#!\ rnews" type="string" offset="0"/>
@@ -5632,6 +5629,20 @@
       <match value="X-Notes-Item:" type="string" offset="0">
         <match value="Message-ID:" type="string" offset="0:8192"/>
       </match>
+      <match value="X-" type="stringignorecase" offset="0">
+        <match value="\nMessage-ID:" type="string" offset="0:8192"/>
+        <match value="\nFrom:" type="stringignorecase" offset="0:8192"/>
+        <match value="\nTo:" type="stringignorecase" offset="0:8192"/>
+        <match value="\nSubject:" type="string" offset="0:8192"/>
+        <match value="\nMIME-Version:" type="stringignorecase" offset="0:8192"/>
+      </match>
+      <match value="DKIM-" type="string" offset="0">
+        <match value="\nMessage-ID:" type="string" offset="0:8192"/>
+        <match value="\nFrom:" type="stringignorecase" offset="0:8192"/>
+        <match value="\nTo:" type="stringignorecase" offset="0:8192"/>
+        <match value="\nSubject:" type="string" offset="0:8192"/>
+        <match value="\nMIME-Version:" type="stringignorecase" offset="0:8192"/>
+      </match>
     </magic>
     <glob pattern="*.eml"/>
     <glob pattern="*.mime"/>
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index e423fdd..e76a7d5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -897,8 +897,15 @@ public class TestMimeTypes {
         // MBOX
         assertTypeDetection("headers.mbox", "application/mbox");
         
-        // Thunderbird - doesn't currently work by name
-        assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
+        // Thunderbird
+        assertTypeDetection("testThunderbirdEml.eml", "message/rfc822");
+
+        //dkim header
+        assertTypeDetection("testThunderbirdEml.eml", "message/rfc822");
+
+        //x- custom header
+        assertTypeDetection("testRFC822_x-.eml", "message/rfc822");
+
     }
     
     @Test
diff --git a/tika-parsers/src/test/resources/test-documents/testRFC822_dkim.eml b/tika-parsers/src/test/resources/test-documents/testRFC822_dkim.eml
new file mode 100644
index 0000000..aeb5ea7
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testRFC822_dkim.eml
@@ -0,0 +1,22 @@
+DKIM-Signature: v=1; a=rsa-sha256; s=brisbane; d=example.com;
+      c=simple/simple; q=dns/txt; i=joe@football.example.com;
+      h=Received : From : To : Subject : Date : Message-ID;
+      bh=2jUSOH9NhtVGCQWNr9BrIAPreKQjO6Sn7XIkfJVOzv8=;
+      b=AuUoFEfDxTDkHlLXSZEpZj79LICEps6eda7W3deTVFOk4yAUoqOB
+        4nujc7YopdG5dWLSdNg6xNAZpOPr+kHxt1IrE+NahM6L/LbvaHut
+        KVdkLLkpVaVVQPzeRDI009SO2Il5Lu7rDNH6mZckBdrIx0orEtZV
+        4bmp/YzhwvcubU4=;
+Received: from client1.football.example.com  [192.0.2.1]
+      by submitserver.example.com with SUBMISSION;
+      Fri, 11 Jul 2003 21:01:54 -0700 (PDT)
+From: Joe SixPack <jo...@football.example.com>
+To: Suzie Q <su...@shopping.example.net>
+Subject: Is dinner ready?
+Date: Fri, 11 Jul 2003 21:00:37 -0700 (PDT)
+Message-ID: <20...@football.example.com>
+
+Hi.
+
+We lost the game. Are you hungry yet?
+
+Joe.
\ No newline at end of file
diff --git a/tika-parsers/src/test/resources/test-documents/testRFC822_x-.eml b/tika-parsers/src/test/resources/test-documents/testRFC822_x-.eml
new file mode 100644
index 0000000..6074468
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testRFC822_x-.eml
@@ -0,0 +1,41 @@
+X-IronPort-AV: E=Sophos;i="1.11,500,1111111111";
+   d="xml'?bin'?scan'12,345,67,89,111,222,333,444,555?rels";a="12345678"
+From: "Julien Nioche (JIRA)" <ji...@apache.org>
+To: dev@tika.apache.org
+Subject: [jira] Commented: (TIKA-461) RFC822 messages not parsed
+Reply-To: dev@tika.apache.org
+Delivered-To: mailing list dev@tika.apache.org
+Date: Mon, 6 Sep 2010 05:25:34 -0400 (EDT)
+In-Reply-To: <60...@thor>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+
+
+    [ https://issues.apache.org/jira/browse/TIKA-461?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12906468#action_12906468 ]
+
+Julien Nioche commented on TIKA-461:
+------------------------------------
+
+I'll have a look at mime4j and try to use it in Tika
+
+> RFC822 messages not parsed
+> --------------------------
+>
+>                 Key: TIKA-461
+>                 URL: https://issues.apache.org/jira/browse/TIKA-461
+>             Project: Tika
+>          Issue Type: Bug
+>          Components: parser
+>    Affects Versions: 0.7
+>            Reporter: Joshua Turner
+>            Assignee: Julien Nioche
+>
+> Presented with an RFC822 message exported from Thunderbird, AutodetectParser produces an empty body, and a Metadata containing only one key-value pair: "Content-Type=message/rfc822". Directly calling MboxParser likewise gives an empty body, but with two metadata pairs: "Content-Encoding=us-ascii Content-Type=application/mbox".
+> A quick peek at the source of MboxParser shows that the implementation is pretty naive. If the wiring can be sorted out, something like Apache James' mime4j might be a better bet.
+
+--
+This message is automatically generated by JIRA.
+-
+You can reply to this email to add a comment to the issue online.
+

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.