You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/03 19:09:39 UTC

[tika] branch main updated: [TIKA-3687] Fix email detection (#520)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 5444f80  [TIKA-3687] Fix email detection (#520)
5444f80 is described below

commit 5444f80d1b71845ff47c91376f5c90a40dae5a4f
Author: Thierry Guérin <Sc...@users.noreply.github.com>
AuthorDate: Thu Mar 3 20:09:29 2022 +0100

    [TIKA-3687] Fix email detection (#520)
    
    Thank you!
---
 .../org/apache/tika/mime/tika-mimetypes.xml        |  3 +
 .../apache/tika/parser/mail/RFC822ParserTest.java  | 11 +++
 .../test/resources/test-documents/testRFC822-ARC   | 90 ++++++++++++++++++++++
 3 files changed, 104 insertions(+)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index b2fd332..0627921 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -6418,6 +6418,9 @@
         <match value="\nUser-Agent:" type="string" offset="0:1024"/>
         <match value="\nX-Mailer:" type="string" offset="0:1024"/>
         <match value="\nX-Originating-IP:" type="stringignorecase" offset="0:1024"/>
+        <match value="\nX-" type="string" offset="0:1024"/>
+        <match value="\nDKIM-" type="string" offset="0:1024"/>
+        <match value="\nARC-" type="string" offset="0:1024"/>        
       </match>
       <!-- match X- DKIM- ARC- at start of file and then require at least one
            of the usual: from, received, date...but look farther into the file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 9f25776..bfe625f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -600,6 +600,17 @@ public class RFC822ParserTest extends TikaTest {
     }
 
     @Test
+    public void testArc() throws Exception {
+        /*
+        This tests an email with ARC-* headers but that does not begin 
+        with one, and was detected as HTML
+        */
+        List<Metadata> metadataList = getRecursiveMetadata("testRFC822-ARC");
+        assertEquals(1, metadataList.size());
+        assertEquals("message/rfc822", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
     public void testSimpleBodyInlined() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testRFC822_simple_inline_body.txt");
         assertEquals(1, metadataList.size());
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/testRFC822-ARC b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/testRFC822-ARC
new file mode 100644
index 0000000..9104fc9
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/testRFC822-ARC
@@ -0,0 +1,90 @@
+Received: from PR1P264MB1613.FRAP264.PROD.OUTLOOK.COM
+ ([fe80::84c8:cd64:fe3e:8fc1]) by PR1P264MB1613.FRAP264.PROD.OUTLOOK.COM
+ ([fe80::84c8:cd64:fe3e:8fc1%4]) with mapi id 15.20.4930.015; Wed, 26 Jan 2022
+ 08:14:37 +0000
+ARC-Seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none;
+ b=Sr2OMNGtNc7s2butUewmIFPhRpdbMSZfIpqWE4mK0Bq/3lsZq7UUUUUUUUUUYTlYMKdIHu7+Jfk4H0nB2QA70/H+L2suKMPgVGEp8UQkgxBcDRz77RJkgQYrp3/o0ZupDE5G+FSXVfvestOx0D8JE2loqDXGcCCF2pLqE8FD05DeTLPrApXXerdpvmsLMSHXHZ3xQD65yVg9HRFnCvKNTIGXnsOQfAHkCG9hco0nBaVdRQWKxVvU1AxG37chlXGE3PAEatepzpAZgndXglmiD2hf42BSw8l5DSoE58DwO/UxZD466n/R7bv1GVdxe7UgtgCWYBq5DHrc2PzOHPNvNA==
+ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com;
+ s=arcselector9901;
+ h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1;
+ bh=wEuQiRgrogSR8Yuuuu17RIZFydHUwx78Ibooqbq2B00=;
+ b=ZCMHAp7d52ghcG1hjEBl3zGJlHgf19dxHUtVVcKERzaIfq7rLP4Oo3ishEMX+nCz+XV0iyRc1u72+/OE1B3jithvUUUUUUUUUUOtV1SL2XKEjA+H3KpVvG0lSnC8RNfCVHBTCYtiKbCzhMCJiwwrSi+cde8VsrEBodam3y4OaT9we79ytJJBTX/Xo0OqZgKstWdmL+KSGef4yLgcD7H4KSz4zVOA80L+RwM4yJnkfFXt/7639maUucyVAq5STsXDavyd8febeqiOnl59UhdxcL+d0vpsriYQaShrAZ+v1O9MK8NlZKrGIRCp0y/VZuUL2rjSOhlVkwqNFnLQw8iCuQ==
+ARC-Authentication-Results: i=1; mx.microsoft.com 1; spf=none; dmarc=none;
+ dkim=none; arc=none
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=fooo.ba; s=selector2;
+ h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck;
+ bh=wEuQiRgrogSRUUUUUUUUUUZFydHUwx78Ibooqbq2B00=;
+ b=ihQ7DYpbIMb4TUUUUUUUUUUwSoZv+9nfA+Df+3BnFRNT+Xz29HNzpv69vQwLwPYhLNLnIGmL1RXux/8nMKFJgSiRYGA8jlDX7jnYNegD75LL4DlCAEYgMXr0V8Etc2KfUfQnhPR56Vh+qolyERTzwCMXPR+tY5tg8aXdrOPqM64=
+From: fooooooo.baaaaar@fooo.ba
+To: fooooooooo.baaaaaar@fooo.ba
+CC: foooooooooo@fooo.ba
+Subject: Facture à regler pour le cabinet ABC Avocats - Stage RC des avocats
+Thread-Topic:  Facture à regler pour le cabinet ABC Avocats - Stage RC des avocats
+Thread-Index: AdgSjKthoDPmiUM4Twu3A64tG47MDw==
+Date: Wed, 26 Jan 2022 09:14:37 +0100 (CET)
+Message-ID: <PR...@PR1P264MB1613.FRAP264.PROD.OUTLOOK.COM>
+Accept-Language: fr-FR, en-US
+Content-Language: fr-FR
+X-MS-Has-Attach: yes
+X-MS-TNEF-Correlator: 
+authentication-results: dkim=none (message not signed)
+ header.d=none;dmarc=none action=none header.from=fooo.ba;
+x-ms-publictraffictype: Email
+x-ms-office365-filtering-correlation-id: 9f47717f-776f-402b-c48e-08d9e0a3e4d0
+x-ms-traffictypediagnostic: MR1P264MB3924:EE_
+x-ld-processed: 9c9d8823-ab9e-4ac4-8251-32c4a7ae50d5,ExtAddr
+x-ms-oob-tlc-oobclassifiers: OLM:227;
+x-microsoft-antispam: BCL:0;
+x-microsoft-antispam-message-info: 7Go/5xNjnIJUaBXf4v+ab9Ir8K5/5PgmsUUUUUq+jhNduFTxz5nqlfHxt/6xKX8OzIWUvB9WOufRfRq00Gx5u/xCktydUBNWm/NPFhhIN5++XEUliNuzu9SnlslYO0gXZQ6QyLNRP2xwYcvjKenZ/tASOFy+xnwrzXBs503T+j6g/Cv/72AEfjfOfim8LEotEYXoz6e2GYo6gutH3mbY9XgOIvun7wJDqIRMBlX77j2/f3MYQoKjhBZxxuAVECQTh/M0NRL6G6thVjGODzKv2DLVSMQ153WNRmz2ZgUlZpQbPghEunco5rLfabRrhSkslrfiZsrooX3ufCyZ4qx7oDjMmf+cUXiuIS87jYaTA/OfckRBqWRdIFS3lHzUBmiJD0a9eN/D5M6PhomJk8KB9mQ5omHjxOQXaEUfNXTfHwSf1MWAuMVjK8KMMp21HZJxpPY++RL+O [...]
+x-forefront-antispam-report: CIP:255.255.255.255;CTRY:;LANG:fr;SCL:-1;SRV:;IPV:NLI;SFV:SKI;H:PR1P264MB1613.FRAP264.PROD.OUTLOOK.COM;PTR:;CAT:NONE;SFS:;DIR:INB;
+Content-Type: multipart/mixed;
+    boundary="_009_PR1P264MB1613BDD83232C1F0AB981DCF87209PR1P264MB1613FRAP_"
+X-MS-Exchange-CrossTenant-AuthAs: Internal
+X-MS-Exchange-CrossTenant-AuthSource: PR1P264MB1613.FRAP264.PROD.OUTLOOK.COM
+X-MS-Exchange-CrossTenant-Network-Message-Id: 9f47717f-776f-402b-c48e-08d9e0a3e4d0
+X-MS-Exchange-CrossTenant-originalarrivaltime: 26 Jan 2022 08:14:37.1875
+ (UTC)
+X-MS-Exchange-CrossTenant-fromentityheader: Hosted
+X-MS-Exchange-CrossTenant-id: 9c9d8823-ab9e-4ac4-8251-32c4a7ae50d5
+X-MS-Exchange-CrossTenant-mailboxtype: HOSTED
+X-MS-Exchange-CrossTenant-userprincipalname: krZPSCydVb1VY92MQhWJkFieP/R9DjOKt1TUUUUU6pinRMy7AaFCrhwDPcAtnL6AmU1yLWu8PhlsNyj/+TNatWZYIQ0AfiHRbbthPdUO2kk=
+X-MS-Exchange-Transport-CrossTenantHeadersStamped: MR1P264MB3924
+Return-Path: fooooooo.baaaaar@fooo.ba
+X-MS-Exchange-Organization-Network-Message-Id: 75f89c94-2a94-42a5-a1e6-08d9e0a3e55b
+X-MS-Exchange-Organization-AuthSource: SW101199.fooo.local
+X-MS-Exchange-Organization-AuthAs: Anonymous
+X-MS-Exchange-Transport-EndToEndLatency: 00:00:01.8942046
+X-MS-Exchange-Processed-By-BccFoldering: 15.01.2242.012
+MIME-Version: 1.0
+Reply-To: FOOOOOO Baaaaaar <fo...@fooo.ba>
+
+--_009_PR1P264MB1613BDD83232C1F0AB981DCF87209PR1P264MB1613FRAP_
+Content-Type: multipart/related;
+    boundary="_008_PR1P264MB1613BDD83232C1F0AB981DCF87209PR1P264MB1613FRAP_";
+    type="multipart/alternative"
+
+--_008_PR1P264MB1613BDD83232C1F0AB981DCF87209PR1P264MB1613FRAP_
+Content-Type: multipart/alternative;
+    boundary="_000_PR1P264MB1613BDD83232C1F0AB981DCF87209PR1P264MB1613FRAP_"
+
+--_000_PR1P264MB1613BDD83232C1F0AB981DCF87209PR1P264MB1613FRAP_
+Content-Type: text/plain; charset="iso-8859-1"
+Content-Transfer-Encoding: quoted-printable
+
+Simple body
+
+--_000_PR1P264MB1613BDD83232C1F0AB981DCF87209PR1P264MB1613FRAP_
+Content-Type: text/html; charset="iso-8859-1"
+Content-Transfer-Encoding: quoted-printable
+
+<html>
+<body>
+Simple <b>body</b>
+</body>
+</html>
+
+--_000_PR1P264MB1613BDD83232C1F0AB981DCF87209PR1P264MB1613FRAP_--
+
+--_008_PR1P264MB1613BDD83232C1F0AB981DCF87209PR1P264MB1613FRAP_--
+
+--_009_PR1P264MB1613BDD83232C1F0AB981DCF87209PR1P264MB1613FRAP_--