You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by bt...@apache.org on 2015/06/29 10:39:30 UTC

svn commit: r1688139 - in /james/mailbox/trunk/elasticsearch/src: main/java/org/apache/james/mailbox/elasticsearch/json/ test/java/org/apache/james/mailbox/elasticsearch/json/ test/resources/documents/

Author: btellier
Date: Mon Jun 29 08:39:30 2015
New Revision: 1688139

URL: http://svn.apache.org/r1688139
Log:
MAILBOX-234 Dates extraction from headers

Modified:
    james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java
    james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json

Modified: james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java (original)
+++ james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollection.java Mon Jun 29 08:39:30 2015
@@ -19,6 +19,7 @@
 
 package org.apache.james.mailbox.elasticsearch.json;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ArrayListMultimap;
 import com.google.common.collect.ImmutableMultimap;
@@ -39,6 +40,8 @@ import java.time.format.DateTimeFormatte
 import java.util.HashSet;
 import java.util.Optional;
 import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -46,6 +49,14 @@ public class HeaderCollection {
 
     public static class Builder {
 
+        // Some sent e-mail have this form : Wed,  3 Jun 2015 09:05:46 +0000 (UTC)
+        // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed,  3 Jun 2015 09:05:46 +0000 only
+        // This REGEXP is here to match ( in order to remove ) the possible invalid end of a header date
+        // Example of matching patterns :
+        //  (UTC)
+        //  (CEST)
+        private static final Pattern DATE_SANITIZING_PATTERN = Pattern.compile(" *\\(.*\\) *");
+
         private final Set<EMailer> toAddressSet;
         private final Set<EMailer> fromAddressSet;
         private final Set<EMailer> ccAddressSet;
@@ -135,13 +146,26 @@ public class HeaderCollection {
 
         private Optional<ZonedDateTime> toISODate(String value) {
             try {
-                return Optional.of(ZonedDateTime.parse(value, DateTimeFormatter.RFC_1123_DATE_TIME));
+                return Optional.of(ZonedDateTime.parse(
+                    sanitizeDateStringHeaderValue(value),
+                    DateTimeFormatter.RFC_1123_DATE_TIME));
             } catch (Exception e) {
                 LOGGER.info("Can not parse receive date " + value);
                 return Optional.empty();
             }
         }
 
+        @VisibleForTesting String sanitizeDateStringHeaderValue(String value) {
+            // Some sent e-mail have this form : Wed,  3 Jun 2015 09:05:46 +0000 (UTC)
+            // Java 8 Time library RFC_1123_DATE_TIME corresponds to Wed,  3 Jun 2015 09:05:46 +0000 only
+            // This method is here to convert the first date into something parsable by RFC_1123_DATE_TIME DateTimeFormatter
+            Matcher sanitizerMatcher = DATE_SANITIZING_PATTERN.matcher(value);
+            if (sanitizerMatcher.find()) {
+                return value.substring(0 , sanitizerMatcher.start());
+            }
+            return value;
+        }
+
     }
 
     public static final String TO = "to";

Modified: james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java (original)
+++ james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/HeaderCollectionTest.java Mon Jun 29 08:39:30 2015
@@ -130,6 +130,13 @@ public class HeaderCollectionTest {
     }
 
     @Test
+    public void nonStandardDatesShouldBeRetreived() {
+        HeaderCollection headerCollection = HeaderCollection.builder().add(new FieldImpl("Date", "Thu, 4 Jun 2015 06:08:41 +0200 (UTC)")).build();
+        assertThat(DATE_TIME_FORMATTER.format(headerCollection.getSentDate().get()))
+            .isEqualTo("2015/06/04 06:08:41");
+    }
+
+    @Test
     public void dateShouldBeAbsentOnInvalidHeader() {
         HeaderCollection headerCollection = HeaderCollection.builder().add(new FieldImpl("Date", "Not a date")).build();
         assertThat(headerCollection.getSentDate().isPresent())
@@ -148,4 +155,32 @@ public class HeaderCollectionTest {
         HeaderCollection.builder().add(null).build();
     }
 
+    @Test
+    public void sanitizeDateStringHeaderValueShouldRemoveCESTPart() {
+        assertThat(HeaderCollection.builder()
+            .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200 (CEST)"))
+            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
+    }
+
+    @Test
+    public void sanitizeDateStringHeaderValueShouldRemoveUTCPart() {
+        assertThat(HeaderCollection.builder()
+            .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200  (UTC)  "))
+            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
+    }
+
+    @Test
+    public void sanitizeDateStringHeaderValueShouldNotChangeAcceptableString() {
+        assertThat(HeaderCollection.builder()
+            .sanitizeDateStringHeaderValue("Thu, 18 Jun 2015 04:09:35 +0200"))
+            .isEqualTo("Thu, 18 Jun 2015 04:09:35 +0200");
+    }
+
+    @Test
+    public void sanitizeDateStringHeaderValueShouldNotChangeEmptyString() {
+        assertThat(HeaderCollection.builder()
+            .sanitizeDateStringHeaderValue(""))
+            .isEqualTo("");
+    }
+
 }

Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml (original)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail1.eml Mon Jun 29 08:39:30 2015
@@ -32,7 +32,7 @@ Delivered-To: mailing list server-dev@ja
 Received: (qmail 37236 invoked by uid 99); 4 Jun 2015 09:23:38 -0000
 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28)
     by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 04 Jun 2015 09:23:38 +0000
-Date: Thu, 4 Jun 2015 09:23:37 +0000
+Date: Thu, 4 Jun 2015 09:23:37 +0000 (UTC)
 From: "Tellier Benoit (JIRA)" <ji...@apache.org>
 To: server-dev@james.apache.org
 Message-ID: <JI...@Atlassian.JIRA>

Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml (original)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail2.eml Mon Jun 29 08:39:30 2015
@@ -32,7 +32,7 @@ Delivered-To: mailing list server-dev@ja
 Received: (qmail 43130 invoked by uid 99); 4 Jun 2015 09:27:38 -0000
 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28)
     by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 04 Jun 2015 09:27:38 +0000
-Date: Thu, 4 Jun 2015 09:27:37 +0000
+Date: Thu, 4 Jun 2015 09:27:37 +0000 (UTC)
 From: "Tellier Benoit (JIRA)" <ji...@apache.org>
 To: server-dev@james.apache.org
 Message-ID: <JI...@Atlassian.JIRA>

Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml (original)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail3.eml Mon Jun 29 08:39:30 2015
@@ -34,7 +34,7 @@ Delivered-To: mailing list server-dev@ja
 Received: (qmail 1132 invoked by uid 99); 2 Jun 2015 08:16:20 -0000
 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28)
     by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 02 Jun 2015 08:16:20 +0000
-Date: Tue, 2 Jun 2015 08:16:19 +0000
+Date: Tue, 2 Jun 2015 08:16:19 +0000 (UTC)
 From: "Eric Charles (JIRA)" <ji...@apache.org>
 To: server-dev@james.apache.org
 Message-ID: <JI...@Atlassian.JIRA>

Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml (original)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/mail4.eml Mon Jun 29 08:39:30 2015
@@ -34,7 +34,7 @@ Delivered-To: mailing list mailet-api@ja
 Received: (qmail 81730 invoked by uid 99); 15 May 2015 06:36:00 -0000
 Received: from arcas.apache.org (HELO arcas.apache.org) (140.211.11.28)
     by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 15 May 2015 06:36:00 +0000
-Date: Fri, 15 May 2015 06:35:59 +0000
+Date: Fri, 15 May 2015 06:35:59 +0000 (UTC)
 From: "Eric Charles (JIRA)" <ma...@james.apache.org>
 To: mailet-api@james.apache.org
 Message-ID: <JI...@Atlassian.JIRA>

Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml (original)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.eml Mon Jun 29 08:39:30 2015
@@ -31,7 +31,7 @@ Content-Type: multipart/mixed; boundary=
 Content-Transfer-Encoding: 7bit
 MIME-Version: 1.0
 From: "Content-filter at spam.minet.net" <po...@minet.net>
-Date: Wed, 3 Jun 2015 09:05:46 +0000
+Date: Wed, 3 Jun 2015 09:05:46 +0000 (UTC)
 To: <ro...@listes.minet.net>
 Message-ID: <VA...@spam.minet.net>
 Subject: [root] UNCHECKED contents in mail FROM <qu...@riseup.net>

Modified: james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json?rev=1688139&r1=1688138&r2=1688139&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json (original)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/spamMail.json Mon Jun 29 08:39:30 2015
@@ -12,7 +12,7 @@
 	  "1.0"
 	],
 	"date": [
-	  "Wed, 3 Jun 2015 09:05:46 +0000"
+	  "Wed, 3 Jun 2015 09:05:46 +0000 (UTC)"
 	],
 	"x-beenthere": [
 	  "root@listes.minet.net"
@@ -103,7 +103,7 @@
   "subject": [
 	"[root] UNCHECKED contents in mail FROM <qu...@riseup.net>"
   ],
-  "sentDate": "2015-06-07T00:00:00+0200",
+  "sentDate": "2015-06-03T09:05:46+0000",
   "properties": [
 	{
 	  "namespace": "http://james.apache.org/rfc2045/Content-Type",



---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org