You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by rc...@apache.org on 2020/02/06 03:54:34 UTC

[james-project] 01/16: MAILBOX-395 ElasticSearch indexing should not fail upon invalid charset

This is an automated email from the ASF dual-hosted git repository.

rcordier pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/james-project.git

commit ef87f69290a724e9f6065173f91ad051e6268d02
Author: Benoit Tellier <bt...@linagora.com>
AuthorDate: Tue Feb 4 16:27:35 2020 +0700

    MAILBOX-395 ElasticSearch indexing should not fail upon invalid charset
---
 .../mailbox/elasticsearch/json/MimePartParser.java | 17 +++++++---
 .../json/MessageToElasticSearchJsonTest.java       | 22 ++++++++++++
 .../src/test/resources/eml/invalidCharset.eml      | 10 ++++++
 .../src/test/resources/eml/invalidCharset.json     | 39 ++++++++++++++++++++++
 4 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java
index 7cd6e3a..a87ae44 100644
--- a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java
+++ b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java
@@ -33,10 +33,13 @@ import org.apache.james.mime4j.message.MaximalBodyDescriptor;
 import org.apache.james.mime4j.stream.EntityState;
 import org.apache.james.mime4j.stream.MimeConfig;
 import org.apache.james.mime4j.stream.MimeTokenStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import com.google.common.base.Preconditions;
 
 public class MimePartParser {
+    private static final Logger LOGGER = LoggerFactory.getLogger(MimePartParser.class);
 
     private final Message message;
     private final TextExtractor textExtractor;
@@ -120,10 +123,16 @@ public class MimePartParser {
             .addSubType(descriptor.getSubType())
             .addContentDisposition(descriptor.getContentDispositionType())
             .addFileName(descriptor.getContentDispositionFilename());
-
-        Optional.ofNullable(descriptor.getCharset())
-            .map(Charset::forName)
-            .ifPresent(currentlyBuildMimePart::charset);
+        extractCharset(descriptor);
     }
 
+    private void extractCharset(MaximalBodyDescriptor descriptor) {
+        try {
+            Optional.ofNullable(descriptor.getCharset())
+                .map(Charset::forName)
+                .ifPresent(currentlyBuildMimePart::charset);
+        } catch (Exception e) {
+            LOGGER.info("Failed parsing charset", e);
+        }
+    }
 }
diff --git a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java
index 8586c1a..7c7f5ba 100644
--- a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java
+++ b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java
@@ -110,6 +110,28 @@ class MessageToElasticSearchJsonTest {
     }
 
     @Test
+    void invalidCharsetShouldBeWellConvertedToJson() throws IOException {
+        MessageToElasticSearchJson messageToElasticSearchJson = new MessageToElasticSearchJson(
+            new DefaultTextExtractor(),
+            ZoneId.of("Europe/Paris"), IndexAttachments.YES);
+        MailboxMessage spamMail = new SimpleMailboxMessage(MESSAGE_ID,
+                date,
+                SIZE,
+                BODY_START_OCTET,
+                ClassLoaderUtils.getSystemResourceAsSharedStream("eml/invalidCharset.eml"),
+                new Flags(),
+                propertyBuilder,
+                MAILBOX_ID);
+        spamMail.setUid(UID);
+        spamMail.setModSeq(MOD_SEQ);
+
+        String actual = messageToElasticSearchJson.convertToJson(spamMail, ImmutableList.of(USERNAME));
+        assertThatJson(actual)
+            .when(IGNORING_ARRAY_ORDER)
+            .isEqualTo(ClassLoaderUtils.getSystemResourceAsString("eml/invalidCharset.json"));
+    }
+
+    @Test
     void htmlEmailShouldBeWellConvertedToJson() throws IOException {
         MessageToElasticSearchJson messageToElasticSearchJson = new MessageToElasticSearchJson(
             new DefaultTextExtractor(),
diff --git a/mailbox/store/src/test/resources/eml/invalidCharset.eml b/mailbox/store/src/test/resources/eml/invalidCharset.eml
new file mode 100644
index 0000000..62bc3fb
--- /dev/null
+++ b/mailbox/store/src/test/resources/eml/invalidCharset.eml
@@ -0,0 +1,10 @@
+To: Antoine DUPRAT <xy...@linagora.com>
+From: Antoine DUPRAT <xy...@linagora.com>
+Subject: Inline attachment
+Message-ID: <26...@linagora.com>
+Date: Tue, 5 Jul 2016 11:47:46 +0200
+MIME-Version: 1.0
+Content-Type: text/plain; charset=%invalid; format=flowed
+Content-Transfer-Encoding: 7bit
+
+This is an inline attachment: Cheers!
\ No newline at end of file
diff --git a/mailbox/store/src/test/resources/eml/invalidCharset.json b/mailbox/store/src/test/resources/eml/invalidCharset.json
new file mode 100644
index 0000000..eed4184
--- /dev/null
+++ b/mailbox/store/src/test/resources/eml/invalidCharset.json
@@ -0,0 +1,39 @@
+{
+  "attachments":[],
+  "bcc":[],
+  "htmlBody":null,
+  "textBody":"This is an inline attachment: Cheers!",
+  "cc":[],
+  "date":"2015-06-07T00:00:00+0200",
+  "from":[{"name":"Antoine DUPRAT","address":"xyz@linagora.com"}],
+  "hasAttachment":false,
+  "headers":[
+    {"name":"to","value":"Antoine DUPRAT <xy...@linagora.com>"},
+    {"name":"from","value":"Antoine DUPRAT <xy...@linagora.com>"},
+    {"name":"subject","value":"Inline attachment"},
+    {"name":"message-id","value":"<26...@linagora.com>"},
+    {"name":"date","value":"Tue, 5 Jul 2016 11:47:46 +0200"},
+    {"name":"mime-version","value":"1.0"},
+    {"name":"content-type","value":"text/plain; charset=%invalid; format=flowed"},
+    {"name":"content-transfer-encoding","value":"7bit"}
+  ],
+  "mailboxId":"18",
+  "mediaType":"plain",
+  "messageId":"184",
+  "modSeq":42,
+  "sentDate":"2016-07-05T11:47:46+0200",
+  "size":25,
+  "subject":["Inline attachment"],
+  "subtype":"text",
+  "text":"Antoine DUPRAT xyz@linagora.com Antoine DUPRAT xyz@linagora.com Inline attachment This is an inline attachment: Cheers!",
+  "to":[{"name":"Antoine DUPRAT","address":"xyz@linagora.com"}],
+  "uid":25,
+  "userFlags":[],
+  "mimeMessageID":"<26...@linagora.com>",
+  "isAnswered":false,
+  "isDeleted":false,
+  "isDraft":false,
+  "isFlagged":false,
+  "isRecent":false,
+  "isUnread":true
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org