You are viewing a plain text version of this content. The canonical link for it is here.
Posted to server-dev@james.apache.org by rc...@apache.org on 2020/02/06 03:54:34 UTC
[james-project] 01/16: MAILBOX-395 ElasticSearch indexing should
not fail upon invalid charset
This is an automated email from the ASF dual-hosted git repository.
rcordier pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/james-project.git
commit ef87f69290a724e9f6065173f91ad051e6268d02
Author: Benoit Tellier <bt...@linagora.com>
AuthorDate: Tue Feb 4 16:27:35 2020 +0700
MAILBOX-395 ElasticSearch indexing should not fail upon invalid charset
---
.../mailbox/elasticsearch/json/MimePartParser.java | 17 +++++++---
.../json/MessageToElasticSearchJsonTest.java | 22 ++++++++++++
.../src/test/resources/eml/invalidCharset.eml | 10 ++++++
.../src/test/resources/eml/invalidCharset.json | 39 ++++++++++++++++++++++
4 files changed, 84 insertions(+), 4 deletions(-)
diff --git a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java
index 7cd6e3a..a87ae44 100644
--- a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java
+++ b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePartParser.java
@@ -33,10 +33,13 @@ import org.apache.james.mime4j.message.MaximalBodyDescriptor;
import org.apache.james.mime4j.stream.EntityState;
import org.apache.james.mime4j.stream.MimeConfig;
import org.apache.james.mime4j.stream.MimeTokenStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
public class MimePartParser {
+ private static final Logger LOGGER = LoggerFactory.getLogger(MimePartParser.class);
private final Message message;
private final TextExtractor textExtractor;
@@ -120,10 +123,16 @@ public class MimePartParser {
.addSubType(descriptor.getSubType())
.addContentDisposition(descriptor.getContentDispositionType())
.addFileName(descriptor.getContentDispositionFilename());
-
- Optional.ofNullable(descriptor.getCharset())
- .map(Charset::forName)
- .ifPresent(currentlyBuildMimePart::charset);
+ extractCharset(descriptor);
}
+ private void extractCharset(MaximalBodyDescriptor descriptor) {
+ try {
+ Optional.ofNullable(descriptor.getCharset())
+ .map(Charset::forName)
+ .ifPresent(currentlyBuildMimePart::charset);
+ } catch (Exception e) {
+ LOGGER.info("Failed parsing charset", e);
+ }
+ }
}
diff --git a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java
index 8586c1a..7c7f5ba 100644
--- a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java
+++ b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MessageToElasticSearchJsonTest.java
@@ -110,6 +110,28 @@ class MessageToElasticSearchJsonTest {
}
@Test
+ void invalidCharsetShouldBeWellConvertedToJson() throws IOException {
+ MessageToElasticSearchJson messageToElasticSearchJson = new MessageToElasticSearchJson(
+ new DefaultTextExtractor(),
+ ZoneId.of("Europe/Paris"), IndexAttachments.YES);
+ MailboxMessage spamMail = new SimpleMailboxMessage(MESSAGE_ID,
+ date,
+ SIZE,
+ BODY_START_OCTET,
+ ClassLoaderUtils.getSystemResourceAsSharedStream("eml/invalidCharset.eml"),
+ new Flags(),
+ propertyBuilder,
+ MAILBOX_ID);
+ spamMail.setUid(UID);
+ spamMail.setModSeq(MOD_SEQ);
+
+ String actual = messageToElasticSearchJson.convertToJson(spamMail, ImmutableList.of(USERNAME));
+ assertThatJson(actual)
+ .when(IGNORING_ARRAY_ORDER)
+ .isEqualTo(ClassLoaderUtils.getSystemResourceAsString("eml/invalidCharset.json"));
+ }
+
+ @Test
void htmlEmailShouldBeWellConvertedToJson() throws IOException {
MessageToElasticSearchJson messageToElasticSearchJson = new MessageToElasticSearchJson(
new DefaultTextExtractor(),
diff --git a/mailbox/store/src/test/resources/eml/invalidCharset.eml b/mailbox/store/src/test/resources/eml/invalidCharset.eml
new file mode 100644
index 0000000..62bc3fb
--- /dev/null
+++ b/mailbox/store/src/test/resources/eml/invalidCharset.eml
@@ -0,0 +1,10 @@
+To: Antoine DUPRAT <xy...@linagora.com>
+From: Antoine DUPRAT <xy...@linagora.com>
+Subject: Inline attachment
+Message-ID: <26...@linagora.com>
+Date: Tue, 5 Jul 2016 11:47:46 +0200
+MIME-Version: 1.0
+Content-Type: text/plain; charset=%invalid; format=flowed
+Content-Transfer-Encoding: 7bit
+
+This is an inline attachment: Cheers!
\ No newline at end of file
diff --git a/mailbox/store/src/test/resources/eml/invalidCharset.json b/mailbox/store/src/test/resources/eml/invalidCharset.json
new file mode 100644
index 0000000..eed4184
--- /dev/null
+++ b/mailbox/store/src/test/resources/eml/invalidCharset.json
@@ -0,0 +1,39 @@
+{
+ "attachments":[],
+ "bcc":[],
+ "htmlBody":null,
+ "textBody":"This is an inline attachment: Cheers!",
+ "cc":[],
+ "date":"2015-06-07T00:00:00+0200",
+ "from":[{"name":"Antoine DUPRAT","address":"xyz@linagora.com"}],
+ "hasAttachment":false,
+ "headers":[
+ {"name":"to","value":"Antoine DUPRAT <xy...@linagora.com>"},
+ {"name":"from","value":"Antoine DUPRAT <xy...@linagora.com>"},
+ {"name":"subject","value":"Inline attachment"},
+ {"name":"message-id","value":"<26...@linagora.com>"},
+ {"name":"date","value":"Tue, 5 Jul 2016 11:47:46 +0200"},
+ {"name":"mime-version","value":"1.0"},
+ {"name":"content-type","value":"text/plain; charset=%invalid; format=flowed"},
+ {"name":"content-transfer-encoding","value":"7bit"}
+ ],
+ "mailboxId":"18",
+ "mediaType":"plain",
+ "messageId":"184",
+ "modSeq":42,
+ "sentDate":"2016-07-05T11:47:46+0200",
+ "size":25,
+ "subject":["Inline attachment"],
+ "subtype":"text",
+ "text":"Antoine DUPRAT xyz@linagora.com Antoine DUPRAT xyz@linagora.com Inline attachment This is an inline attachment: Cheers!",
+ "to":[{"name":"Antoine DUPRAT","address":"xyz@linagora.com"}],
+ "uid":25,
+ "userFlags":[],
+ "mimeMessageID":"<26...@linagora.com>",
+ "isAnswered":false,
+ "isDeleted":false,
+ "isDraft":false,
+ "isFlagged":false,
+ "isRecent":false,
+ "isUnread":true
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org