You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@james.apache.org by rc...@apache.org on 2020/12/23 03:51:19 UTC

[james-project] 05/10: MAILBOX-403 Relax address matching for text body in ElasticSearch implementation

This is an automated email from the ASF dual-hosted git repository.

rcordier pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/james-project.git

commit a3c4ba473bd88d5bbf3266093dab15ee9e4b0188
Author: Benoit Tellier <bt...@linagora.com>
AuthorDate: Tue Dec 22 12:40:26 2020 +0700

    MAILBOX-403 Relax address matching for text body in ElasticSearch implementation
    
    Address matching is less relevant for text content.
    
    We keep strong constraints for address matching on address header fields.
    
    Relaxing this constraint enable indexing only one time each text field. Thus reclaiming performance and disk space.
---
 .../james/backends/es/NodeMappingFactory.java      |  1 -
 .../elasticsearch/MailboxMappingFactory.java       | 19 +----
 .../elasticsearch/query/CriterionConverter.java    | 10 ---
 .../ElasticSearchIntegrationTest.java              | 89 ++-------------------
 .../search/LuceneMessageSearchIndexTest.java       | 20 +++++
 .../search/AbstractMessageSearchIndexTest.java     | 91 +++++++++++++++++++++-
 6 files changed, 116 insertions(+), 114 deletions(-)

diff --git a/backends-common/elasticsearch/src/main/java/org/apache/james/backends/es/NodeMappingFactory.java b/backends-common/elasticsearch/src/main/java/org/apache/james/backends/es/NodeMappingFactory.java
index 25b82e7..e7408e9 100644
--- a/backends-common/elasticsearch/src/main/java/org/apache/james/backends/es/NodeMappingFactory.java
+++ b/backends-common/elasticsearch/src/main/java/org/apache/james/backends/es/NodeMappingFactory.java
@@ -47,7 +47,6 @@ public class NodeMappingFactory {
     public static final String NESTED = "nested";
     public static final String FIELDS = "fields";
     public static final String RAW = "raw";
-    public static final String SPLIT_EMAIL = "splitEmail";
     public static final String ANALYZER = "analyzer";
     public static final String NORMALIZER = "normalizer";
     public static final String SEARCH_ANALYZER = "search_analyzer";
diff --git a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/MailboxMappingFactory.java b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/MailboxMappingFactory.java
index 088a288..9d74266 100644
--- a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/MailboxMappingFactory.java
+++ b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/MailboxMappingFactory.java
@@ -34,7 +34,6 @@ import static org.apache.james.backends.es.NodeMappingFactory.RAW;
 import static org.apache.james.backends.es.NodeMappingFactory.REQUIRED;
 import static org.apache.james.backends.es.NodeMappingFactory.ROUTING;
 import static org.apache.james.backends.es.NodeMappingFactory.SEARCH_ANALYZER;
-import static org.apache.james.backends.es.NodeMappingFactory.SPLIT_EMAIL;
 import static org.apache.james.backends.es.NodeMappingFactory.TYPE;
 import static org.apache.james.mailbox.elasticsearch.json.JsonMessageConstants.ATTACHMENTS;
 import static org.apache.james.mailbox.elasticsearch.json.JsonMessageConstants.BCC;
@@ -297,26 +296,12 @@ public class MailboxMappingFactory {
 
                         .startObject(TEXT_BODY)
                             .field(TYPE, TEXT)
-                            .field(ANALYZER, KEEP_MAIL_AND_URL)
-                            .startObject(FIELDS)
-                                .startObject(SPLIT_EMAIL)
-                                    .field(TYPE, TEXT)
-                                    .field(ANALYZER, STANDARD)
-                                    .field(SEARCH_ANALYZER, KEEP_MAIL_AND_URL)
-                                .endObject()
-                            .endObject()
+                            .field(ANALYZER, STANDARD)
                         .endObject()
 
                         .startObject(HTML_BODY)
                             .field(TYPE, TEXT)
-                            .field(ANALYZER, KEEP_MAIL_AND_URL)
-                            .startObject(FIELDS)
-                                .startObject(SPLIT_EMAIL)
-                                    .field(TYPE, TEXT)
-                                    .field(ANALYZER, STANDARD)
-                                    .field(SEARCH_ANALYZER, KEEP_MAIL_AND_URL)
-                                .endObject()
-                            .endObject()
+                            .field(ANALYZER, STANDARD)
                         .endObject()
 
                         .startObject(HAS_ATTACHMENT)
diff --git a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/query/CriterionConverter.java b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/query/CriterionConverter.java
index 031dd47..df39e14 100644
--- a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/query/CriterionConverter.java
+++ b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/query/CriterionConverter.java
@@ -20,7 +20,6 @@
 package org.apache.james.mailbox.elasticsearch.query;
 
 import static org.apache.james.backends.es.NodeMappingFactory.RAW;
-import static org.apache.james.backends.es.NodeMappingFactory.SPLIT_EMAIL;
 import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
@@ -150,19 +149,10 @@ public class CriterionConverter {
         case BODY:
             return boolQuery()
                     .should(matchQuery(JsonMessageConstants.TEXT_BODY, textCriterion.getOperator().getValue()))
-                    .should(matchQuery(JsonMessageConstants.TEXT_BODY + "." + SPLIT_EMAIL,
-                        textCriterion.getOperator().getValue()))
-                    .should(matchQuery(JsonMessageConstants.HTML_BODY + "." + SPLIT_EMAIL,
-                        textCriterion.getOperator().getValue()))
                     .should(matchQuery(JsonMessageConstants.HTML_BODY, textCriterion.getOperator().getValue()));
         case FULL:
             return boolQuery()
                     .should(matchQuery(JsonMessageConstants.TEXT_BODY, textCriterion.getOperator().getValue()))
-                    .should(matchQuery(JsonMessageConstants.TEXT_BODY + "." + SPLIT_EMAIL,
-                        textCriterion.getOperator().getValue()))
-                    .should(matchQuery(JsonMessageConstants.HTML_BODY + "." + SPLIT_EMAIL,
-                        textCriterion.getOperator().getValue()))
-                    .should(matchQuery(JsonMessageConstants.HTML_BODY, textCriterion.getOperator().getValue()))
                     .should(matchQuery(JsonMessageConstants.HTML_BODY, textCriterion.getOperator().getValue()))
                     .should(matchQuery(JsonMessageConstants.ATTACHMENTS + "." + JsonMessageConstants.Attachment.TEXT_CONTENT,
                         textCriterion.getOperator().getValue()));
diff --git a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/ElasticSearchIntegrationTest.java b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/ElasticSearchIntegrationTest.java
index b25c05e..9178820 100644
--- a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/ElasticSearchIntegrationTest.java
+++ b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/ElasticSearchIntegrationTest.java
@@ -20,6 +20,7 @@
 package org.apache.james.mailbox.elasticsearch;
 
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
@@ -28,6 +29,7 @@ import java.time.ZoneId;
 import org.apache.james.backends.es.DockerElasticSearchExtension;
 import org.apache.james.backends.es.ElasticSearchIndexer;
 import org.apache.james.backends.es.ReactorElasticSearchClient;
+import org.apache.james.mailbox.MailboxManager;
 import org.apache.james.mailbox.MailboxSession;
 import org.apache.james.mailbox.MailboxSessionUtil;
 import org.apache.james.mailbox.MessageManager;
@@ -36,6 +38,7 @@ import org.apache.james.mailbox.elasticsearch.json.MessageToElasticSearchJson;
 import org.apache.james.mailbox.elasticsearch.query.CriterionConverter;
 import org.apache.james.mailbox.elasticsearch.query.QueryConverter;
 import org.apache.james.mailbox.elasticsearch.search.ElasticSearchSearcher;
+import org.apache.james.mailbox.exception.MailboxException;
 import org.apache.james.mailbox.inmemory.InMemoryId;
 import org.apache.james.mailbox.inmemory.InMemoryMessageId;
 import org.apache.james.mailbox.inmemory.manager.InMemoryIntegrationResources;
@@ -56,6 +59,7 @@ import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.extension.RegisterExtension;
 
 import com.google.common.base.Strings;
+import com.google.common.collect.ImmutableList;
 
 import reactor.core.publisher.Flux;
 
@@ -266,7 +270,6 @@ class ElasticSearchIntegrationTest extends AbstractMessageSearchIndexTest {
 
     @Test
     void addressMatchesShouldBeExact() throws Exception {
-        // results should not include the domain part nor the local part but the full email address
         MailboxPath mailboxPath = MailboxPath.forUser(USERNAME, INBOX);
         MailboxSession session = MailboxSessionUtil.create(USERNAME);
         MessageManager messageManager = storeMailboxManager.getMailbox(mailboxPath, session);
@@ -296,90 +299,10 @@ class ElasticSearchIntegrationTest extends AbstractMessageSearchIndexTest {
             .containsOnly(messageId2.getUid());
     }
 
+    @Disabled("MAILBOX-403 Relaxed the matching constraints for email addresses in text bodies to reduce ElasticSearch disk space usage")
     @Test
-    void textShouldMatchFullEmailAddress() throws Exception {
-        MailboxPath mailboxPath = MailboxPath.forUser(USERNAME, INBOX);
-        MailboxSession session = MailboxSessionUtil.create(USERNAME);
-        MessageManager messageManager = storeMailboxManager.getMailbox(mailboxPath, session);
-
-
-        ComposedMessageId messageId1 = messageManager.appendMessage(
-            MessageManager.AppendCommand.builder().build(
-                Message.Builder
-                    .of()
-                    .setSubject("test")
-                    .setBody("benwa@apache.org email address do not exist", StandardCharsets.UTF_8)
-                    .build()),
-            session).getId();
-
-        elasticSearch.awaitForElasticSearch();
-
-        assertThat(Flux.from(messageManager.search(SearchQuery.of(SearchQuery.bodyContains("benwa@apache.org")), session)).toStream())
-            .containsOnly(messageId1.getUid());
-    }
-
-    @Test
-    void textShouldMatchEmailAddressLocalPart() throws Exception {
-        MailboxPath mailboxPath = MailboxPath.forUser(USERNAME, INBOX);
-        MailboxSession session = MailboxSessionUtil.create(USERNAME);
-        MessageManager messageManager = storeMailboxManager.getMailbox(mailboxPath, session);
-
-        ComposedMessageId messageId1 = messageManager.appendMessage(
-            MessageManager.AppendCommand.builder().build(
-                Message.Builder
-                    .of()
-                    .setSubject("test")
-                    .setBody("benwa@apache.org email address do not exist", StandardCharsets.UTF_8)
-                    .build()),
-            session).getId();
-
-        elasticSearch.awaitForElasticSearch();
-
-        assertThat(Flux.from(messageManager.search(SearchQuery.of(SearchQuery.bodyContains("benwa")), session)).toStream())
-            .containsOnly(messageId1.getUid());
-    }
-
-    @Test
-    void textShouldMatchEmailAddressDomainPart() throws Exception {
-        MailboxPath mailboxPath = MailboxPath.forUser(USERNAME, INBOX);
-        MailboxSession session = MailboxSessionUtil.create(USERNAME);
-        MessageManager messageManager = storeMailboxManager.getMailbox(mailboxPath, session);
-
-        ComposedMessageId messageId1 = messageManager.appendMessage(
-            MessageManager.AppendCommand.builder().build(
-                Message.Builder
-                    .of()
-                    .setSubject("test")
-                    .setBody("benwa@apache.org email address do not exist", StandardCharsets.UTF_8)
-                    .build()),
-            session).getId();
-
-        elasticSearch.awaitForElasticSearch();
-
-        assertThat(Flux.from(messageManager.search(SearchQuery.of(SearchQuery.bodyContains("apache.org")), session)).toStream())
-            .containsOnly(messageId1.getUid());
-    }
-
-    @Test
-    void textShouldNotMatchOtherAddressesOfTheSameDomain() throws Exception {
-        MailboxPath mailboxPath = MailboxPath.forUser(USERNAME, INBOX);
-        MailboxSession session = MailboxSessionUtil.create(USERNAME);
-        MessageManager messageManager = storeMailboxManager.getMailbox(mailboxPath, session);
-
-        messageManager.appendMessage(
-            MessageManager.AppendCommand.builder().build(
-                Message.Builder
-                    .of()
-                    .setSubject("test")
-                    .setBody("benwa@apache.org email address do not exist", StandardCharsets.UTF_8)
-                    .build()),
-            session).getId();
-
-
-        elasticSearch.awaitForElasticSearch();
+    public void textShouldNotMatchOtherAddressesOfTheSameDomain() throws Exception {
 
-        assertThat(Flux.from(messageManager.search(SearchQuery.of(SearchQuery.bodyContains("alice@apache.org")), session)).toStream())
-            .isEmpty();
     }
 
     @Disabled("MAILBOX-401 '-' causes address matching to fail")
diff --git a/mailbox/lucene/src/test/java/org/apache/james/mailbox/lucene/search/LuceneMessageSearchIndexTest.java b/mailbox/lucene/src/test/java/org/apache/james/mailbox/lucene/search/LuceneMessageSearchIndexTest.java
index 8e7b4fe..e31852e 100644
--- a/mailbox/lucene/src/test/java/org/apache/james/mailbox/lucene/search/LuceneMessageSearchIndexTest.java
+++ b/mailbox/lucene/src/test/java/org/apache/james/mailbox/lucene/search/LuceneMessageSearchIndexTest.java
@@ -19,15 +19,29 @@
 
 package org.apache.james.mailbox.lucene.search;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.nio.charset.StandardCharsets;
+
+import org.apache.james.mailbox.MailboxSession;
+import org.apache.james.mailbox.MailboxSessionUtil;
+import org.apache.james.mailbox.MessageManager;
 import org.apache.james.mailbox.inmemory.InMemoryId;
 import org.apache.james.mailbox.inmemory.InMemoryMessageId;
 import org.apache.james.mailbox.inmemory.manager.InMemoryIntegrationResources;
+import org.apache.james.mailbox.model.ComposedMessageId;
+import org.apache.james.mailbox.model.MailboxPath;
+import org.apache.james.mailbox.model.SearchQuery;
 import org.apache.james.mailbox.store.search.AbstractMessageSearchIndexTest;
+import org.apache.james.mime4j.dom.Message;
 import org.apache.lucene.store.RAMDirectory;
 import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
 
 import com.github.fge.lambdas.Throwing;
 
+import reactor.core.publisher.Flux;
+
 class LuceneMessageSearchIndexTest extends AbstractMessageSearchIndexTest {
 
     @Override
@@ -120,6 +134,12 @@ class LuceneMessageSearchIndexTest extends AbstractMessageSearchIndexTest {
     public void mailsContainsShouldIncludeMailHavingAttachmentsMatchingTheRequest() {
     }
 
+    @Disabled("Domain part search is not supported by Lucene")
+    @Test
+    @Override
+    public void textShouldMatchEmailAddressDomainPart() throws Exception {
+    }
+
     @Disabled("JAMES-1799: ignoring failing test after generalizing ElasticSearch test suite to other mailbox search backends")
     @Override
     public void modSeqGreaterThanShouldReturnUidsOfMessageHavingAGreaterModSeq() {
diff --git a/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/AbstractMessageSearchIndexTest.java b/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/AbstractMessageSearchIndexTest.java
index 9492958..5cc5dba 100644
--- a/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/AbstractMessageSearchIndexTest.java
+++ b/mailbox/store/src/test/java/org/apache/james/mailbox/store/search/AbstractMessageSearchIndexTest.java
@@ -37,6 +37,7 @@ import javax.mail.Flags;
 import org.apache.james.core.Username;
 import org.apache.james.mailbox.MailboxManager;
 import org.apache.james.mailbox.MailboxSession;
+import org.apache.james.mailbox.MailboxSessionUtil;
 import org.apache.james.mailbox.MessageIdManager;
 import org.apache.james.mailbox.MessageManager;
 import org.apache.james.mailbox.exception.MailboxException;
@@ -59,10 +60,12 @@ import org.apache.james.mime4j.dom.Multipart;
 import org.apache.james.mime4j.message.BodyPartBuilder;
 import org.apache.james.mime4j.message.MultipartBuilder;
 import org.apache.james.mime4j.message.SingleBodyBuilder;
+import org.apache.james.mime4j.stream.RawField;
 import org.apache.james.util.ClassLoaderUtils;
 import org.awaitility.Awaitility;
 import org.awaitility.Duration;
 import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import com.google.common.collect.ImmutableList;
@@ -466,15 +469,97 @@ public abstract class AbstractMessageSearchIndexTest {
             SearchQuery.address(AddressType.To, emailToSearch),
             SearchQuery.address(AddressType.Cc, emailToSearch),
             SearchQuery.address(AddressType.Bcc, emailToSearch),
-            SearchQuery.headerContains("Subject", emailToSearch),
-            SearchQuery.attachmentContains(emailToSearch),
-            SearchQuery.bodyContains(emailToSearch))));
+            SearchQuery.headerContains("Subject", emailToSearch))));
 
         assertThat(messageSearchIndex.search(session, mailbox, searchQuery).toStream())
             .containsOnly(m11.getUid());
     }
 
     @Test
+    void textShouldMatchFullEmailAddress() throws Exception {
+        MailboxPath mailboxPath = MailboxPath.forUser(USERNAME, INBOX);
+        MailboxSession session = MailboxSessionUtil.create(USERNAME);
+        MessageManager messageManager = storeMailboxManager.getMailbox(mailboxPath, session);
+
+        ComposedMessageId messageId1 = messageManager.appendMessage(
+            MessageManager.AppendCommand.builder().build(
+                Message.Builder
+                    .of()
+                    .setSubject("test")
+                    .setBody("benwa@apache.org email address do not exist", StandardCharsets.UTF_8)
+                    .build()),
+            session).getId();
+
+        await();
+
+        assertThat(Flux.from(messageManager.search(SearchQuery.of(SearchQuery.bodyContains("benwa@apache.org")), session)).toStream())
+            .containsOnly(messageId1.getUid());
+    }
+
+    @Test
+    void textShouldMatchEmailAddressLocalPart() throws Exception {
+        MailboxPath mailboxPath = MailboxPath.forUser(USERNAME, INBOX);
+        MailboxSession session = MailboxSessionUtil.create(USERNAME);
+        MessageManager messageManager = storeMailboxManager.getMailbox(mailboxPath, session);
+
+        ComposedMessageId messageId1 = messageManager.appendMessage(
+            MessageManager.AppendCommand.builder().build(
+                Message.Builder
+                    .of()
+                    .setSubject("test")
+                    .setBody("benwa@apache.org email address do not exist", StandardCharsets.UTF_8)
+                    .build()),
+            session).getId();
+
+        await();
+
+        assertThat(Flux.from(messageManager.search(SearchQuery.of(SearchQuery.bodyContains("benwa")), session)).toStream())
+            .containsOnly(messageId1.getUid());
+    }
+
+    @Test
+    public void textShouldMatchEmailAddressDomainPart() throws Exception {
+        MailboxPath mailboxPath = MailboxPath.forUser(USERNAME, INBOX);
+        MailboxSession session = MailboxSessionUtil.create(USERNAME);
+        MessageManager messageManager = storeMailboxManager.getMailbox(mailboxPath, session);
+
+        ComposedMessageId messageId1 = messageManager.appendMessage(
+            MessageManager.AppendCommand.builder().build(
+                Message.Builder
+                    .of()
+                    .setSubject("test")
+                    .setBody("benwa@mydomain.org email address do not exist", StandardCharsets.UTF_8)
+                    .build()),
+            session).getId();
+
+        await();
+
+        assertThat(Flux.from(messageManager.search(SearchQuery.of(SearchQuery.bodyContains("mydomain.org")), session)).toStream())
+            .containsOnly(messageId1.getUid());
+    }
+
+    @Test
+    public void textShouldNotMatchOtherAddressesOfTheSameDomain() throws Exception {
+        MailboxPath mailboxPath = MailboxPath.forUser(USERNAME, INBOX);
+        MailboxSession session = MailboxSessionUtil.create(USERNAME);
+        MessageManager messageManager = storeMailboxManager.getMailbox(mailboxPath, session);
+
+        messageManager.appendMessage(
+            MessageManager.AppendCommand.builder().build(
+                Message.Builder
+                    .of()
+                    .setSubject("test")
+                    .setBody("benwa@apache.org email address do not exist", StandardCharsets.UTF_8)
+                    .build()),
+            session).getId();
+        
+        await();
+
+        assertThat(Flux.from(messageManager.search(SearchQuery.of(SearchQuery.bodyContains("alice@apache.org")), session)).toStream())
+            .isEmpty();
+    }
+
+    @Test
     void hasNoAttachmenShouldOnlyReturnMessageThatHasNoAttachmentWhichAreNotInline() throws MailboxException {
         SearchQuery searchQuery = SearchQuery.of(SearchQuery.hasNoAttachment());
 


---------------------------------------------------------------------
To unsubscribe, e-mail: notifications-unsubscribe@james.apache.org
For additional commands, e-mail: notifications-help@james.apache.org