You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by fo...@apache.org on 2022/02/11 09:12:30 UTC

[jackrabbit-oak] branch trunk updated: OAK-9691 Improve fulltext query syntax support for ElasticSearch (#489)

This is an automated email from the ASF dual-hosted git repository.

fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 3fffce6  OAK-9691 Improve fulltext query syntax support for ElasticSearch (#489)
3fffce6 is described below

commit 3fffce60331a922084837e9924c139d7eb5496af
Author: Jun Zhang <zj...@adobe.com>
AuthorDate: Fri Feb 11 01:12:22 2022 -0800

    OAK-9691 Improve fulltext query syntax support for ElasticSearch (#489)
---
 .../lucene/dynamicBoost/DynamicBoostTest.java      | 29 ++------
 .../index/elastic/query/ElasticRequestHandler.java | 86 ++++++++--------------
 .../index/search/spi/query/FulltextIndex.java      |  2 +-
 .../oak/plugins/index/IndexQueryCommonTest.java    | 41 +++++++++++
 4 files changed, 77 insertions(+), 81 deletions(-)

diff --git a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/dynamicBoost/DynamicBoostTest.java b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/dynamicBoost/DynamicBoostTest.java
index 77f7c0d..e0a8f56 100644
--- a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/dynamicBoost/DynamicBoostTest.java
+++ b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/dynamicBoost/DynamicBoostTest.java
@@ -52,7 +52,6 @@ import org.apache.jackrabbit.oak.spi.commit.Observer;
 import org.apache.jackrabbit.oak.spi.mount.Mounts;
 import org.apache.jackrabbit.oak.spi.query.QueryIndexProvider;
 import org.apache.jackrabbit.oak.spi.security.OpenSecurityProvider;
-import org.junit.Ignore;
 import org.junit.Test;
 
 import ch.qos.logback.classic.Level;
@@ -134,27 +133,6 @@ public class DynamicBoostTest extends AbstractQueryTest {
         assertEquals("[]", log);
     }
 
-    // verifying fulltext query syntax ability, put here for comparison with dynamic boost and dynamic boost lite
-    @Test
-    public void testQueryFullTextWithTitle() throws Exception {
-        createAssetsIndexAndProperties(false, false);
-        prepareTestAssets();
-
-        assertQuery("//element(*, dam:Asset)[jcr:contains(., 'titleone')]", XPATH, Arrays.asList("/test/asset1"));
-        assertQuery("//element(*, dam:Asset)[jcr:contains(., 'long')]", XPATH, Arrays.asList("/test/asset1", "/test/asset2"));
-        // case insensitive
-        assertQuery("//element(*, dam:Asset)[jcr:contains(., 'LONG')]", XPATH, Arrays.asList("/test/asset1", "/test/asset2"));
-        // wildcard works
-        assertQuery("//element(*, dam:Asset)[jcr:contains(., 'title*')]", XPATH,
-                Arrays.asList("/test/asset1", "/test/asset2", "/test/asset3"));
-        // space is treated as AND
-        assertQuery("//element(*, dam:Asset)[jcr:contains(., 'long titleone')]", XPATH, Arrays.asList("/test/asset1"));
-        // OR works
-        assertQuery("//element(*, dam:Asset)[jcr:contains(., 'long OR titleone')]", XPATH, Arrays.asList("/test/asset1", "/test/asset2"));
-        // minus works
-        assertQuery("//element(*, dam:Asset)[jcr:contains(., 'long -titleone')]", XPATH, Arrays.asList("/test/asset2"));
-    }
-
     @Test
     public void testQueryDynamicBoostBasic() throws Exception {
         createAssetsIndexAndProperties(false, false);
@@ -224,7 +202,12 @@ public class DynamicBoostTest extends AbstractQueryTest {
         createAssetsIndexAndProperties(true, true);
         prepareTestAssets();
 
-        assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'blu*')", SQL2, Arrays.asList("/test/asset3"));
+        assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'blu?')", SQL2, Arrays.asList("/test/asset3"));
+        assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'bl?e')", SQL2, Arrays.asList("/test/asset3"));
+        assertQuery("select [jcr:path] from [dam:Asset] where contains(*, '?lue')", SQL2, Arrays.asList("/test/asset3"));
+        assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'coff*')", SQL2, Arrays.asList("/test/asset2"));
+        assertQuery("select [jcr:path] from [dam:Asset] where contains(*, 'co*ee')", SQL2, Arrays.asList("/test/asset2"));
+        assertQuery("select [jcr:path] from [dam:Asset] where contains(*, '*ffee')", SQL2, Arrays.asList("/test/asset2"));
     }
 
     @Test
diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
index 0960d52..93bb4f7 100644
--- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
+++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
@@ -16,12 +16,29 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.elastic.query;
 
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.BiConsumer;
+import java.util.function.BiPredicate;
+import java.util.function.Consumer;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+import javax.jcr.PropertyType;
+
 import org.apache.http.entity.ContentType;
 import org.apache.http.nio.entity.NByteArrayEntity;
 import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.PropertyState;
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.plugins.index.IndexConstants;
 import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
 import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition;
 import org.apache.jackrabbit.oak.plugins.index.elastic.query.async.facets.ElasticFacetProvider;
@@ -34,7 +51,6 @@ import org.apache.jackrabbit.oak.plugins.index.search.spi.binary.BlobByteSource;
 import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndex;
 import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner;
 import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner.PlanResult;
-import org.apache.jackrabbit.oak.plugins.index.IndexConstants;
 import org.apache.jackrabbit.oak.spi.query.Filter;
 import org.apache.jackrabbit.oak.spi.query.QueryConstants;
 import org.apache.jackrabbit.oak.spi.query.QueryIndex;
@@ -48,13 +64,10 @@ import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.search.join.ScoreMode;
-import org.elasticsearch.common.Strings;
-import org.elasticsearch.xcontent.XContentBuilder;
-import org.elasticsearch.xcontent.json.JsonXContent;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.client.Request;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.xcontent.XContentHelper;
-import org.elasticsearch.xcontent.XContentType;
 import org.elasticsearch.index.query.BoolQueryBuilder;
 import org.elasticsearch.index.query.InnerHitBuilder;
 import org.elasticsearch.index.query.MatchBoolPrefixQueryBuilder;
@@ -66,6 +79,7 @@ import org.elasticsearch.index.query.NestedQueryBuilder;
 import org.elasticsearch.index.query.Operator;
 import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.index.query.QueryStringQueryBuilder;
 import org.elasticsearch.index.query.WrapperQueryBuilder;
 import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
 import org.elasticsearch.search.aggregations.AggregationBuilders;
@@ -77,26 +91,14 @@ import org.elasticsearch.search.sort.SortOrder;
 import org.elasticsearch.search.suggest.SuggestBuilders;
 import org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder;
 import org.elasticsearch.search.suggest.phrase.PhraseSuggestionBuilder;
+import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.XContentType;
+import org.elasticsearch.xcontent.json.JsonXContent;
 import org.jetbrains.annotations.NotNull;
 import org.jetbrains.annotations.Nullable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import javax.jcr.PropertyType;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicReference;
-import java.util.function.BiConsumer;
-import java.util.function.BiPredicate;
-import java.util.function.Consumer;
-import java.util.stream.Stream;
-import java.util.stream.StreamSupport;
-
 import static org.apache.jackrabbit.JcrConstants.JCR_MIXINTYPES;
 import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE;
 import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
@@ -113,19 +115,17 @@ import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuil
 import static org.apache.jackrabbit.oak.spi.query.QueryConstants.JCR_PATH;
 import static org.apache.jackrabbit.oak.spi.query.QueryConstants.JCR_SCORE;
 import static org.apache.jackrabbit.util.ISO8601.parse;
-import static org.elasticsearch.xcontent.ToXContent.EMPTY_PARAMS;
 import static org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item;
 import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
 import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
 import static org.elasticsearch.index.query.QueryBuilders.moreLikeThisQuery;
-import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
 import static org.elasticsearch.index.query.QueryBuilders.nestedQuery;
 import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
-import static org.elasticsearch.index.query.QueryBuilders.simpleQueryStringQuery;
 import static org.elasticsearch.index.query.QueryBuilders.termQuery;
 import static org.elasticsearch.index.query.QueryBuilders.wrapperQuery;
+import static org.elasticsearch.xcontent.ToXContent.EMPTY_PARAMS;
 
 /**
  * Class to map query plans into Elastic request objects.
@@ -779,41 +779,13 @@ public class ElasticRequestHandler {
     }
 
     private static QueryBuilder fullTextQuery(String text, String fieldName, PlanResult pr) {
-        // default match query are executed in OR, we need to use AND instead to avoid that
-        // every document having at least one term in the `text` will match. If there are multiple
-        // contains clause they will go to different match queries and will be executed in OR
-        if (FieldNames.FULLTEXT.equals(fieldName) && !pr.indexingRule.getNodeScopeAnalyzedProps().isEmpty()) {
-            MultiMatchQueryBuilder multiMatchQuery = multiMatchQuery(text)
-                    .operator(Operator.AND)
-                    .type(MultiMatchQueryBuilder.Type.CROSS_FIELDS);
-            pr.indexingRule.getNodeScopeAnalyzedProps().forEach(pd -> multiMatchQuery.field(pd.name, pd.boost));
-            // Add the query for actual fulltext field also. That query would not be boosted
-            // and could contain other parts like renditions, node name, etc
-            return multiMatchQuery.field(fieldName);
-        } else {
-            // https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html
-            // simpleQueryStringQuery does not support leading wildcards whereas it's supported by default in queryStringQuery
-            // Not using queryStringQuery by default , since some functional cases break.
-            // simpleQueryStringQuery is less Strict, for instance searches for terms starting with / work, whereas
-            // with queryStringQuery, they throw an Exception (which ultimately results in an empty result set in oak),
-            // so using simpleQueryStringQuery by default would break certain functional cases.
-            // So only support this in case any term in the text String actually starts with *
-            // For example *hello or Hello *world
-            String[] textTerms = text.split(" ");
-            boolean allowLeadingWildCard = false;
-            for(String textTerm : textTerms) {
-                if (textTerm.startsWith("*")) {
-                    allowLeadingWildCard = true;
-                    break;
-                }
-            }
-
-            if (allowLeadingWildCard) {
-                return queryStringQuery(text).field(fieldName).defaultOperator(Operator.AND);
-            } else {
-                return simpleQueryStringQuery(text).analyzeWildcard(true).field(fieldName).defaultOperator(Operator.AND);
-            }
+        LOG.debug("fullTextQuery for text: '{}', fieldName: '{}'", text, fieldName);
+        QueryStringQueryBuilder queryStringQueryBuilder = queryStringQuery(FulltextIndex.rewriteQueryText(text)).defaultOperator(
+                Operator.AND).type(MultiMatchQueryBuilder.Type.CROSS_FIELDS);
+        if (FieldNames.FULLTEXT.equals(fieldName)) {
+            pr.indexingRule.getNodeScopeAnalyzedProps().forEach(pd -> queryStringQueryBuilder.field(pd.name, pd.boost));
         }
+        return queryStringQueryBuilder.field(fieldName);
     }
 
     private QueryBuilder createQuery(String propertyName, Filter.PropertyRestriction pr,
diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndex.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndex.java
index 277715e..5272338 100644
--- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndex.java
+++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndex.java
@@ -288,7 +288,7 @@ public abstract class FulltextIndex implements AdvancedQueryIndex, QueryIndex, N
     /**
      * Following logic is taken from org.apache.jackrabbit.core.query.lucene.JackrabbitQueryParser#parse(java.lang.String)
      */
-    protected static String rewriteQueryText(String textsearch) {
+    public static String rewriteQueryText(String textsearch) {
         // replace escaped ' with just '
         StringBuilder rewritten = new StringBuilder();
         // most query parsers recognize 'AND' and 'NOT' as
diff --git a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java
index 614da4b..670ed15 100644
--- a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java
+++ b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/IndexQueryCommonTest.java
@@ -16,6 +16,7 @@
  */
 package org.apache.jackrabbit.oak.plugins.index;
 
+import org.apache.jackrabbit.oak.api.CommitFailedException;
 import org.apache.jackrabbit.oak.api.PropertyState;
 import org.apache.jackrabbit.oak.api.Tree;
 import org.apache.jackrabbit.oak.api.Type;
@@ -527,6 +528,46 @@ public abstract class IndexQueryCommonTest extends AbstractQueryTest {
     }
 
     @Test
+    public void fullTextQueryGeneric() throws Exception {
+        Tree test = root.getTree("/").addChild("test");
+
+        Tree testNodeA = test.addChild("nodea");
+        testNodeA.setProperty("a", "hello");
+        testNodeA.setProperty("b", "ocean");
+
+        Tree testNodeB = test.addChild("nodeb");
+        testNodeB.setProperty("a", "hello world");
+        testNodeB.setProperty("b", "soccer-shoe");
+
+        Tree testNodeC = test.addChild("nodec");
+        testNodeC.setProperty("a", "hello");
+        testNodeC.setProperty("b", "world");
+        root.commit();
+
+        assertEventually(() -> {
+            // case insensitive
+            assertQuery("//*[jcr:contains(., 'WORLD')] ", XPATH, Arrays.asList("/test/nodeb", "/test/nodec"));
+
+            // wild card
+            assertQuery("//*[jcr:contains(., 'Hell*')] ", XPATH, Arrays.asList("/test/nodea", "/test/nodeb", "/test/nodec"));
+            assertQuery("//*[jcr:contains(., 'He*o')] ", XPATH, Arrays.asList("/test/nodea", "/test/nodeb", "/test/nodec"));
+            assertQuery("//*[jcr:contains(., '*llo')] ", XPATH, Arrays.asList("/test/nodea", "/test/nodeb", "/test/nodec"));
+            assertQuery("//*[jcr:contains(., '?orld')] ", XPATH, Arrays.asList("/test/nodeb", "/test/nodec"));
+            assertQuery("//*[jcr:contains(., 'wo?ld')] ", XPATH, Arrays.asList("/test/nodeb", "/test/nodec"));
+            assertQuery("//*[jcr:contains(., 'worl?')] ", XPATH, Arrays.asList("/test/nodeb", "/test/nodec"));
+
+            // space explained as AND
+            assertQuery("//*[jcr:contains(., 'hello world')] ", XPATH, Arrays.asList("/test/nodeb", "/test/nodec"));
+
+            // exclude
+            assertQuery("//*[jcr:contains(., 'hello -world')] ", XPATH, Arrays.asList("/test/nodea"));
+
+            // explicit OR
+            assertQuery("//*[jcr:contains(., 'ocean OR world')] ", XPATH, Arrays.asList("/test/nodea", "/test/nodeb", "/test/nodec"));
+        });
+    }
+
+    @Test
     public void testInequalityQuery_native() throws Exception {
 
         Tree test = root.getTree("/").addChild("test");