You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by fo...@apache.org on 2020/11/02 09:17:10 UTC

svn commit: r1883065 - in /jackrabbit/oak/trunk: oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ oak-search-elastic/src/main/java/...

Author: fortino
Date: Mon Nov  2 09:17:10 2020
New Revision: 1883065

URL: http://svn.apache.org/viewvc?rev=1883065&view=rev
Log:
OAK-9264: add support for similarityTags in oak-search-elastic

Modified:
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
    jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java
    jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
    jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java Mon Nov  2 09:17:10 2020
@@ -57,6 +57,11 @@ public class ElasticIndexDefinition exte
     public static final String PROP_INDEX_NAME_SEED = ":nameSeed";
 
     /**
+     * Hidden property to store similarity tags
+     */
+    public static final String SIMILARITY_TAGS = ":simTags";
+
+    /**
      * Node name under which various analyzers are configured
      */
     private static final String ANALYZERS = "analyzers";

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java Mon Nov  2 09:17:10 2020
@@ -18,6 +18,7 @@ package org.apache.jackrabbit.oak.plugin
 
 import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
 import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
 import org.apache.jackrabbit.oak.plugins.index.search.spi.binary.BlobByteSource;
 import org.elasticsearch.common.Strings;
@@ -47,6 +48,7 @@ class ElasticDocument {
     private final Map<String, Object> properties;
     private final Map<String, Object> similarityFields;
     private final Map<String, Map<String, Double>> dynamicBoostFields;
+    private final Set<String> similarityTags;
 
     ElasticDocument(String path) {
         this.path = path;
@@ -57,6 +59,7 @@ class ElasticDocument {
         this.properties = new HashMap<>();
         this.similarityFields = new HashMap<>();
         this.dynamicBoostFields = new HashMap<>();
+        this.similarityTags = new LinkedHashSet<>();
     }
 
     void addFulltext(String value) {
@@ -105,6 +108,10 @@ class ElasticDocument {
                 .putIfAbsent(value, boost);
     }
 
+    void addSimilarityTag(String value) {
+        similarityTags.add(value);
+    }
+
     public String build() {
         String ret;
         try {
@@ -144,6 +151,9 @@ class ElasticDocument {
                     }
                     builder.endArray();
                 }
+                if (!similarityTags.isEmpty()) {
+                    builder.field(ElasticIndexDefinition.SIMILARITY_TAGS, similarityTags);
+                }
             }
             builder.endObject();
 

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java Mon Nov  2 09:17:10 2020
@@ -172,7 +172,11 @@ class ElasticDocumentMaker extends Fullt
 
     @Override
     protected boolean indexSimilarityTag(ElasticDocument doc, PropertyState property) {
-        // TODO : not implemented
+        String val = property.getValue(Type.STRING);
+        if (val.length() > 0) {
+            doc.addSimilarityTag(val);
+            return true;
+        }
         return false;
     }
 
@@ -180,9 +184,7 @@ class ElasticDocumentMaker extends Fullt
     protected void indexSimilarityBinaries(ElasticDocument doc, PropertyDefinition pd, Blob blob) throws IOException {
         // see https://www.elastic.co/blog/text-similarity-search-with-vectors-in-elasticsearch
         // see https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
-
         doc.addSimilarityField(pd.name, blob);
-
     }
 
     @Override

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java Mon Nov  2 09:17:10 2020
@@ -255,6 +255,11 @@ class ElasticIndexHelper {
             }
             mappingBuilder.endObject();
         }
+
+        mappingBuilder.startObject(ElasticIndexDefinition.SIMILARITY_TAGS)
+                .field("type", "text")
+                .field("analyzer", "oak_analyzer")
+                .endObject();
     }
 
     // we need to check if in the defined rules there are properties with the same name and different types

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java Mon Nov  2 09:17:10 2020
@@ -45,7 +45,6 @@ import org.apache.jackrabbit.oak.spi.sta
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.search.join.ScoreMode;
 import org.elasticsearch.index.query.BoolQueryBuilder;
-import org.elasticsearch.index.query.ExistsQueryBuilder;
 import org.elasticsearch.index.query.InnerHitBuilder;
 import org.elasticsearch.index.query.MatchBoolPrefixQueryBuilder;
 import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
@@ -89,8 +88,6 @@ import java.util.stream.StreamSupport;
 
 import static org.apache.jackrabbit.JcrConstants.JCR_MIXINTYPES;
 import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE;
-import static org.apache.jackrabbit.oak.commons.PathUtils.denotesRoot;
-import static org.apache.jackrabbit.oak.commons.PathUtils.getParentPath;
 import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
 import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newAncestorQuery;
 import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newDepthQuery;
@@ -113,6 +110,7 @@ import static org.elasticsearch.index.qu
 import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
+import static org.elasticsearch.index.query.QueryBuilders.moreLikeThisQuery;
 import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
 import static org.elasticsearch.index.query.QueryBuilders.nestedQuery;
 import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
@@ -172,15 +170,34 @@ public class ElasticRequestHandler {
                     sp.addAll(r.getSimilarityProperties());
                 }
                 String mltQueryString = propertyRestrictionQuery.substring("mlt?".length());
+                Map<String, String> mltParams = MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
+                String text = mltParams.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+
+                if (text == null) {
+                    // TODO : See if we might want to support like Text here (passed as null in above constructors)
+                    // IT is not supported in our lucene implementation.
+                    throw new IllegalArgumentException("Missing required field stream.body in MLT query: " + mltQueryString);
+                }
                 if (sp.isEmpty()) {
                     // SimilarityImpl in oak-core sets property restriction for sim search and the query is something like
                     // mlt?mlt.fl=:path&mlt.mindf=0&stream.body=<path> . We need parse this query string and turn into a query
                     // elastic can understand.
-                    boolQuery.must(moreLikeThisQuery(mltQueryString));
+                    MoreLikeThisQueryBuilder mltqb = mltQuery(mltParams);
+                    boolQuery.must(mltqb);
+                    // add should clause to improve relevance using similarity tags
+                    boolQuery.should(moreLikeThisQuery(
+                            new String[]{ElasticIndexDefinition.SIMILARITY_TAGS}, null, mltqb.likeItems())
+                            .minTermFreq(1).minDocFreq(1)
+                    );
                 } else {
-                    boolQuery.must(similarityQuery(mltQueryString, sp));
+                    boolQuery.must(similarityQuery(text, sp));
+                    // add should clause to improve relevance using similarity tags
+                    boolQuery.should(moreLikeThisQuery(
+                            new String[]{ElasticIndexDefinition.SIMILARITY_TAGS}, null,
+                            new Item[]{new Item(null, ElasticIndexUtils.idFromPath(text))})
+                            .minTermFreq(1).minDocFreq(1)
+                    );
                 }
-
             } else {
                 boolQuery.must(queryStringQuery(propertyRestrictionQuery));
             }
@@ -288,12 +305,9 @@ public class ElasticRequestHandler {
                 .map(pd -> pd.name);
     }
 
-    private QueryBuilder similarityQuery(String mltQueryString, List<PropertyDefinition> sp) {
-        LOG.debug("parsing similarity query on {}", mltQueryString);
-        Map<String, String> mltParamMap = MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
-        String text = mltParamMap.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+    private QueryBuilder similarityQuery(@NotNull String text, List<PropertyDefinition> sp) {
         BoolQueryBuilder query = boolQuery();
-        if (text != null && !sp.isEmpty()) {
+        if (!sp.isEmpty()) {
             LOG.debug("generating similarity query for {}", text);
             NodeState targetNodeState = rootState;
             for (String token : PathUtils.elements(text)) {
@@ -303,7 +317,7 @@ public class ElasticRequestHandler {
                 throw new IllegalArgumentException("Could not find node " + text);
             }
             for (PropertyDefinition pd : sp) {
-                String propertyPath = PathUtils.getParentPath(pd.name);;
+                String propertyPath = PathUtils.getParentPath(pd.name);
                 String propertyName = PathUtils.getName(pd.name);
                 NodeState tempState = targetNodeState;
                 for (String token : PathUtils.elements(propertyPath)) {
@@ -331,7 +345,7 @@ public class ElasticRequestHandler {
                 paramMap.put("field_name", similarityPropFieldName);
                 ScriptScoreQueryBuilder scriptScoreQueryBuilder = scriptScoreQuery(existsQuery(similarityPropFieldName),
                         new Script(ScriptType.INLINE, Script.DEFAULT_SCRIPT_LANG, "cosineSimilarity(params.query_vector, params.field_name) + 1.0",
-                        Collections.emptyMap(), paramMap));
+                                Collections.emptyMap(), paramMap));
                 query.should(scriptScoreQueryBuilder);
             }
         }
@@ -355,36 +369,29 @@ public class ElasticRequestHandler {
        (The above is important since this is not a one-size-fits-all situation and the default values might not
        be useful in every situation based on the type of content)
      */
-    private QueryBuilder moreLikeThisQuery(String mltQueryString) {
+    private MoreLikeThisQueryBuilder mltQuery(Map<String, String> mltParams) {
+        String text = mltParams.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+
         MoreLikeThisQueryBuilder mlt;
-        Map<String, String> paramMap = MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
-        String text = paramMap.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
-        String fields = paramMap.get(MoreLikeThisHelperUtil.MLT_FILED);
-
-        if (text != null) {
-            // It's expected the text here to be the path of the doc
-            // In case the path of a node is greater than 512 bytes,
-            // we hash it before storing it as the _id for the elastic doc
-            text = ElasticIndexUtils.idFromPath(text);
-            if (FieldNames.PATH.equals(fields) || fields == null) {
-                // Handle the case 1) where default query sent by SimilarImpl (No Custom fields)
-                // We just need to specify the doc (Item) whose similar content we need to find
-                // We store path as the _id so no need to do anything extra here
-                // We expect Similar impl to send a query where text would have evaluated to node path.
-                mlt = new MoreLikeThisQueryBuilder(null, new Item[]{new Item(null, text)});
-            } else {
-                // This is for native queries if someone send additional fields via mlt.fl=field1,field2
-                String[] fieldsArray = fields.split(",");
-                mlt = new MoreLikeThisQueryBuilder(fieldsArray, null, new Item[]{new Item(null, text)});
-            }
-            // TODO : See if we might want to support like Text here (passed as null in above constructors)
-            // IT is not supported in our lucene implementation.
+        String fields = mltParams.get(MoreLikeThisHelperUtil.MLT_FILED);
+        // It's expected the text here to be the path of the doc
+        // In case the path of a node is greater than 512 bytes,
+        // we hash it before storing it as the _id for the elastic doc
+        text = ElasticIndexUtils.idFromPath(text);
+        if (fields == null || FieldNames.PATH.equals(fields)) {
+            // Handle the case 1) where default query sent by SimilarImpl (No Custom fields)
+            // We just need to specify the doc (Item) whose similar content we need to find
+            // We store path as the _id so no need to do anything extra here
+            // We expect Similar impl to send a query where text would have evaluated to node path.
+            mlt = moreLikeThisQuery(new Item[]{new Item(null, text)});
         } else {
-            throw new RuntimeException("Missing required field stream.body in  MLT query: " + mltQueryString);
+            // This is for native queries if someone send additional fields via mlt.fl=field1,field2
+            String[] fieldsArray = fields.split(",");
+            mlt = moreLikeThisQuery(fieldsArray, null, new Item[]{new Item(null, text)});
         }
 
-        for (String key : paramMap.keySet()) {
-            String val = paramMap.get(key);
+        for (String key : mltParams.keySet()) {
+            String val = mltParams.get(key);
             if (MoreLikeThisHelperUtil.MLT_MIN_DOC_FREQ.equals(key)) {
                 mlt.minDocFreq(Integer.parseInt(val));
             } else if (MoreLikeThisHelperUtil.MLT_MIN_TERM_FREQ.equals(key)) {
@@ -406,7 +413,7 @@ public class ElasticRequestHandler {
                 String[] stopWords = val.split(",");
                 mlt.stopWords(stopWords);
             } else {
-                LOG.warn("Unrecognized param {} in the mlt query {}", key, mltQueryString);
+                LOG.warn("Unrecognized param {} in the mlt query {}", key, mltParams);
             }
         }
 
@@ -564,7 +571,7 @@ public class ElasticRequestHandler {
                 }
                 break;
             case PARENT:
-                if (denotesRoot(path)) {
+                if (PathUtils.denotesRoot(path)) {
                     // there's no parent of the root node
                     // we add a path that can not possibly occur because there
                     // is no way to say "match no documents" in Lucene
@@ -575,10 +582,10 @@ public class ElasticRequestHandler {
                     if (planResult.isPathTransformed()) {
                         String parentPathSegment = planResult.getParentPathSegment();
                         if (!any.test(PathUtils.elements(parentPathSegment), "*")) {
-                            queries.add(newPathQuery(getParentPath(path) + parentPathSegment));
+                            queries.add(newPathQuery(PathUtils.getParentPath(path) + parentPathSegment));
                         }
                     } else {
-                        queries.add(newPathQuery(getParentPath(path)));
+                        queries.add(newPathQuery(PathUtils.getParentPath(path)));
                     }
                 }
                 break;

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java Mon Nov  2 09:17:10 2020
@@ -16,38 +16,16 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.elastic;
 
-import org.apache.commons.io.IOUtils;
-import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.Tree;
-import org.apache.jackrabbit.oak.api.Type;
-import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
 import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
-import org.elasticsearch.client.RequestOptions;
-import org.elasticsearch.client.indices.GetFieldMappingsRequest;
-import org.elasticsearch.client.indices.GetFieldMappingsResponse;
 import org.junit.Test;
 
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.net.URI;
-import java.nio.charset.Charset;
 import java.util.Arrays;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
 
 import static java.util.Collections.singletonList;
-import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toByteArray;
-import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
 import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROPDEF_PROP_NODE_NAME;
 import static org.hamcrest.CoreMatchers.containsString;
 import static org.hamcrest.MatcherAssert.assertThat;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotEquals;
 
 public class ElasticPropertyIndexTest extends ElasticAbstractQueryTest {
 
@@ -176,83 +154,4 @@ public class ElasticPropertyIndexTest ex
                 Arrays.asList("/test/a", "/test/b")));
     }
 
-    @Test
-    public void vectorSimilarityCustomVectorSize() throws Exception {
-        final String indexName = "test1";
-        final String fieldName1 = "fv1";
-        final String fieldName2 = "fv2";
-        final String similarityFieldName1 = FieldNames.createSimilarityFieldName(fieldName1);
-        final String similarityFieldName2 = FieldNames.createSimilarityFieldName(fieldName2);
-        IndexDefinitionBuilder builder = createIndex(fieldName1, fieldName2);
-        builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex()
-                .similaritySearchDenseVectorSize(10);
-        builder.indexRule("nt:base").property(fieldName2).useInSimilarity(true).nodeScopeIndex()
-                .similaritySearchDenseVectorSize(20);
-        Tree index = setIndex(indexName, builder);
-        root.commit();
-        String alias =  ElasticIndexNameHelper.getIndexAlias(esConnection.getIndexPrefix(), "/oak:index/" + indexName);
-        GetFieldMappingsRequest fieldMappingsRequest = new GetFieldMappingsRequest();
-        fieldMappingsRequest.indices(alias).fields(similarityFieldName1, similarityFieldName2);
-        GetFieldMappingsResponse mappingsResponse = esConnection.getClient().indices().
-                getFieldMapping(fieldMappingsRequest, RequestOptions.DEFAULT);
-        final Map<String, Map<String, GetFieldMappingsResponse.FieldMappingMetadata>> mappings =
-                mappingsResponse.mappings();
-        assertEquals("More than one index found", 1, mappings.keySet().size());
-        @SuppressWarnings("unchecked")
-        Map<String, Integer> map1 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
-                get(similarityFieldName1).sourceAsMap().get(similarityFieldName1);
-        assertEquals("Dense vector size doesn't match", 10, map1.get("dims").intValue());
-        @SuppressWarnings("unchecked")
-        Map<String, Integer> map2 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
-                get(similarityFieldName2).sourceAsMap().get(similarityFieldName2);
-        assertEquals("Dense vector size doesn't match", 20, map2.get("dims").intValue());
-    }
-
-
-    @Test
-    public void vectorSimilarity() throws Exception {
-        IndexDefinitionBuilder builder = createIndex("fv");
-        builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex();
-        Tree index = setIndex("test1", builder);
-        root.commit();
-        Tree test = root.getTree("/").addChild("test");
-
-        URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
-        File file = new File(uri);
-
-        Collection<String> children = new LinkedList<>();
-        for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
-            String[] split = line.split(",");
-            List<Double> values = Arrays.stream(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
-            byte[] bytes = toByteArray(values);
-            List<Double> actual = toDoubles(bytes);
-            assertEquals(values, actual);
-
-            Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
-            String name = split[0];
-            Tree child = test.addChild(name);
-            child.setProperty("fv", blob, Type.BINARY);
-            children.add(child.getPath());
-        }
-        root.commit();
-
-        // check that similarity changes across different feature vectors
-        List<String> baseline = new LinkedList<>();
-        for (String similarPath : children) {
-            String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";
-            List<String> current = new LinkedList<>();
-            assertEventually(() -> {
-                Iterator<String> result = executeQuery(query, "JCR-SQL2", false, true).iterator();
-                current.clear();
-                while (result.hasNext()) {
-                    String next = result.next();
-                    current.add(next);
-                }
-                assertNotEquals(baseline, current);
-            });
-            baseline.clear();
-            baseline.addAll(current);
-        }
-    }
-
 }

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java Mon Nov  2 09:17:10 2020
@@ -16,14 +16,37 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.elastic;
 
+import org.apache.commons.io.IOUtils;
+import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.Tree;
+import org.apache.jackrabbit.oak.api.Type;
+import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
 import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
 import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
+import org.elasticsearch.client.RequestOptions;
+import org.elasticsearch.client.indices.GetFieldMappingsRequest;
+import org.elasticsearch.client.indices.GetFieldMappingsResponse;
 import org.junit.Test;
 
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.net.URI;
+import java.nio.charset.Charset;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
 import java.util.UUID;
+import java.util.stream.Collectors;
+
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toByteArray;
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
 
 public class ElasticSimilarQueryTest extends ElasticAbstractQueryTest {
 
@@ -34,7 +57,7 @@ public class ElasticSimilarQueryTest ext
     whereas in elastic, it doesn't.
      */
     @Test
-    public void testRepSimilarAsNativeQuery() throws Exception {
+    public void repSimilarAsNativeQuery() throws Exception {
 
         createIndex(true);
 
@@ -56,7 +79,7 @@ public class ElasticSimilarQueryTest ext
     whereas in elastic, it doesn't.
      */
     @Test
-    public void testRepSimilarQuery() throws Exception {
+    public void repSimilarQuery() throws Exception {
         createIndex(false);
 
         String query = "select [jcr:path] from [nt:base] where similar(., '/test/a')";
@@ -82,7 +105,7 @@ public class ElasticSimilarQueryTest ext
     whereas in elastic, it doesn't.
      */
     @Test
-    public void testRepSimilarXPathQuery() throws Exception {
+    public void repSimilarXPathQuery() throws Exception {
         createIndex(false);
 
         String query = "//element(*, nt:base)[rep:similar(., '/test/a')]";
@@ -101,7 +124,7 @@ public class ElasticSimilarQueryTest ext
     }
 
     @Test
-    public void testRepSimilarWithStopWords() throws Exception {
+    public void repSimilarWithStopWords() throws Exception {
         createIndex(true);
 
         String nativeQueryStringWithStopWords = "select [jcr:path] from [nt:base] where " +
@@ -129,7 +152,7 @@ public class ElasticSimilarQueryTest ext
     }
 
     @Test
-    public void testRepSimilarWithMinWordLength() throws Exception {
+    public void repSimilarWithMinWordLength() throws Exception {
         createIndex(true);
         String nativeQueryStringWithMinWordLength = "select [jcr:path] from [nt:base] where " +
                 "native('elastic-sim', 'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0&mlt.minwl=6')";
@@ -155,7 +178,7 @@ public class ElasticSimilarQueryTest ext
     }
 
     @Test
-    public void testRepSimilarQueryWithLongPath() throws Exception {
+    public void repSimilarQueryWithLongPath() throws Exception {
         createIndex(false);
         Tree test = root.getTree("/").addChild("test");
         Tree longPath = test.addChild("a");
@@ -178,12 +201,114 @@ public class ElasticSimilarQueryTest ext
                 Arrays.asList("/test/b", "/test/c", "/test/d", "/test/f", "/test/g", "/test/h")));
     }
 
+    @Test
+    public void similarityTagsAffectRelevance() throws Exception {
+        createIndex(false);
+
+        Tree test = root.getTree("/").addChild("test");
+        Tree a = test.addChild("a");
+        a.setProperty("text", "Hello World Hello World");
+        a.setProperty("tags", "foo");
+        Tree b = test.addChild("b");
+        b.setProperty("text", "Hello World Hello World");
+        b.setProperty("tags", "bar");
+        Tree c = test.addChild("c");
+        c.setProperty("text", "Hello World Hello World");
+        c.setProperty("tags", "foo");
+        root.commit();
+
+        assertEventually(() -> assertOrderedQuery("select [jcr:path] from [nt:base] where similar(., '/test/a')",
+                Arrays.asList("/test/c", "/test/b")));
+        assertEventually(() -> assertOrderedQuery("select [jcr:path] from [nt:base] where similar(., '/test/c')",
+                Arrays.asList("/test/a", "/test/b")));
+    }
+
+    @Test
+    public void vectorSimilarityCustomVectorSize() throws Exception {
+        final String indexName = "test1";
+        final String fieldName1 = "fv1";
+        final String fieldName2 = "fv2";
+        final String similarityFieldName1 = FieldNames.createSimilarityFieldName(fieldName1);
+        final String similarityFieldName2 = FieldNames.createSimilarityFieldName(fieldName2);
+        IndexDefinitionBuilder builder = createIndex(fieldName1, fieldName2);
+        builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex()
+                .similaritySearchDenseVectorSize(10);
+        builder.indexRule("nt:base").property(fieldName2).useInSimilarity(true).nodeScopeIndex()
+                .similaritySearchDenseVectorSize(20);
+        setIndex(indexName, builder);
+        root.commit();
+        String alias =  ElasticIndexNameHelper.getIndexAlias(esConnection.getIndexPrefix(), "/oak:index/" + indexName);
+        GetFieldMappingsRequest fieldMappingsRequest = new GetFieldMappingsRequest();
+        fieldMappingsRequest.indices(alias).fields(similarityFieldName1, similarityFieldName2);
+        GetFieldMappingsResponse mappingsResponse = esConnection.getClient().indices().
+                getFieldMapping(fieldMappingsRequest, RequestOptions.DEFAULT);
+        final Map<String, Map<String, GetFieldMappingsResponse.FieldMappingMetadata>> mappings =
+                mappingsResponse.mappings();
+        assertEquals("More than one index found", 1, mappings.keySet().size());
+        @SuppressWarnings("unchecked")
+        Map<String, Integer> map1 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
+                get(similarityFieldName1).sourceAsMap().get(similarityFieldName1);
+        assertEquals("Dense vector size doesn't match", 10, map1.get("dims").intValue());
+        @SuppressWarnings("unchecked")
+        Map<String, Integer> map2 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
+                get(similarityFieldName2).sourceAsMap().get(similarityFieldName2);
+        assertEquals("Dense vector size doesn't match", 20, map2.get("dims").intValue());
+    }
+
+
+    @Test
+    public void vectorSimilarity() throws Exception {
+        IndexDefinitionBuilder builder = createIndex("fv");
+        builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex();
+        setIndex("test1", builder);
+        root.commit();
+        Tree test = root.getTree("/").addChild("test");
+
+        URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+        File file = new File(uri);
+
+        Collection<String> children = new LinkedList<>();
+        for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+            String[] split = line.split(",");
+            List<Double> values = Arrays.stream(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
+            byte[] bytes = toByteArray(values);
+            List<Double> actual = toDoubles(bytes);
+            assertEquals(values, actual);
+
+            Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+            String name = split[0];
+            Tree child = test.addChild(name);
+            child.setProperty("fv", blob, Type.BINARY);
+            children.add(child.getPath());
+        }
+        root.commit();
+
+        // check that similarity changes across different feature vectors
+        List<String> baseline = new LinkedList<>();
+        for (String similarPath : children) {
+            String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";
+            List<String> current = new LinkedList<>();
+            assertEventually(() -> {
+                Iterator<String> result = executeQuery(query, "JCR-SQL2", false, true).iterator();
+                current.clear();
+                while (result.hasNext()) {
+                    String next = result.next();
+                    current.add(next);
+                }
+                assertNotEquals(baseline, current);
+            });
+            baseline.clear();
+            baseline.addAll(current);
+        }
+    }
+
     private void createIndex(boolean nativeQuery) throws Exception {
-        IndexDefinitionBuilder builder = createIndex("text");
+        IndexDefinitionBuilder builder = createIndex("text", "tags");
         if (nativeQuery) {
             builder.getBuilderTree().setProperty(FulltextIndexConstants.FUNC_NAME, "elastic-sim");
         }
         builder.indexRule("nt:base").property("text").analyzed();
+        builder.indexRule("nt:base").property("tags").similarityTags(true);
         String indexId = UUID.randomUUID().toString();
         setIndex(indexId, builder);
         root.commit();

Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java Mon Nov  2 09:17:10 2020
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
 package org.apache.jackrabbit.oak.plugins.index.search.util;
 
 import com.google.common.collect.Iterables;
@@ -388,6 +387,11 @@ public class IndexDefinitionBuilder {
             return this;
         }
 
+        public PropertyRule similarityTags(boolean rerank) {
+            propTree.setProperty(FulltextIndexConstants.PROP_SIMILARITY_TAGS, rerank);
+            return this;
+        }
+
         public PropertyRule type(String type) {
             //This would throw an IAE if type is invalid
             PropertyType.valueFromName(type);