You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by am...@apache.org on 2021/02/08 12:37:25 UTC

svn commit: r1886318 [1/3] - in /jackrabbit/oak/trunk: oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ oak-search-elastic/src/main...

Author: amrverma
Date: Mon Feb  8 12:37:23 2021
New Revision: 1886318

URL: http://svn.apache.org/viewvc?rev=1886318&view=rev
Log:
OAK-9339: Image Similarity: LSH based search
* Using elastiknn plugin

Added:
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java
    jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/elasticstartscript.sh
    jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/org/apache/jackrabbit/oak/query/imagedata.txt
Modified:
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
    jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticConnectionRule.java
    jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
    jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java?rev=1886318&r1=1886317&r2=1886318&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java Mon Feb  8 12:37:23 2021
@@ -81,6 +81,12 @@ public class ElasticIndexDefinition exte
      */
     private static final String INDEX_ORIGINAL_TERM = "indexOriginalTerm";
 
+    private static final String SIMILARITY_TAGS_ENABLED = "similarityTagsEnabled";
+    private static final boolean SIMILARITY_TAGS_ENABLED_DEFAULT = true;
+
+    private static final String SIMILARITY_TAGS_BOOST = "similarityTagsBoost";
+    private static final float SIMILARITY_TAGS_BOOST_DEFAULT = 0.5f;
+
     private static final Function<Integer, Boolean> isAnalyzable;
 
     static {
@@ -97,12 +103,15 @@ public class ElasticIndexDefinition exte
     public final int bulkRetries;
     public final long bulkRetriesBackoff;
     private final String remoteAlias;
+    private final boolean similarityTagsEnabled;
+    private final float similarityTagsBoost;
     public final int numberOfShards;
     public final int numberOfReplicas;
     public final int[] queryFetchSizes;
 
     private final Map<String, List<PropertyDefinition>> propertiesByName;
     private final List<PropertyDefinition> dynamicBoostProperties;
+    private final List<PropertyDefinition> similarityProperties;
 
     public ElasticIndexDefinition(NodeState root, NodeState defn, String indexPath, String indexPrefix) {
         super(root, defn, determineIndexFormatVersion(defn), determineUniqueId(defn), indexPath);
@@ -114,6 +123,8 @@ public class ElasticIndexDefinition exte
         this.bulkRetriesBackoff = getOptionalValue(defn, BULK_RETRIES_BACKOFF, BULK_RETRIES_BACKOFF_DEFAULT);
         this.numberOfShards = getOptionalValue(defn, NUMBER_OF_SHARDS, NUMBER_OF_SHARDS_DEFAULT);
         this.numberOfReplicas = getOptionalValue(defn, NUMBER_OF_REPLICAS, NUMBER_OF_REPLICAS_DEFAULT);
+        this.similarityTagsEnabled = getOptionalValue(defn, SIMILARITY_TAGS_ENABLED, SIMILARITY_TAGS_ENABLED_DEFAULT);
+        this.similarityTagsBoost = getOptionalValue(defn, SIMILARITY_TAGS_BOOST, SIMILARITY_TAGS_BOOST_DEFAULT);
         this.queryFetchSizes = Arrays.stream(getOptionalValues(defn, QUERY_FETCH_SIZES, Type.LONGS, Long.class, QUERY_FETCH_SIZES_DEFAULT))
                 .mapToInt(Long::intValue).toArray();
 
@@ -128,6 +139,11 @@ public class ElasticIndexDefinition exte
                 .flatMap(IndexingRule::getNamePatternsProperties)
                 .filter(pd -> pd.dynamicBoost)
                 .collect(Collectors.toList());
+
+        this.similarityProperties = getDefinedRules()
+                .stream()
+                .flatMap(rule -> rule.getSimilarityProperties().stream())
+                .collect(Collectors.toList());
     }
 
     /**
@@ -147,6 +163,18 @@ public class ElasticIndexDefinition exte
         return dynamicBoostProperties;
     }
 
+    public List<PropertyDefinition> getSimilarityProperties() {
+        return similarityProperties;
+    }
+
+    public boolean areSimilarityTagsEnabled() {
+        return similarityTagsEnabled;
+    }
+
+    public float getSimilarityTagsBoost() {
+        return similarityTagsBoost;
+    }
+
     /**
      * Returns the keyword field name mapped in Elasticsearch for the specified property name.
      * @param propertyName the property name in the index rules
@@ -189,6 +217,11 @@ public class ElasticIndexDefinition exte
         return getOptionalValue(analyzersTree, INDEX_ORIGINAL_TERM, false);
     }
 
+    @Override
+    protected PropertyDefinition createPropertyDefinition(IndexDefinition.IndexingRule rule, String name, NodeState nodeState) {
+        return new ElasticPropertyDefinition(rule, name, nodeState);
+    }
+
     /**
      * Class to help with {@link ElasticIndexDefinition} creation.
      * The built object represents the index definition only without the node structure.

Added: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java?rev=1886318&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java (added)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java Mon Feb  8 12:37:23 2021
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.elastic;
+
+import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
+import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
+import org.apache.jackrabbit.oak.spi.state.NodeState;
+
+import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue;
+
+public class ElasticPropertyDefinition extends PropertyDefinition {
+
+    SimilaritySearchParameters similaritySearchParameters;
+
+    public static final String PROP_QUERY_MODEL = "queryModel";
+    public static final String PROP_NUMBER_OF_HASH_TABLES = "L";
+    public static final String PROP_NUMBER_OF_HASH_FUNCTIONS = "k";
+    public static final String PROP_NUMBER_OF_BUCKETS = "w";
+    public static final String PROP_INDEX_SIMILARITY = "indexSimilarity";
+    public static final String PROP_QUERY_SIMILARITY = "querySimilarity";
+    public static final String PROP_CANDIDATES = "candidates";
+    public static final String PROP_PROBES = "probes";
+
+    private static final int DEFAULT_NUMBER_OF_HASH_TABLES = 20;
+    private static final int DEFAULT_NO_OF_HASH_FUNCTIONS = 15;
+    private static final int DEFAULT_BUCKET_WIDTH = 500;
+    private static final String DEFAULT_SIMILARITY_QUERY_MODEL = "lsh";
+    private static final String DEFAULT_SIMILARITY_INDEX_FUNCTION = "l2";
+    private static final String DEFAULT_SIMILARITY_QUERY_FUNCTION = "l2";
+    private static final int DEFAULT_QUERY_CANDIDATES = 500;
+    private static final int DEFAULT_QUERY_PROBES = 3;
+
+
+    public ElasticPropertyDefinition(IndexDefinition.IndexingRule idxDefn, String nodeName, NodeState defn) {
+        super(idxDefn, nodeName, defn);
+        if (this.useInSimilarity) {
+            similaritySearchParameters = new SimilaritySearchParameters(
+                    getOptionalValue(defn, PROP_NUMBER_OF_HASH_TABLES, DEFAULT_NUMBER_OF_HASH_TABLES),
+                    getOptionalValue(defn, PROP_NUMBER_OF_HASH_FUNCTIONS, DEFAULT_NO_OF_HASH_FUNCTIONS),
+                    getOptionalValue(defn, PROP_NUMBER_OF_BUCKETS, DEFAULT_BUCKET_WIDTH),
+                    getOptionalValue(defn, PROP_QUERY_MODEL, DEFAULT_SIMILARITY_QUERY_MODEL),
+                    getOptionalValue(defn, PROP_INDEX_SIMILARITY, DEFAULT_SIMILARITY_INDEX_FUNCTION),
+                    getOptionalValue(defn, PROP_QUERY_SIMILARITY, DEFAULT_SIMILARITY_QUERY_FUNCTION),
+                    getOptionalValue(defn, PROP_CANDIDATES, DEFAULT_QUERY_CANDIDATES),
+                    getOptionalValue(defn, PROP_PROBES, DEFAULT_QUERY_PROBES));
+        }
+    }
+
+    /**
+     * Class for defining parameters for similarity search based on https://elastiknn.com/api.
+     * For all possible models and query combinations, see https://elastiknn.com/api/#model-and-query-compatibility
+     */
+    public static class SimilaritySearchParameters {
+
+        /**
+         * Number of hash tables. Generally, increasing this value increases recall.
+         */
+        private final int L;
+        /**
+         * Number of hash functions combined to form a single hash value. Generally, increasing this value increases precision.
+         */
+        private final int k;
+        /**
+         * Integer bucket width.
+         */
+        private final int w;
+        /**
+         * Possible values - lsh, exact
+         */
+        private final String queryModel;
+        /**
+         * Possible values l2 (with lsh or exact model), l1 (with exact model), A (angular distance - with exact model)
+         */
+        private final String queryTimeSimilarityFunction;
+        /**
+         * Possible values l2 (with lsh or exact model), l1 (with exact model), A (angular distance - with exact model)
+         */
+        private final String indexTimeSimilarityFunction;
+        /**
+         * Take the top vectors with the most matching hashes and compute their exact similarity to the query vector. The candidates parameter
+         * controls the number of exact similarity computations. Specifically, we compute exact similarity for the top candidates candidate vectors
+         * in each segment. As a reminder, each Elasticsearch index has >= 1 shards, and each shard has >= 1 segments. That means if you set
+         * "candidates": 200 for an index with 2 shards, each with 3 segments, then you’ll compute the exact similarity for 2 * 3 * 200 = 1200 vectors.
+         * candidates must be set to a number greater or equal to the number of Elasticsearch results you want to get. Higher values generally mean
+         * higher recall and higher latency.
+         */
+        private final int candidates;
+        /**
+         * Number of probes for using the multiprobe search technique. Default value is zero. Max value is 3^k. Generally, increasing probes will
+         * increase recall, will allow you to use a smaller value for L with comparable recall, but introduces some additional computation at query time.
+         */
+        private final int probes;
+
+        public SimilaritySearchParameters(int l, int k, int w, String queryModel, String indexTimeSimilarityFunction,
+                                          String queryTimeSimilarityFunction, int candidates, int probes) {
+            L = l;
+            this.k = k;
+            this.w = w;
+            this.queryModel = queryModel;
+            this.indexTimeSimilarityFunction = indexTimeSimilarityFunction;
+            this.queryTimeSimilarityFunction = queryTimeSimilarityFunction;
+            this.candidates = candidates;
+            this.probes = probes;
+        }
+
+        public int getL() {
+            return L;
+        }
+
+        public int getK() {
+            return k;
+        }
+
+        public int getW() {
+            return w;
+        }
+
+        public String getQueryModel() {
+            return queryModel;
+        }
+
+        public String getQueryTimeSimilarityFunction() {
+            return queryTimeSimilarityFunction;
+        }
+
+        public String getIndexTimeSimilarityFunction() {
+            return indexTimeSimilarityFunction;
+        }
+
+        public int getCandidates() {
+            return candidates;
+        }
+
+        public int getProbes() {
+            return probes;
+        }
+    }
+
+    public SimilaritySearchParameters getSimilaritySearchParameters() {
+        return similaritySearchParameters;
+    }
+}

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java?rev=1886318&r1=1886317&r2=1886318&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java Mon Feb  8 12:37:23 2021
@@ -18,6 +18,7 @@ package org.apache.jackrabbit.oak.plugin
 
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
+import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition;
 import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
 import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
 import org.elasticsearch.client.indices.CreateIndexRequest;
@@ -33,8 +34,7 @@ import java.util.stream.Collectors;
  * Provides utility functions around Elasticsearch indexing
  */
 class ElasticIndexHelper {
-
-    private static final String ES_DENSE_VECTOR_TYPE = "dense_vector";
+    
     private static final String ES_DENSE_VECTOR_DIM_PROP = "dims";
 
     public static CreateIndexRequest createIndexRequest(String remoteIndexName, ElasticIndexDefinition indexDefinition) throws IOException {
@@ -63,6 +63,9 @@ class ElasticIndexHelper {
     private static XContentBuilder loadSettings(ElasticIndexDefinition indexDefinition) throws IOException {
         final XContentBuilder settingsBuilder = XContentFactory.jsonBuilder();
         settingsBuilder.startObject();
+        if (indexDefinition.getSimilarityProperties().size() > 0) {
+            settingsBuilder.field("elastiknn", true);
+        }
         settingsBuilder.field("number_of_shards", indexDefinition.numberOfShards);
         settingsBuilder.field("number_of_replicas", indexDefinition.numberOfReplicas);
         {
@@ -154,11 +157,8 @@ class ElasticIndexHelper {
         for (Map.Entry<String, List<PropertyDefinition>> entry : indexDefinition.getPropertiesByName().entrySet()) {
             final String name = entry.getKey();
             final List<PropertyDefinition> propertyDefinitions = entry.getValue();
-
             Type<?> type = null;
             boolean useInSpellCheck = false;
-            boolean useInSimilarity = false;
-            int denseVectorSize = -1;
             for (PropertyDefinition pd : propertyDefinitions) {
                 type = Type.fromTag(pd.getType(), false);
                 if (pd.useInSpellcheck) {
@@ -167,10 +167,6 @@ class ElasticIndexHelper {
                 if (pd.useInSuggest) {
                     useInSuggest = true;
                 }
-                if (pd.useInSimilarity) {
-                    useInSimilarity = true;
-                    denseVectorSize = pd.getSimilaritySearchDenseVectorSize();
-                }
             }
 
             mappingBuilder.startObject(name);
@@ -213,13 +209,6 @@ class ElasticIndexHelper {
                 }
             }
             mappingBuilder.endObject();
-
-            if (useInSimilarity) {
-                mappingBuilder.startObject(FieldNames.createSimilarityFieldName(name));
-                mappingBuilder.field("type", ES_DENSE_VECTOR_TYPE);
-                mappingBuilder.field(ES_DENSE_VECTOR_DIM_PROP, denseVectorSize);
-                mappingBuilder.endObject();
-            }
         }
 
         if (useInSuggest) {
@@ -255,6 +244,26 @@ class ElasticIndexHelper {
                 }
                 mappingBuilder.endObject();
             }
+            mappingBuilder.endObject();
+        }
+
+        for (PropertyDefinition propertyDefinition : indexDefinition.getSimilarityProperties()) {
+            ElasticPropertyDefinition pd = (ElasticPropertyDefinition) propertyDefinition;
+            int denseVectorSize = pd.getSimilaritySearchDenseVectorSize();
+            mappingBuilder.startObject(FieldNames.createSimilarityFieldName(pd.name));
+            {
+                mappingBuilder.field("type", "elastiknn_dense_float_vector");
+                mappingBuilder.startObject("elastiknn");
+                {
+                    mappingBuilder.field(ES_DENSE_VECTOR_DIM_PROP, denseVectorSize);
+                    mappingBuilder.field("model", "lsh");
+                    mappingBuilder.field("similarity", pd.getSimilaritySearchParameters().getIndexTimeSimilarityFunction());
+                    mappingBuilder.field("L", pd.getSimilaritySearchParameters().getL());
+                    mappingBuilder.field("k", pd.getSimilaritySearchParameters().getK());
+                    mappingBuilder.field("w", pd.getSimilaritySearchParameters().getW());
+                }
+                mappingBuilder.endObject();
+            }
             mappingBuilder.endObject();
         }
 

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java?rev=1886318&r1=1886317&r2=1886318&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java Mon Feb  8 12:37:23 2021
@@ -23,6 +23,7 @@ import org.apache.jackrabbit.oak.api.Pro
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.commons.PathUtils;
 import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
+import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition;
 import org.apache.jackrabbit.oak.plugins.index.elastic.query.async.facets.ElasticFacetProvider;
 import org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils;
 import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
@@ -46,6 +47,9 @@ import org.apache.jackrabbit.oak.spi.que
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.search.join.ScoreMode;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.json.JsonXContent;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.client.Request;
 import org.elasticsearch.common.xcontent.XContentHelper;
@@ -60,11 +64,9 @@ import org.elasticsearch.index.query.Nes
 import org.elasticsearch.index.query.Operator;
 import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.index.query.QueryBuilders;
-import org.elasticsearch.index.query.functionscore.ScriptScoreQueryBuilder;
+import org.elasticsearch.index.query.WrapperQueryBuilder;
 import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
 import org.elasticsearch.index.search.MatchQuery;
-import org.elasticsearch.script.Script;
-import org.elasticsearch.script.ScriptType;
 import org.elasticsearch.search.aggregations.AggregationBuilders;
 import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
 import org.elasticsearch.search.builder.SearchSourceBuilder;
@@ -84,9 +86,7 @@ import java.io.IOException;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Collections;
 import java.util.HashMap;
-import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicReference;
@@ -117,7 +117,6 @@ import static org.apache.jackrabbit.util
 import static org.elasticsearch.common.xcontent.ToXContent.EMPTY_PARAMS;
 import static org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item;
 import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
-import static org.elasticsearch.index.query.QueryBuilders.existsQuery;
 import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
@@ -125,8 +124,8 @@ import static org.elasticsearch.index.qu
 import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
 import static org.elasticsearch.index.query.QueryBuilders.nestedQuery;
 import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
-import static org.elasticsearch.index.query.QueryBuilders.scriptScoreQuery;
 import static org.elasticsearch.index.query.QueryBuilders.termQuery;
+import static org.elasticsearch.index.query.QueryBuilders.wrapperQuery;
 
 /**
  * Class to map query plans into Elastic request objects.
@@ -176,15 +175,12 @@ public class ElasticRequestHandler {
 
         if (propertyRestrictionQuery != null) {
             if (propertyRestrictionQuery.startsWith("mlt?")) {
-                List<PropertyDefinition> sp = new LinkedList<>();
-                for (IndexDefinition.IndexingRule r : elasticIndexDefinition.getDefinedRules()) {
-                    sp.addAll(r.getSimilarityProperties());
-                }
+                List<PropertyDefinition> sp = elasticIndexDefinition.getSimilarityProperties();
                 String mltQueryString = propertyRestrictionQuery.substring("mlt?".length());
                 Map<String, String> mltParams = MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
-                String text = mltParams.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+                String queryNodePath = mltParams.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
 
-                if (text == null) {
+                if (queryNodePath == null) {
                     // TODO : See if we might want to support like Text here (passed as null in above constructors)
                     // IT is not supported in our lucene implementation.
                     throw new IllegalArgumentException("Missing required field stream.body in MLT query: " + mltQueryString);
@@ -201,13 +197,15 @@ public class ElasticRequestHandler {
                             .minTermFreq(1).minDocFreq(1)
                     );
                 } else {
-                    boolQuery.must(similarityQuery(text, sp));
-                    // add should clause to improve relevance using similarity tags
-                    boolQuery.should(moreLikeThisQuery(
-                            new String[]{ElasticIndexDefinition.SIMILARITY_TAGS}, null,
-                            new Item[]{new Item(null, ElasticIndexUtils.idFromPath(text))})
-                            .minTermFreq(1).minDocFreq(1)
-                    );
+                    boolQuery.must(similarityQuery(queryNodePath, sp));
+                    if (elasticIndexDefinition.areSimilarityTagsEnabled()) {
+                        // add should clause to improve relevance using similarity tags
+                        boolQuery.should(moreLikeThisQuery(
+                                new String[]{ElasticIndexDefinition.SIMILARITY_TAGS}, null,
+                                new Item[]{new Item(null, ElasticIndexUtils.idFromPath(queryNodePath))})
+                                .minTermFreq(1).minDocFreq(1).boost(elasticIndexDefinition.getSimilarityTagsBoost())
+                        );
+                    }
                 }
             } else {
                 boolQuery.must(queryStringQuery(propertyRestrictionQuery));
@@ -352,7 +350,8 @@ public class ElasticRequestHandler {
             if (!targetNodeState.exists()) {
                 throw new IllegalArgumentException("Could not find node " + text);
             }
-            for (PropertyDefinition pd : sp) {
+            for (PropertyDefinition propertyDefinition : sp) {
+                ElasticPropertyDefinition pd = (ElasticPropertyDefinition) propertyDefinition;
                 String propertyPath = PathUtils.getParentPath(pd.name);
                 String propertyName = PathUtils.getName(pd.name);
                 NodeState tempState = targetNodeState;
@@ -376,13 +375,36 @@ public class ElasticRequestHandler {
                     continue;
                 }
                 String similarityPropFieldName = FieldNames.createSimilarityFieldName(pd.name);
-                Map<String, Object> paramMap = new HashMap<>();
-                paramMap.put("query_vector", toDoubles(bytes));
-                paramMap.put("field_name", similarityPropFieldName);
-                ScriptScoreQueryBuilder scriptScoreQueryBuilder = scriptScoreQuery(existsQuery(similarityPropFieldName),
-                        new Script(ScriptType.INLINE, Script.DEFAULT_SCRIPT_LANG, "cosineSimilarity(params.query_vector, params.field_name) + 1.0",
-                                Collections.emptyMap(), paramMap));
-                query.should(scriptScoreQueryBuilder);
+                try {
+                    XContentBuilder contentBuilder = JsonXContent.contentBuilder();
+                    contentBuilder.startObject();
+                    contentBuilder.field("elastiknn_nearest_neighbors");
+                    contentBuilder.startObject();
+                    {
+                        contentBuilder.field("field", similarityPropFieldName);
+                        contentBuilder.field("vec");
+                        contentBuilder.startObject();
+                        {
+                            contentBuilder.field("values");
+                            contentBuilder.startArray();
+                            for (Double d : toDoubles(bytes)) {
+                                contentBuilder.value(d);
+                            }
+                            contentBuilder.endArray();
+                        }
+                        contentBuilder.endObject();
+                        contentBuilder.field("model", pd.getSimilaritySearchParameters().getQueryModel());
+                        contentBuilder.field("similarity", pd.getSimilaritySearchParameters().getQueryTimeSimilarityFunction());
+                        contentBuilder.field("candidates", pd.getSimilaritySearchParameters().getCandidates());
+                        contentBuilder.field("probes", pd.getSimilaritySearchParameters().getProbes());
+                    }
+                    contentBuilder.endObject();
+                    contentBuilder.endObject();
+                    WrapperQueryBuilder wqb = wrapperQuery(Strings.toString(contentBuilder));
+                    query.should(wqb);
+                } catch (IOException e){
+                    LOG.error("Could not create similarity query ", e);
+                }
             }
         }
         return query;

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticConnectionRule.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticConnectionRule.java?rev=1886318&r1=1886317&r2=1886318&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticConnectionRule.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticConnectionRule.java Mon Feb  8 12:37:23 2021
@@ -17,6 +17,11 @@
 package org.apache.jackrabbit.oak.plugins.index.elastic;
 
 import com.github.dockerjava.api.DockerClient;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.jackrabbit.oak.commons.IOUtils;
 import org.elasticsearch.Version;
 import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
 import org.elasticsearch.client.RequestOptions;
@@ -27,10 +32,17 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.testcontainers.DockerClientFactory;
 import org.testcontainers.elasticsearch.ElasticsearchContainer;
+import org.testcontainers.utility.MountableFile;
 
+import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.net.URI;
 import java.net.URISyntaxException;
+import java.security.DigestInputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 
 import static org.junit.Assume.assumeNotNull;
 
@@ -43,6 +55,7 @@ public class ElasticConnectionRule exten
     private ElasticConnection elasticConnection;
     private final String elasticConnectionString;
     private static final String INDEX_PREFIX = "ElasticTest_";
+    private static final String PLUGIN_DIGEST = "c4451aa794641dd3c9b0fdc64b553b71ca2f9a44689a7784b51669b5e557046d";
     private static boolean useDocker = false;
 
     public ElasticConnectionRule(String elasticConnectionString) {
@@ -68,10 +81,16 @@ public class ElasticConnectionRule exten
     public Statement apply(Statement base, Description description) {
         Statement s = super.apply(base, description);
         // see if docker is to be used or not... initialize docker rule only if that's the case.
-
+        final String pluginVersion = "7.10.1.0";
+        final String pluginFileName = "elastiknn-" + pluginVersion + ".zip";
+        final String localPluginPath = "target/" + pluginFileName;
+        downloadSimilaritySearchPluginIfNotExists(localPluginPath, pluginVersion);
         if (elasticConnectionString == null || getElasticConnectionFromString() == null) {
             checkIfDockerClientAvailable();
-            elastic = new ElasticsearchContainer("docker.elastic.co/elasticsearch/elasticsearch:" + Version.CURRENT);
+            elastic = new ElasticsearchContainer("docker.elastic.co/elasticsearch/elasticsearch:" + Version.CURRENT)
+                .withCopyFileToContainer(MountableFile.forHostPath(localPluginPath), "/tmp/plugins/" + pluginFileName)
+                    .withCopyFileToContainer(MountableFile.forClasspathResource("elasticstartscript.sh"), "/tmp/elasticstartscript.sh")
+                .withCommand("bash /tmp/elasticstartscript.sh");
             s = elastic.apply(s, description);
             setUseDocker(true);
         }
@@ -83,6 +102,38 @@ public class ElasticConnectionRule exten
         //TODO: See if something needs to be cleaned up at test class level ??
     }
 
+    private void downloadSimilaritySearchPluginIfNotExists(String localPluginPath, String pluginVersion) {
+        File pluginFile = new File(localPluginPath);
+        if (!pluginFile.exists()) {
+            LOG.info("Plugin file {} doesn't exist. Trying to download.", localPluginPath);
+            try (CloseableHttpClient client = HttpClients.createDefault()) {
+                HttpGet get = new HttpGet("https://github.com/alexklibisz/elastiknn/releases/download/" + pluginVersion
+                        +"/elastiknn-" + pluginVersion +".zip");
+                CloseableHttpResponse response = client.execute(get);
+                InputStream inputStream = response.getEntity().getContent();
+                MessageDigest messageDigest = MessageDigest.getInstance("SHA-256");
+                DigestInputStream dis = new DigestInputStream(inputStream, messageDigest);
+                FileOutputStream outputStream = new FileOutputStream(pluginFile);
+                IOUtils.copy(dis, outputStream);
+                messageDigest = dis.getMessageDigest();
+                // bytes to hex
+                StringBuilder result = new StringBuilder();
+                for (byte b : messageDigest.digest()) {
+                    result.append(String.format("%02x", b));
+                }
+                if (!PLUGIN_DIGEST.equals(result.toString())) {
+                    String deleteString = "Downloaded plugin file deleted.";
+                    if (!pluginFile.delete()) {
+                        deleteString = "Could not delete downloaded plugin file.";
+                    }
+                    throw new RuntimeException("Plugin digest unequal. Found " + result.toString() + ". Expected " + PLUGIN_DIGEST + ". " + deleteString);
+                }
+            } catch (IOException|NoSuchAlgorithmException e) {
+                throw new RuntimeException("Could not download similarity search plugin", e);
+            }
+        }
+    }
+
     public ElasticConnection getElasticConnectionFromString() {
         if (elasticConnection == null) {
             try {

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java?rev=1886318&r1=1886317&r2=1886318&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java Mon Feb  8 12:37:23 2021
@@ -33,13 +33,18 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.net.URI;
 import java.nio.charset.Charset;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.Scanner;
+import java.util.Set;
 import java.util.UUID;
 import java.util.stream.Collectors;
 
@@ -224,35 +229,34 @@ public class ElasticSimilarQueryTest ext
     }
 
     @Test
-    public void vectorSimilarityCustomVectorSize() throws Exception {
+    public void vectorSimilarityElastiknnIndexConfiguration() throws Exception {
         final String indexName = "test1";
         final String fieldName1 = "fv1";
-        final String fieldName2 = "fv2";
         final String similarityFieldName1 = FieldNames.createSimilarityFieldName(fieldName1);
-        final String similarityFieldName2 = FieldNames.createSimilarityFieldName(fieldName2);
-        IndexDefinitionBuilder builder = createIndex(fieldName1, fieldName2);
-        builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex()
-                .similaritySearchDenseVectorSize(10);
-        builder.indexRule("nt:base").property(fieldName2).useInSimilarity(true).nodeScopeIndex()
-                .similaritySearchDenseVectorSize(20);
+        IndexDefinitionBuilder builder = createIndex(fieldName1);
+        Tree tree = builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex()
+                .similaritySearchDenseVectorSize(2048).getBuilderTree();
+        tree.setProperty(ElasticPropertyDefinition.PROP_INDEX_SIMILARITY, "angular");
+        tree.setProperty(ElasticPropertyDefinition.PROP_NUMBER_OF_HASH_TABLES, 10);
+        tree.setProperty(ElasticPropertyDefinition.PROP_NUMBER_OF_HASH_FUNCTIONS, 12);
+
         setIndex(indexName, builder);
         root.commit();
         String alias =  ElasticIndexNameHelper.getIndexAlias(esConnection.getIndexPrefix(), "/oak:index/" + indexName);
         GetFieldMappingsRequest fieldMappingsRequest = new GetFieldMappingsRequest();
-        fieldMappingsRequest.indices(alias).fields(similarityFieldName1, similarityFieldName2);
+        fieldMappingsRequest.indices(alias).fields(similarityFieldName1);
         GetFieldMappingsResponse mappingsResponse = esConnection.getClient().indices().
                 getFieldMapping(fieldMappingsRequest, RequestOptions.DEFAULT);
         final Map<String, Map<String, GetFieldMappingsResponse.FieldMappingMetadata>> mappings =
                 mappingsResponse.mappings();
         assertEquals("More than one index found", 1, mappings.keySet().size());
         @SuppressWarnings("unchecked")
-        Map<String, Integer> map1 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
-                get(similarityFieldName1).sourceAsMap().get(similarityFieldName1);
-        assertEquals("Dense vector size doesn't match", 10, map1.get("dims").intValue());
-        @SuppressWarnings("unchecked")
-        Map<String, Integer> map2 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
-                get(similarityFieldName2).sourceAsMap().get(similarityFieldName2);
-        assertEquals("Dense vector size doesn't match", 20, map2.get("dims").intValue());
+        Map<String, Object> map1 = (Map<String, Object>)(((Map<String, Object>)mappings.entrySet().iterator().next().getValue().
+                get(similarityFieldName1).sourceAsMap().get(similarityFieldName1)).get("elastiknn"));
+        assertEquals("Dense vector size doesn't match", 2048, (int)map1.get("dims"));
+        assertEquals("Similarity doesn't match", "angular", map1.get("similarity"));
+        assertEquals("Similarity doesn't match", 10, map1.get("L"));
+        assertEquals("Similarity doesn't match", 12, map1.get("k"));
     }
 
 
@@ -302,6 +306,101 @@ public class ElasticSimilarQueryTest ext
         }
     }
 
+    private void createNodeWithFV(String imageName, String fv, Tree test) throws Exception {
+        String[] split = fv.split(",");
+        List<Double> values = Arrays.stream(split).map(Double::parseDouble).collect(Collectors.toList());
+        byte[] bytes = toByteArray(values);
+        List<Double> actual = toDoubles(bytes);
+        assertEquals(values, actual);
+        Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+        Tree child = test.addChild(imageName);
+        child.setProperty("fv", blob, Type.BINARY);
+    }
+
+    private void indexEntry(Scanner scanner, Tree test, Map<String, List<String>> expectedResults, int similarResultCount) throws Exception {
+        String lineRead = "";
+        List<String> similarities = new ArrayList<>();
+        //skip empty lines at the beginning
+        while (scanner.hasNextLine()) {
+            lineRead = scanner.nextLine();
+            if (!"".equals(lineRead)) {
+                break;
+            }
+        }
+        if ("".equals(lineRead)) {
+            // complete file read
+            return;
+        }
+        String imageName = lineRead;
+        expectedResults.put(lineRead, similarities);
+        String fv = scanner.nextLine();
+        createNodeWithFV(imageName, fv, test);
+        int resultCount = 0;
+        while (scanner.hasNextLine() && resultCount < similarResultCount) {
+            imageName = scanner.nextLine();
+            if ("".equals(imageName)) {
+                continue;
+            }
+            resultCount++;
+            fv = scanner.nextLine();
+            createNodeWithFV(imageName, fv, test);
+            similarities.add(imageName);
+        }
+    }
+
+    private void verifyLSHResults(Map<String, List<String>> expectedResults) {
+        for (String similarPath : expectedResults.keySet()) {
+            String query = "select [jcr:path] from [nt:base] where similar(., '" + "/test/" + similarPath + "')";
+            assertEventually(() -> {
+                Iterator<String> result = executeQuery(query, "JCR-SQL2", false, true).iterator();
+                List<String> expectedList = expectedResults.get(similarPath.substring(similarPath.lastIndexOf("/") + 1));
+                Set<String> found = new HashSet<>();
+                int resultNum = 0;
+                // Verify that the expected results are present in the top 10 results
+                while (resultNum < expectedList.size()) {
+                    String next = result.next();
+                    next = next.substring(next.lastIndexOf("/") + 1);
+                    found.add(next);
+                    resultNum++;
+                }
+                double per = (expectedList.stream().filter(found::contains).count() * 100.0)/expectedList.size();
+                assertEquals(100.0, per, 0.0);
+            });
+        }
+    }
+
+    @Test
+    public void vectorSimilarityLargeData() throws Exception {
+        final int similarImageCount = 10;
+        IndexDefinitionBuilder builder = createIndex("fv");
+        builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex();
+        setIndex("test1", builder);
+        root.commit();
+        Tree test = root.getTree("/").addChild("test");
+        /*
+        Image names and their feature vectors are written in this file with the image name first and its feature vector
+        in the line below.
+        This file contains test data in form of blocks and each block has following format -
+         Line 1: Query_Image_Name
+         Line 2: Feature Vector of Query_Image
+         Line 3: EMPTY_LINE
+         Lines 4-23: 10 Result images and their feature vectors
+         Line 24: EMPTY_LINE
+        Then this pattern repeats again with next Query Image name in line 25.
+         */
+        URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/imagedata.txt").toURI();
+        File inputFile = new File(uri);
+        Map<String, List<String>> expectedResults = new HashMap<>();
+
+        Scanner scanner = new Scanner(inputFile);
+        while (scanner.hasNextLine()) {
+            indexEntry(scanner, test, expectedResults, similarImageCount);
+        }
+        root.commit();
+
+        verifyLSHResults(expectedResults);
+    }
+
     private void createIndex(boolean nativeQuery) throws Exception {
         IndexDefinitionBuilder builder = createIndex("text", "tags");
         if (nativeQuery) {

Added: jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/elasticstartscript.sh
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/elasticstartscript.sh?rev=1886318&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/elasticstartscript.sh (added)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/elasticstartscript.sh Mon Feb  8 12:37:23 2021
@@ -0,0 +1,4 @@
+pluginZip=`ls /tmp/plugins | grep elastiknn-7.10 | head -n 1`
+echo "Installing plugin /tmp/plugins/$pluginZip"
+bin/elasticsearch-plugin install --batch file:///tmp/plugins/$pluginZip
+su -c "bin/elasticsearch" elasticsearch
\ No newline at end of file