You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by am...@apache.org on 2020/10/29 16:23:30 UTC

svn commit: r1882970 [1/3] - in /jackrabbit/oak/trunk: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ oak-search-elastic/src/main/java/org...

Author: amrverma
Date: Thu Oct 29 16:23:30 2020
New Revision: 1882970

URL: http://svn.apache.org/viewvc?rev=1882970&view=rev
Log:
OAK-9213 - Support feature vector similarity / image similarity in Oak ES

Added:
    jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/org/apache/jackrabbit/oak/query/
    jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/org/apache/jackrabbit/oak/query/fvs.csv
Modified:
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
    jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
    jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java
    jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java
    jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java
    jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndex.java
    jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java Thu Oct 29 16:23:30 2020
@@ -776,7 +776,7 @@ public class LucenePropertyIndex extends
     }
 
     @Override
-    protected String getFulltextRequestString(IndexPlan plan, IndexNode indexNode) {
+    protected String getFulltextRequestString(IndexPlan plan, IndexNode indexNode, NodeState root) {
         return getLuceneRequest(plan, augmentorFactory, null).toString();
     }
 

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java Thu Oct 29 16:23:30 2020
@@ -16,8 +16,10 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.elastic.index;
 
+import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.commons.PathUtils;
 import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
+import org.apache.jackrabbit.oak.plugins.index.search.spi.binary.BlobByteSource;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentFactory;
@@ -32,6 +34,8 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
+
 class ElasticDocument {
     private static final Logger LOG = LoggerFactory.getLogger(ElasticDocument.class);
 
@@ -41,6 +45,7 @@ class ElasticDocument {
     private final List<String> notNullProps;
     private final List<String> nullProps;
     private final Map<String, Object> properties;
+    private final Map<String, Object> similarityFields;
     private final Map<String, Map<String, Double>> dynamicBoostFields;
 
     ElasticDocument(String path) {
@@ -50,6 +55,7 @@ class ElasticDocument {
         this.notNullProps = new ArrayList<>();
         this.nullProps = new ArrayList<>();
         this.properties = new HashMap<>();
+        this.similarityFields = new HashMap<>();
         this.dynamicBoostFields = new HashMap<>();
     }
 
@@ -81,6 +87,11 @@ class ElasticDocument {
         properties.put(fieldName, value);
     }
 
+    void addSimilarityField(String name, Blob value) throws IOException{
+        byte[] bytes = new BlobByteSource(value).read();
+        similarityFields.put(FieldNames.createSimilarityFieldName(name), toDoubles(bytes));
+    }
+
     void indexAncestors(String path) {
         String parPath = PathUtils.getParentPath(path);
         int depth = PathUtils.getDepth(path);
@@ -117,6 +128,9 @@ class ElasticDocument {
                 if (nullProps.size() > 0) {
                     builder.field(FieldNames.NULL_PROPS, nullProps);
                 }
+                for (Map.Entry<String, Object> simProp: similarityFields.entrySet()) {
+                    builder.field(simProp.getKey(), simProp.getValue());
+                }
                 for (Map.Entry<String, Object> prop : properties.entrySet()) {
                     builder.field(prop.getKey(), prop.getValue());
                 }
@@ -148,4 +162,5 @@ class ElasticDocument {
     public String toString() {
         return build();
     }
+
 }

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java Thu Oct 29 16:23:30 2020
@@ -178,9 +178,11 @@ class ElasticDocumentMaker extends Fullt
 
     @Override
     protected void indexSimilarityBinaries(ElasticDocument doc, PropertyDefinition pd, Blob blob) throws IOException {
-        // TODO : not implemented
         // see https://www.elastic.co/blog/text-similarity-search-with-vectors-in-elasticsearch
         // see https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
+
+        doc.addSimilarityField(pd.name, blob);
+
     }
 
     @Override

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java Thu Oct 29 16:23:30 2020
@@ -34,6 +34,9 @@ import java.util.stream.Collectors;
  */
 class ElasticIndexHelper {
 
+    private static final String ES_DENSE_VECTOR_TYPE = "dense_vector";
+    private static final String ES_DENSE_VECTOR_DIM_PROP = "dims";
+
     public static CreateIndexRequest createIndexRequest(String remoteIndexName, ElasticIndexDefinition indexDefinition) throws IOException {
         final CreateIndexRequest request = new CreateIndexRequest(remoteIndexName);
 
@@ -152,6 +155,8 @@ class ElasticIndexHelper {
 
             Type<?> type = null;
             boolean useInSpellCheck = false;
+            boolean useInSimilarity = false;
+            int denseVectorSize = -1;
             for (PropertyDefinition pd : propertyDefinitions) {
                 type = Type.fromTag(pd.getType(), false);
                 if (pd.useInSpellcheck) {
@@ -160,6 +165,10 @@ class ElasticIndexHelper {
                 if (pd.useInSuggest) {
                     useInSuggest = true;
                 }
+                if (pd.useInSimilarity) {
+                    useInSimilarity = true;
+                    denseVectorSize = pd.getSimilaritySearchDenseVectorSize();
+                }
             }
 
             mappingBuilder.startObject(name);
@@ -202,6 +211,13 @@ class ElasticIndexHelper {
                 }
             }
             mappingBuilder.endObject();
+
+            if (useInSimilarity) {
+                mappingBuilder.startObject(FieldNames.createSimilarityFieldName(name));
+                mappingBuilder.field("type", ES_DENSE_VECTOR_TYPE);
+                mappingBuilder.field(ES_DENSE_VECTOR_DIM_PROP, denseVectorSize);
+                mappingBuilder.endObject();
+            }
         }
 
         if (useInSuggest) {

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java Thu Oct 29 16:23:30 2020
@@ -96,8 +96,8 @@ class ElasticIndex extends FulltextIndex
     }
 
     @Override
-    protected String getFulltextRequestString(IndexPlan plan, IndexNode indexNode) {
-        return Strings.toString(new ElasticRequestHandler(plan, getPlanResult(plan)).baseQuery());
+    protected String getFulltextRequestString(IndexPlan plan, IndexNode indexNode, NodeState rootState) {
+        return Strings.toString(new ElasticRequestHandler(plan, getPlanResult(plan), rootState).baseQuery());
     }
 
     @Override
@@ -105,7 +105,7 @@ class ElasticIndex extends FulltextIndex
         final Filter filter = plan.getFilter();
         final FulltextIndexPlanner.PlanResult planResult = getPlanResult(plan);
 
-        final ElasticRequestHandler requestHandler = new ElasticRequestHandler(plan, planResult);
+        final ElasticRequestHandler requestHandler = new ElasticRequestHandler(plan, planResult, rootState);
         final ElasticResponseHandler responseHandler = new ElasticResponseHandler(planResult, filter);
 
         final Iterator<FulltextResultRow> itr;

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java Thu Oct 29 16:23:30 2020
@@ -16,6 +16,8 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.elastic.query;
 
+import org.apache.jackrabbit.oak.api.Blob;
+import org.apache.jackrabbit.oak.api.PropertyState;
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.commons.PathUtils;
 import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
@@ -25,6 +27,7 @@ import org.apache.jackrabbit.oak.plugins
 import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
 import org.apache.jackrabbit.oak.plugins.index.search.MoreLikeThisHelperUtil;
 import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
+import org.apache.jackrabbit.oak.plugins.index.search.spi.binary.BlobByteSource;
 import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndex;
 import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner;
 import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner.PlanResult;
@@ -38,9 +41,11 @@ import org.apache.jackrabbit.oak.spi.que
 import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextOr;
 import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm;
 import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor;
+import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.search.join.ScoreMode;
 import org.elasticsearch.index.query.BoolQueryBuilder;
+import org.elasticsearch.index.query.ExistsQueryBuilder;
 import org.elasticsearch.index.query.InnerHitBuilder;
 import org.elasticsearch.index.query.MatchBoolPrefixQueryBuilder;
 import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
@@ -50,8 +55,11 @@ import org.elasticsearch.index.query.Nes
 import org.elasticsearch.index.query.Operator;
 import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.index.query.functionscore.ScriptScoreQueryBuilder;
 import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
 import org.elasticsearch.index.search.MatchQuery;
+import org.elasticsearch.script.Script;
+import org.elasticsearch.script.ScriptType;
 import org.elasticsearch.search.aggregations.AggregationBuilders;
 import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
 import org.elasticsearch.search.sort.FieldSortBuilder;
@@ -66,8 +74,12 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import javax.jcr.PropertyType;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicReference;
@@ -79,6 +91,7 @@ import static org.apache.jackrabbit.JcrC
 import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE;
 import static org.apache.jackrabbit.oak.commons.PathUtils.denotesRoot;
 import static org.apache.jackrabbit.oak.commons.PathUtils.getParentPath;
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
 import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newAncestorQuery;
 import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newDepthQuery;
 import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newMixinTypeQuery;
@@ -96,12 +109,14 @@ import static org.apache.jackrabbit.oak.
 import static org.apache.jackrabbit.util.ISO8601.parse;
 import static org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item;
 import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
+import static org.elasticsearch.index.query.QueryBuilders.existsQuery;
 import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
 import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
 import static org.elasticsearch.index.query.QueryBuilders.nestedQuery;
 import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
+import static org.elasticsearch.index.query.QueryBuilders.scriptScoreQuery;
 import static org.elasticsearch.index.query.QueryBuilders.termQuery;
 
 /**
@@ -123,8 +138,9 @@ public class ElasticRequestHandler {
     private final PlanResult planResult;
     private final ElasticIndexDefinition elasticIndexDefinition;
     private final String propertyRestrictionQuery;
+    private final NodeState rootState;
 
-    ElasticRequestHandler(@NotNull IndexPlan indexPlan, @NotNull FulltextIndexPlanner.PlanResult planResult) {
+    ElasticRequestHandler(@NotNull IndexPlan indexPlan, @NotNull FulltextIndexPlanner.PlanResult planResult, NodeState rootState) {
         this.indexPlan = indexPlan;
         this.filter = indexPlan.getFilter();
         this.planResult = planResult;
@@ -137,6 +153,7 @@ public class ElasticRequestHandler {
         }
 
         this.propertyRestrictionQuery = pr != null ? String.valueOf(pr.first.getValue(pr.first.getType())) : null;
+        this.rootState = rootState;
     }
 
     public BoolQueryBuilder baseQuery() {
@@ -150,11 +167,19 @@ public class ElasticRequestHandler {
 
         if (propertyRestrictionQuery != null) {
             if (propertyRestrictionQuery.startsWith("mlt?")) {
-                // SimilarityImpl in oak-core sets property restriction for sim search and the query is something like
-                // mlt?mlt.fl=:path&mlt.mindf=0&stream.body=<path> . We need parse this query string and turn into a query
-                // elastic can understand.
-                String mltQueryString = propertyRestrictionQuery.replace("mlt?", "");
-                boolQuery.must(moreLikeThisQuery(mltQueryString));
+                List<PropertyDefinition> sp = new LinkedList<>();
+                for (IndexDefinition.IndexingRule r : elasticIndexDefinition.getDefinedRules()) {
+                    sp.addAll(r.getSimilarityProperties());
+                }
+                String mltQueryString = propertyRestrictionQuery.substring("mlt?".length());
+                if (sp.isEmpty()) {
+                    // SimilarityImpl in oak-core sets property restriction for sim search and the query is something like
+                    // mlt?mlt.fl=:path&mlt.mindf=0&stream.body=<path> . We need parse this query string and turn into a query
+                    // elastic can understand.
+                    boolQuery.must(moreLikeThisQuery(mltQueryString));
+                } else {
+                    boolQuery.must(similarityQuery(mltQueryString, sp));
+                }
 
             } else {
                 boolQuery.must(queryStringQuery(propertyRestrictionQuery));
@@ -263,6 +288,56 @@ public class ElasticRequestHandler {
                 .map(pd -> pd.name);
     }
 
+    private QueryBuilder similarityQuery(String mltQueryString, List<PropertyDefinition> sp) {
+        LOG.debug("parsing similarity query on {}", mltQueryString);
+        Map<String, String> mltParamMap = MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
+        String text = mltParamMap.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+        BoolQueryBuilder query = boolQuery();
+        if (text != null && !sp.isEmpty()) {
+            LOG.debug("generating similarity query for {}", text);
+            NodeState targetNodeState = rootState;
+            for (String token : PathUtils.elements(text)) {
+                targetNodeState = targetNodeState.getChildNode(token);
+            }
+            if (!targetNodeState.exists()) {
+                throw new IllegalArgumentException("Could not find node " + text);
+            }
+            for (PropertyDefinition pd : sp) {
+                String propertyPath = PathUtils.getParentPath(pd.name);;
+                String propertyName = PathUtils.getName(pd.name);
+                NodeState tempState = targetNodeState;
+                for (String token : PathUtils.elements(propertyPath)) {
+                    if (token.isEmpty()) {
+                        break;
+                    }
+                    tempState = tempState.getChildNode(token);
+                }
+                PropertyState ps = tempState.getProperty(propertyName);
+                Blob property = ps != null ? ps.getValue(Type.BINARY) : null;
+                if (property == null) {
+                    LOG.warn("Couldn't find property {} on {}", pd.name, text);
+                    continue;
+                }
+                byte[] bytes;
+                try {
+                    bytes = new BlobByteSource(property).read();
+                } catch (IOException e) {
+                    LOG.error("Error reading bytes from property " + pd.name +" on " + text, e);
+                    continue;
+                }
+                String similarityPropFieldName = FieldNames.createSimilarityFieldName(pd.name);
+                Map<String, Object> paramMap = new HashMap<>();
+                paramMap.put("query_vector", toDoubles(bytes));
+                paramMap.put("field_name", similarityPropFieldName);
+                ScriptScoreQueryBuilder scriptScoreQueryBuilder = scriptScoreQuery(existsQuery(similarityPropFieldName),
+                        new Script(ScriptType.INLINE, Script.DEFAULT_SCRIPT_LANG, "cosineSimilarity(params.query_vector, params.field_name) + 1.0",
+                        Collections.emptyMap(), paramMap));
+                query.should(scriptScoreQueryBuilder);
+            }
+        }
+        return query;
+    }
+
     /*
     Generates mlt query builder from the given mltQueryString
     There could be 2 cases here -

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java Thu Oct 29 16:23:30 2020
@@ -18,13 +18,20 @@ package org.apache.jackrabbit.oak.plugin
 
 
 import org.jetbrains.annotations.NotNull;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
+import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.List;
 
 public class ElasticIndexUtils {
 
+    private static final Logger LOG = LoggerFactory.getLogger(ElasticIndexUtils.class);
+
     /**
      * Transforms a path into an _id compatible with Elasticsearch specification. The path cannot be larger than 512
      * bytes. For performance reasons paths that are already compatible are returned untouched. Otherwise, SHA-256
@@ -46,4 +53,39 @@ public class ElasticIndexUtils {
         }
         return path;
     }
+
+    /**
+     * Converts a given byte array (of doubles) to a list of doubles
+     * @param array given byte array
+     * @return list of doubles
+     */
+    public static List<Double> toDoubles(byte[] array) {
+        int blockSize = Double.SIZE / Byte.SIZE;
+        ByteBuffer wrap = ByteBuffer.wrap(array);
+        if (array.length % blockSize != 0) {
+            LOG.warn("Unexpected byte array length {}", array.length);
+        }
+        int capacity = array.length / blockSize;
+        List<Double> doubles = new ArrayList<>(capacity);
+        for (int i = 0; i < capacity; i++) {
+            double e = wrap.getDouble(i * blockSize);
+            doubles.add(e);
+        }
+        return doubles;
+    }
+
+    /**
+     * Converts a given list of double values into a byte array
+     * @param values given list of doubles
+     * @return byte array
+     */
+    public static byte[] toByteArray(List<Double> values) {
+        int blockSize = Double.SIZE / Byte.SIZE;
+        byte[] bytes = new byte[values.size() * blockSize];
+        ByteBuffer wrap = ByteBuffer.wrap(bytes);
+        for (int i = 0, j = 0; i < values.size(); i++, j += blockSize) {
+            wrap.putDouble(values.get(i));
+        }
+        return bytes;
+    }
 }

Modified: jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java Thu Oct 29 16:23:30 2020
@@ -16,16 +16,38 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.elastic;
 
+import org.apache.commons.io.IOUtils;
+import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.Tree;
+import org.apache.jackrabbit.oak.api.Type;
+import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
 import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
+import org.elasticsearch.client.RequestOptions;
+import org.elasticsearch.client.indices.GetFieldMappingsRequest;
+import org.elasticsearch.client.indices.GetFieldMappingsResponse;
 import org.junit.Test;
 
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.net.URI;
+import java.nio.charset.Charset;
 import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
 
 import static java.util.Collections.singletonList;
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toByteArray;
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
 import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROPDEF_PROP_NODE_NAME;
 import static org.hamcrest.CoreMatchers.containsString;
 import static org.hamcrest.MatcherAssert.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
 
 public class ElasticPropertyIndexTest extends ElasticAbstractQueryTest {
 
@@ -154,4 +176,83 @@ public class ElasticPropertyIndexTest ex
                 Arrays.asList("/test/a", "/test/b")));
     }
 
+    @Test
+    public void vectorSimilarityCustomVectorSize() throws Exception {
+        final String indexName = "test1";
+        final String fieldName1 = "fv1";
+        final String fieldName2 = "fv2";
+        final String similarityFieldName1 = FieldNames.createSimilarityFieldName(fieldName1);
+        final String similarityFieldName2 = FieldNames.createSimilarityFieldName(fieldName2);
+        IndexDefinitionBuilder builder = createIndex(fieldName1, fieldName2);
+        builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex()
+                .similaritySearchDenseVectorSize(10);
+        builder.indexRule("nt:base").property(fieldName2).useInSimilarity(true).nodeScopeIndex()
+                .similaritySearchDenseVectorSize(20);
+        Tree index = setIndex(indexName, builder);
+        root.commit();
+        String alias =  ElasticIndexNameHelper.getIndexAlias(esConnection.getIndexPrefix(), "/oak:index/" + indexName);
+        GetFieldMappingsRequest fieldMappingsRequest = new GetFieldMappingsRequest();
+        fieldMappingsRequest.indices(alias).fields(similarityFieldName1, similarityFieldName2);
+        GetFieldMappingsResponse mappingsResponse = esConnection.getClient().indices().
+                getFieldMapping(fieldMappingsRequest, RequestOptions.DEFAULT);
+        final Map<String, Map<String, GetFieldMappingsResponse.FieldMappingMetadata>> mappings =
+                mappingsResponse.mappings();
+        assertEquals("More than one index found", 1, mappings.keySet().size());
+        @SuppressWarnings("unchecked")
+        Map<String, Integer> map1 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
+                get(similarityFieldName1).sourceAsMap().get(similarityFieldName1);
+        assertEquals("Dense vector size doesn't match", 10, map1.get("dims").intValue());
+        @SuppressWarnings("unchecked")
+        Map<String, Integer> map2 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
+                get(similarityFieldName2).sourceAsMap().get(similarityFieldName2);
+        assertEquals("Dense vector size doesn't match", 20, map2.get("dims").intValue());
+    }
+
+
+    @Test
+    public void vectorSimilarity() throws Exception {
+        IndexDefinitionBuilder builder = createIndex("fv");
+        builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex();
+        Tree index = setIndex("test1", builder);
+        root.commit();
+        Tree test = root.getTree("/").addChild("test");
+
+        URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+        File file = new File(uri);
+
+        Collection<String> children = new LinkedList<>();
+        for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+            String[] split = line.split(",");
+            List<Double> values = Arrays.stream(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
+            byte[] bytes = toByteArray(values);
+            List<Double> actual = toDoubles(bytes);
+            assertEquals(values, actual);
+
+            Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+            String name = split[0];
+            Tree child = test.addChild(name);
+            child.setProperty("fv", blob, Type.BINARY);
+            children.add(child.getPath());
+        }
+        root.commit();
+
+        // check that similarity changes across different feature vectors
+        List<String> baseline = new LinkedList<>();
+        for (String similarPath : children) {
+            String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";
+            List<String> current = new LinkedList<>();
+            assertEventually(() -> {
+                Iterator<String> result = executeQuery(query, "JCR-SQL2", false, true).iterator();
+                current.clear();
+                while (result.hasNext()) {
+                    String next = result.next();
+                    current.add(next);
+                }
+                assertNotEquals(baseline, current);
+            });
+            baseline.clear();
+            baseline.addAll(current);
+        }
+    }
+
 }