You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by am...@apache.org on 2021/02/08 12:37:25 UTC
svn commit: r1886318 [1/3] - in /jackrabbit/oak/trunk:
oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/
oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/
oak-search-elastic/src/main...
Author: amrverma
Date: Mon Feb 8 12:37:23 2021
New Revision: 1886318
URL: http://svn.apache.org/viewvc?rev=1886318&view=rev
Log:
OAK-9339: Image Similarity: LSH based search
* Using elastiknn plugin
Added:
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java
jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/elasticstartscript.sh
jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/org/apache/jackrabbit/oak/query/imagedata.txt
Modified:
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticConnectionRule.java
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java?rev=1886318&r1=1886317&r2=1886318&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java Mon Feb 8 12:37:23 2021
@@ -81,6 +81,12 @@ public class ElasticIndexDefinition exte
*/
private static final String INDEX_ORIGINAL_TERM = "indexOriginalTerm";
+ private static final String SIMILARITY_TAGS_ENABLED = "similarityTagsEnabled";
+ private static final boolean SIMILARITY_TAGS_ENABLED_DEFAULT = true;
+
+ private static final String SIMILARITY_TAGS_BOOST = "similarityTagsBoost";
+ private static final float SIMILARITY_TAGS_BOOST_DEFAULT = 0.5f;
+
private static final Function<Integer, Boolean> isAnalyzable;
static {
@@ -97,12 +103,15 @@ public class ElasticIndexDefinition exte
public final int bulkRetries;
public final long bulkRetriesBackoff;
private final String remoteAlias;
+ private final boolean similarityTagsEnabled;
+ private final float similarityTagsBoost;
public final int numberOfShards;
public final int numberOfReplicas;
public final int[] queryFetchSizes;
private final Map<String, List<PropertyDefinition>> propertiesByName;
private final List<PropertyDefinition> dynamicBoostProperties;
+ private final List<PropertyDefinition> similarityProperties;
public ElasticIndexDefinition(NodeState root, NodeState defn, String indexPath, String indexPrefix) {
super(root, defn, determineIndexFormatVersion(defn), determineUniqueId(defn), indexPath);
@@ -114,6 +123,8 @@ public class ElasticIndexDefinition exte
this.bulkRetriesBackoff = getOptionalValue(defn, BULK_RETRIES_BACKOFF, BULK_RETRIES_BACKOFF_DEFAULT);
this.numberOfShards = getOptionalValue(defn, NUMBER_OF_SHARDS, NUMBER_OF_SHARDS_DEFAULT);
this.numberOfReplicas = getOptionalValue(defn, NUMBER_OF_REPLICAS, NUMBER_OF_REPLICAS_DEFAULT);
+ this.similarityTagsEnabled = getOptionalValue(defn, SIMILARITY_TAGS_ENABLED, SIMILARITY_TAGS_ENABLED_DEFAULT);
+ this.similarityTagsBoost = getOptionalValue(defn, SIMILARITY_TAGS_BOOST, SIMILARITY_TAGS_BOOST_DEFAULT);
this.queryFetchSizes = Arrays.stream(getOptionalValues(defn, QUERY_FETCH_SIZES, Type.LONGS, Long.class, QUERY_FETCH_SIZES_DEFAULT))
.mapToInt(Long::intValue).toArray();
@@ -128,6 +139,11 @@ public class ElasticIndexDefinition exte
.flatMap(IndexingRule::getNamePatternsProperties)
.filter(pd -> pd.dynamicBoost)
.collect(Collectors.toList());
+
+ this.similarityProperties = getDefinedRules()
+ .stream()
+ .flatMap(rule -> rule.getSimilarityProperties().stream())
+ .collect(Collectors.toList());
}
/**
@@ -147,6 +163,18 @@ public class ElasticIndexDefinition exte
return dynamicBoostProperties;
}
+ public List<PropertyDefinition> getSimilarityProperties() {
+ return similarityProperties;
+ }
+
+ public boolean areSimilarityTagsEnabled() {
+ return similarityTagsEnabled;
+ }
+
+ public float getSimilarityTagsBoost() {
+ return similarityTagsBoost;
+ }
+
/**
* Returns the keyword field name mapped in Elasticsearch for the specified property name.
* @param propertyName the property name in the index rules
@@ -189,6 +217,11 @@ public class ElasticIndexDefinition exte
return getOptionalValue(analyzersTree, INDEX_ORIGINAL_TERM, false);
}
+ @Override
+ protected PropertyDefinition createPropertyDefinition(IndexDefinition.IndexingRule rule, String name, NodeState nodeState) {
+ return new ElasticPropertyDefinition(rule, name, nodeState);
+ }
+
/**
* Class to help with {@link ElasticIndexDefinition} creation.
* The built object represents the index definition only without the node structure.
Added: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java?rev=1886318&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java (added)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java Mon Feb 8 12:37:23 2021
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.elastic;
+
+import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
+import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
+import org.apache.jackrabbit.oak.spi.state.NodeState;
+
+import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue;
+
+public class ElasticPropertyDefinition extends PropertyDefinition {
+
+ SimilaritySearchParameters similaritySearchParameters;
+
+ public static final String PROP_QUERY_MODEL = "queryModel";
+ public static final String PROP_NUMBER_OF_HASH_TABLES = "L";
+ public static final String PROP_NUMBER_OF_HASH_FUNCTIONS = "k";
+ public static final String PROP_NUMBER_OF_BUCKETS = "w";
+ public static final String PROP_INDEX_SIMILARITY = "indexSimilarity";
+ public static final String PROP_QUERY_SIMILARITY = "querySimilarity";
+ public static final String PROP_CANDIDATES = "candidates";
+ public static final String PROP_PROBES = "probes";
+
+ private static final int DEFAULT_NUMBER_OF_HASH_TABLES = 20;
+ private static final int DEFAULT_NO_OF_HASH_FUNCTIONS = 15;
+ private static final int DEFAULT_BUCKET_WIDTH = 500;
+ private static final String DEFAULT_SIMILARITY_QUERY_MODEL = "lsh";
+ private static final String DEFAULT_SIMILARITY_INDEX_FUNCTION = "l2";
+ private static final String DEFAULT_SIMILARITY_QUERY_FUNCTION = "l2";
+ private static final int DEFAULT_QUERY_CANDIDATES = 500;
+ private static final int DEFAULT_QUERY_PROBES = 3;
+
+
+ public ElasticPropertyDefinition(IndexDefinition.IndexingRule idxDefn, String nodeName, NodeState defn) {
+ super(idxDefn, nodeName, defn);
+ if (this.useInSimilarity) {
+ similaritySearchParameters = new SimilaritySearchParameters(
+ getOptionalValue(defn, PROP_NUMBER_OF_HASH_TABLES, DEFAULT_NUMBER_OF_HASH_TABLES),
+ getOptionalValue(defn, PROP_NUMBER_OF_HASH_FUNCTIONS, DEFAULT_NO_OF_HASH_FUNCTIONS),
+ getOptionalValue(defn, PROP_NUMBER_OF_BUCKETS, DEFAULT_BUCKET_WIDTH),
+ getOptionalValue(defn, PROP_QUERY_MODEL, DEFAULT_SIMILARITY_QUERY_MODEL),
+ getOptionalValue(defn, PROP_INDEX_SIMILARITY, DEFAULT_SIMILARITY_INDEX_FUNCTION),
+ getOptionalValue(defn, PROP_QUERY_SIMILARITY, DEFAULT_SIMILARITY_QUERY_FUNCTION),
+ getOptionalValue(defn, PROP_CANDIDATES, DEFAULT_QUERY_CANDIDATES),
+ getOptionalValue(defn, PROP_PROBES, DEFAULT_QUERY_PROBES));
+ }
+ }
+
+ /**
+ * Class for defining parameters for similarity search based on https://elastiknn.com/api.
+ * For all possible models and query combinations, see https://elastiknn.com/api/#model-and-query-compatibility
+ */
+ public static class SimilaritySearchParameters {
+
+ /**
+ * Number of hash tables. Generally, increasing this value increases recall.
+ */
+ private final int L;
+ /**
+ * Number of hash functions combined to form a single hash value. Generally, increasing this value increases precision.
+ */
+ private final int k;
+ /**
+ * Integer bucket width.
+ */
+ private final int w;
+ /**
+ * Possible values - lsh, exact
+ */
+ private final String queryModel;
+ /**
+ * Possible values l2 (with lsh or exact model), l1 (with exact model), A (angular distance - with exact model)
+ */
+ private final String queryTimeSimilarityFunction;
+ /**
+ * Possible values l2 (with lsh or exact model), l1 (with exact model), A (angular distance - with exact model)
+ */
+ private final String indexTimeSimilarityFunction;
+ /**
+ * Take the top vectors with the most matching hashes and compute their exact similarity to the query vector. The candidates parameter
+ * controls the number of exact similarity computations. Specifically, we compute exact similarity for the top candidates candidate vectors
+ * in each segment. As a reminder, each Elasticsearch index has >= 1 shards, and each shard has >= 1 segments. That means if you set
+ * "candidates": 200 for an index with 2 shards, each with 3 segments, then youâll compute the exact similarity for 2 * 3 * 200 = 1200 vectors.
+ * candidates must be set to a number greater or equal to the number of Elasticsearch results you want to get. Higher values generally mean
+ * higher recall and higher latency.
+ */
+ private final int candidates;
+ /**
+ * Number of probes for using the multiprobe search technique. Default value is zero. Max value is 3^k. Generally, increasing probes will
+ * increase recall, will allow you to use a smaller value for L with comparable recall, but introduces some additional computation at query time.
+ */
+ private final int probes;
+
+ public SimilaritySearchParameters(int l, int k, int w, String queryModel, String indexTimeSimilarityFunction,
+ String queryTimeSimilarityFunction, int candidates, int probes) {
+ L = l;
+ this.k = k;
+ this.w = w;
+ this.queryModel = queryModel;
+ this.indexTimeSimilarityFunction = indexTimeSimilarityFunction;
+ this.queryTimeSimilarityFunction = queryTimeSimilarityFunction;
+ this.candidates = candidates;
+ this.probes = probes;
+ }
+
+ public int getL() {
+ return L;
+ }
+
+ public int getK() {
+ return k;
+ }
+
+ public int getW() {
+ return w;
+ }
+
+ public String getQueryModel() {
+ return queryModel;
+ }
+
+ public String getQueryTimeSimilarityFunction() {
+ return queryTimeSimilarityFunction;
+ }
+
+ public String getIndexTimeSimilarityFunction() {
+ return indexTimeSimilarityFunction;
+ }
+
+ public int getCandidates() {
+ return candidates;
+ }
+
+ public int getProbes() {
+ return probes;
+ }
+ }
+
+ public SimilaritySearchParameters getSimilaritySearchParameters() {
+ return similaritySearchParameters;
+ }
+}
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java?rev=1886318&r1=1886317&r2=1886318&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java Mon Feb 8 12:37:23 2021
@@ -18,6 +18,7 @@ package org.apache.jackrabbit.oak.plugin
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
+import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
import org.elasticsearch.client.indices.CreateIndexRequest;
@@ -33,8 +34,7 @@ import java.util.stream.Collectors;
* Provides utility functions around Elasticsearch indexing
*/
class ElasticIndexHelper {
-
- private static final String ES_DENSE_VECTOR_TYPE = "dense_vector";
+
private static final String ES_DENSE_VECTOR_DIM_PROP = "dims";
public static CreateIndexRequest createIndexRequest(String remoteIndexName, ElasticIndexDefinition indexDefinition) throws IOException {
@@ -63,6 +63,9 @@ class ElasticIndexHelper {
private static XContentBuilder loadSettings(ElasticIndexDefinition indexDefinition) throws IOException {
final XContentBuilder settingsBuilder = XContentFactory.jsonBuilder();
settingsBuilder.startObject();
+ if (indexDefinition.getSimilarityProperties().size() > 0) {
+ settingsBuilder.field("elastiknn", true);
+ }
settingsBuilder.field("number_of_shards", indexDefinition.numberOfShards);
settingsBuilder.field("number_of_replicas", indexDefinition.numberOfReplicas);
{
@@ -154,11 +157,8 @@ class ElasticIndexHelper {
for (Map.Entry<String, List<PropertyDefinition>> entry : indexDefinition.getPropertiesByName().entrySet()) {
final String name = entry.getKey();
final List<PropertyDefinition> propertyDefinitions = entry.getValue();
-
Type<?> type = null;
boolean useInSpellCheck = false;
- boolean useInSimilarity = false;
- int denseVectorSize = -1;
for (PropertyDefinition pd : propertyDefinitions) {
type = Type.fromTag(pd.getType(), false);
if (pd.useInSpellcheck) {
@@ -167,10 +167,6 @@ class ElasticIndexHelper {
if (pd.useInSuggest) {
useInSuggest = true;
}
- if (pd.useInSimilarity) {
- useInSimilarity = true;
- denseVectorSize = pd.getSimilaritySearchDenseVectorSize();
- }
}
mappingBuilder.startObject(name);
@@ -213,13 +209,6 @@ class ElasticIndexHelper {
}
}
mappingBuilder.endObject();
-
- if (useInSimilarity) {
- mappingBuilder.startObject(FieldNames.createSimilarityFieldName(name));
- mappingBuilder.field("type", ES_DENSE_VECTOR_TYPE);
- mappingBuilder.field(ES_DENSE_VECTOR_DIM_PROP, denseVectorSize);
- mappingBuilder.endObject();
- }
}
if (useInSuggest) {
@@ -255,6 +244,26 @@ class ElasticIndexHelper {
}
mappingBuilder.endObject();
}
+ mappingBuilder.endObject();
+ }
+
+ for (PropertyDefinition propertyDefinition : indexDefinition.getSimilarityProperties()) {
+ ElasticPropertyDefinition pd = (ElasticPropertyDefinition) propertyDefinition;
+ int denseVectorSize = pd.getSimilaritySearchDenseVectorSize();
+ mappingBuilder.startObject(FieldNames.createSimilarityFieldName(pd.name));
+ {
+ mappingBuilder.field("type", "elastiknn_dense_float_vector");
+ mappingBuilder.startObject("elastiknn");
+ {
+ mappingBuilder.field(ES_DENSE_VECTOR_DIM_PROP, denseVectorSize);
+ mappingBuilder.field("model", "lsh");
+ mappingBuilder.field("similarity", pd.getSimilaritySearchParameters().getIndexTimeSimilarityFunction());
+ mappingBuilder.field("L", pd.getSimilaritySearchParameters().getL());
+ mappingBuilder.field("k", pd.getSimilaritySearchParameters().getK());
+ mappingBuilder.field("w", pd.getSimilaritySearchParameters().getW());
+ }
+ mappingBuilder.endObject();
+ }
mappingBuilder.endObject();
}
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java?rev=1886318&r1=1886317&r2=1886318&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java Mon Feb 8 12:37:23 2021
@@ -23,6 +23,7 @@ import org.apache.jackrabbit.oak.api.Pro
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
+import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition;
import org.apache.jackrabbit.oak.plugins.index.elastic.query.async.facets.ElasticFacetProvider;
import org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
@@ -46,6 +47,9 @@ import org.apache.jackrabbit.oak.spi.que
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.join.ScoreMode;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.json.JsonXContent;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.client.Request;
import org.elasticsearch.common.xcontent.XContentHelper;
@@ -60,11 +64,9 @@ import org.elasticsearch.index.query.Nes
import org.elasticsearch.index.query.Operator;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
-import org.elasticsearch.index.query.functionscore.ScriptScoreQueryBuilder;
+import org.elasticsearch.index.query.WrapperQueryBuilder;
import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
import org.elasticsearch.index.search.MatchQuery;
-import org.elasticsearch.script.Script;
-import org.elasticsearch.script.ScriptType;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
import org.elasticsearch.search.builder.SearchSourceBuilder;
@@ -84,9 +86,7 @@ import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collections;
import java.util.HashMap;
-import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
@@ -117,7 +117,6 @@ import static org.apache.jackrabbit.util
import static org.elasticsearch.common.xcontent.ToXContent.EMPTY_PARAMS;
import static org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
-import static org.elasticsearch.index.query.QueryBuilders.existsQuery;
import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
@@ -125,8 +124,8 @@ import static org.elasticsearch.index.qu
import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
import static org.elasticsearch.index.query.QueryBuilders.nestedQuery;
import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
-import static org.elasticsearch.index.query.QueryBuilders.scriptScoreQuery;
import static org.elasticsearch.index.query.QueryBuilders.termQuery;
+import static org.elasticsearch.index.query.QueryBuilders.wrapperQuery;
/**
* Class to map query plans into Elastic request objects.
@@ -176,15 +175,12 @@ public class ElasticRequestHandler {
if (propertyRestrictionQuery != null) {
if (propertyRestrictionQuery.startsWith("mlt?")) {
- List<PropertyDefinition> sp = new LinkedList<>();
- for (IndexDefinition.IndexingRule r : elasticIndexDefinition.getDefinedRules()) {
- sp.addAll(r.getSimilarityProperties());
- }
+ List<PropertyDefinition> sp = elasticIndexDefinition.getSimilarityProperties();
String mltQueryString = propertyRestrictionQuery.substring("mlt?".length());
Map<String, String> mltParams = MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
- String text = mltParams.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+ String queryNodePath = mltParams.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
- if (text == null) {
+ if (queryNodePath == null) {
// TODO : See if we might want to support like Text here (passed as null in above constructors)
// IT is not supported in our lucene implementation.
throw new IllegalArgumentException("Missing required field stream.body in MLT query: " + mltQueryString);
@@ -201,13 +197,15 @@ public class ElasticRequestHandler {
.minTermFreq(1).minDocFreq(1)
);
} else {
- boolQuery.must(similarityQuery(text, sp));
- // add should clause to improve relevance using similarity tags
- boolQuery.should(moreLikeThisQuery(
- new String[]{ElasticIndexDefinition.SIMILARITY_TAGS}, null,
- new Item[]{new Item(null, ElasticIndexUtils.idFromPath(text))})
- .minTermFreq(1).minDocFreq(1)
- );
+ boolQuery.must(similarityQuery(queryNodePath, sp));
+ if (elasticIndexDefinition.areSimilarityTagsEnabled()) {
+ // add should clause to improve relevance using similarity tags
+ boolQuery.should(moreLikeThisQuery(
+ new String[]{ElasticIndexDefinition.SIMILARITY_TAGS}, null,
+ new Item[]{new Item(null, ElasticIndexUtils.idFromPath(queryNodePath))})
+ .minTermFreq(1).minDocFreq(1).boost(elasticIndexDefinition.getSimilarityTagsBoost())
+ );
+ }
}
} else {
boolQuery.must(queryStringQuery(propertyRestrictionQuery));
@@ -352,7 +350,8 @@ public class ElasticRequestHandler {
if (!targetNodeState.exists()) {
throw new IllegalArgumentException("Could not find node " + text);
}
- for (PropertyDefinition pd : sp) {
+ for (PropertyDefinition propertyDefinition : sp) {
+ ElasticPropertyDefinition pd = (ElasticPropertyDefinition) propertyDefinition;
String propertyPath = PathUtils.getParentPath(pd.name);
String propertyName = PathUtils.getName(pd.name);
NodeState tempState = targetNodeState;
@@ -376,13 +375,36 @@ public class ElasticRequestHandler {
continue;
}
String similarityPropFieldName = FieldNames.createSimilarityFieldName(pd.name);
- Map<String, Object> paramMap = new HashMap<>();
- paramMap.put("query_vector", toDoubles(bytes));
- paramMap.put("field_name", similarityPropFieldName);
- ScriptScoreQueryBuilder scriptScoreQueryBuilder = scriptScoreQuery(existsQuery(similarityPropFieldName),
- new Script(ScriptType.INLINE, Script.DEFAULT_SCRIPT_LANG, "cosineSimilarity(params.query_vector, params.field_name) + 1.0",
- Collections.emptyMap(), paramMap));
- query.should(scriptScoreQueryBuilder);
+ try {
+ XContentBuilder contentBuilder = JsonXContent.contentBuilder();
+ contentBuilder.startObject();
+ contentBuilder.field("elastiknn_nearest_neighbors");
+ contentBuilder.startObject();
+ {
+ contentBuilder.field("field", similarityPropFieldName);
+ contentBuilder.field("vec");
+ contentBuilder.startObject();
+ {
+ contentBuilder.field("values");
+ contentBuilder.startArray();
+ for (Double d : toDoubles(bytes)) {
+ contentBuilder.value(d);
+ }
+ contentBuilder.endArray();
+ }
+ contentBuilder.endObject();
+ contentBuilder.field("model", pd.getSimilaritySearchParameters().getQueryModel());
+ contentBuilder.field("similarity", pd.getSimilaritySearchParameters().getQueryTimeSimilarityFunction());
+ contentBuilder.field("candidates", pd.getSimilaritySearchParameters().getCandidates());
+ contentBuilder.field("probes", pd.getSimilaritySearchParameters().getProbes());
+ }
+ contentBuilder.endObject();
+ contentBuilder.endObject();
+ WrapperQueryBuilder wqb = wrapperQuery(Strings.toString(contentBuilder));
+ query.should(wqb);
+ } catch (IOException e){
+ LOG.error("Could not create similarity query ", e);
+ }
}
}
return query;
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticConnectionRule.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticConnectionRule.java?rev=1886318&r1=1886317&r2=1886318&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticConnectionRule.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticConnectionRule.java Mon Feb 8 12:37:23 2021
@@ -17,6 +17,11 @@
package org.apache.jackrabbit.oak.plugins.index.elastic;
import com.github.dockerjava.api.DockerClient;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.jackrabbit.oak.commons.IOUtils;
import org.elasticsearch.Version;
import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
import org.elasticsearch.client.RequestOptions;
@@ -27,10 +32,17 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testcontainers.DockerClientFactory;
import org.testcontainers.elasticsearch.ElasticsearchContainer;
+import org.testcontainers.utility.MountableFile;
+import java.io.File;
+import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
+import java.security.DigestInputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
import static org.junit.Assume.assumeNotNull;
@@ -43,6 +55,7 @@ public class ElasticConnectionRule exten
private ElasticConnection elasticConnection;
private final String elasticConnectionString;
private static final String INDEX_PREFIX = "ElasticTest_";
+ private static final String PLUGIN_DIGEST = "c4451aa794641dd3c9b0fdc64b553b71ca2f9a44689a7784b51669b5e557046d";
private static boolean useDocker = false;
public ElasticConnectionRule(String elasticConnectionString) {
@@ -68,10 +81,16 @@ public class ElasticConnectionRule exten
public Statement apply(Statement base, Description description) {
Statement s = super.apply(base, description);
// see if docker is to be used or not... initialize docker rule only if that's the case.
-
+ final String pluginVersion = "7.10.1.0";
+ final String pluginFileName = "elastiknn-" + pluginVersion + ".zip";
+ final String localPluginPath = "target/" + pluginFileName;
+ downloadSimilaritySearchPluginIfNotExists(localPluginPath, pluginVersion);
if (elasticConnectionString == null || getElasticConnectionFromString() == null) {
checkIfDockerClientAvailable();
- elastic = new ElasticsearchContainer("docker.elastic.co/elasticsearch/elasticsearch:" + Version.CURRENT);
+ elastic = new ElasticsearchContainer("docker.elastic.co/elasticsearch/elasticsearch:" + Version.CURRENT)
+ .withCopyFileToContainer(MountableFile.forHostPath(localPluginPath), "/tmp/plugins/" + pluginFileName)
+ .withCopyFileToContainer(MountableFile.forClasspathResource("elasticstartscript.sh"), "/tmp/elasticstartscript.sh")
+ .withCommand("bash /tmp/elasticstartscript.sh");
s = elastic.apply(s, description);
setUseDocker(true);
}
@@ -83,6 +102,38 @@ public class ElasticConnectionRule exten
//TODO: See if something needs to be cleaned up at test class level ??
}
+ private void downloadSimilaritySearchPluginIfNotExists(String localPluginPath, String pluginVersion) {
+ File pluginFile = new File(localPluginPath);
+ if (!pluginFile.exists()) {
+ LOG.info("Plugin file {} doesn't exist. Trying to download.", localPluginPath);
+ try (CloseableHttpClient client = HttpClients.createDefault()) {
+ HttpGet get = new HttpGet("https://github.com/alexklibisz/elastiknn/releases/download/" + pluginVersion
+ +"/elastiknn-" + pluginVersion +".zip");
+ CloseableHttpResponse response = client.execute(get);
+ InputStream inputStream = response.getEntity().getContent();
+ MessageDigest messageDigest = MessageDigest.getInstance("SHA-256");
+ DigestInputStream dis = new DigestInputStream(inputStream, messageDigest);
+ FileOutputStream outputStream = new FileOutputStream(pluginFile);
+ IOUtils.copy(dis, outputStream);
+ messageDigest = dis.getMessageDigest();
+ // bytes to hex
+ StringBuilder result = new StringBuilder();
+ for (byte b : messageDigest.digest()) {
+ result.append(String.format("%02x", b));
+ }
+ if (!PLUGIN_DIGEST.equals(result.toString())) {
+ String deleteString = "Downloaded plugin file deleted.";
+ if (!pluginFile.delete()) {
+ deleteString = "Could not delete downloaded plugin file.";
+ }
+ throw new RuntimeException("Plugin digest unequal. Found " + result.toString() + ". Expected " + PLUGIN_DIGEST + ". " + deleteString);
+ }
+ } catch (IOException|NoSuchAlgorithmException e) {
+ throw new RuntimeException("Could not download similarity search plugin", e);
+ }
+ }
+ }
+
public ElasticConnection getElasticConnectionFromString() {
if (elasticConnection == null) {
try {
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java?rev=1886318&r1=1886317&r2=1886318&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java Mon Feb 8 12:37:23 2021
@@ -33,13 +33,18 @@ import java.io.File;
import java.io.FileInputStream;
import java.net.URI;
import java.nio.charset.Charset;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
+import java.util.Scanner;
+import java.util.Set;
import java.util.UUID;
import java.util.stream.Collectors;
@@ -224,35 +229,34 @@ public class ElasticSimilarQueryTest ext
}
@Test
- public void vectorSimilarityCustomVectorSize() throws Exception {
+ public void vectorSimilarityElastiknnIndexConfiguration() throws Exception {
final String indexName = "test1";
final String fieldName1 = "fv1";
- final String fieldName2 = "fv2";
final String similarityFieldName1 = FieldNames.createSimilarityFieldName(fieldName1);
- final String similarityFieldName2 = FieldNames.createSimilarityFieldName(fieldName2);
- IndexDefinitionBuilder builder = createIndex(fieldName1, fieldName2);
- builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex()
- .similaritySearchDenseVectorSize(10);
- builder.indexRule("nt:base").property(fieldName2).useInSimilarity(true).nodeScopeIndex()
- .similaritySearchDenseVectorSize(20);
+ IndexDefinitionBuilder builder = createIndex(fieldName1);
+ Tree tree = builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex()
+ .similaritySearchDenseVectorSize(2048).getBuilderTree();
+ tree.setProperty(ElasticPropertyDefinition.PROP_INDEX_SIMILARITY, "angular");
+ tree.setProperty(ElasticPropertyDefinition.PROP_NUMBER_OF_HASH_TABLES, 10);
+ tree.setProperty(ElasticPropertyDefinition.PROP_NUMBER_OF_HASH_FUNCTIONS, 12);
+
setIndex(indexName, builder);
root.commit();
String alias = ElasticIndexNameHelper.getIndexAlias(esConnection.getIndexPrefix(), "/oak:index/" + indexName);
GetFieldMappingsRequest fieldMappingsRequest = new GetFieldMappingsRequest();
- fieldMappingsRequest.indices(alias).fields(similarityFieldName1, similarityFieldName2);
+ fieldMappingsRequest.indices(alias).fields(similarityFieldName1);
GetFieldMappingsResponse mappingsResponse = esConnection.getClient().indices().
getFieldMapping(fieldMappingsRequest, RequestOptions.DEFAULT);
final Map<String, Map<String, GetFieldMappingsResponse.FieldMappingMetadata>> mappings =
mappingsResponse.mappings();
assertEquals("More than one index found", 1, mappings.keySet().size());
@SuppressWarnings("unchecked")
- Map<String, Integer> map1 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
- get(similarityFieldName1).sourceAsMap().get(similarityFieldName1);
- assertEquals("Dense vector size doesn't match", 10, map1.get("dims").intValue());
- @SuppressWarnings("unchecked")
- Map<String, Integer> map2 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
- get(similarityFieldName2).sourceAsMap().get(similarityFieldName2);
- assertEquals("Dense vector size doesn't match", 20, map2.get("dims").intValue());
+ Map<String, Object> map1 = (Map<String, Object>)(((Map<String, Object>)mappings.entrySet().iterator().next().getValue().
+ get(similarityFieldName1).sourceAsMap().get(similarityFieldName1)).get("elastiknn"));
+ assertEquals("Dense vector size doesn't match", 2048, (int)map1.get("dims"));
+ assertEquals("Similarity doesn't match", "angular", map1.get("similarity"));
+ assertEquals("Similarity doesn't match", 10, map1.get("L"));
+ assertEquals("Similarity doesn't match", 12, map1.get("k"));
}
@@ -302,6 +306,101 @@ public class ElasticSimilarQueryTest ext
}
}
+ private void createNodeWithFV(String imageName, String fv, Tree test) throws Exception {
+ String[] split = fv.split(",");
+ List<Double> values = Arrays.stream(split).map(Double::parseDouble).collect(Collectors.toList());
+ byte[] bytes = toByteArray(values);
+ List<Double> actual = toDoubles(bytes);
+ assertEquals(values, actual);
+ Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+ Tree child = test.addChild(imageName);
+ child.setProperty("fv", blob, Type.BINARY);
+ }
+
+ private void indexEntry(Scanner scanner, Tree test, Map<String, List<String>> expectedResults, int similarResultCount) throws Exception {
+ String lineRead = "";
+ List<String> similarities = new ArrayList<>();
+ //skip empty lines at the beginning
+ while (scanner.hasNextLine()) {
+ lineRead = scanner.nextLine();
+ if (!"".equals(lineRead)) {
+ break;
+ }
+ }
+ if ("".equals(lineRead)) {
+ // complete file read
+ return;
+ }
+ String imageName = lineRead;
+ expectedResults.put(lineRead, similarities);
+ String fv = scanner.nextLine();
+ createNodeWithFV(imageName, fv, test);
+ int resultCount = 0;
+ while (scanner.hasNextLine() && resultCount < similarResultCount) {
+ imageName = scanner.nextLine();
+ if ("".equals(imageName)) {
+ continue;
+ }
+ resultCount++;
+ fv = scanner.nextLine();
+ createNodeWithFV(imageName, fv, test);
+ similarities.add(imageName);
+ }
+ }
+
+ private void verifyLSHResults(Map<String, List<String>> expectedResults) {
+ for (String similarPath : expectedResults.keySet()) {
+ String query = "select [jcr:path] from [nt:base] where similar(., '" + "/test/" + similarPath + "')";
+ assertEventually(() -> {
+ Iterator<String> result = executeQuery(query, "JCR-SQL2", false, true).iterator();
+ List<String> expectedList = expectedResults.get(similarPath.substring(similarPath.lastIndexOf("/") + 1));
+ Set<String> found = new HashSet<>();
+ int resultNum = 0;
+ // Verify that the expected results are present in the top 10 results
+ while (resultNum < expectedList.size()) {
+ String next = result.next();
+ next = next.substring(next.lastIndexOf("/") + 1);
+ found.add(next);
+ resultNum++;
+ }
+ double per = (expectedList.stream().filter(found::contains).count() * 100.0)/expectedList.size();
+ assertEquals(100.0, per, 0.0);
+ });
+ }
+ }
+
+ @Test
+ public void vectorSimilarityLargeData() throws Exception {
+ final int similarImageCount = 10;
+ IndexDefinitionBuilder builder = createIndex("fv");
+ builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex();
+ setIndex("test1", builder);
+ root.commit();
+ Tree test = root.getTree("/").addChild("test");
+ /*
+ Image names and their feature vectors are written in this file with the image name first and its feature vector
+ in the line below.
+ This file contains test data in form of blocks and each block has following format -
+ Line 1: Query_Image_Name
+ Line 2: Feature Vector of Query_Image
+ Line 3: EMPTY_LINE
+ Lines 4-23: 10 Result images and their feature vectors
+ Line 24: EMPTY_LINE
+ Then this pattern repeats again with next Query Image name in line 25.
+ */
+ URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/imagedata.txt").toURI();
+ File inputFile = new File(uri);
+ Map<String, List<String>> expectedResults = new HashMap<>();
+
+ Scanner scanner = new Scanner(inputFile);
+ while (scanner.hasNextLine()) {
+ indexEntry(scanner, test, expectedResults, similarImageCount);
+ }
+ root.commit();
+
+ verifyLSHResults(expectedResults);
+ }
+
private void createIndex(boolean nativeQuery) throws Exception {
IndexDefinitionBuilder builder = createIndex("text", "tags");
if (nativeQuery) {
Added: jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/elasticstartscript.sh
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/elasticstartscript.sh?rev=1886318&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/elasticstartscript.sh (added)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/elasticstartscript.sh Mon Feb 8 12:37:23 2021
@@ -0,0 +1,4 @@
+pluginZip=`ls /tmp/plugins | grep elastiknn-7.10 | head -n 1`
+echo "Installing plugin /tmp/plugins/$pluginZip"
+bin/elasticsearch-plugin install --batch file:///tmp/plugins/$pluginZip
+su -c "bin/elasticsearch" elasticsearch
\ No newline at end of file