You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by am...@apache.org on 2020/10/29 16:23:30 UTC
svn commit: r1882970 [1/3] - in /jackrabbit/oak/trunk:
oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/
oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/
oak-search-elastic/src/main/java/org...
Author: amrverma
Date: Thu Oct 29 16:23:30 2020
New Revision: 1882970
URL: http://svn.apache.org/viewvc?rev=1882970&view=rev
Log:
OAK-9213 - Support feature vector similarity / image similarity in Oak ES
Added:
jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/org/apache/jackrabbit/oak/query/
jackrabbit/oak/trunk/oak-search-elastic/src/test/resources/org/apache/jackrabbit/oak/query/fvs.csv
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/query/FulltextIndex.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java Thu Oct 29 16:23:30 2020
@@ -776,7 +776,7 @@ public class LucenePropertyIndex extends
}
@Override
- protected String getFulltextRequestString(IndexPlan plan, IndexNode indexNode) {
+ protected String getFulltextRequestString(IndexPlan plan, IndexNode indexNode, NodeState root) {
return getLuceneRequest(plan, augmentorFactory, null).toString();
}
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java Thu Oct 29 16:23:30 2020
@@ -16,8 +16,10 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic.index;
+import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
+import org.apache.jackrabbit.oak.plugins.index.search.spi.binary.BlobByteSource;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
@@ -32,6 +34,8 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
+
class ElasticDocument {
private static final Logger LOG = LoggerFactory.getLogger(ElasticDocument.class);
@@ -41,6 +45,7 @@ class ElasticDocument {
private final List<String> notNullProps;
private final List<String> nullProps;
private final Map<String, Object> properties;
+ private final Map<String, Object> similarityFields;
private final Map<String, Map<String, Double>> dynamicBoostFields;
ElasticDocument(String path) {
@@ -50,6 +55,7 @@ class ElasticDocument {
this.notNullProps = new ArrayList<>();
this.nullProps = new ArrayList<>();
this.properties = new HashMap<>();
+ this.similarityFields = new HashMap<>();
this.dynamicBoostFields = new HashMap<>();
}
@@ -81,6 +87,11 @@ class ElasticDocument {
properties.put(fieldName, value);
}
+ void addSimilarityField(String name, Blob value) throws IOException{
+ byte[] bytes = new BlobByteSource(value).read();
+ similarityFields.put(FieldNames.createSimilarityFieldName(name), toDoubles(bytes));
+ }
+
void indexAncestors(String path) {
String parPath = PathUtils.getParentPath(path);
int depth = PathUtils.getDepth(path);
@@ -117,6 +128,9 @@ class ElasticDocument {
if (nullProps.size() > 0) {
builder.field(FieldNames.NULL_PROPS, nullProps);
}
+ for (Map.Entry<String, Object> simProp: similarityFields.entrySet()) {
+ builder.field(simProp.getKey(), simProp.getValue());
+ }
for (Map.Entry<String, Object> prop : properties.entrySet()) {
builder.field(prop.getKey(), prop.getValue());
}
@@ -148,4 +162,5 @@ class ElasticDocument {
public String toString() {
return build();
}
+
}
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java Thu Oct 29 16:23:30 2020
@@ -178,9 +178,11 @@ class ElasticDocumentMaker extends Fullt
@Override
protected void indexSimilarityBinaries(ElasticDocument doc, PropertyDefinition pd, Blob blob) throws IOException {
- // TODO : not implemented
// see https://www.elastic.co/blog/text-similarity-search-with-vectors-in-elasticsearch
// see https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
+
+ doc.addSimilarityField(pd.name, blob);
+
}
@Override
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java Thu Oct 29 16:23:30 2020
@@ -34,6 +34,9 @@ import java.util.stream.Collectors;
*/
class ElasticIndexHelper {
+ private static final String ES_DENSE_VECTOR_TYPE = "dense_vector";
+ private static final String ES_DENSE_VECTOR_DIM_PROP = "dims";
+
public static CreateIndexRequest createIndexRequest(String remoteIndexName, ElasticIndexDefinition indexDefinition) throws IOException {
final CreateIndexRequest request = new CreateIndexRequest(remoteIndexName);
@@ -152,6 +155,8 @@ class ElasticIndexHelper {
Type<?> type = null;
boolean useInSpellCheck = false;
+ boolean useInSimilarity = false;
+ int denseVectorSize = -1;
for (PropertyDefinition pd : propertyDefinitions) {
type = Type.fromTag(pd.getType(), false);
if (pd.useInSpellcheck) {
@@ -160,6 +165,10 @@ class ElasticIndexHelper {
if (pd.useInSuggest) {
useInSuggest = true;
}
+ if (pd.useInSimilarity) {
+ useInSimilarity = true;
+ denseVectorSize = pd.getSimilaritySearchDenseVectorSize();
+ }
}
mappingBuilder.startObject(name);
@@ -202,6 +211,13 @@ class ElasticIndexHelper {
}
}
mappingBuilder.endObject();
+
+ if (useInSimilarity) {
+ mappingBuilder.startObject(FieldNames.createSimilarityFieldName(name));
+ mappingBuilder.field("type", ES_DENSE_VECTOR_TYPE);
+ mappingBuilder.field(ES_DENSE_VECTOR_DIM_PROP, denseVectorSize);
+ mappingBuilder.endObject();
+ }
}
if (useInSuggest) {
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java Thu Oct 29 16:23:30 2020
@@ -96,8 +96,8 @@ class ElasticIndex extends FulltextIndex
}
@Override
- protected String getFulltextRequestString(IndexPlan plan, IndexNode indexNode) {
- return Strings.toString(new ElasticRequestHandler(plan, getPlanResult(plan)).baseQuery());
+ protected String getFulltextRequestString(IndexPlan plan, IndexNode indexNode, NodeState rootState) {
+ return Strings.toString(new ElasticRequestHandler(plan, getPlanResult(plan), rootState).baseQuery());
}
@Override
@@ -105,7 +105,7 @@ class ElasticIndex extends FulltextIndex
final Filter filter = plan.getFilter();
final FulltextIndexPlanner.PlanResult planResult = getPlanResult(plan);
- final ElasticRequestHandler requestHandler = new ElasticRequestHandler(plan, planResult);
+ final ElasticRequestHandler requestHandler = new ElasticRequestHandler(plan, planResult, rootState);
final ElasticResponseHandler responseHandler = new ElasticResponseHandler(planResult, filter);
final Iterator<FulltextResultRow> itr;
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java Thu Oct 29 16:23:30 2020
@@ -16,6 +16,8 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic.query;
+import org.apache.jackrabbit.oak.api.Blob;
+import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
@@ -25,6 +27,7 @@ import org.apache.jackrabbit.oak.plugins
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.MoreLikeThisHelperUtil;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
+import org.apache.jackrabbit.oak.plugins.index.search.spi.binary.BlobByteSource;
import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndex;
import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner;
import org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner.PlanResult;
@@ -38,9 +41,11 @@ import org.apache.jackrabbit.oak.spi.que
import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextOr;
import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm;
import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor;
+import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.join.ScoreMode;
import org.elasticsearch.index.query.BoolQueryBuilder;
+import org.elasticsearch.index.query.ExistsQueryBuilder;
import org.elasticsearch.index.query.InnerHitBuilder;
import org.elasticsearch.index.query.MatchBoolPrefixQueryBuilder;
import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
@@ -50,8 +55,11 @@ import org.elasticsearch.index.query.Nes
import org.elasticsearch.index.query.Operator;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.index.query.functionscore.ScriptScoreQueryBuilder;
import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders;
import org.elasticsearch.index.search.MatchQuery;
+import org.elasticsearch.script.Script;
+import org.elasticsearch.script.ScriptType;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
import org.elasticsearch.search.sort.FieldSortBuilder;
@@ -66,8 +74,12 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.jcr.PropertyType;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
@@ -79,6 +91,7 @@ import static org.apache.jackrabbit.JcrC
import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE;
import static org.apache.jackrabbit.oak.commons.PathUtils.denotesRoot;
import static org.apache.jackrabbit.oak.commons.PathUtils.getParentPath;
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newAncestorQuery;
import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newDepthQuery;
import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newMixinTypeQuery;
@@ -96,12 +109,14 @@ import static org.apache.jackrabbit.oak.
import static org.apache.jackrabbit.util.ISO8601.parse;
import static org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
+import static org.elasticsearch.index.query.QueryBuilders.existsQuery;
import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
import static org.elasticsearch.index.query.QueryBuilders.nestedQuery;
import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
+import static org.elasticsearch.index.query.QueryBuilders.scriptScoreQuery;
import static org.elasticsearch.index.query.QueryBuilders.termQuery;
/**
@@ -123,8 +138,9 @@ public class ElasticRequestHandler {
private final PlanResult planResult;
private final ElasticIndexDefinition elasticIndexDefinition;
private final String propertyRestrictionQuery;
+ private final NodeState rootState;
- ElasticRequestHandler(@NotNull IndexPlan indexPlan, @NotNull FulltextIndexPlanner.PlanResult planResult) {
+ ElasticRequestHandler(@NotNull IndexPlan indexPlan, @NotNull FulltextIndexPlanner.PlanResult planResult, NodeState rootState) {
this.indexPlan = indexPlan;
this.filter = indexPlan.getFilter();
this.planResult = planResult;
@@ -137,6 +153,7 @@ public class ElasticRequestHandler {
}
this.propertyRestrictionQuery = pr != null ? String.valueOf(pr.first.getValue(pr.first.getType())) : null;
+ this.rootState = rootState;
}
public BoolQueryBuilder baseQuery() {
@@ -150,11 +167,19 @@ public class ElasticRequestHandler {
if (propertyRestrictionQuery != null) {
if (propertyRestrictionQuery.startsWith("mlt?")) {
- // SimilarityImpl in oak-core sets property restriction for sim search and the query is something like
- // mlt?mlt.fl=:path&mlt.mindf=0&stream.body=<path> . We need parse this query string and turn into a query
- // elastic can understand.
- String mltQueryString = propertyRestrictionQuery.replace("mlt?", "");
- boolQuery.must(moreLikeThisQuery(mltQueryString));
+ List<PropertyDefinition> sp = new LinkedList<>();
+ for (IndexDefinition.IndexingRule r : elasticIndexDefinition.getDefinedRules()) {
+ sp.addAll(r.getSimilarityProperties());
+ }
+ String mltQueryString = propertyRestrictionQuery.substring("mlt?".length());
+ if (sp.isEmpty()) {
+ // SimilarityImpl in oak-core sets property restriction for sim search and the query is something like
+ // mlt?mlt.fl=:path&mlt.mindf=0&stream.body=<path> . We need parse this query string and turn into a query
+ // elastic can understand.
+ boolQuery.must(moreLikeThisQuery(mltQueryString));
+ } else {
+ boolQuery.must(similarityQuery(mltQueryString, sp));
+ }
} else {
boolQuery.must(queryStringQuery(propertyRestrictionQuery));
@@ -263,6 +288,56 @@ public class ElasticRequestHandler {
.map(pd -> pd.name);
}
+ private QueryBuilder similarityQuery(String mltQueryString, List<PropertyDefinition> sp) {
+ LOG.debug("parsing similarity query on {}", mltQueryString);
+ Map<String, String> mltParamMap = MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
+ String text = mltParamMap.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+ BoolQueryBuilder query = boolQuery();
+ if (text != null && !sp.isEmpty()) {
+ LOG.debug("generating similarity query for {}", text);
+ NodeState targetNodeState = rootState;
+ for (String token : PathUtils.elements(text)) {
+ targetNodeState = targetNodeState.getChildNode(token);
+ }
+ if (!targetNodeState.exists()) {
+ throw new IllegalArgumentException("Could not find node " + text);
+ }
+ for (PropertyDefinition pd : sp) {
+ String propertyPath = PathUtils.getParentPath(pd.name);;
+ String propertyName = PathUtils.getName(pd.name);
+ NodeState tempState = targetNodeState;
+ for (String token : PathUtils.elements(propertyPath)) {
+ if (token.isEmpty()) {
+ break;
+ }
+ tempState = tempState.getChildNode(token);
+ }
+ PropertyState ps = tempState.getProperty(propertyName);
+ Blob property = ps != null ? ps.getValue(Type.BINARY) : null;
+ if (property == null) {
+ LOG.warn("Couldn't find property {} on {}", pd.name, text);
+ continue;
+ }
+ byte[] bytes;
+ try {
+ bytes = new BlobByteSource(property).read();
+ } catch (IOException e) {
+ LOG.error("Error reading bytes from property " + pd.name +" on " + text, e);
+ continue;
+ }
+ String similarityPropFieldName = FieldNames.createSimilarityFieldName(pd.name);
+ Map<String, Object> paramMap = new HashMap<>();
+ paramMap.put("query_vector", toDoubles(bytes));
+ paramMap.put("field_name", similarityPropFieldName);
+ ScriptScoreQueryBuilder scriptScoreQueryBuilder = scriptScoreQuery(existsQuery(similarityPropFieldName),
+ new Script(ScriptType.INLINE, Script.DEFAULT_SCRIPT_LANG, "cosineSimilarity(params.query_vector, params.field_name) + 1.0",
+ Collections.emptyMap(), paramMap));
+ query.should(scriptScoreQueryBuilder);
+ }
+ }
+ return query;
+ }
+
/*
Generates mlt query builder from the given mltQueryString
There could be 2 cases here -
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java Thu Oct 29 16:23:30 2020
@@ -18,13 +18,20 @@ package org.apache.jackrabbit.oak.plugin
import org.jetbrains.annotations.NotNull;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.List;
public class ElasticIndexUtils {
+ private static final Logger LOG = LoggerFactory.getLogger(ElasticIndexUtils.class);
+
/**
* Transforms a path into an _id compatible with Elasticsearch specification. The path cannot be larger than 512
* bytes. For performance reasons paths that are already compatible are returned untouched. Otherwise, SHA-256
@@ -46,4 +53,39 @@ public class ElasticIndexUtils {
}
return path;
}
+
+ /**
+ * Converts a given byte array (of doubles) to a list of doubles
+ * @param array given byte array
+ * @return list of doubles
+ */
+ public static List<Double> toDoubles(byte[] array) {
+ int blockSize = Double.SIZE / Byte.SIZE;
+ ByteBuffer wrap = ByteBuffer.wrap(array);
+ if (array.length % blockSize != 0) {
+ LOG.warn("Unexpected byte array length {}", array.length);
+ }
+ int capacity = array.length / blockSize;
+ List<Double> doubles = new ArrayList<>(capacity);
+ for (int i = 0; i < capacity; i++) {
+ double e = wrap.getDouble(i * blockSize);
+ doubles.add(e);
+ }
+ return doubles;
+ }
+
+ /**
+ * Converts a given list of double values into a byte array
+ * @param values given list of doubles
+ * @return byte array
+ */
+ public static byte[] toByteArray(List<Double> values) {
+ int blockSize = Double.SIZE / Byte.SIZE;
+ byte[] bytes = new byte[values.size() * blockSize];
+ ByteBuffer wrap = ByteBuffer.wrap(bytes);
+ for (int i = 0, j = 0; i < values.size(); i++, j += blockSize) {
+ wrap.putDouble(values.get(i));
+ }
+ return bytes;
+ }
}
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java?rev=1882970&r1=1882969&r2=1882970&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java Thu Oct 29 16:23:30 2020
@@ -16,16 +16,38 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic;
+import org.apache.commons.io.IOUtils;
+import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.Tree;
+import org.apache.jackrabbit.oak.api.Type;
+import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
+import org.elasticsearch.client.RequestOptions;
+import org.elasticsearch.client.indices.GetFieldMappingsRequest;
+import org.elasticsearch.client.indices.GetFieldMappingsResponse;
import org.junit.Test;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.net.URI;
+import java.nio.charset.Charset;
import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
import static java.util.Collections.singletonList;
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toByteArray;
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROPDEF_PROP_NODE_NAME;
import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.MatcherAssert.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
public class ElasticPropertyIndexTest extends ElasticAbstractQueryTest {
@@ -154,4 +176,83 @@ public class ElasticPropertyIndexTest ex
Arrays.asList("/test/a", "/test/b")));
}
+ @Test
+ public void vectorSimilarityCustomVectorSize() throws Exception {
+ final String indexName = "test1";
+ final String fieldName1 = "fv1";
+ final String fieldName2 = "fv2";
+ final String similarityFieldName1 = FieldNames.createSimilarityFieldName(fieldName1);
+ final String similarityFieldName2 = FieldNames.createSimilarityFieldName(fieldName2);
+ IndexDefinitionBuilder builder = createIndex(fieldName1, fieldName2);
+ builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex()
+ .similaritySearchDenseVectorSize(10);
+ builder.indexRule("nt:base").property(fieldName2).useInSimilarity(true).nodeScopeIndex()
+ .similaritySearchDenseVectorSize(20);
+ Tree index = setIndex(indexName, builder);
+ root.commit();
+ String alias = ElasticIndexNameHelper.getIndexAlias(esConnection.getIndexPrefix(), "/oak:index/" + indexName);
+ GetFieldMappingsRequest fieldMappingsRequest = new GetFieldMappingsRequest();
+ fieldMappingsRequest.indices(alias).fields(similarityFieldName1, similarityFieldName2);
+ GetFieldMappingsResponse mappingsResponse = esConnection.getClient().indices().
+ getFieldMapping(fieldMappingsRequest, RequestOptions.DEFAULT);
+ final Map<String, Map<String, GetFieldMappingsResponse.FieldMappingMetadata>> mappings =
+ mappingsResponse.mappings();
+ assertEquals("More than one index found", 1, mappings.keySet().size());
+ @SuppressWarnings("unchecked")
+ Map<String, Integer> map1 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
+ get(similarityFieldName1).sourceAsMap().get(similarityFieldName1);
+ assertEquals("Dense vector size doesn't match", 10, map1.get("dims").intValue());
+ @SuppressWarnings("unchecked")
+ Map<String, Integer> map2 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
+ get(similarityFieldName2).sourceAsMap().get(similarityFieldName2);
+ assertEquals("Dense vector size doesn't match", 20, map2.get("dims").intValue());
+ }
+
+
+ @Test
+ public void vectorSimilarity() throws Exception {
+ IndexDefinitionBuilder builder = createIndex("fv");
+ builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex();
+ Tree index = setIndex("test1", builder);
+ root.commit();
+ Tree test = root.getTree("/").addChild("test");
+
+ URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+ File file = new File(uri);
+
+ Collection<String> children = new LinkedList<>();
+ for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+ String[] split = line.split(",");
+ List<Double> values = Arrays.stream(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
+ byte[] bytes = toByteArray(values);
+ List<Double> actual = toDoubles(bytes);
+ assertEquals(values, actual);
+
+ Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+ String name = split[0];
+ Tree child = test.addChild(name);
+ child.setProperty("fv", blob, Type.BINARY);
+ children.add(child.getPath());
+ }
+ root.commit();
+
+ // check that similarity changes across different feature vectors
+ List<String> baseline = new LinkedList<>();
+ for (String similarPath : children) {
+ String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";
+ List<String> current = new LinkedList<>();
+ assertEventually(() -> {
+ Iterator<String> result = executeQuery(query, "JCR-SQL2", false, true).iterator();
+ current.clear();
+ while (result.hasNext()) {
+ String next = result.next();
+ current.add(next);
+ }
+ assertNotEquals(baseline, current);
+ });
+ baseline.clear();
+ baseline.addAll(current);
+ }
+ }
+
}