You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by fo...@apache.org on 2020/11/02 09:17:10 UTC
svn commit: r1883065 - in /jackrabbit/oak/trunk:
oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/
oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/
oak-search-elastic/src/main/java/...
Author: fortino
Date: Mon Nov 2 09:17:10 2020
New Revision: 1883065
URL: http://svn.apache.org/viewvc?rev=1883065&view=rev
Log:
OAK-9264: add support for similarityTags in oak-search-elastic
Modified:
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java
jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java Mon Nov 2 09:17:10 2020
@@ -57,6 +57,11 @@ public class ElasticIndexDefinition exte
public static final String PROP_INDEX_NAME_SEED = ":nameSeed";
/**
+ * Hidden property to store similarity tags
+ */
+ public static final String SIMILARITY_TAGS = ":simTags";
+
+ /**
* Node name under which various analyzers are configured
*/
private static final String ANALYZERS = "analyzers";
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java Mon Nov 2 09:17:10 2020
@@ -18,6 +18,7 @@ package org.apache.jackrabbit.oak.plugin
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.spi.binary.BlobByteSource;
import org.elasticsearch.common.Strings;
@@ -47,6 +48,7 @@ class ElasticDocument {
private final Map<String, Object> properties;
private final Map<String, Object> similarityFields;
private final Map<String, Map<String, Double>> dynamicBoostFields;
+ private final Set<String> similarityTags;
ElasticDocument(String path) {
this.path = path;
@@ -57,6 +59,7 @@ class ElasticDocument {
this.properties = new HashMap<>();
this.similarityFields = new HashMap<>();
this.dynamicBoostFields = new HashMap<>();
+ this.similarityTags = new LinkedHashSet<>();
}
void addFulltext(String value) {
@@ -105,6 +108,10 @@ class ElasticDocument {
.putIfAbsent(value, boost);
}
+ void addSimilarityTag(String value) {
+ similarityTags.add(value);
+ }
+
public String build() {
String ret;
try {
@@ -144,6 +151,9 @@ class ElasticDocument {
}
builder.endArray();
}
+ if (!similarityTags.isEmpty()) {
+ builder.field(ElasticIndexDefinition.SIMILARITY_TAGS, similarityTags);
+ }
}
builder.endObject();
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java Mon Nov 2 09:17:10 2020
@@ -172,7 +172,11 @@ class ElasticDocumentMaker extends Fullt
@Override
protected boolean indexSimilarityTag(ElasticDocument doc, PropertyState property) {
- // TODO : not implemented
+ String val = property.getValue(Type.STRING);
+ if (val.length() > 0) {
+ doc.addSimilarityTag(val);
+ return true;
+ }
return false;
}
@@ -180,9 +184,7 @@ class ElasticDocumentMaker extends Fullt
protected void indexSimilarityBinaries(ElasticDocument doc, PropertyDefinition pd, Blob blob) throws IOException {
// see https://www.elastic.co/blog/text-similarity-search-with-vectors-in-elasticsearch
// see https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
-
doc.addSimilarityField(pd.name, blob);
-
}
@Override
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java Mon Nov 2 09:17:10 2020
@@ -255,6 +255,11 @@ class ElasticIndexHelper {
}
mappingBuilder.endObject();
}
+
+ mappingBuilder.startObject(ElasticIndexDefinition.SIMILARITY_TAGS)
+ .field("type", "text")
+ .field("analyzer", "oak_analyzer")
+ .endObject();
}
// we need to check if in the defined rules there are properties with the same name and different types
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java Mon Nov 2 09:17:10 2020
@@ -45,7 +45,6 @@ import org.apache.jackrabbit.oak.spi.sta
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.join.ScoreMode;
import org.elasticsearch.index.query.BoolQueryBuilder;
-import org.elasticsearch.index.query.ExistsQueryBuilder;
import org.elasticsearch.index.query.InnerHitBuilder;
import org.elasticsearch.index.query.MatchBoolPrefixQueryBuilder;
import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
@@ -89,8 +88,6 @@ import java.util.stream.StreamSupport;
import static org.apache.jackrabbit.JcrConstants.JCR_MIXINTYPES;
import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE;
-import static org.apache.jackrabbit.oak.commons.PathUtils.denotesRoot;
-import static org.apache.jackrabbit.oak.commons.PathUtils.getParentPath;
import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newAncestorQuery;
import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newDepthQuery;
@@ -113,6 +110,7 @@ import static org.elasticsearch.index.qu
import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
+import static org.elasticsearch.index.query.QueryBuilders.moreLikeThisQuery;
import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
import static org.elasticsearch.index.query.QueryBuilders.nestedQuery;
import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
@@ -172,15 +170,34 @@ public class ElasticRequestHandler {
sp.addAll(r.getSimilarityProperties());
}
String mltQueryString = propertyRestrictionQuery.substring("mlt?".length());
+ Map<String, String> mltParams = MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
+ String text = mltParams.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+
+ if (text == null) {
+ // TODO : See if we might want to support like Text here (passed as null in above constructors)
+ // IT is not supported in our lucene implementation.
+ throw new IllegalArgumentException("Missing required field stream.body in MLT query: " + mltQueryString);
+ }
if (sp.isEmpty()) {
// SimilarityImpl in oak-core sets property restriction for sim search and the query is something like
// mlt?mlt.fl=:path&mlt.mindf=0&stream.body=<path> . We need parse this query string and turn into a query
// elastic can understand.
- boolQuery.must(moreLikeThisQuery(mltQueryString));
+ MoreLikeThisQueryBuilder mltqb = mltQuery(mltParams);
+ boolQuery.must(mltqb);
+ // add should clause to improve relevance using similarity tags
+ boolQuery.should(moreLikeThisQuery(
+ new String[]{ElasticIndexDefinition.SIMILARITY_TAGS}, null, mltqb.likeItems())
+ .minTermFreq(1).minDocFreq(1)
+ );
} else {
- boolQuery.must(similarityQuery(mltQueryString, sp));
+ boolQuery.must(similarityQuery(text, sp));
+ // add should clause to improve relevance using similarity tags
+ boolQuery.should(moreLikeThisQuery(
+ new String[]{ElasticIndexDefinition.SIMILARITY_TAGS}, null,
+ new Item[]{new Item(null, ElasticIndexUtils.idFromPath(text))})
+ .minTermFreq(1).minDocFreq(1)
+ );
}
-
} else {
boolQuery.must(queryStringQuery(propertyRestrictionQuery));
}
@@ -288,12 +305,9 @@ public class ElasticRequestHandler {
.map(pd -> pd.name);
}
- private QueryBuilder similarityQuery(String mltQueryString, List<PropertyDefinition> sp) {
- LOG.debug("parsing similarity query on {}", mltQueryString);
- Map<String, String> mltParamMap = MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
- String text = mltParamMap.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+ private QueryBuilder similarityQuery(@NotNull String text, List<PropertyDefinition> sp) {
BoolQueryBuilder query = boolQuery();
- if (text != null && !sp.isEmpty()) {
+ if (!sp.isEmpty()) {
LOG.debug("generating similarity query for {}", text);
NodeState targetNodeState = rootState;
for (String token : PathUtils.elements(text)) {
@@ -303,7 +317,7 @@ public class ElasticRequestHandler {
throw new IllegalArgumentException("Could not find node " + text);
}
for (PropertyDefinition pd : sp) {
- String propertyPath = PathUtils.getParentPath(pd.name);;
+ String propertyPath = PathUtils.getParentPath(pd.name);
String propertyName = PathUtils.getName(pd.name);
NodeState tempState = targetNodeState;
for (String token : PathUtils.elements(propertyPath)) {
@@ -331,7 +345,7 @@ public class ElasticRequestHandler {
paramMap.put("field_name", similarityPropFieldName);
ScriptScoreQueryBuilder scriptScoreQueryBuilder = scriptScoreQuery(existsQuery(similarityPropFieldName),
new Script(ScriptType.INLINE, Script.DEFAULT_SCRIPT_LANG, "cosineSimilarity(params.query_vector, params.field_name) + 1.0",
- Collections.emptyMap(), paramMap));
+ Collections.emptyMap(), paramMap));
query.should(scriptScoreQueryBuilder);
}
}
@@ -355,36 +369,29 @@ public class ElasticRequestHandler {
(The above is important since this is not a one-size-fits-all situation and the default values might not
be useful in every situation based on the type of content)
*/
- private QueryBuilder moreLikeThisQuery(String mltQueryString) {
+ private MoreLikeThisQueryBuilder mltQuery(Map<String, String> mltParams) {
+ String text = mltParams.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
+
MoreLikeThisQueryBuilder mlt;
- Map<String, String> paramMap = MoreLikeThisHelperUtil.getParamMapFromMltQuery(mltQueryString);
- String text = paramMap.get(MoreLikeThisHelperUtil.MLT_STREAM_BODY);
- String fields = paramMap.get(MoreLikeThisHelperUtil.MLT_FILED);
-
- if (text != null) {
- // It's expected the text here to be the path of the doc
- // In case the path of a node is greater than 512 bytes,
- // we hash it before storing it as the _id for the elastic doc
- text = ElasticIndexUtils.idFromPath(text);
- if (FieldNames.PATH.equals(fields) || fields == null) {
- // Handle the case 1) where default query sent by SimilarImpl (No Custom fields)
- // We just need to specify the doc (Item) whose similar content we need to find
- // We store path as the _id so no need to do anything extra here
- // We expect Similar impl to send a query where text would have evaluated to node path.
- mlt = new MoreLikeThisQueryBuilder(null, new Item[]{new Item(null, text)});
- } else {
- // This is for native queries if someone send additional fields via mlt.fl=field1,field2
- String[] fieldsArray = fields.split(",");
- mlt = new MoreLikeThisQueryBuilder(fieldsArray, null, new Item[]{new Item(null, text)});
- }
- // TODO : See if we might want to support like Text here (passed as null in above constructors)
- // IT is not supported in our lucene implementation.
+ String fields = mltParams.get(MoreLikeThisHelperUtil.MLT_FILED);
+ // It's expected the text here to be the path of the doc
+ // In case the path of a node is greater than 512 bytes,
+ // we hash it before storing it as the _id for the elastic doc
+ text = ElasticIndexUtils.idFromPath(text);
+ if (fields == null || FieldNames.PATH.equals(fields)) {
+ // Handle the case 1) where default query sent by SimilarImpl (No Custom fields)
+ // We just need to specify the doc (Item) whose similar content we need to find
+ // We store path as the _id so no need to do anything extra here
+ // We expect Similar impl to send a query where text would have evaluated to node path.
+ mlt = moreLikeThisQuery(new Item[]{new Item(null, text)});
} else {
- throw new RuntimeException("Missing required field stream.body in MLT query: " + mltQueryString);
+ // This is for native queries if someone send additional fields via mlt.fl=field1,field2
+ String[] fieldsArray = fields.split(",");
+ mlt = moreLikeThisQuery(fieldsArray, null, new Item[]{new Item(null, text)});
}
- for (String key : paramMap.keySet()) {
- String val = paramMap.get(key);
+ for (String key : mltParams.keySet()) {
+ String val = mltParams.get(key);
if (MoreLikeThisHelperUtil.MLT_MIN_DOC_FREQ.equals(key)) {
mlt.minDocFreq(Integer.parseInt(val));
} else if (MoreLikeThisHelperUtil.MLT_MIN_TERM_FREQ.equals(key)) {
@@ -406,7 +413,7 @@ public class ElasticRequestHandler {
String[] stopWords = val.split(",");
mlt.stopWords(stopWords);
} else {
- LOG.warn("Unrecognized param {} in the mlt query {}", key, mltQueryString);
+ LOG.warn("Unrecognized param {} in the mlt query {}", key, mltParams);
}
}
@@ -564,7 +571,7 @@ public class ElasticRequestHandler {
}
break;
case PARENT:
- if (denotesRoot(path)) {
+ if (PathUtils.denotesRoot(path)) {
// there's no parent of the root node
// we add a path that can not possibly occur because there
// is no way to say "match no documents" in Lucene
@@ -575,10 +582,10 @@ public class ElasticRequestHandler {
if (planResult.isPathTransformed()) {
String parentPathSegment = planResult.getParentPathSegment();
if (!any.test(PathUtils.elements(parentPathSegment), "*")) {
- queries.add(newPathQuery(getParentPath(path) + parentPathSegment));
+ queries.add(newPathQuery(PathUtils.getParentPath(path) + parentPathSegment));
}
} else {
- queries.add(newPathQuery(getParentPath(path)));
+ queries.add(newPathQuery(PathUtils.getParentPath(path)));
}
}
break;
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyIndexTest.java Mon Nov 2 09:17:10 2020
@@ -16,38 +16,16 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic;
-import org.apache.commons.io.IOUtils;
-import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.Tree;
-import org.apache.jackrabbit.oak.api.Type;
-import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
-import org.elasticsearch.client.RequestOptions;
-import org.elasticsearch.client.indices.GetFieldMappingsRequest;
-import org.elasticsearch.client.indices.GetFieldMappingsResponse;
import org.junit.Test;
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.net.URI;
-import java.nio.charset.Charset;
import java.util.Arrays;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
import static java.util.Collections.singletonList;
-import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toByteArray;
-import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROPDEF_PROP_NODE_NAME;
import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.MatcherAssert.assertThat;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotEquals;
public class ElasticPropertyIndexTest extends ElasticAbstractQueryTest {
@@ -176,83 +154,4 @@ public class ElasticPropertyIndexTest ex
Arrays.asList("/test/a", "/test/b")));
}
- @Test
- public void vectorSimilarityCustomVectorSize() throws Exception {
- final String indexName = "test1";
- final String fieldName1 = "fv1";
- final String fieldName2 = "fv2";
- final String similarityFieldName1 = FieldNames.createSimilarityFieldName(fieldName1);
- final String similarityFieldName2 = FieldNames.createSimilarityFieldName(fieldName2);
- IndexDefinitionBuilder builder = createIndex(fieldName1, fieldName2);
- builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex()
- .similaritySearchDenseVectorSize(10);
- builder.indexRule("nt:base").property(fieldName2).useInSimilarity(true).nodeScopeIndex()
- .similaritySearchDenseVectorSize(20);
- Tree index = setIndex(indexName, builder);
- root.commit();
- String alias = ElasticIndexNameHelper.getIndexAlias(esConnection.getIndexPrefix(), "/oak:index/" + indexName);
- GetFieldMappingsRequest fieldMappingsRequest = new GetFieldMappingsRequest();
- fieldMappingsRequest.indices(alias).fields(similarityFieldName1, similarityFieldName2);
- GetFieldMappingsResponse mappingsResponse = esConnection.getClient().indices().
- getFieldMapping(fieldMappingsRequest, RequestOptions.DEFAULT);
- final Map<String, Map<String, GetFieldMappingsResponse.FieldMappingMetadata>> mappings =
- mappingsResponse.mappings();
- assertEquals("More than one index found", 1, mappings.keySet().size());
- @SuppressWarnings("unchecked")
- Map<String, Integer> map1 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
- get(similarityFieldName1).sourceAsMap().get(similarityFieldName1);
- assertEquals("Dense vector size doesn't match", 10, map1.get("dims").intValue());
- @SuppressWarnings("unchecked")
- Map<String, Integer> map2 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
- get(similarityFieldName2).sourceAsMap().get(similarityFieldName2);
- assertEquals("Dense vector size doesn't match", 20, map2.get("dims").intValue());
- }
-
-
- @Test
- public void vectorSimilarity() throws Exception {
- IndexDefinitionBuilder builder = createIndex("fv");
- builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex();
- Tree index = setIndex("test1", builder);
- root.commit();
- Tree test = root.getTree("/").addChild("test");
-
- URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
- File file = new File(uri);
-
- Collection<String> children = new LinkedList<>();
- for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
- String[] split = line.split(",");
- List<Double> values = Arrays.stream(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
- byte[] bytes = toByteArray(values);
- List<Double> actual = toDoubles(bytes);
- assertEquals(values, actual);
-
- Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
- String name = split[0];
- Tree child = test.addChild(name);
- child.setProperty("fv", blob, Type.BINARY);
- children.add(child.getPath());
- }
- root.commit();
-
- // check that similarity changes across different feature vectors
- List<String> baseline = new LinkedList<>();
- for (String similarPath : children) {
- String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";
- List<String> current = new LinkedList<>();
- assertEventually(() -> {
- Iterator<String> result = executeQuery(query, "JCR-SQL2", false, true).iterator();
- current.clear();
- while (result.hasNext()) {
- String next = result.next();
- current.add(next);
- }
- assertNotEquals(baseline, current);
- });
- baseline.clear();
- baseline.addAll(current);
- }
- }
-
}
Modified: jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java (original)
+++ jackrabbit/oak/trunk/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java Mon Nov 2 09:17:10 2020
@@ -16,14 +16,37 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic;
+import org.apache.commons.io.IOUtils;
+import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.Tree;
+import org.apache.jackrabbit.oak.api.Type;
+import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants;
import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder;
+import org.elasticsearch.client.RequestOptions;
+import org.elasticsearch.client.indices.GetFieldMappingsRequest;
+import org.elasticsearch.client.indices.GetFieldMappingsResponse;
import org.junit.Test;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.net.URI;
+import java.nio.charset.Charset;
import java.util.Arrays;
+import java.util.Collection;
import java.util.Collections;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
import java.util.UUID;
+import java.util.stream.Collectors;
+
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toByteArray;
+import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
public class ElasticSimilarQueryTest extends ElasticAbstractQueryTest {
@@ -34,7 +57,7 @@ public class ElasticSimilarQueryTest ext
whereas in elastic, it doesn't.
*/
@Test
- public void testRepSimilarAsNativeQuery() throws Exception {
+ public void repSimilarAsNativeQuery() throws Exception {
createIndex(true);
@@ -56,7 +79,7 @@ public class ElasticSimilarQueryTest ext
whereas in elastic, it doesn't.
*/
@Test
- public void testRepSimilarQuery() throws Exception {
+ public void repSimilarQuery() throws Exception {
createIndex(false);
String query = "select [jcr:path] from [nt:base] where similar(., '/test/a')";
@@ -82,7 +105,7 @@ public class ElasticSimilarQueryTest ext
whereas in elastic, it doesn't.
*/
@Test
- public void testRepSimilarXPathQuery() throws Exception {
+ public void repSimilarXPathQuery() throws Exception {
createIndex(false);
String query = "//element(*, nt:base)[rep:similar(., '/test/a')]";
@@ -101,7 +124,7 @@ public class ElasticSimilarQueryTest ext
}
@Test
- public void testRepSimilarWithStopWords() throws Exception {
+ public void repSimilarWithStopWords() throws Exception {
createIndex(true);
String nativeQueryStringWithStopWords = "select [jcr:path] from [nt:base] where " +
@@ -129,7 +152,7 @@ public class ElasticSimilarQueryTest ext
}
@Test
- public void testRepSimilarWithMinWordLength() throws Exception {
+ public void repSimilarWithMinWordLength() throws Exception {
createIndex(true);
String nativeQueryStringWithMinWordLength = "select [jcr:path] from [nt:base] where " +
"native('elastic-sim', 'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0&mlt.minwl=6')";
@@ -155,7 +178,7 @@ public class ElasticSimilarQueryTest ext
}
@Test
- public void testRepSimilarQueryWithLongPath() throws Exception {
+ public void repSimilarQueryWithLongPath() throws Exception {
createIndex(false);
Tree test = root.getTree("/").addChild("test");
Tree longPath = test.addChild("a");
@@ -178,12 +201,114 @@ public class ElasticSimilarQueryTest ext
Arrays.asList("/test/b", "/test/c", "/test/d", "/test/f", "/test/g", "/test/h")));
}
+ @Test
+ public void similarityTagsAffectRelevance() throws Exception {
+ createIndex(false);
+
+ Tree test = root.getTree("/").addChild("test");
+ Tree a = test.addChild("a");
+ a.setProperty("text", "Hello World Hello World");
+ a.setProperty("tags", "foo");
+ Tree b = test.addChild("b");
+ b.setProperty("text", "Hello World Hello World");
+ b.setProperty("tags", "bar");
+ Tree c = test.addChild("c");
+ c.setProperty("text", "Hello World Hello World");
+ c.setProperty("tags", "foo");
+ root.commit();
+
+ assertEventually(() -> assertOrderedQuery("select [jcr:path] from [nt:base] where similar(., '/test/a')",
+ Arrays.asList("/test/c", "/test/b")));
+ assertEventually(() -> assertOrderedQuery("select [jcr:path] from [nt:base] where similar(., '/test/c')",
+ Arrays.asList("/test/a", "/test/b")));
+ }
+
+ @Test
+ public void vectorSimilarityCustomVectorSize() throws Exception {
+ final String indexName = "test1";
+ final String fieldName1 = "fv1";
+ final String fieldName2 = "fv2";
+ final String similarityFieldName1 = FieldNames.createSimilarityFieldName(fieldName1);
+ final String similarityFieldName2 = FieldNames.createSimilarityFieldName(fieldName2);
+ IndexDefinitionBuilder builder = createIndex(fieldName1, fieldName2);
+ builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex()
+ .similaritySearchDenseVectorSize(10);
+ builder.indexRule("nt:base").property(fieldName2).useInSimilarity(true).nodeScopeIndex()
+ .similaritySearchDenseVectorSize(20);
+ setIndex(indexName, builder);
+ root.commit();
+ String alias = ElasticIndexNameHelper.getIndexAlias(esConnection.getIndexPrefix(), "/oak:index/" + indexName);
+ GetFieldMappingsRequest fieldMappingsRequest = new GetFieldMappingsRequest();
+ fieldMappingsRequest.indices(alias).fields(similarityFieldName1, similarityFieldName2);
+ GetFieldMappingsResponse mappingsResponse = esConnection.getClient().indices().
+ getFieldMapping(fieldMappingsRequest, RequestOptions.DEFAULT);
+ final Map<String, Map<String, GetFieldMappingsResponse.FieldMappingMetadata>> mappings =
+ mappingsResponse.mappings();
+ assertEquals("More than one index found", 1, mappings.keySet().size());
+ @SuppressWarnings("unchecked")
+ Map<String, Integer> map1 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
+ get(similarityFieldName1).sourceAsMap().get(similarityFieldName1);
+ assertEquals("Dense vector size doesn't match", 10, map1.get("dims").intValue());
+ @SuppressWarnings("unchecked")
+ Map<String, Integer> map2 = (Map<String, Integer>)mappings.entrySet().iterator().next().getValue().
+ get(similarityFieldName2).sourceAsMap().get(similarityFieldName2);
+ assertEquals("Dense vector size doesn't match", 20, map2.get("dims").intValue());
+ }
+
+
+ @Test
+ public void vectorSimilarity() throws Exception {
+ IndexDefinitionBuilder builder = createIndex("fv");
+ builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex();
+ setIndex("test1", builder);
+ root.commit();
+ Tree test = root.getTree("/").addChild("test");
+
+ URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+ File file = new File(uri);
+
+ Collection<String> children = new LinkedList<>();
+ for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+ String[] split = line.split(",");
+ List<Double> values = Arrays.stream(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
+ byte[] bytes = toByteArray(values);
+ List<Double> actual = toDoubles(bytes);
+ assertEquals(values, actual);
+
+ Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+ String name = split[0];
+ Tree child = test.addChild(name);
+ child.setProperty("fv", blob, Type.BINARY);
+ children.add(child.getPath());
+ }
+ root.commit();
+
+ // check that similarity changes across different feature vectors
+ List<String> baseline = new LinkedList<>();
+ for (String similarPath : children) {
+ String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";
+ List<String> current = new LinkedList<>();
+ assertEventually(() -> {
+ Iterator<String> result = executeQuery(query, "JCR-SQL2", false, true).iterator();
+ current.clear();
+ while (result.hasNext()) {
+ String next = result.next();
+ current.add(next);
+ }
+ assertNotEquals(baseline, current);
+ });
+ baseline.clear();
+ baseline.addAll(current);
+ }
+ }
+
private void createIndex(boolean nativeQuery) throws Exception {
- IndexDefinitionBuilder builder = createIndex("text");
+ IndexDefinitionBuilder builder = createIndex("text", "tags");
if (nativeQuery) {
builder.getBuilderTree().setProperty(FulltextIndexConstants.FUNC_NAME, "elastic-sim");
}
builder.indexRule("nt:base").property("text").analyzed();
+ builder.indexRule("nt:base").property("tags").similarityTags(true);
String indexId = UUID.randomUUID().toString();
setIndex(indexId, builder);
root.commit();
Modified: jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java?rev=1883065&r1=1883064&r2=1883065&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java (original)
+++ jackrabbit/oak/trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/util/IndexDefinitionBuilder.java Mon Nov 2 09:17:10 2020
@@ -16,7 +16,6 @@
* specific language governing permissions and limitations
* under the License.
*/
-
package org.apache.jackrabbit.oak.plugins.index.search.util;
import com.google.common.collect.Iterables;
@@ -388,6 +387,11 @@ public class IndexDefinitionBuilder {
return this;
}
+ public PropertyRule similarityTags(boolean rerank) {
+ propTree.setProperty(FulltextIndexConstants.PROP_SIMILARITY_TAGS, rerank);
+ return this;
+ }
+
public PropertyRule type(String type) {
//This would throw an IAE if type is invalid
PropertyType.valueFromName(type);