You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by fo...@apache.org on 2022/02/03 14:25:28 UTC
[jackrabbit-oak] branch trunk updated: OAK-9684: avoid ingesting feature vectors with wrong size (#480)
This is an automated email from the ASF dual-hosted git repository.
fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new 55f95f2 OAK-9684: avoid ingesting feature vectors with wrong size (#480)
55f95f2 is described below
commit 55f95f28eb91b9f898685732a04b760662393732
Author: Fabrizio Fortino <fa...@gmail.com>
AuthorDate: Thu Feb 3 15:25:22 2022 +0100
OAK-9684: avoid ingesting feature vectors with wrong size (#480)
* OAK-9684: avoid ingesting feature vectors with wrong size
* OAK-9684: add node path in wrong vector size warning
---
.../index/elastic/index/ElasticDocumentMaker.java | 14 ++++++++---
.../index/elastic/ElasticSimilarQueryTest.java | 29 ++++++++++++++++++++++
2 files changed, 39 insertions(+), 4 deletions(-)
diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
index 216421d..f14f9b0 100644
--- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
+++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
@@ -90,7 +90,7 @@ public class ElasticDocumentMaker extends FulltextDocumentMaker<ElasticDocument>
@Override
protected boolean indexFacetProperty(ElasticDocument doc, int tag, PropertyState property, String pname) {
// faceting on ES works automatically for keyword (propertyIndex) and subsequently query params.
- // We we don't need to add anything.
+ // We don't need to add anything.
// https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html
return false;
}
@@ -215,9 +215,15 @@ public class ElasticDocumentMaker extends FulltextDocumentMaker<ElasticDocument>
@Override
protected void indexSimilarityBinaries(ElasticDocument doc, PropertyDefinition pd, Blob blob) throws IOException {
- // see https://www.elastic.co/blog/text-similarity-search-with-vectors-in-elasticsearch
- // see https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
- doc.addSimilarityField(pd.name, blob);
+ // without this check, if the vector size is not correct, the entire document will be skipped
+ if (pd.getSimilaritySearchDenseVectorSize() == blob.length() / 8) {
+ // see https://www.elastic.co/blog/text-similarity-search-with-vectors-in-elasticsearch
+ // see https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
+ doc.addSimilarityField(pd.name, blob);
+ } else {
+ LOG.warn("[{}] Ignoring binary property {} for path {}. Expected dimension is {} but got {}",
+ getIndexName(), pd.name, this.path, pd.getSimilaritySearchDenseVectorSize(), blob.length() / 8);
+ }
}
@Override
diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
index 2bfae8f..c7c9898 100644
--- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
+++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
@@ -258,6 +258,35 @@ public class ElasticSimilarQueryTest extends ElasticAbstractQueryTest {
assertEquals("Similarity doesn't match", 12, map1.get("k"));
}
+ @Test
+ public void vectorSimilarityWithWrongVectorSizes() throws Exception {
+ IndexDefinitionBuilder builder = createIndex("fv");
+ builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex()
+ .similaritySearchDenseVectorSize(100);// test FVs have size 1048
+ Tree index = setIndex("test1", builder);
+ root.commit();
+ Tree test = root.getTree("/").addChild("test");
+
+ URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+ File file = new File(uri);
+
+ for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+ String[] split = line.split(",");
+ List<Double> values = Arrays.stream(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
+ byte[] bytes = toByteArray(values);
+ List<Double> actual = toDoubles(bytes);
+ assertEquals(values, actual);
+
+ Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+ String name = split[0];
+ Tree child = test.addChild(name);
+ child.setProperty("fv", blob, Type.BINARY);
+ }
+ root.commit();
+
+ // regardless of the wrong vectors, we should be able to index
+ assertEventually(() -> assertEquals(10, countDocuments(index)));
+ }
@Test
public void vectorSimilarity() throws Exception {