You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by fo...@apache.org on 2022/02/03 14:25:28 UTC

[jackrabbit-oak] branch trunk updated: OAK-9684: avoid ingesting feature vectors with wrong size (#480)

This is an automated email from the ASF dual-hosted git repository.

fortino pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 55f95f2  OAK-9684: avoid ingesting feature vectors with wrong size (#480)
55f95f2 is described below

commit 55f95f28eb91b9f898685732a04b760662393732
Author: Fabrizio Fortino <fa...@gmail.com>
AuthorDate: Thu Feb 3 15:25:22 2022 +0100

    OAK-9684: avoid ingesting feature vectors with wrong size (#480)
    
    * OAK-9684: avoid ingesting feature vectors with wrong size
    
    * OAK-9684: add node path in wrong vector size warning
---
 .../index/elastic/index/ElasticDocumentMaker.java  | 14 ++++++++---
 .../index/elastic/ElasticSimilarQueryTest.java     | 29 ++++++++++++++++++++++
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
index 216421d..f14f9b0 100644
--- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
+++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java
@@ -90,7 +90,7 @@ public class ElasticDocumentMaker extends FulltextDocumentMaker<ElasticDocument>
     @Override
     protected boolean indexFacetProperty(ElasticDocument doc, int tag, PropertyState property, String pname) {
         // faceting on ES works automatically for keyword (propertyIndex) and subsequently query params.
-        // We we don't need to add anything.
+        // We don't need to add anything.
         // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html
         return false;
     }
@@ -215,9 +215,15 @@ public class ElasticDocumentMaker extends FulltextDocumentMaker<ElasticDocument>
 
     @Override
     protected void indexSimilarityBinaries(ElasticDocument doc, PropertyDefinition pd, Blob blob) throws IOException {
-        // see https://www.elastic.co/blog/text-similarity-search-with-vectors-in-elasticsearch
-        // see https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
-        doc.addSimilarityField(pd.name, blob);
+        // without this check, if the vector size is not correct, the entire document will be skipped
+        if (pd.getSimilaritySearchDenseVectorSize() == blob.length() / 8) {
+            // see https://www.elastic.co/blog/text-similarity-search-with-vectors-in-elasticsearch
+            // see https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
+            doc.addSimilarityField(pd.name, blob);
+        } else {
+            LOG.warn("[{}] Ignoring binary property {} for path {}. Expected dimension is {} but got {}",
+                    getIndexName(), pd.name, this.path, pd.getSimilaritySearchDenseVectorSize(), blob.length() / 8);
+        }
     }
 
     @Override
diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
index 2bfae8f..c7c9898 100644
--- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
+++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java
@@ -258,6 +258,35 @@ public class ElasticSimilarQueryTest extends ElasticAbstractQueryTest {
         assertEquals("Similarity doesn't match", 12, map1.get("k"));
     }
 
+    @Test
+    public void vectorSimilarityWithWrongVectorSizes() throws Exception {
+        IndexDefinitionBuilder builder = createIndex("fv");
+        builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex()
+                .similaritySearchDenseVectorSize(100);// test FVs have size 1048
+        Tree index = setIndex("test1", builder);
+        root.commit();
+        Tree test = root.getTree("/").addChild("test");
+
+        URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+        File file = new File(uri);
+
+        for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+            String[] split = line.split(",");
+            List<Double> values = Arrays.stream(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
+            byte[] bytes = toByteArray(values);
+            List<Double> actual = toDoubles(bytes);
+            assertEquals(values, actual);
+
+            Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+            String name = split[0];
+            Tree child = test.addChild(name);
+            child.setProperty("fv", blob, Type.BINARY);
+        }
+        root.commit();
+
+        // regardless of the wrong vectors, we should be able to index
+        assertEventually(() -> assertEquals(10, countDocuments(index)));
+    }
 
     @Test
     public void vectorSimilarity() throws Exception {