You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@jackrabbit.apache.org by GitBox <gi...@apache.org> on 2022/06/09 10:07:02 UTC

[GitHub] [jackrabbit-oak] fabriziofortino commented on a diff in pull request #589: OAK-9788: Add an index property to disable similarity for lucene index

fabriziofortino commented on code in PR #589:
URL: https://github.com/apache/jackrabbit-oak/pull/589#discussion_r893311381


##########
oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java:
##########
@@ -3032,6 +3089,50 @@ public void testRepSimilarWithStringFeatureVectors() throws Exception {
         }
     }
 
+    @Test
+    public void testRepSimilarWithStringFeatureVectorsWithDisabledStrings() throws Exception {
+
+        IndexDefinitionBuilder idxb = new LuceneIndexDefinitionBuilder().noAsync().indexSimilarityStrings(false);
+        idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();
+
+        Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
+        idxb.build(idx);
+        root.commit();
+
+        Tree test = root.getTree("/").addChild("test");
+
+        URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+        File file = new File(uri);
+
+        Collection<String> children = new LinkedList<>();
+
+        for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+            int i1 = line.indexOf(',');
+            String name = line.substring(0, i1);
+            String value = line.substring(i1 + 1);
+            Tree child = test.addChild(name);
+            child.setProperty("fv", value, Type.STRING);
+            children.add(child.getPath());
+        }
+        root.commit();
+
+        // check that similarity changes across different feature vectors
+        List<String> baseline = new LinkedList<>();

Review Comment:
   remove `baseline`



##########
oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java:
##########
@@ -2988,6 +2988,63 @@ public void testRepSimilarWithBinaryFeatureVectors() throws Exception {
         }
     }
 
+    @Test
+    public void testRepSimilarWithBinaryFeatureVectorsWithDisabledBinary() throws Exception {
+
+        IndexDefinitionBuilder idxb = new LuceneIndexDefinitionBuilder().noAsync().indexSimilarityBinaries(false);
+        idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();
+
+
+        Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
+        idxb.build(idx);
+        root.commit();
+
+        Tree test = root.getTree("/").addChild("test");
+
+        URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+        File file = new File(uri);
+
+        Collection<String> children = new LinkedList<>();
+        for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+            String[] split = line.split(",");
+            List<Double> values = new LinkedList<>();
+            int i = 0;
+            for (String s : split) {
+                if (i > 0) {
+                    values.add(Double.parseDouble(s));
+                }
+                i++;
+            }
+
+            byte[] bytes = SimSearchUtils.toByteArray(values);
+            List<Double> actual = SimSearchUtils.toDoubles(bytes);
+            assertEquals(values, actual);
+
+            Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+            String name = split[0];
+            Tree child = test.addChild(name);
+            child.setProperty("fv", blob, Type.BINARY);
+            children.add(child.getPath());
+        }
+        root.commit();
+
+        // check that similarity changes across different feature vectors
+        List<String> baseline = new LinkedList<>();

Review Comment:
   `baseline` can be removed. It's updated but never queried



##########
oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java:
##########
@@ -2988,6 +2988,63 @@ public void testRepSimilarWithBinaryFeatureVectors() throws Exception {
         }
     }
 
+    @Test
+    public void testRepSimilarWithBinaryFeatureVectorsWithDisabledBinary() throws Exception {
+
+        IndexDefinitionBuilder idxb = new LuceneIndexDefinitionBuilder().noAsync().indexSimilarityBinaries(false);
+        idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();
+
+
+        Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
+        idxb.build(idx);
+        root.commit();
+
+        Tree test = root.getTree("/").addChild("test");
+
+        URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+        File file = new File(uri);
+
+        Collection<String> children = new LinkedList<>();
+        for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+            String[] split = line.split(",");
+            List<Double> values = new LinkedList<>();
+            int i = 0;
+            for (String s : split) {
+                if (i > 0) {
+                    values.add(Double.parseDouble(s));
+                }
+                i++;
+            }
+
+            byte[] bytes = SimSearchUtils.toByteArray(values);
+            List<Double> actual = SimSearchUtils.toDoubles(bytes);
+            assertEquals(values, actual);
+
+            Blob blob = root.createBlob(new ByteArrayInputStream(bytes));
+            String name = split[0];
+            Tree child = test.addChild(name);
+            child.setProperty("fv", blob, Type.BINARY);
+            children.add(child.getPath());
+        }
+        root.commit();
+
+        // check that similarity changes across different feature vectors
+        List<String> baseline = new LinkedList<>();
+        for (String similarPath : children) {
+            String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";
+
+            Iterator<String> result = executeQuery(query, "JCR-SQL2").iterator();
+            List<String> current = new LinkedList<>();
+            while (result.hasNext()) {
+                String next = result.next();
+                current.add(next);
+            }
+            assertTrue("binary data for similarity should not be indexed" ,current.size() == 0);

Review Comment:
   this can be improved
   ```suggestion
               assertEquals("binary data for similarity should not be indexed", 0, current.size());
   ```



##########
oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/IndexDefinition.java:
##########
@@ -488,6 +491,8 @@ protected IndexDefinition(NodeState root, NodeState defn, IndexFormatVersion ver
                     defn.getProperty(TYPE_PROPERTY_NAME) != null &&
                             DYNAMIC_BOOST_LITE.contains(defn.getProperty(TYPE_PROPERTY_NAME).getValue(Type.STRING))
             );
+            this.indexSimilarityBinaries = getOptionalValue(defn, getIndexSimilarityBinariesDefinitionKey(), true);
+            this.indexSimilarityStrings = getOptionalValue(defn, getIndexSimilarityStringsDefinitionKey(), true);

Review Comment:
   instead of doing this, we could use the same strategy used for `dynamicBoostLite` (see line above) with a single string property like `disableIndexSimilarityBinary` (similar for strings). This property can have one or more index types. Examples:
   
   ```
   disableIndexSimilarityBinary=lucene
   disableIndexSimilarityBinary=elasticsearch
   disableIndexSimilarityBinary=lucene,elasticsearch
   ```
   
   In this way we could have the same functionality for the different types without the need to introduce a specific key.



##########
oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java:
##########
@@ -3032,6 +3089,50 @@ public void testRepSimilarWithStringFeatureVectors() throws Exception {
         }
     }
 
+    @Test
+    public void testRepSimilarWithStringFeatureVectorsWithDisabledStrings() throws Exception {
+
+        IndexDefinitionBuilder idxb = new LuceneIndexDefinitionBuilder().noAsync().indexSimilarityStrings(false);
+        idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex();
+
+        Tree idx = root.getTree("/").getChild("oak:index").addChild("test1");
+        idxb.build(idx);
+        root.commit();
+
+        Tree test = root.getTree("/").addChild("test");
+
+        URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
+        File file = new File(uri);
+
+        Collection<String> children = new LinkedList<>();
+
+        for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) {
+            int i1 = line.indexOf(',');
+            String name = line.substring(0, i1);
+            String value = line.substring(i1 + 1);
+            Tree child = test.addChild(name);
+            child.setProperty("fv", value, Type.STRING);
+            children.add(child.getPath());
+        }
+        root.commit();
+
+        // check that similarity changes across different feature vectors
+        List<String> baseline = new LinkedList<>();
+        for (String similarPath : children) {
+            String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')";
+
+            Iterator<String> result = executeQuery(query, "JCR-SQL2").iterator();
+            List<String> current = new LinkedList<>();
+            while (result.hasNext()) {
+                String next = result.next();
+                current.add(next);
+            }
+            assertTrue("String data for similarity should not be indexed",current.size() == 0);

Review Comment:
   ```suggestion
               assertEquals("String data for similarity should not be indexed", 0, current.size());
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscribe@jackrabbit.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org