You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by to...@apache.org on 2018/11/23 11:19:07 UTC

svn commit: r1847245 - in /jackrabbit/oak/trunk/oak-lucene/src: main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/ test/java/org/apache/jackrabbit/oak/plugins/index/lucene/

Author: tommaso
Date: Fri Nov 23 11:19:06 2018
New Revision: 1847245

URL: http://svn.apache.org/viewvc?rev=1847245&view=rev
Log:
OAK-7824 - make distance threshold relative to current result set

Modified:
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
    jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java?rev=1847245&r1=1847244&r2=1847245&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java Fri Nov 23 11:19:06 2018
@@ -405,7 +405,6 @@ public class LucenePropertyIndex extends
                                             long fvs = PERF_LOGGER.start();
                                             SimSearchUtils.bruteForceFVRerank(sp, docs, indexSearcher);
                                             PERF_LOGGER.end(fvs, -1, "fv reranking done");
-                                            LOG.info("reranking done");
                                             earlyStop = true;
                                         }
                                     }

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java?rev=1847245&r1=1847244&r2=1847245&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java Fri Nov 23 11:19:06 2018
@@ -234,7 +234,9 @@ public class SimSearchUtils {
     }
 
     public static void bruteForceFVRerank(List<PropertyDefinition> sp, TopDocs docs, IndexSearcher indexSearcher) throws IOException {
-        double farthestDistance = 50d;
+        double distSum = 0d;
+        double counter = 0d;
+        Map<Integer, Double> distances = new HashMap<>();
         int k = 15;
         ScoreDoc inputDoc = docs.scoreDocs[0]; // we assume the input doc is the first one returned
         List<Integer> toDiscard = new LinkedList<>();
@@ -247,26 +249,37 @@ public class SimSearchUtils {
                     double[] currentVector = toDoubleArray(indexSearcher.doc(docs.scoreDocs[j].doc)
                             .getBinaryValue(fieldName).bytes);
                     double distance = dist(inputVector, currentVector) + 1e-10; // constant term to avoid division by zero
-
-                    if (distance > farthestDistance) { // a threshold distance above which current vector is discarded
-                        toDiscard.add(docs.scoreDocs[j].doc);
-                    }
                     if (Double.isNaN(distance) || Double.isInfinite(distance)) {
                         toDiscard.add(docs.scoreDocs[j].doc);
+                    } else {
+                        distSum += distance;
+                        counter++;
+                        distances.put(docs.scoreDocs[j].doc, distance);
+                        docs.scoreDocs[j].score += (float) (1d / distance); // additive similarity boosting
                     }
-                    docs.scoreDocs[j].score += (float) (1d / distance); // additive similarity boosting
                 }
             }
         }
+
+        // remove docs having invalid distance
         if (!toDiscard.isEmpty()) {
-            docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e -> !toDiscard.contains(e.doc)).toArray(ScoreDoc[]::new); // remove docs that are not close enough
+            docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e -> !toDiscard.contains(e.doc)).toArray(ScoreDoc[]::new);
         }
-        Arrays.parallelSort(docs.scoreDocs, 0, docs.scoreDocs.length, (o1, o2) -> { // rerank scoreDocs
+
+        // remove docs whose distance is one order of magnitude higher than average distance
+        final double distanceThreshold = 10 * distSum / counter;
+        docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e -> distances.containsKey(e.doc) && distances.get(e.doc) < distanceThreshold).toArray(ScoreDoc[]::new);
+
+        // rerank scoreDocs
+        Arrays.parallelSort(docs.scoreDocs, 0, docs.scoreDocs.length, (o1, o2) -> {
             return -1 * Double.compare(o1.score, o2.score);
         });
+
+        // retain only the top k nearest neighbours
         if (docs.scoreDocs.length > k) {
-            docs.scoreDocs = Arrays.copyOfRange(docs.scoreDocs, 0, k); // retain only the top k nearest neighbours
+            docs.scoreDocs = Arrays.copyOfRange(docs.scoreDocs, 0, k);
         }
+
         if (docs.scoreDocs.length > 0) {
             docs.setMaxScore(docs.scoreDocs[0].score);
         }

Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java?rev=1847245&r1=1847244&r2=1847245&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java Fri Nov 23 11:19:06 2018
@@ -393,7 +393,6 @@ public class LucenePropertyIndexTest ext
         root.commit();
 
         String propabQuery = "/jcr:root//element(*, nt:file)";
-        System.out.println(explainXpath(propabQuery));
         assertThat(explainXpath(propabQuery), containsString("nodeType"));
     }