You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by to...@apache.org on 2018/11/23 11:19:07 UTC
svn commit: r1847245 - in /jackrabbit/oak/trunk/oak-lucene/src:
main/java/org/apache/jackrabbit/oak/plugins/index/lucene/
main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/
test/java/org/apache/jackrabbit/oak/plugins/index/lucene/
Author: tommaso
Date: Fri Nov 23 11:19:06 2018
New Revision: 1847245
URL: http://svn.apache.org/viewvc?rev=1847245&view=rev
Log:
OAK-7824 - make distance threshold relative to current result set
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java?rev=1847245&r1=1847244&r2=1847245&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java Fri Nov 23 11:19:06 2018
@@ -405,7 +405,6 @@ public class LucenePropertyIndex extends
long fvs = PERF_LOGGER.start();
SimSearchUtils.bruteForceFVRerank(sp, docs, indexSearcher);
PERF_LOGGER.end(fvs, -1, "fv reranking done");
- LOG.info("reranking done");
earlyStop = true;
}
}
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java?rev=1847245&r1=1847244&r2=1847245&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java Fri Nov 23 11:19:06 2018
@@ -234,7 +234,9 @@ public class SimSearchUtils {
}
public static void bruteForceFVRerank(List<PropertyDefinition> sp, TopDocs docs, IndexSearcher indexSearcher) throws IOException {
- double farthestDistance = 50d;
+ double distSum = 0d;
+ double counter = 0d;
+ Map<Integer, Double> distances = new HashMap<>();
int k = 15;
ScoreDoc inputDoc = docs.scoreDocs[0]; // we assume the input doc is the first one returned
List<Integer> toDiscard = new LinkedList<>();
@@ -247,26 +249,37 @@ public class SimSearchUtils {
double[] currentVector = toDoubleArray(indexSearcher.doc(docs.scoreDocs[j].doc)
.getBinaryValue(fieldName).bytes);
double distance = dist(inputVector, currentVector) + 1e-10; // constant term to avoid division by zero
-
- if (distance > farthestDistance) { // a threshold distance above which current vector is discarded
- toDiscard.add(docs.scoreDocs[j].doc);
- }
if (Double.isNaN(distance) || Double.isInfinite(distance)) {
toDiscard.add(docs.scoreDocs[j].doc);
+ } else {
+ distSum += distance;
+ counter++;
+ distances.put(docs.scoreDocs[j].doc, distance);
+ docs.scoreDocs[j].score += (float) (1d / distance); // additive similarity boosting
}
- docs.scoreDocs[j].score += (float) (1d / distance); // additive similarity boosting
}
}
}
+
+ // remove docs having invalid distance
if (!toDiscard.isEmpty()) {
- docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e -> !toDiscard.contains(e.doc)).toArray(ScoreDoc[]::new); // remove docs that are not close enough
+ docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e -> !toDiscard.contains(e.doc)).toArray(ScoreDoc[]::new);
}
- Arrays.parallelSort(docs.scoreDocs, 0, docs.scoreDocs.length, (o1, o2) -> { // rerank scoreDocs
+
+ // remove docs whose distance is one order of magnitude higher than average distance
+ final double distanceThreshold = 10 * distSum / counter;
+ docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e -> distances.containsKey(e.doc) && distances.get(e.doc) < distanceThreshold).toArray(ScoreDoc[]::new);
+
+ // rerank scoreDocs
+ Arrays.parallelSort(docs.scoreDocs, 0, docs.scoreDocs.length, (o1, o2) -> {
return -1 * Double.compare(o1.score, o2.score);
});
+
+ // retain only the top k nearest neighbours
if (docs.scoreDocs.length > k) {
- docs.scoreDocs = Arrays.copyOfRange(docs.scoreDocs, 0, k); // retain only the top k nearest neighbours
+ docs.scoreDocs = Arrays.copyOfRange(docs.scoreDocs, 0, k);
}
+
if (docs.scoreDocs.length > 0) {
docs.setMaxScore(docs.scoreDocs[0].score);
}
Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java?rev=1847245&r1=1847244&r2=1847245&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java Fri Nov 23 11:19:06 2018
@@ -393,7 +393,6 @@ public class LucenePropertyIndexTest ext
root.commit();
String propabQuery = "/jcr:root//element(*, nt:file)";
- System.out.println(explainXpath(propabQuery));
assertThat(explainXpath(propabQuery), containsString("nodeType"));
}