You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by iv...@apache.org on 2019/08/23 09:16:38 UTC

[lucene-solr] branch master updated: LUCENE-8952: Use a sort key instead of true distance in NearestNeighbor. (#832)

This is an automated email from the ASF dual-hosted git repository.

ivera pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 152756f  LUCENE-8952: Use a sort key instead of true distance in NearestNeighbor. (#832)
152756f is described below

commit 152756fcbd8ae3bfd5e842f94b053715cb554588
Author: Julie Tibshirani <ju...@elastic.co>
AuthorDate: Fri Aug 23 02:16:27 2019 -0700

    LUCENE-8952: Use a sort key instead of true distance in NearestNeighbor. (#832)
---
 lucene/CHANGES.txt                                 |  2 +
 .../lucene/search/LatLonPointPrototypeQueries.java |  4 +-
 .../org/apache/lucene/search/NearestNeighbor.java  | 58 ++++++++++++----------
 .../test/org/apache/lucene/search/TestNearest.java | 29 ++++++-----
 4 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index aa37779..6ea7f94 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -102,6 +102,8 @@ Improvements
 
 * SOLR-13663: Introduce <SpanPositionRange> into XML Query Parser (Alessandro Benedetti via Mikhail Khludnev)
 
+* LUCENE-8952: Use a sort key instead of true distance in NearestNeighbor (Julie Tibshirani).
+
 Optimizations
 
 * LUCENE-8922: DisjunctionMaxQuery more efficiently leverages impacts to skip
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/LatLonPointPrototypeQueries.java b/lucene/sandbox/src/java/org/apache/lucene/search/LatLonPointPrototypeQueries.java
index 73cbbb8..3c0d7ff 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/LatLonPointPrototypeQueries.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/LatLonPointPrototypeQueries.java
@@ -27,6 +27,7 @@ import org.apache.lucene.geo.GeoUtils;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.SloppyMath;
 import org.apache.lucene.util.bkd.BKDReader;
 
 /**
@@ -104,7 +105,8 @@ public class LatLonPointPrototypeQueries {
     ScoreDoc[] scoreDocs = new ScoreDoc[hits.length];
     for(int i=0;i<hits.length;i++) {
       NearestNeighbor.NearestHit hit = hits[i];
-      scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hit.distanceMeters)});
+      double hitDistance = SloppyMath.haversinMeters(hit.distanceSortKey);
+      scoreDocs[i] = new FieldDoc(hit.docID, 0.0f, new Object[] {Double.valueOf(hitDistance)});
     }
     return new TopFieldDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), scoreDocs, null);
   }
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java b/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java
index 449d37d..86a0a15 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/NearestNeighbor.java
@@ -28,9 +28,9 @@ import org.apache.lucene.index.PointValues.Relation;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.SloppyMath;
+import org.apache.lucene.util.bkd.BKDReader;
 import org.apache.lucene.util.bkd.BKDReader.IndexTree;
 import org.apache.lucene.util.bkd.BKDReader.IntersectState;
-import org.apache.lucene.util.bkd.BKDReader;
 
 import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
 import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude;
@@ -48,19 +48,23 @@ class NearestNeighbor {
     final byte[] maxPacked;
     final IndexTree index;
 
-    /** The closest possible distance of all points in this cell */
-    final double distanceMeters;
+    /**
+     * The closest distance from a point in this cell to the query point, computed as a sort key through
+     * {@link SloppyMath#haversinSortKey}. Note that this is an approximation to the closest distance,
+     * and there could be a point in the cell that is closer.
+     */
+    final double distanceSortKey;
 
-    public Cell(IndexTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceMeters) {
+    public Cell(IndexTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceSortKey) {
       this.index = index;
       this.readerIndex = readerIndex;
       this.minPacked = minPacked.clone();
       this.maxPacked = maxPacked.clone();
-      this.distanceMeters = distanceMeters;
+      this.distanceSortKey = distanceSortKey;
     }
 
     public int compareTo(Cell other) {
-      return Double.compare(distanceMeters, other.distanceMeters);
+      return Double.compare(distanceSortKey, other.distanceSortKey);
     }
 
     @Override
@@ -69,7 +73,7 @@ class NearestNeighbor {
       double minLon = decodeLongitude(minPacked, Integer.BYTES);
       double maxLat = decodeLatitude(maxPacked, 0);
       double maxLon = decodeLongitude(maxPacked, Integer.BYTES);
-      return "Cell(readerIndex=" + readerIndex + " nodeID=" + index.getNodeID() + " isLeaf=" + index.isLeafNode() + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")";
+      return "Cell(readerIndex=" + readerIndex + " nodeID=" + index.getNodeID() + " isLeaf=" + index.isLeafNode() + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceSortKey=" + distanceSortKey + ")";
     }
   }
 
@@ -106,7 +110,8 @@ class NearestNeighbor {
     private void maybeUpdateBBox() {
       if (setBottomCounter < 1024 || (setBottomCounter & 0x3F) == 0x3F) {
         NearestHit hit = hitQueue.peek();
-        Rectangle box = Rectangle.fromPointDistance(pointLat, pointLon, hit.distanceMeters);
+        Rectangle box = Rectangle.fromPointDistance(pointLat, pointLon,
+            SloppyMath.haversinMeters(hit.distanceSortKey));
         //System.out.println("    update bbox to " + box);
         minLat = box.minLat;
         maxLat = box.maxLat;
@@ -134,8 +139,6 @@ class NearestNeighbor {
         return;
       }
 
-      // TODO: work in int space, use haversinSortKey
-
       double docLatitude = decodeLatitude(packedValue, 0);
       double docLongitude = decodeLongitude(packedValue, Integer.BYTES);
 
@@ -147,21 +150,22 @@ class NearestNeighbor {
         return;
       }
 
-      double distanceMeters = SloppyMath.haversinMeters(pointLat, pointLon, docLatitude, docLongitude);
+      // Use the haversin sort key when comparing hits, as it is faster to compute than the true distance.
+      double distanceSortKey = SloppyMath.haversinSortKey(pointLat, pointLon, docLatitude, docLongitude);
 
-      //System.out.println("    visit docID=" + docID + " distanceMeters=" + distanceMeters + " docLat=" + docLatitude + " docLon=" + docLongitude);
+      //System.out.println("    visit docID=" + docID + " distanceSortKey=" + distanceSortKey + " docLat=" + docLatitude + " docLon=" + docLongitude);
 
       int fullDocID = curDocBase + docID;
 
       if (hitQueue.size() == topN) {
         // queue already full
         NearestHit hit = hitQueue.peek();
-        //System.out.println("      bottom distanceMeters=" + hit.distanceMeters);
+        //System.out.println("      bottom distanceSortKey=" + hit.distanceSortKey);
         // we don't collect docs in order here, so we must also test the tie-break case ourselves:
-        if (distanceMeters < hit.distanceMeters || (distanceMeters == hit.distanceMeters && fullDocID < hit.docID)) {
+        if (distanceSortKey < hit.distanceSortKey || (distanceSortKey == hit.distanceSortKey && fullDocID < hit.docID)) {
           hitQueue.poll();
           hit.docID = fullDocID;
-          hit.distanceMeters = distanceMeters;
+          hit.distanceSortKey = distanceSortKey;
           hitQueue.offer(hit);
           //System.out.println("      ** keep2, now bottom=" + hit);
           maybeUpdateBBox();
@@ -170,7 +174,7 @@ class NearestNeighbor {
       } else {
         NearestHit hit = new NearestHit();
         hit.docID = fullDocID;
-        hit.distanceMeters = distanceMeters;
+        hit.distanceSortKey = distanceSortKey;
         hitQueue.offer(hit);
         //System.out.println("      ** keep1, now bottom=" + hit);
       }
@@ -182,14 +186,18 @@ class NearestNeighbor {
     }
   }
 
-  /** Holds one hit from {@link LatLonPointPrototypeQueries#nearest} */
+  /** Holds one hit from {@link NearestNeighbor#nearest} */
   static class NearestHit {
     public int docID;
-    public double distanceMeters;
+
+    /**
+     * The distance from the hit to the query point, computed as a sort key through {@link SloppyMath#haversinSortKey}.
+     */
+    public double distanceSortKey;
 
     @Override
     public String toString() {
-      return "NearestHit(docID=" + docID + " distanceMeters=" + distanceMeters + ")";
+      return "NearestHit(docID=" + docID + " distanceSortKey=" + distanceSortKey + ")";
     }
   }
 
@@ -204,8 +212,8 @@ class NearestNeighbor {
     final PriorityQueue<NearestHit> hitQueue = new PriorityQueue<>(n, new Comparator<NearestHit>() {
         @Override
         public int compare(NearestHit a, NearestHit b) {
-          // sort by opposite distanceMeters natural order
-          int cmp = Double.compare(a.distanceMeters, b.distanceMeters);
+          // sort by opposite distanceSortKey natural order
+          int cmp = Double.compare(a.distanceSortKey, b.distanceSortKey);
           if (cmp != 0) {
             return -cmp;
           }
@@ -319,10 +327,10 @@ class NearestNeighbor {
       return 0.0;
     }
 
-    double d1 = SloppyMath.haversinMeters(pointLat, pointLon, minLat, minLon);
-    double d2 = SloppyMath.haversinMeters(pointLat, pointLon, minLat, maxLon);
-    double d3 = SloppyMath.haversinMeters(pointLat, pointLon, maxLat, maxLon);
-    double d4 = SloppyMath.haversinMeters(pointLat, pointLon, maxLat, minLon);
+    double d1 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, minLon);
+    double d2 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, maxLon);
+    double d3 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, maxLon);
+    double d4 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, minLon);
     return Math.min(Math.min(d1, d2), Math.min(d3, d4));
   }
 
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestNearest.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestNearest.java
index 40c521e..627b724 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/TestNearest.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestNearest.java
@@ -26,7 +26,6 @@ import org.apache.lucene.document.LatLonDocValuesField;
 import org.apache.lucene.document.LatLonPoint;
 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StringField;
-import org.apache.lucene.search.NearestNeighbor.NearestHit;
 import org.apache.lucene.geo.GeoEncodingUtils;
 import org.apache.lucene.geo.GeoTestUtil;
 import org.apache.lucene.index.DirectoryReader;
@@ -190,23 +189,22 @@ public class TestNearest extends LuceneTestCase {
       double pointLon = GeoTestUtil.nextLongitude();
 
       // dumb brute force search to get the expected result:
-      NearestHit[] expectedHits = new NearestHit[lats.length];
+      FieldDoc[] expectedHits = new FieldDoc[lats.length];
       for(int id=0;id<lats.length;id++) {
-        NearestHit hit = new NearestHit();
-        hit.distanceMeters = SloppyMath.haversinMeters(pointLat, pointLon, lats[id], lons[id]);
-        hit.docID = id;
+        double distance = SloppyMath.haversinMeters(pointLat, pointLon, lats[id], lons[id]);
+        FieldDoc hit = new FieldDoc(id, 0.0f, new Object[] {Double.valueOf(distance)});
         expectedHits[id] = hit;
       }
 
-      Arrays.sort(expectedHits, new Comparator<NearestHit>() {
+      Arrays.sort(expectedHits, new Comparator<FieldDoc>() {
           @Override
-          public int compare(NearestHit a, NearestHit b) {
-            int cmp = Double.compare(a.distanceMeters, b.distanceMeters);
+          public int compare(FieldDoc a, FieldDoc  b) {
+            int cmp = Double.compare(((Double) a.fields[0]).doubleValue(), ((Double) b.fields[0]).doubleValue());
             if (cmp != 0) {
               return cmp;
             }
             // tie break by smaller docID:
-            return a.docID - b.docID;
+            return a.doc - b.doc;
           }
         });
 
@@ -221,22 +219,23 @@ public class TestNearest extends LuceneTestCase {
 
       ScoreDoc[] hits = LatLonPointPrototypeQueries.nearest(s, "point", pointLat, pointLon, topN).scoreDocs;
       for(int i=0;i<topN;i++) {
-        NearestHit expected = expectedHits[i];
+        FieldDoc expected = expectedHits[i];
         FieldDoc expected2 = (FieldDoc) fieldDocs.scoreDocs[i];
         FieldDoc actual = (FieldDoc) hits[i];
         Document actualDoc = r.document(actual.doc);
 
         if (VERBOSE) {
           System.out.println("hit " + i);
-          System.out.println("  expected id=" + expected.docID + " lat=" + lats[expected.docID] + " lon=" + lons[expected.docID] + " distance=" + expected.distanceMeters + " meters");
+          System.out.println("  expected id=" + expected.doc+ " lat=" + lats[expected.doc] + " lon=" + lons[expected.doc]
+              + " distance=" + ((Double) expected.fields[0]).doubleValue() + " meters");
           System.out.println("  actual id=" + actualDoc.getField("id") + " distance=" + actual.fields[0] + " meters");
         }
 
-        assertEquals(expected.docID, actual.doc);
-        assertEquals(expected.distanceMeters, ((Double) actual.fields[0]).doubleValue(), 0.0);
+        assertEquals(expected.doc, actual.doc);
+        assertEquals(((Double) expected.fields[0]).doubleValue(), ((Double) actual.fields[0]).doubleValue(), 0.0);
 
-        assertEquals(expected.docID, expected.docID);
-        assertEquals(((Double) expected2.fields[0]).doubleValue(), expected.distanceMeters, 0.0);
+        assertEquals(expected2.doc, actual.doc);
+        assertEquals(((Double) expected2.fields[0]).doubleValue(), ((Double) actual.fields[0]).doubleValue(), 0.0);
       }
     }