You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2023/02/06 17:14:07 UTC

[lucene] branch main updated: Speed up docvalues set query by making use of sortedness (#12128)

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 10d9c7440b9 Speed up docvalues set query by making use of sortedness (#12128)
10d9c7440b9 is described below

commit 10d9c7440b9afe03b9371bf704b1bea75517f608
Author: Robert Muir <rm...@apache.org>
AuthorDate: Mon Feb 6 12:14:02 2023 -0500

    Speed up docvalues set query by making use of sortedness (#12128)
    
    LongHashSet is used for the set of numbers, but it has some issues:
    * tries to hard to extend AbstractSet, mostly for testing
    * causes traps with boxing if you aren't careful
    * complex hashcode/equals
    
    Practically we should take advantage of the fact numbers come in sorted
    order for multivalued fields: just like range queries do. So we use
    min/max to our advantage, including termination of docvalues iteration
    
    Actually it is generally a win to just check min/max even in the single-valued
    case: these constant time comparisons are cheap and can avoid hashing,
    etc.
    
    In the worst-case, if all of your query Sets contain both the minimum and maximum
    possible values, then it won't help, but it doesn't hurt either.
---
 .../org/apache/lucene/document/LongHashSet.java    | 125 +++++++++++----------
 .../document/SortedNumericDocValuesSetQuery.java   |  18 ++-
 .../apache/lucene/document/TestLongHashSet.java    |  33 ++++--
 3 files changed, 102 insertions(+), 74 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/document/LongHashSet.java b/lucene/core/src/java/org/apache/lucene/document/LongHashSet.java
index 402009a3acc..e0e0b252a78 100644
--- a/lucene/core/src/java/org/apache/lucene/document/LongHashSet.java
+++ b/lucene/core/src/java/org/apache/lucene/document/LongHashSet.java
@@ -16,15 +16,16 @@
  */
 package org.apache.lucene.document;
 
-import java.util.AbstractSet;
 import java.util.Arrays;
-import java.util.Iterator;
-import java.util.NoSuchElementException;
+import java.util.HashSet;
+import java.util.Objects;
+import java.util.Set;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.packed.PackedInts;
 
-final class LongHashSet extends AbstractSet<Long> implements Accountable {
+/** Set of longs, optimized for docvalues usage */
+final class LongHashSet implements Accountable {
   private static final long BASE_RAM_BYTES =
       RamUsageEstimator.shallowSizeOfInstance(LongHashSet.class);
 
@@ -34,8 +35,12 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
   final int mask;
   final boolean hasMissingValue;
   final int size;
-  final int hashCode;
+  /** minimum value in the set, or Long.MAX_VALUE for an empty set */
+  final long minValue;
+  /** maximum value in the set, or Long.MIN_VALUE for an empty set */
+  final long maxValue;
 
+  /** Construct a set. Values must be in sorted order. */
   LongHashSet(long[] values) {
     int tableSize = Math.toIntExact(values.length * 3L / 2);
     tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2
@@ -45,19 +50,21 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
     mask = tableSize - 1;
     boolean hasMissingValue = false;
     int size = 0;
-    int hashCode = 0;
+    long previousValue = Long.MIN_VALUE; // for assert
     for (long value : values) {
       if (value == MISSING || add(value)) {
         if (value == MISSING) {
           hasMissingValue = true;
         }
         ++size;
-        hashCode += Long.hashCode(value);
       }
+      assert value >= previousValue : "values must be provided in sorted order";
+      previousValue = value;
     }
     this.hasMissingValue = hasMissingValue;
     this.size = size;
-    this.hashCode = hashCode;
+    this.minValue = values.length == 0 ? Long.MAX_VALUE : values[0];
+    this.maxValue = values.length == 0 ? Long.MIN_VALUE : values[values.length - 1];
   }
 
   private boolean add(long l) {
@@ -74,6 +81,12 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
     }
   }
 
+  /**
+   * check for membership in the set.
+   *
+   * <p>You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before
+   * calling this.
+   */
   boolean contains(long l) {
     if (l == MISSING) {
       return hasMissingValue;
@@ -88,75 +101,67 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
     }
   }
 
-  @Override
-  public int size() {
-    return size;
-  }
-
   @Override
   public int hashCode() {
-    return hashCode;
+    return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table));
   }
 
   @Override
   public boolean equals(Object obj) {
-    if (obj != null && obj.getClass() == LongHashSet.class) {
+    if (obj != null && obj instanceof LongHashSet) {
       LongHashSet that = (LongHashSet) obj;
-      if (hashCode != that.hashCode
-          || size != that.size
-          || hasMissingValue != that.hasMissingValue) {
-        return false;
-      }
-      for (long v : table) {
-        if (v != MISSING && that.contains(v) == false) {
-          return false;
-        }
-      }
-      return true;
+      return size == that.size
+          && minValue == that.minValue
+          && maxValue == that.maxValue
+          && mask == that.mask
+          && hasMissingValue == that.hasMissingValue
+          && Arrays.equals(table, that.table);
     }
-    return super.equals(obj);
+    return false;
   }
 
   @Override
-  public long ramBytesUsed() {
-    return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);
+  public String toString() {
+    StringBuilder sb = new StringBuilder("[");
+    boolean seenValue = false;
+    if (hasMissingValue) {
+      sb.append(MISSING);
+      seenValue = true;
+    }
+    for (long v : table) {
+      if (v != MISSING) {
+        if (seenValue) {
+          sb.append(", ");
+        }
+        sb.append(v);
+        seenValue = true;
+      }
+    }
+    sb.append("]");
+    return sb.toString();
   }
 
-  @Override
-  public boolean contains(Object o) {
-    return o instanceof Long && contains(((Long) o).longValue());
+  /** number of elements in the set */
+  int size() {
+    return size;
   }
 
   @Override
-  public Iterator<Long> iterator() {
-    return new Iterator<Long>() {
-
-      private boolean hasNext = hasMissingValue;
-      private int i = -1;
-      private long value = MISSING;
-
-      @Override
-      public boolean hasNext() {
-        if (hasNext) {
-          return true;
-        }
-        while (++i < table.length) {
-          value = table[i];
-          if (value != MISSING) {
-            return hasNext = true;
-          }
-        }
-        return false;
-      }
+  public long ramBytesUsed() {
+    return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);
+  }
 
-      @Override
-      public Long next() {
-        if (hasNext() == false) {
-          throw new NoSuchElementException();
-        }
-        hasNext = false;
-        return value;
+  // for testing only
+  Set<Long> toSet() {
+    Set<Long> set = new HashSet<>();
+    if (hasMissingValue) {
+      set.add(MISSING);
+    }
+    for (long v : table) {
+      if (v != MISSING) {
+        set.add(v);
       }
-    };
+    }
+    return set;
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesSetQuery.java b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesSetQuery.java
index 530a640bb6f..fe5e8c3afed 100644
--- a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesSetQuery.java
@@ -17,6 +17,7 @@
 package org.apache.lucene.document;
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.Objects;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
@@ -45,6 +46,7 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
 
   SortedNumericDocValuesSetQuery(String field, long[] numbers) {
     this.field = Objects.requireNonNull(field);
+    Arrays.sort(numbers);
     this.numbers = new LongHashSet(numbers);
   }
 
@@ -112,12 +114,15 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
               new TwoPhaseIterator(singleton) {
                 @Override
                 public boolean matches() throws IOException {
-                  return numbers.contains(singleton.longValue());
+                  long value = singleton.longValue();
+                  return value >= numbers.minValue
+                      && value <= numbers.maxValue
+                      && numbers.contains(value);
                 }
 
                 @Override
                 public float matchCost() {
-                  return 5; // lookup in the set
+                  return 5; // 2 comparisions, possible lookup in the set
                 }
               };
         } else {
@@ -127,7 +132,12 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
                 public boolean matches() throws IOException {
                   int count = values.docValueCount();
                   for (int i = 0; i < count; i++) {
-                    if (numbers.contains(values.nextValue())) {
+                    final long value = values.nextValue();
+                    if (value < numbers.minValue) {
+                      continue;
+                    } else if (value > numbers.maxValue) {
+                      return false; // values are sorted, terminate
+                    } else if (numbers.contains(value)) {
                       return true;
                     }
                   }
@@ -136,7 +146,7 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
 
                 @Override
                 public float matchCost() {
-                  return 5; // lookup in the set
+                  return 5; // 2 comparisons, possible lookup in the set
                 }
               };
         }
diff --git a/lucene/core/src/test/org/apache/lucene/document/TestLongHashSet.java b/lucene/core/src/test/org/apache/lucene/document/TestLongHashSet.java
index 12977373c62..19c4725da4d 100644
--- a/lucene/core/src/test/org/apache/lucene/document/TestLongHashSet.java
+++ b/lucene/core/src/test/org/apache/lucene/document/TestLongHashSet.java
@@ -25,11 +25,10 @@ import org.apache.lucene.tests.util.LuceneTestCase;
 
 public class TestLongHashSet extends LuceneTestCase {
 
-  private void assertEquals(Set<Long> set1, LongHashSet set2) {
+  private void assertEquals(Set<Long> set1, LongHashSet longHashSet) {
+    Set<Long> set2 = longHashSet.toSet();
+
     LuceneTestCase.assertEquals(set1, set2);
-    LuceneTestCase.assertEquals(set2, set1);
-    LuceneTestCase.assertEquals(set2, set2);
-    assertEquals(set1.hashCode(), set2.hashCode());
 
     if (set1.isEmpty() == false) {
       Set<Long> set3 = new HashSet<>(set1);
@@ -40,40 +39,53 @@ public class TestLongHashSet extends LuceneTestCase {
           break;
         }
       }
-      assertNotEquals(set3, set2);
+      assertNotEquals(set3, longHashSet);
     }
   }
 
-  private void assertNotEquals(Set<Long> set1, LongHashSet set2) {
-    assertFalse(set1.equals(set2));
-    assertFalse(set2.equals(set1));
-    LongHashSet set3 = new LongHashSet(set1.stream().mapToLong(Long::longValue).toArray());
-    assertFalse(set2.equals(set3));
+  private void assertNotEquals(Set<Long> set1, LongHashSet longHashSet) {
+    Set<Long> set2 = longHashSet.toSet();
+
+    LuceneTestCase.assertNotEquals(set1, set2);
+
+    LongHashSet set3 = new LongHashSet(set1.stream().mapToLong(Long::longValue).sorted().toArray());
+
+    LuceneTestCase.assertNotEquals(set2, set3.toSet());
   }
 
   public void testEmpty() {
     Set<Long> set1 = new HashSet<>();
     LongHashSet set2 = new LongHashSet(new long[] {});
+    assertEquals(Long.MAX_VALUE, set2.minValue);
+    assertEquals(Long.MIN_VALUE, set2.maxValue);
     assertEquals(set1, set2);
   }
 
   public void testOneValue() {
     Set<Long> set1 = new HashSet<>(Arrays.asList(42L));
     LongHashSet set2 = new LongHashSet(new long[] {42L});
+    assertEquals(42L, set2.minValue);
+    assertEquals(42L, set2.maxValue);
     assertEquals(set1, set2);
 
     set1 = new HashSet<>(Arrays.asList(Long.MIN_VALUE));
     set2 = new LongHashSet(new long[] {Long.MIN_VALUE});
+    assertEquals(Long.MIN_VALUE, set2.minValue);
+    assertEquals(Long.MIN_VALUE, set2.maxValue);
     assertEquals(set1, set2);
   }
 
   public void testTwoValues() {
     Set<Long> set1 = new HashSet<>(Arrays.asList(42L, Long.MAX_VALUE));
     LongHashSet set2 = new LongHashSet(new long[] {42L, Long.MAX_VALUE});
+    assertEquals(42, set2.minValue);
+    assertEquals(Long.MAX_VALUE, set2.maxValue);
     assertEquals(set1, set2);
 
     set1 = new HashSet<>(Arrays.asList(Long.MIN_VALUE, 42L));
     set2 = new LongHashSet(new long[] {Long.MIN_VALUE, 42L});
+    assertEquals(Long.MIN_VALUE, set2.minValue);
+    assertEquals(42, set2.maxValue);
     assertEquals(set1, set2);
   }
 
@@ -95,6 +107,7 @@ public class TestLongHashSet extends LuceneTestCase {
           LongStream.of(values)
               .mapToObj(Long::valueOf)
               .collect(Collectors.toCollection(HashSet::new));
+      Arrays.sort(values);
       LongHashSet set2 = new LongHashSet(values);
       assertEquals(set1, set2);
     }