You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2023/02/06 17:46:44 UTC
[lucene] branch branch_9x updated: Speed up docvalues set query by making use of sortedness (#12128)
This is an automated email from the ASF dual-hosted git repository.
rmuir pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 94fb732b8a1 Speed up docvalues set query by making use of sortedness (#12128)
94fb732b8a1 is described below
commit 94fb732b8a1262284e0afa60f89f49a7f00753b3
Author: Robert Muir <rm...@apache.org>
AuthorDate: Mon Feb 6 12:14:02 2023 -0500
Speed up docvalues set query by making use of sortedness (#12128)
LongHashSet is used for the set of numbers, but it has some issues:
* tries to hard to extend AbstractSet, mostly for testing
* causes traps with boxing if you aren't careful
* complex hashcode/equals
Practically we should take advantage of the fact numbers come in sorted
order for multivalued fields: just like range queries do. So we use
min/max to our advantage, including termination of docvalues iteration
Actually it is generally a win to just check min/max even in the single-valued
case: these constant time comparisons are cheap and can avoid hashing,
etc.
In the worst-case, if all of your query Sets contain both the minimum and maximum
possible values, then it won't help, but it doesn't hurt either.
---
.../org/apache/lucene/document/LongHashSet.java | 125 +++++++++++----------
.../document/SortedNumericDocValuesSetQuery.java | 18 ++-
.../apache/lucene/document/TestLongHashSet.java | 33 ++++--
3 files changed, 102 insertions(+), 74 deletions(-)
diff --git a/lucene/core/src/java/org/apache/lucene/document/LongHashSet.java b/lucene/core/src/java/org/apache/lucene/document/LongHashSet.java
index 402009a3acc..e0e0b252a78 100644
--- a/lucene/core/src/java/org/apache/lucene/document/LongHashSet.java
+++ b/lucene/core/src/java/org/apache/lucene/document/LongHashSet.java
@@ -16,15 +16,16 @@
*/
package org.apache.lucene.document;
-import java.util.AbstractSet;
import java.util.Arrays;
-import java.util.Iterator;
-import java.util.NoSuchElementException;
+import java.util.HashSet;
+import java.util.Objects;
+import java.util.Set;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;
-final class LongHashSet extends AbstractSet<Long> implements Accountable {
+/** Set of longs, optimized for docvalues usage */
+final class LongHashSet implements Accountable {
private static final long BASE_RAM_BYTES =
RamUsageEstimator.shallowSizeOfInstance(LongHashSet.class);
@@ -34,8 +35,12 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
final int mask;
final boolean hasMissingValue;
final int size;
- final int hashCode;
+ /** minimum value in the set, or Long.MAX_VALUE for an empty set */
+ final long minValue;
+ /** maximum value in the set, or Long.MIN_VALUE for an empty set */
+ final long maxValue;
+ /** Construct a set. Values must be in sorted order. */
LongHashSet(long[] values) {
int tableSize = Math.toIntExact(values.length * 3L / 2);
tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2
@@ -45,19 +50,21 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
mask = tableSize - 1;
boolean hasMissingValue = false;
int size = 0;
- int hashCode = 0;
+ long previousValue = Long.MIN_VALUE; // for assert
for (long value : values) {
if (value == MISSING || add(value)) {
if (value == MISSING) {
hasMissingValue = true;
}
++size;
- hashCode += Long.hashCode(value);
}
+ assert value >= previousValue : "values must be provided in sorted order";
+ previousValue = value;
}
this.hasMissingValue = hasMissingValue;
this.size = size;
- this.hashCode = hashCode;
+ this.minValue = values.length == 0 ? Long.MAX_VALUE : values[0];
+ this.maxValue = values.length == 0 ? Long.MIN_VALUE : values[values.length - 1];
}
private boolean add(long l) {
@@ -74,6 +81,12 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
}
}
+ /**
+ * check for membership in the set.
+ *
+ * <p>You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before
+ * calling this.
+ */
boolean contains(long l) {
if (l == MISSING) {
return hasMissingValue;
@@ -88,75 +101,67 @@ final class LongHashSet extends AbstractSet<Long> implements Accountable {
}
}
- @Override
- public int size() {
- return size;
- }
-
@Override
public int hashCode() {
- return hashCode;
+ return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table));
}
@Override
public boolean equals(Object obj) {
- if (obj != null && obj.getClass() == LongHashSet.class) {
+ if (obj != null && obj instanceof LongHashSet) {
LongHashSet that = (LongHashSet) obj;
- if (hashCode != that.hashCode
- || size != that.size
- || hasMissingValue != that.hasMissingValue) {
- return false;
- }
- for (long v : table) {
- if (v != MISSING && that.contains(v) == false) {
- return false;
- }
- }
- return true;
+ return size == that.size
+ && minValue == that.minValue
+ && maxValue == that.maxValue
+ && mask == that.mask
+ && hasMissingValue == that.hasMissingValue
+ && Arrays.equals(table, that.table);
}
- return super.equals(obj);
+ return false;
}
@Override
- public long ramBytesUsed() {
- return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);
+ public String toString() {
+ StringBuilder sb = new StringBuilder("[");
+ boolean seenValue = false;
+ if (hasMissingValue) {
+ sb.append(MISSING);
+ seenValue = true;
+ }
+ for (long v : table) {
+ if (v != MISSING) {
+ if (seenValue) {
+ sb.append(", ");
+ }
+ sb.append(v);
+ seenValue = true;
+ }
+ }
+ sb.append("]");
+ return sb.toString();
}
- @Override
- public boolean contains(Object o) {
- return o instanceof Long && contains(((Long) o).longValue());
+ /** number of elements in the set */
+ int size() {
+ return size;
}
@Override
- public Iterator<Long> iterator() {
- return new Iterator<Long>() {
-
- private boolean hasNext = hasMissingValue;
- private int i = -1;
- private long value = MISSING;
-
- @Override
- public boolean hasNext() {
- if (hasNext) {
- return true;
- }
- while (++i < table.length) {
- value = table[i];
- if (value != MISSING) {
- return hasNext = true;
- }
- }
- return false;
- }
+ public long ramBytesUsed() {
+ return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);
+ }
- @Override
- public Long next() {
- if (hasNext() == false) {
- throw new NoSuchElementException();
- }
- hasNext = false;
- return value;
+ // for testing only
+ Set<Long> toSet() {
+ Set<Long> set = new HashSet<>();
+ if (hasMissingValue) {
+ set.add(MISSING);
+ }
+ for (long v : table) {
+ if (v != MISSING) {
+ set.add(v);
}
- };
+ }
+ return set;
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesSetQuery.java b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesSetQuery.java
index 4c78c92404c..72d5d42b30a 100644
--- a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesSetQuery.java
@@ -17,6 +17,7 @@
package org.apache.lucene.document;
import java.io.IOException;
+import java.util.Arrays;
import java.util.Objects;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
@@ -46,6 +47,7 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
SortedNumericDocValuesSetQuery(String field, long[] numbers) {
this.field = Objects.requireNonNull(field);
+ Arrays.sort(numbers);
this.numbers = new LongHashSet(numbers);
}
@@ -113,12 +115,15 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
new TwoPhaseIterator(singleton) {
@Override
public boolean matches() throws IOException {
- return numbers.contains(singleton.longValue());
+ long value = singleton.longValue();
+ return value >= numbers.minValue
+ && value <= numbers.maxValue
+ && numbers.contains(value);
}
@Override
public float matchCost() {
- return 5; // lookup in the set
+ return 5; // 2 comparisions, possible lookup in the set
}
};
} else {
@@ -128,7 +133,12 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
public boolean matches() throws IOException {
int count = values.docValueCount();
for (int i = 0; i < count; i++) {
- if (numbers.contains(values.nextValue())) {
+ final long value = values.nextValue();
+ if (value < numbers.minValue) {
+ continue;
+ } else if (value > numbers.maxValue) {
+ return false; // values are sorted, terminate
+ } else if (numbers.contains(value)) {
return true;
}
}
@@ -137,7 +147,7 @@ final class SortedNumericDocValuesSetQuery extends Query implements Accountable
@Override
public float matchCost() {
- return 5; // lookup in the set
+ return 5; // 2 comparisons, possible lookup in the set
}
};
}
diff --git a/lucene/core/src/test/org/apache/lucene/document/TestLongHashSet.java b/lucene/core/src/test/org/apache/lucene/document/TestLongHashSet.java
index 12977373c62..19c4725da4d 100644
--- a/lucene/core/src/test/org/apache/lucene/document/TestLongHashSet.java
+++ b/lucene/core/src/test/org/apache/lucene/document/TestLongHashSet.java
@@ -25,11 +25,10 @@ import org.apache.lucene.tests.util.LuceneTestCase;
public class TestLongHashSet extends LuceneTestCase {
- private void assertEquals(Set<Long> set1, LongHashSet set2) {
+ private void assertEquals(Set<Long> set1, LongHashSet longHashSet) {
+ Set<Long> set2 = longHashSet.toSet();
+
LuceneTestCase.assertEquals(set1, set2);
- LuceneTestCase.assertEquals(set2, set1);
- LuceneTestCase.assertEquals(set2, set2);
- assertEquals(set1.hashCode(), set2.hashCode());
if (set1.isEmpty() == false) {
Set<Long> set3 = new HashSet<>(set1);
@@ -40,40 +39,53 @@ public class TestLongHashSet extends LuceneTestCase {
break;
}
}
- assertNotEquals(set3, set2);
+ assertNotEquals(set3, longHashSet);
}
}
- private void assertNotEquals(Set<Long> set1, LongHashSet set2) {
- assertFalse(set1.equals(set2));
- assertFalse(set2.equals(set1));
- LongHashSet set3 = new LongHashSet(set1.stream().mapToLong(Long::longValue).toArray());
- assertFalse(set2.equals(set3));
+ private void assertNotEquals(Set<Long> set1, LongHashSet longHashSet) {
+ Set<Long> set2 = longHashSet.toSet();
+
+ LuceneTestCase.assertNotEquals(set1, set2);
+
+ LongHashSet set3 = new LongHashSet(set1.stream().mapToLong(Long::longValue).sorted().toArray());
+
+ LuceneTestCase.assertNotEquals(set2, set3.toSet());
}
public void testEmpty() {
Set<Long> set1 = new HashSet<>();
LongHashSet set2 = new LongHashSet(new long[] {});
+ assertEquals(Long.MAX_VALUE, set2.minValue);
+ assertEquals(Long.MIN_VALUE, set2.maxValue);
assertEquals(set1, set2);
}
public void testOneValue() {
Set<Long> set1 = new HashSet<>(Arrays.asList(42L));
LongHashSet set2 = new LongHashSet(new long[] {42L});
+ assertEquals(42L, set2.minValue);
+ assertEquals(42L, set2.maxValue);
assertEquals(set1, set2);
set1 = new HashSet<>(Arrays.asList(Long.MIN_VALUE));
set2 = new LongHashSet(new long[] {Long.MIN_VALUE});
+ assertEquals(Long.MIN_VALUE, set2.minValue);
+ assertEquals(Long.MIN_VALUE, set2.maxValue);
assertEquals(set1, set2);
}
public void testTwoValues() {
Set<Long> set1 = new HashSet<>(Arrays.asList(42L, Long.MAX_VALUE));
LongHashSet set2 = new LongHashSet(new long[] {42L, Long.MAX_VALUE});
+ assertEquals(42, set2.minValue);
+ assertEquals(Long.MAX_VALUE, set2.maxValue);
assertEquals(set1, set2);
set1 = new HashSet<>(Arrays.asList(Long.MIN_VALUE, 42L));
set2 = new LongHashSet(new long[] {Long.MIN_VALUE, 42L});
+ assertEquals(Long.MIN_VALUE, set2.minValue);
+ assertEquals(42, set2.maxValue);
assertEquals(set1, set2);
}
@@ -95,6 +107,7 @@ public class TestLongHashSet extends LuceneTestCase {
LongStream.of(values)
.mapToObj(Long::valueOf)
.collect(Collectors.toCollection(HashSet::new));
+ Arrays.sort(values);
LongHashSet set2 = new LongHashSet(values);
assertEquals(set1, set2);
}