You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2018/12/03 13:32:04 UTC
[3/4] lucene-solr:master: LUCENE-8374 part 3/4: Reduce reads for
sparse DocValues
LUCENE-8374 part 3/4: Reduce reads for sparse DocValues
Offset jump-table for variable bits per value blocks.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/7949b98f
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/7949b98f
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/7949b98f
Branch: refs/heads/master
Commit: 7949b98f802c9ab3a588a33cdb1771b83c9fcafb
Parents: 7ad0276
Author: Toke Eskildsen <to...@apache.org>
Authored: Mon Dec 3 14:29:54 2018 +0100
Committer: Toke Eskildsen <to...@apache.org>
Committed: Mon Dec 3 14:29:54 2018 +0100
----------------------------------------------------------------------
.../lucene/codecs/lucene70/IndexedDISI.java | 49 ++++++
.../codecs/lucene70/IndexedDISICache.java | 2 +-
.../lucene70/IndexedDISICacheFactory.java | 90 +++++++++-
.../lucene70/Lucene70DocValuesProducer.java | 165 ++++++++-----------
.../lucene/codecs/lucene70/TestIndexedDISI.java | 36 ++--
.../org/apache/lucene/index/TestDocValues.java | 4 +-
6 files changed, 232 insertions(+), 114 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7949b98f/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
index 114710e..5eaefbc 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
@@ -303,6 +303,9 @@ final class IndexedDISI extends DocIdSetIterator {
final int targetInBlock = target & 0xFFFF;
final int targetWordIndex = targetInBlock >>> 6;
+ // If possible, skip ahead using the rank cache
+ disi.rankSkip(disi, target);
+
for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
disi.word = disi.slice.readLong();
disi.numberOfOnes += Long.bitCount(disi.word);
@@ -337,6 +340,9 @@ final class IndexedDISI extends DocIdSetIterator {
final int targetInBlock = target & 0xFFFF;
final int targetWordIndex = targetInBlock >>> 6;
+ // If possible, skip ahead using the rank cache
+ disi.rankSkip(disi, target);
+
for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
disi.word = disi.slice.readLong();
disi.numberOfOnes += Long.bitCount(disi.word);
@@ -373,4 +379,47 @@ final class IndexedDISI extends DocIdSetIterator {
abstract boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException;
}
+ /**
+ * If the distance between the current position and the target is > 8 words, the rank cache will
+ * be used to guarantee a worst-case of 1 rank-lookup and 7 word-read-and-count-bits operations.
+ * Note: This does not guarantee a skip up to target, only up to nearest rank boundary. It is the
+ * responsibility of the caller to iterate further to reach target.
+ * @param disi standard DISI.
+ * @param target the wanted docID for which to calculate set-flag and index.
+ * @throws IOException if a disi seek failed.
+ */
+ private void rankSkip(IndexedDISI disi, int target) throws IOException {
+ final int targetInBlock = target & 0xFFFF;
+ final int targetWordIndex = targetInBlock >>> 6;
+
+ // If the distance between the current position and the target is >= 8
+ // then it pays to use the rank to jump
+ if (!(disi.cache.hasRank() && targetWordIndex - disi.wordIndex >= IndexedDISICache.RANK_BLOCK_LONGS)) {
+ return;
+ }
+
+ int rankPos = disi.cache.denseRankPosition(target);
+ if (rankPos == -1) {
+ return;
+ }
+ int rank = disi.cache.getRankInBlock(rankPos);
+ if (rank == -1) {
+ return;
+ }
+ int rankIndex = disi.denseOrigoIndex + rank;
+ int rankWordIndex = (rankPos & 0xFFFF) >> 6;
+ long rankOffset = disi.blockStart + 4 + (rankWordIndex * 8);
+
+ long mark = disi.slice.getFilePointer();
+ disi.slice.seek(rankOffset);
+ long rankWord = disi.slice.readLong();
+ int rankNOO = rankIndex + Long.bitCount(rankWord);
+ rankOffset += Long.BYTES;
+
+
+ //disi.slice.seek(mark);
+ disi.wordIndex = rankWordIndex;
+ disi.word = rankWord;
+ disi.numberOfOnes = rankNOO;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7949b98f/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISICache.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISICache.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISICache.java
index be0beb7..fdf93cf 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISICache.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISICache.java
@@ -319,4 +319,4 @@ public class IndexedDISICache implements Accountable {
RamUsageEstimator.NUM_BYTES_OBJECT_REF*3 +
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + creationStats.length()*2;
}
-}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7949b98f/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISICacheFactory.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISICacheFactory.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISICacheFactory.java
index 7a85727..610529a 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISICacheFactory.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISICacheFactory.java
@@ -44,6 +44,8 @@ public class IndexedDISICacheFactory implements Accountable {
// jump-table and rank for DISI blocks
private final Map<Long, IndexedDISICache> disiPool = new HashMap<>();
+ // jump-table for numerics with variable bits per value (dates, longs...)
+ private final Map<String, VaryingBPVJumpTable> vBPVPool = new HashMap<>();
/**
* Create a cached {@link IndexedDISI} instance.
@@ -75,6 +77,24 @@ public class IndexedDISICacheFactory implements Accountable {
}
/**
+ * Creates a cache (jump table) for variable bits per value numerics and returns it.
+ * If the cache has previously been created, the old cache is returned.
+ * @param name the name for the cache, typically the field name. Used as key for later retrieval.
+ * @param slice the long values with varying bits per value.
+ * @param valuesLength the length in bytes of the slice.
+ * @return a jump table for the longs in the given slice or null if the structure is not suitable for caching.
+ */
+ VaryingBPVJumpTable getVBPVJumpTable(String name, RandomAccessInput slice, long valuesLength) throws IOException {
+ VaryingBPVJumpTable jumpTable = vBPVPool.get(name);
+ if (jumpTable == null) {
+ // TODO: Avoid overlapping builds of the same jump table for performance reasons
+ jumpTable = new VaryingBPVJumpTable(slice, name, valuesLength);
+ vBPVPool.put(name, jumpTable);
+ }
+ return jumpTable;
+ }
+
+ /**
* Creates a cache (jump table) for {@link IndexedDISI}.
* If the cache has previously been created, the old cache is returned.
* @param data the slice to create a cache for.
@@ -133,16 +153,26 @@ public class IndexedDISICacheFactory implements Accountable {
public long getDISIBlocksWithRankCount() {
return disiPool.values().stream().filter(IndexedDISICache::hasRank).count();
}
+ public long getVaryingBPVCount() {
+ return vBPVPool.size();
+ }
@Override
public long ramBytesUsed() {
long mem = RamUsageEstimator.shallowSizeOf(this) +
- RamUsageEstimator.shallowSizeOf(disiPool);
+ RamUsageEstimator.shallowSizeOf(disiPool) +
+ RamUsageEstimator.shallowSizeOf(vBPVPool);
for (Map.Entry<Long, IndexedDISICache> cacheEntry: disiPool.entrySet()) {
mem += RamUsageEstimator.shallowSizeOf(cacheEntry);
mem += RamUsageEstimator.sizeOf(cacheEntry.getKey());
mem += cacheEntry.getValue().ramBytesUsed();
}
+ for (Map.Entry<String, VaryingBPVJumpTable> cacheEntry: vBPVPool.entrySet()) {
+ String key = cacheEntry.getKey();
+ mem += RamUsageEstimator.shallowSizeOf(cacheEntry);
+ mem += RamUsageEstimator.shallowSizeOf(key)+key.length()*2;
+ mem += cacheEntry.getValue().ramBytesUsed();
+ }
return mem;
}
@@ -151,5 +181,63 @@ public class IndexedDISICacheFactory implements Accountable {
*/
void releaseAll() {
disiPool.clear();
+ vBPVPool.clear();
+ }
+
+ /**
+ * Jump table used by Lucene70DocValuesProducer.VaryingBPVReader to avoid iterating all blocks from
+ * current to wanted index for numerics with variable bits per value. The jump table holds offsets for all blocks.
+ */
+ public static class VaryingBPVJumpTable implements Accountable {
+ // TODO: It is way overkill to use longs here for practically all indexes. Maybe a PackedInts representation?
+ long[] offsets = new long[10];
+ final String creationStats;
+
+ VaryingBPVJumpTable(RandomAccessInput slice, String name, long valuesLength) throws IOException {
+ final long startTime = System.nanoTime();
+
+ int block = -1;
+ long offset;
+ long blockEndOffset = 0;
+
+ int bitsPerValue;
+ do {
+ offset = blockEndOffset;
+
+ offsets = ArrayUtil.grow(offsets, block+2); // No-op if large enough
+ offsets[block+1] = offset;
+
+ bitsPerValue = slice.readByte(offset++);
+ offset += Long.BYTES; // Skip over delta as we do not resolve the values themselves at this point
+ if (bitsPerValue == 0) {
+ blockEndOffset = offset;
+ } else {
+ final int length = slice.readInt(offset);
+ offset += Integer.BYTES;
+ blockEndOffset = offset + length;
+ }
+ block++;
+ } while (blockEndOffset < valuesLength-Byte.BYTES);
+ offsets = ArrayUtil.copyOfSubArray(offsets, 0, block+1);
+ creationStats = String.format(Locale.ENGLISH,
+ "name=%s, blocks=%d, time=%dms",
+ name, offsets.length, (System.nanoTime()-startTime)/1000000);
+ }
+
+ /**
+ * @param block the logical block in the vBPV structure ( valueindex/16384 ).
+ * @return the index slice offset for the vBPV block (1 block = 16384 values) or -1 if not available.
+ */
+ long getBlockOffset(long block) {
+ // Technically a limitation in caching vs. VaryingBPVReader to limit to 2b blocks of 16K values
+ return offsets[(int) block];
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return RamUsageEstimator.shallowSizeOf(this) +
+ (offsets == null ? 0 : RamUsageEstimator.sizeOf(offsets)) + // offsets
+ RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + creationStats.length()*2; // creationStats
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7949b98f/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java
index 812caba..9b2b9e0 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java
@@ -465,44 +465,18 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
}
};
} else {
- final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
if (entry.blockShift >= 0) {
// dense but split into blocks of different bits per value
- final int shift = entry.blockShift;
- final long mul = entry.gcd;
- final int mask = (1 << shift) - 1;
return new DenseNumericDocValues(maxDoc) {
- int block = -1;
- long delta;
- long offset;
- long blockEndOffset;
- LongValues values;
+ final VaryingBPVReader vBPVReader = new VaryingBPVReader(entry);
@Override
public long longValue() throws IOException {
- final int block = doc >>> shift;
- if (this.block != block) {
- int bitsPerValue;
- do {
- offset = blockEndOffset;
- bitsPerValue = slice.readByte(offset++);
- delta = slice.readLong(offset);
- offset += Long.BYTES;
- if (bitsPerValue == 0) {
- blockEndOffset = offset;
- } else {
- final int length = slice.readInt(offset);
- offset += Integer.BYTES;
- blockEndOffset = offset + length;
- }
- this.block ++;
- } while (this.block != block);
- values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset);
- }
- return mul * values.get(doc & mask) + delta;
+ return vBPVReader.getLongValue(doc);
}
};
} else {
+ final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue);
if (entry.table != null) {
final long[] table = entry.table;
@@ -536,45 +510,19 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
}
};
} else {
- final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
if (entry.blockShift >= 0) {
// sparse and split into blocks of different bits per value
- final int shift = entry.blockShift;
- final long mul = entry.gcd;
- final int mask = (1 << shift) - 1;
return new SparseNumericDocValues(disi) {
- int block = -1;
- long delta;
- long offset;
- long blockEndOffset;
- LongValues values;
+ final VaryingBPVReader vBPVReader = new VaryingBPVReader(entry);
@Override
public long longValue() throws IOException {
final int index = disi.index();
- final int block = index >>> shift;
- if (this.block != block) {
- int bitsPerValue;
- do {
- offset = blockEndOffset;
- bitsPerValue = slice.readByte(offset++);
- delta = slice.readLong(offset);
- offset += Long.BYTES;
- if (bitsPerValue == 0) {
- blockEndOffset = offset;
- } else {
- final int length = slice.readInt(offset);
- offset += Integer.BYTES;
- blockEndOffset = offset + length;
- }
- this.block ++;
- } while (this.block != block);
- values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset);
- }
- return mul * values.get(index & mask) + delta;
+ return vBPVReader.getLongValue(index);
}
};
} else {
+ final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue);
if (entry.table != null) {
final long[] table = entry.table;
@@ -608,47 +556,20 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
}
};
} else {
- final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
if (entry.blockShift >= 0) {
- final int shift = entry.blockShift;
- final long mul = entry.gcd;
- final long mask = (1L << shift) - 1;
return new LongValues() {
- long block = -1;
- long delta;
- long offset;
- long blockEndOffset;
- LongValues values;
-
+ final VaryingBPVReader vBPVReader = new VaryingBPVReader(entry);
+ @Override
public long get(long index) {
- final long block = index >>> shift;
- if (this.block != block) {
- assert block > this.block : "Reading backwards is illegal: " + this.block + " < " + block;
- int bitsPerValue;
- do {
- offset = blockEndOffset;
- try {
- bitsPerValue = slice.readByte(offset++);
- delta = slice.readLong(offset);
- offset += Long.BYTES;
- if (bitsPerValue == 0) {
- blockEndOffset = offset;
- } else {
- final int length = slice.readInt(offset);
- offset += Integer.BYTES;
- blockEndOffset = offset + length;
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- this.block ++;
- } while (this.block != block);
- values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset);
+ try {
+ return vBPVReader.getLongValue(index);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
}
- return mul * values.get(index & mask) + delta;
}
};
} else {
+ final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue);
if (entry.table != null) {
final long[] table = entry.table;
@@ -1457,4 +1378,64 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
CodecUtil.checksumEntireFile(data);
}
+ /**
+ * Reader for longs split into blocks of different bits per values.
+ * The longs are requested by index and must be accessed in monotonically increasing order.
+ */
+ // Note: The order requirement could be removed as the jump-tables allow for backwards iteration.
+ private class VaryingBPVReader {
+ final RandomAccessInput slice;
+ final NumericEntry entry;
+ final int shift;
+ final long mul;
+ final int mask;
+
+ long block = -1;
+ long delta;
+ long offset;
+ long blockEndOffset;
+ LongValues values;
+
+ VaryingBPVReader(NumericEntry entry) throws IOException {
+ this.entry = entry;
+ slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
+ shift = entry.blockShift;
+ mul = entry.gcd;
+ mask = (1 << shift) - 1;
+ }
+
+ long getLongValue(long index) throws IOException {
+ final long block = index >>> shift;
+ if (this.block != block) {
+ int bitsPerValue;
+ do {
+ // If the needed block is the one directly following the current block, it is cheaper to avoid the cache
+ if (block != this.block+1) {
+ IndexedDISICacheFactory.VaryingBPVJumpTable cache;
+ if ((cache = disiCacheFactory.getVBPVJumpTable(entry.name, slice, entry.valuesLength)) != null) {
+ long candidateOffset;
+ if ((candidateOffset = cache.getBlockOffset(block)) != -1) {
+ blockEndOffset = candidateOffset;
+ this.block = block - 1;
+ }
+ }
+ }
+ offset = blockEndOffset;
+ bitsPerValue = slice.readByte(offset++);
+ delta = slice.readLong(offset);
+ offset += Long.BYTES;
+ if (bitsPerValue == 0) {
+ blockEndOffset = offset;
+ } else {
+ final int length = slice.readInt(offset);
+ offset += Integer.BYTES;
+ blockEndOffset = offset + length;
+ }
+ this.block++;
+ } while (this.block != block);
+ values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset);
+ }
+ return mul * values.get(index & mask) + delta;
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7949b98f/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
index aae3a7f..5201aad 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
@@ -246,26 +246,26 @@ public class TestIndexedDISI extends LuceneTestCase {
private void assertAdvanceExactRandomized(IndexedDISI disi, BitSetIterator disi2, int disi2length, int step)
throws IOException {
- int index = -1;
+ int index = -1;
for (int target = 0; target < disi2length; ) {
- target += TestUtil.nextInt(random(), 0, step);
- int doc = disi2.docID();
- while (doc < target) {
- doc = disi2.nextDoc();
- index++;
- }
+ target += TestUtil.nextInt(random(), 0, step);
+ int doc = disi2.docID();
+ while (doc < target) {
+ doc = disi2.nextDoc();
+ index++;
+ }
- boolean exists = disi.advanceExact(target);
- assertEquals(doc == target, exists);
- if (exists) {
- assertEquals(index, disi.index());
- } else if (random().nextBoolean()) {
- assertEquals(doc, disi.nextDoc());
- assertEquals(index, disi.index());
- target = doc;
- }
- }
+ boolean exists = disi.advanceExact(target);
+ assertEquals(doc == target, exists);
+ if (exists) {
+ assertEquals(index, disi.index());
+ } else if (random().nextBoolean()) {
+ assertEquals(doc, disi.nextDoc());
+ assertEquals(index, disi.index());
+ target = doc;
}
+ }
+ }
private void assertSingleStepEquality(IndexedDISI disi, BitSetIterator disi2) throws IOException {
int i = 0;
@@ -274,7 +274,7 @@ public class TestIndexedDISI extends LuceneTestCase {
assertEquals(i++, disi.index());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
- }
+ }
private void assertAdvanceEquality(IndexedDISI disi, BitSetIterator disi2, int step) throws IOException {
int index = -1;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7949b98f/lucene/core/src/test/org/apache/lucene/index/TestDocValues.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocValues.java b/lucene/core/src/test/org/apache/lucene/index/TestDocValues.java
index daebad9..3010047 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestDocValues.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestDocValues.java
@@ -228,8 +228,8 @@ public class TestDocValues extends LuceneTestCase {
}
dir.close();
}
-
- /**
+
+ /**
* field with binary docvalues
*/
public void testBinaryField() throws Exception {