You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/06/11 17:54:59 UTC
svn commit: r1601936 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/codecs/ lucene/codecs/src/java/org/apache/lucene/codecs/memory/
lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/lucene49/
Author: rmuir
Date: Wed Jun 11 15:54:58 2014
New Revision: 1601936
URL: http://svn.apache.org/r1601936
Log:
LUCENE-5751: Bring MemoryDocValues up to speed
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/codecs/ (props changed)
lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java
lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java
lucene/dev/branches/branch_4x/lucene/core/ (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1601936&r1=1601935&r2=1601936&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Jun 11 15:54:58 2014
@@ -172,6 +172,8 @@ Optimizations
* LUCENE-5750: Speed up monotonic addressing for BINARY and SORTED_SET
docvalues. (Robert Muir)
+* LUCENE-5751: Speed up MemoryDocValues. (Adrien Grand, Robert Muir)
+
Bug fixes
* LUCENE-5738: Ensure NativeFSLock prevents opening the file channel for the
Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java?rev=1601936&r1=1601935&r2=1601936&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java Wed Jun 11 15:54:58 2014
@@ -51,9 +51,9 @@ import static org.apache.lucene.codecs.m
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.NUMBER;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.FST;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.DELTA_COMPRESSED;
+import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BLOCK_COMPRESSED;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.GCD_COMPRESSED;
import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.TABLE_COMPRESSED;
-import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.UNCOMPRESSED;
/**
* Writer for {@link MemoryDocValuesFormat}
@@ -93,6 +93,7 @@ class MemoryDocValuesConsumer extends Do
meta.writeLong(data.getFilePointer());
long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
+ long blockSum = 0;
long gcd = 0;
boolean missing = false;
// TODO: more efficient?
@@ -101,6 +102,8 @@ class MemoryDocValuesConsumer extends Do
uniqueValues = new HashSet<>();
long count = 0;
+ long currentBlockMin = Long.MAX_VALUE;
+ long currentBlockMax = Long.MIN_VALUE;
for (Number nv : values) {
final long v;
if (nv == null) {
@@ -121,6 +124,9 @@ class MemoryDocValuesConsumer extends Do
}
}
+ currentBlockMin = Math.min(minValue, v);
+ currentBlockMax = Math.max(maxValue, v);
+
minValue = Math.min(minValue, v);
maxValue = Math.max(maxValue, v);
@@ -133,8 +139,22 @@ class MemoryDocValuesConsumer extends Do
}
++count;
+ if (count % BLOCK_SIZE == 0) {
+ final long blockDelta = currentBlockMax - currentBlockMin;
+ final int blockDeltaRequired = blockDelta < 0 ? 64 : PackedInts.bitsRequired(blockDelta);
+ final int blockBPV = PackedInts.fastestFormatAndBits(BLOCK_SIZE, blockDeltaRequired, acceptableOverheadRatio).bitsPerValue;
+ blockSum += blockBPV;
+ currentBlockMax = Long.MIN_VALUE;
+ currentBlockMin = Long.MAX_VALUE;
+ }
}
assert count == maxDoc;
+ } else {
+ for (Number nv : values) {
+ long v = nv.longValue();
+ maxValue = Math.max(v, maxValue);
+ minValue = Math.min(v, minValue);
+ }
}
if (missing) {
@@ -145,60 +165,99 @@ class MemoryDocValuesConsumer extends Do
} else {
meta.writeLong(-1L);
}
-
+
+ final long delta = maxValue - minValue;
+ final int deltaRequired = delta < 0 ? 64 : PackedInts.bitsRequired(delta);
+ final FormatAndBits deltaBPV = PackedInts.fastestFormatAndBits(maxDoc, deltaRequired, acceptableOverheadRatio);
+
+ final FormatAndBits tableBPV;
if (uniqueValues != null) {
+ tableBPV = PackedInts.fastestFormatAndBits(maxDoc, PackedInts.bitsRequired(uniqueValues.size()-1), acceptableOverheadRatio);
+ } else {
+ tableBPV = null;
+ }
+
+ final FormatAndBits gcdBPV;
+ if (gcd != 0 && gcd != 1) {
+ final long gcdDelta = (maxValue - minValue) / gcd;
+ final int gcdRequired = gcdDelta < 0 ? 64 : PackedInts.bitsRequired(gcdDelta);
+ gcdBPV = PackedInts.fastestFormatAndBits(maxDoc, gcdRequired, acceptableOverheadRatio);
+ } else {
+ gcdBPV = null;
+ }
+
+ boolean doBlock = false;
+ if (blockSum != 0) {
+ int numBlocks = maxDoc / BLOCK_SIZE;
+ float avgBPV = blockSum / (float)numBlocks;
+ // just a heuristic, with tiny amounts of blocks our estimate is skewed as we ignore the final "incomplete" block.
+ // with at least 4 blocks its pretty accurate. The difference must also be significant (according to acceptable overhead).
+ if (numBlocks >= 4 && (avgBPV+avgBPV*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
+ doBlock = true;
+ }
+ }
+
+ if (tableBPV != null && (tableBPV.bitsPerValue+tableBPV.bitsPerValue*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
// small number of unique values
- final int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1);
- FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio);
- if (formatAndBits.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
- meta.writeByte(UNCOMPRESSED); // uncompressed
- for (Number nv : values) {
- data.writeByte(nv == null ? 0 : (byte) nv.longValue());
- }
- } else {
- meta.writeByte(TABLE_COMPRESSED); // table-compressed
- Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
- final HashMap<Long,Integer> encode = new HashMap<>();
- data.writeVInt(decode.length);
- for (int i = 0; i < decode.length; i++) {
- data.writeLong(decode[i]);
- encode.put(decode[i], i);
- }
-
- meta.writeVInt(PackedInts.VERSION_CURRENT);
- data.writeVInt(formatAndBits.format.getId());
- data.writeVInt(formatAndBits.bitsPerValue);
-
- final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
- for(Number nv : values) {
- writer.add(encode.get(nv == null ? 0 : nv.longValue()));
- }
- writer.finish();
+ meta.writeByte(TABLE_COMPRESSED); // table-compressed
+ Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
+ final HashMap<Long,Integer> encode = new HashMap<>();
+ int length = 1 << tableBPV.bitsPerValue;
+ data.writeVInt(length);
+ for (int i = 0; i < decode.length; i++) {
+ data.writeLong(decode[i]);
+ encode.put(decode[i], i);
}
- } else if (gcd != 0 && gcd != 1) {
+ for (int i = decode.length; i < length; i++) {
+ data.writeLong(0);
+ }
+
+ meta.writeVInt(PackedInts.VERSION_CURRENT);
+ data.writeVInt(tableBPV.format.getId());
+ data.writeVInt(tableBPV.bitsPerValue);
+
+ final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, tableBPV.format, maxDoc, tableBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
+ for(Number nv : values) {
+ writer.add(encode.get(nv == null ? 0 : nv.longValue()));
+ }
+ writer.finish();
+ } else if (gcdBPV != null && (gcdBPV.bitsPerValue+gcdBPV.bitsPerValue*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
meta.writeByte(GCD_COMPRESSED);
meta.writeVInt(PackedInts.VERSION_CURRENT);
data.writeLong(minValue);
data.writeLong(gcd);
- data.writeVInt(BLOCK_SIZE);
+ data.writeVInt(gcdBPV.format.getId());
+ data.writeVInt(gcdBPV.bitsPerValue);
- final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
+ final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, gcdBPV.format, maxDoc, gcdBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
for (Number nv : values) {
long value = nv == null ? 0 : nv.longValue();
writer.add((value - minValue) / gcd);
}
writer.finish();
- } else {
- meta.writeByte(DELTA_COMPRESSED); // delta-compressed
-
+ } else if (doBlock) {
+ meta.writeByte(BLOCK_COMPRESSED); // block delta-compressed
meta.writeVInt(PackedInts.VERSION_CURRENT);
data.writeVInt(BLOCK_SIZE);
-
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
for (Number nv : values) {
writer.add(nv == null ? 0 : nv.longValue());
}
writer.finish();
+ } else {
+ meta.writeByte(DELTA_COMPRESSED); // delta-compressed
+ meta.writeVInt(PackedInts.VERSION_CURRENT);
+ final long minDelta = deltaBPV.bitsPerValue == 64 ? 0 : minValue;
+ data.writeLong(minDelta);
+ data.writeVInt(deltaBPV.format.getId());
+ data.writeVInt(deltaBPV.bitsPerValue);
+
+ final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, deltaBPV.format, maxDoc, deltaBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
+ for (Number nv : values) {
+ long v = nv == null ? 0 : nv.longValue();
+ writer.add(v - minDelta);
+ }
+ writer.finish();
}
}
Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java?rev=1601936&r1=1601935&r2=1601936&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java Wed Jun 11 15:54:58 2014
@@ -90,13 +90,14 @@ class MemoryDocValuesProducer extends Do
static final byte DELTA_COMPRESSED = 0;
static final byte TABLE_COMPRESSED = 1;
- static final byte UNCOMPRESSED = 2;
+ static final byte BLOCK_COMPRESSED = 2;
static final byte GCD_COMPRESSED = 3;
static final int VERSION_START = 0;
static final int VERSION_GCD_COMPRESSION = 1;
static final int VERSION_CHECKSUM = 2;
- static final int VERSION_CURRENT = VERSION_CHECKSUM;
+ static final int VERSION_BLOCKDETECTION = 3;
+ static final int VERSION_CURRENT = VERSION_BLOCKDETECTION;
MemoryDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
maxDoc = state.segmentInfo.getDocCount();
@@ -163,15 +164,13 @@ class MemoryDocValuesProducer extends Do
switch(entry.format) {
case DELTA_COMPRESSED:
case TABLE_COMPRESSED:
+ case BLOCK_COMPRESSED:
case GCD_COMPRESSED:
- case UNCOMPRESSED:
break;
default:
throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
}
- if (entry.format != UNCOMPRESSED) {
- entry.packedIntsVersion = meta.readVInt();
- }
+ entry.packedIntsVersion = meta.readVInt();
numerics.put(fieldNumber, entry);
} else if (fieldType == BYTES) {
BinaryEntry entry = new BinaryEntry();
@@ -248,25 +247,28 @@ class MemoryDocValuesProducer extends Do
}
};
case DELTA_COMPRESSED:
- final int blockSize = data.readVInt();
- final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, maxDoc, false);
- ramBytesUsed.addAndGet(reader.ramBytesUsed());
- return reader;
- case UNCOMPRESSED:
- final byte bytes[] = new byte[maxDoc];
- data.readBytes(bytes, 0, bytes.length);
- ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(bytes));
+ final long minDelta = data.readLong();
+ final int formatIDDelta = data.readVInt();
+ final int bitsPerValueDelta = data.readVInt();
+ final PackedInts.Reader deltaReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatIDDelta), entry.packedIntsVersion, maxDoc, bitsPerValueDelta);
+ ramBytesUsed.addAndGet(deltaReader.ramBytesUsed());
return new NumericDocValues() {
@Override
public long get(int docID) {
- return bytes[docID];
+ return minDelta + deltaReader.get(docID);
}
};
+ case BLOCK_COMPRESSED:
+ final int blockSize = data.readVInt();
+ final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, maxDoc, false);
+ ramBytesUsed.addAndGet(reader.ramBytesUsed());
+ return reader;
case GCD_COMPRESSED:
final long min = data.readLong();
final long mult = data.readLong();
- final int quotientBlockSize = data.readVInt();
- final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, quotientBlockSize, maxDoc, false);
+ final int formatIDGCD = data.readVInt();
+ final int bitsPerValueGCD = data.readVInt();
+ final PackedInts.Reader quotientReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatIDGCD), entry.packedIntsVersion, maxDoc, bitsPerValueGCD);
ramBytesUsed.addAndGet(quotientReader.ramBytesUsed());
return new NumericDocValues() {
@Override
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java?rev=1601936&r1=1601935&r2=1601936&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java Wed Jun 11 15:54:58 2014
@@ -150,13 +150,15 @@ public class Lucene49DocValuesConsumer e
}
final long delta = maxValue - minValue;
+ final int deltaBitsRequired = delta < 0 ? 64 : DirectWriter.bitsRequired(delta);
final int format;
- if (uniqueValues != null
- && (delta < 0L || PackedInts.bitsRequired(uniqueValues.size() - 1) < PackedInts.bitsRequired(delta))) {
+ if (uniqueValues != null && DirectWriter.bitsRequired(uniqueValues.size() - 1) < deltaBitsRequired) {
format = TABLE_COMPRESSED;
} else if (gcd != 0 && gcd != 1) {
- format = GCD_COMPRESSED;
+ final long gcdDelta = (maxValue - minValue) / gcd;
+ final long gcdBitsRequired = gcdDelta < 0 ? 64 : DirectWriter.bitsRequired(gcdDelta);
+ format = gcdBitsRequired < deltaBitsRequired ? GCD_COMPRESSED : DELTA_COMPRESSED;
} else {
format = DELTA_COMPRESSED;
}
@@ -189,9 +191,8 @@ public class Lucene49DocValuesConsumer e
case DELTA_COMPRESSED:
final long minDelta = delta < 0 ? 0 : minValue;
meta.writeLong(minDelta);
- final int bpv = delta < 0 ? 64 : DirectWriter.bitsRequired(delta);
- meta.writeVInt(bpv);
- final DirectWriter writer = DirectWriter.getInstance(data, count, bpv);
+ meta.writeVInt(deltaBitsRequired);
+ final DirectWriter writer = DirectWriter.getInstance(data, count, deltaBitsRequired);
for (Number nv : values) {
long v = nv == null ? 0 : nv.longValue();
writer.add(v - minDelta);