You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2012/11/07 17:24:24 UTC
svn commit: r1406704 - in
/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/compressing:
CompressingStoredFieldsReader.java CompressingStoredFieldsWriter.java
Author: jpountz
Date: Wed Nov 7 16:24:24 2012
New Revision: 1406704
URL: http://svn.apache.org/viewvc?rev=1406704&view=rev
Log:
LUCENE-4527: CompressingStoreFieldsFormat: encode numStoredFields more efficiently.
Modified:
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java?rev=1406704&r1=1406703&r2=1406704&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsReader.java Wed Nov 7 16:24:24 2012
@@ -19,7 +19,7 @@ package org.apache.lucene.codecs.compres
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.BYTE_ARR;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_NAME_DAT;
-import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.*;
+import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_NAME_IDX;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HEADER_LENGTH_DAT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HEADER_LENGTH_IDX;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_DOUBLE;
@@ -27,6 +27,7 @@ import static org.apache.lucene.codecs.c
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_INT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_LONG;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.STRING;
+import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_BITS;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_MASK;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_CURRENT;
import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_START;
@@ -34,6 +35,7 @@ import static org.apache.lucene.codecs.l
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION;
import java.io.IOException;
+import java.util.Arrays;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsReader;
@@ -65,7 +67,7 @@ final class CompressingStoredFieldsReade
private final BytesRef bytes;
private final int numDocs;
private boolean closed;
-
+
// used by clone
private CompressingStoredFieldsReader(CompressingStoredFieldsReader reader) {
this.fieldInfos = reader.fieldInfos;
@@ -78,7 +80,7 @@ final class CompressingStoredFieldsReade
this.bytes = new BytesRef(reader.bytes.bytes.length);
this.closed = false;
}
-
+
public CompressingStoredFieldsReader(Directory d, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
final String segment = si.name;
boolean success = false;
@@ -187,31 +189,63 @@ final class CompressingStoredFieldsReade
final int docBase = fieldsStream.readVInt();
final int chunkDocs = fieldsStream.readVInt();
- final int bitsPerValue = fieldsStream.readVInt();
if (docID < docBase
|| docID >= docBase + chunkDocs
- || docBase + chunkDocs > numDocs
- || bitsPerValue > 31) {
+ || docBase + chunkDocs > numDocs) {
throw new CorruptIndexException("Corrupted: docID=" + docID
+ ", docBase=" + docBase + ", chunkDocs=" + chunkDocs
- + ", numDocs=" + numDocs + ", bitsPerValue=" + bitsPerValue);
+ + ", numDocs=" + numDocs);
}
- final long filePointer = fieldsStream.getFilePointer();
- final PackedInts.ReaderIterator lengths = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerValue, 1);
- int offset = 0;
- for (int i = docBase; i < docID; ++i) {
- offset += lengths.next();
- }
- final int length = (int) lengths.next();
- // skip the last values
- fieldsStream.seek(filePointer + PackedInts.Format.PACKED.byteCount(packedIntsVersion, chunkDocs, bitsPerValue));
+ final int numStoredFields, offset, length;
+ if (chunkDocs == 1) {
+ numStoredFields = fieldsStream.readVInt();
+ offset = 0;
+ length = fieldsStream.readVInt();
+ } else {
+ final int bitsPerStoredFields = fieldsStream.readVInt();
+ if (bitsPerStoredFields == 0) {
+ numStoredFields = fieldsStream.readVInt();
+ } else if (bitsPerStoredFields > 31) {
+ throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields);
+ } else {
+ final long filePointer = fieldsStream.getFilePointer();
+ final PackedInts.Reader reader = PackedInts.getDirectReaderNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields);
+ numStoredFields = (int) (reader.get(docID - docBase));
+ fieldsStream.seek(filePointer + PackedInts.Format.PACKED.byteCount(packedIntsVersion, chunkDocs, bitsPerStoredFields));
+ }
+
+ final int bitsPerLength = fieldsStream.readVInt();
+ if (bitsPerLength == 0) {
+ length = fieldsStream.readVInt();
+ offset = (docID - docBase) * length;
+ } else if (bitsPerStoredFields > 31) {
+ throw new CorruptIndexException("bitsPerLength=" + bitsPerLength);
+ } else {
+ final long filePointer = fieldsStream.getFilePointer();
+ final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);
+ int off = 0;
+ for (int i = 0; i < docID - docBase; ++i) {
+ off += it.next();
+ }
+ offset = off;
+ length = (int) it.next();
+ fieldsStream.seek(filePointer + PackedInts.Format.PACKED.byteCount(packedIntsVersion, chunkDocs, bitsPerLength));
+ }
+ }
+
+ if ((length == 0) != (numStoredFields == 0)) {
+ throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields);
+ }
+ if (numStoredFields == 0) {
+ // nothing to do
+ return;
+ }
decompressor.decompress(fieldsStream, offset, length, bytes);
final ByteArrayDataInput documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);
- final int numFields = documentInput.readVInt();
- for (int fieldIDX = 0; fieldIDX < numFields; fieldIDX++) {
+ for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++) {
final long infoAndBits = documentInput.readVLong();
final int fieldNumber = (int) (infoAndBits >>> TYPE_BITS);
FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
@@ -222,14 +256,17 @@ final class CompressingStoredFieldsReade
switch(visitor.needsField(fieldInfo)) {
case YES:
readField(documentInput, visitor, fieldInfo, bits);
+ assert documentInput.getPosition() <= bytes.offset + bytes.length : documentInput.getPosition() + " " + bytes.offset + bytes.length;
break;
- case NO:
+ case NO:
skipField(documentInput, bits);
+ assert documentInput.getPosition() <= bytes.offset + bytes.length : documentInput.getPosition() + " " + bytes.offset + bytes.length;
break;
- case STOP:
+ case STOP:
return;
}
}
+ assert documentInput.getPosition() == bytes.offset + bytes.length : documentInput.getPosition() + " " + bytes.offset + " " + bytes.length;
}
@Override
@@ -253,28 +290,14 @@ final class CompressingStoredFieldsReade
BytesRef bytes;
int docBase;
int chunkDocs;
+ int[] numStoredFields;
int[] lengths;
private ChunkIterator() {
this.docBase = -1;
bytes = new BytesRef();
- lengths = new int[0];
- }
-
- private int readHeader() throws IOException {
- final int docBase = fieldsStream.readVInt();
- final int chunkDocs = fieldsStream.readVInt();
- final int bitsPerValue = fieldsStream.readVInt();
- if (docBase < this.docBase + this.chunkDocs
- || docBase + chunkDocs > numDocs
- || bitsPerValue > 31) {
- throw new CorruptIndexException("Corrupted: current docBase=" + this.docBase
- + ", current numDocs=" + this.chunkDocs + ", new docBase=" + docBase
- + ", new numDocs=" + chunkDocs + ", bitsPerValue=" + bitsPerValue);
- }
- this.docBase = docBase;
- this.chunkDocs = chunkDocs;
- return bitsPerValue;
+ numStoredFields = new int[1];
+ lengths = new int[1];
}
/**
@@ -293,26 +316,52 @@ final class CompressingStoredFieldsReade
*/
void next(int doc) throws IOException {
assert doc >= docBase + chunkDocs : doc + " " + docBase + " " + chunkDocs;
- // try next chunk
- int bitsPerValue = readHeader();
- if (docBase + chunkDocs <= doc) {
- // doc is not in the next chunk, use seek to skip to the next document chunk
- fieldsStream.seek(indexReader.getStartPointer(doc));
- bitsPerValue = readHeader();
- }
- if (doc < docBase
- || doc >= docBase + chunkDocs) {
- throw new CorruptIndexException("Corrupted: docID=" + doc
- + ", docBase=" + docBase + ", chunkDocs=" + chunkDocs);
+ fieldsStream.seek(indexReader.getStartPointer(doc));
+
+ final int docBase = fieldsStream.readVInt();
+ final int chunkDocs = fieldsStream.readVInt();
+ if (docBase < this.docBase + this.chunkDocs
+ || docBase + chunkDocs > numDocs) {
+ throw new CorruptIndexException("Corrupted: current docBase=" + this.docBase
+ + ", current numDocs=" + this.chunkDocs + ", new docBase=" + docBase
+ + ", new numDocs=" + chunkDocs);
}
+ this.docBase = docBase;
+ this.chunkDocs = chunkDocs;
- // decode lengths
- if (lengths.length < chunkDocs) {
- lengths = new int[ArrayUtil.oversize(chunkDocs, 4)];
+ if (chunkDocs > numStoredFields.length) {
+ final int newLength = ArrayUtil.oversize(chunkDocs, 4);
+ numStoredFields = new int[newLength];
+ lengths = new int[newLength];
}
- final PackedInts.ReaderIterator iterator = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerValue, 0);
- for (int i = 0; i < chunkDocs; ++i) {
- lengths[i] = (int) iterator.next();
+
+ if (chunkDocs == 1) {
+ numStoredFields[0] = fieldsStream.readVInt();
+ lengths[0] = fieldsStream.readVInt();
+ } else {
+ final int bitsPerStoredFields = fieldsStream.readVInt();
+ if (bitsPerStoredFields == 0) {
+ Arrays.fill(numStoredFields, 0, chunkDocs, fieldsStream.readVInt());
+ } else if (bitsPerStoredFields > 31) {
+ throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields);
+ } else {
+ final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields, 1);
+ for (int i = 0; i < chunkDocs; ++i) {
+ numStoredFields[i] = (int) it.next();
+ }
+ }
+
+ final int bitsPerLength = fieldsStream.readVInt();
+ if (bitsPerLength == 0) {
+ Arrays.fill(lengths, 0, chunkDocs, fieldsStream.readVInt());
+ } else if (bitsPerLength > 31) {
+ throw new CorruptIndexException("bitsPerLength=" + bitsPerLength);
+ } else {
+ final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1);
+ for (int i = 0; i < chunkDocs; ++i) {
+ lengths[i] = (int) it.next();
+ }
+ }
}
}
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java?rev=1406704&r1=1406703&r2=1406704&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java Wed Nov 7 16:24:24 2012
@@ -21,6 +21,7 @@ import static org.apache.lucene.codecs.l
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION;
import java.io.IOException;
+import java.util.Arrays;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsReader;
@@ -36,6 +37,7 @@ import org.apache.lucene.index.SegmentIn
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
@@ -74,6 +76,7 @@ final class CompressingStoredFieldsWrite
private final int chunkSize;
private final GrowableByteArrayDataOutput bufferedDocs;
+ private int[] numStoredFields; // number of stored fields
private int[] endOffsets; // end offsets in bufferedDocs
private int docBase; // doc ID at the beginning of the chunk
private int numBufferedDocs; // docBase + numBufferedDocs == current doc ID
@@ -88,6 +91,7 @@ final class CompressingStoredFieldsWrite
this.chunkSize = chunkSize;
this.docBase = 0;
this.bufferedDocs = new GrowableByteArrayDataOutput(chunkSize);
+ this.numStoredFields = new int[16];
this.endOffsets = new int[16];
this.numBufferedDocs = 0;
@@ -128,50 +132,72 @@ final class CompressingStoredFieldsWrite
private void endWithPreviousDocument() throws IOException {
if (numBufferedDocs > 0) {
- assert bufferedDocs.length > 0;
- if (numBufferedDocs == endOffsets.length) {
- endOffsets = ArrayUtil.grow(endOffsets);
- }
endOffsets[numBufferedDocs - 1] = bufferedDocs.length;
}
}
- private void addRawDocument(byte[] buf, int off, int len) throws IOException {
+ @Override
+ public void startDocument(int numStoredFields) throws IOException {
endWithPreviousDocument();
- if (bufferedDocs.length >= chunkSize) {
+ if (triggerFlush()) {
flush();
}
- bufferedDocs.writeBytes(buf, off, len);
+
+ if (numBufferedDocs == this.numStoredFields.length) {
+ final int newLength = ArrayUtil.oversize(numBufferedDocs + 1, 4);
+ this.numStoredFields = Arrays.copyOf(this.numStoredFields, newLength);
+ endOffsets = Arrays.copyOf(endOffsets, newLength);
+ }
+ this.numStoredFields[numBufferedDocs] = numStoredFields;
++numBufferedDocs;
}
- @Override
- public void startDocument(int numStoredFields) throws IOException {
- endWithPreviousDocument();
- if (bufferedDocs.length >= chunkSize) {
- flush();
+ private static void saveInts(int[] values, int length, DataOutput out) throws IOException {
+ assert length > 0;
+ if (length == 1) {
+ out.writeVInt(values[0]);
+ } else {
+ boolean allEqual = true;
+ for (int i = 1; i < length; ++i) {
+ if (values[i] != values[0]) {
+ allEqual = false;
+ break;
+ }
+ }
+ if (allEqual) {
+ out.writeVInt(0);
+ out.writeVInt(values[0]);
+ } else {
+ long max = 0;
+ for (int i = 0; i < length; ++i) {
+ max |= values[i];
+ }
+ final int bitsRequired = PackedInts.bitsRequired(max);
+ out.writeVInt(bitsRequired);
+ final PackedInts.Writer w = PackedInts.getWriterNoHeader(out, PackedInts.Format.PACKED, length, bitsRequired, 1);
+ for (int i = 0; i < length; ++i) {
+ w.add(values[i]);
+ }
+ w.finish();
+ }
}
- bufferedDocs.writeVInt(numStoredFields);
- ++numBufferedDocs;
}
- private void writeHeader(int docBase, int numBufferedDocs, int[] lengths) throws IOException {
+ private void writeHeader(int docBase, int numBufferedDocs, int[] numStoredFields, int[] lengths) throws IOException {
// save docBase and numBufferedDocs
fieldsStream.writeVInt(docBase);
fieldsStream.writeVInt(numBufferedDocs);
+ // save numStoredFields
+ saveInts(numStoredFields, numBufferedDocs, fieldsStream);
+
// save lengths
- final int bitsRequired = bitsRequired(lengths, numBufferedDocs);
- assert bitsRequired <= 31;
- fieldsStream.writeVInt(bitsRequired);
-
- final PackedInts.Writer writer = PackedInts.getWriterNoHeader(fieldsStream, PackedInts.Format.PACKED, numBufferedDocs, bitsRequired, 1);
- for (int i = 0; i < numBufferedDocs; ++i) {
- assert lengths[i] > 0;
- writer.add(lengths[i]);
- }
- assert writer.ord() + 1 == numBufferedDocs;
- writer.finish();
+ saveInts(lengths, numBufferedDocs, fieldsStream);
+ }
+
+ private boolean triggerFlush() {
+ return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes
+ numBufferedDocs >= chunkSize; // can be necessary if most docs are empty
}
private void flush() throws IOException {
@@ -181,9 +207,9 @@ final class CompressingStoredFieldsWrite
final int[] lengths = endOffsets;
for (int i = numBufferedDocs - 1; i > 0; --i) {
lengths[i] = endOffsets[i] - endOffsets[i - 1];
- assert lengths[i] > 0;
+ assert lengths[i] >= 0;
}
- writeHeader(docBase, numBufferedDocs, lengths);
+ writeHeader(docBase, numBufferedDocs, numStoredFields, lengths);
// compress stored fields to fieldsStream
compressor.compress(bufferedDocs.bytes, 0, bufferedDocs.length, fieldsStream);
@@ -194,14 +220,6 @@ final class CompressingStoredFieldsWrite
bufferedDocs.length = 0;
}
- private static int bitsRequired(int[] data, int length) {
- int or = data[0];
- for (int i = 1; i < length; ++i) {
- or |= data[i];
- }
- return PackedInts.bitsRequired(or);
- }
-
@Override
public void writeField(FieldInfo info, StorableField field)
throws IOException {
@@ -327,7 +345,7 @@ final class CompressingStoredFieldsWrite
}
if (compressionMode == matchingFieldsReader.getCompressionMode() // same compression mode
- && (numBufferedDocs == 0 || bufferedDocs.length >= chunkSize) // starting a new chunk
+ && (numBufferedDocs == 0 || triggerFlush()) // starting a new chunk
&& startOffsets[it.chunkDocs - 1] < chunkSize // chunk is small enough
&& startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] >= chunkSize // chunk is large enough
&& nextDeletedDoc(it.docBase, liveDocs, it.docBase + it.chunkDocs) == it.docBase + it.chunkDocs) { // no deletion in the chunk
@@ -335,11 +353,11 @@ final class CompressingStoredFieldsWrite
// no need to decompress, just copy data
endWithPreviousDocument();
- if (bufferedDocs.length >= chunkSize) {
+ if (triggerFlush()) {
flush();
}
indexWriter.writeIndex(it.chunkDocs, fieldsStream.getFilePointer());
- writeHeader(this.docBase, it.chunkDocs, it.lengths);
+ writeHeader(this.docBase, it.chunkDocs, it.numStoredFields, it.lengths);
it.copyCompressedData(fieldsStream);
this.docBase += it.chunkDocs;
docID = nextLiveDoc(it.docBase + it.chunkDocs, liveDocs, maxDoc);
@@ -354,7 +372,8 @@ final class CompressingStoredFieldsWrite
// copy non-deleted docs
for (; docID < it.docBase + it.chunkDocs; docID = nextLiveDoc(docID + 1, liveDocs, maxDoc)) {
final int diff = docID - it.docBase;
- addRawDocument(it.bytes.bytes, it.bytes.offset + startOffsets[diff], it.lengths[diff]);
+ startDocument(it.numStoredFields[diff]);
+ bufferedDocs.writeBytes(it.bytes.bytes, it.bytes.offset + startOffsets[diff], it.lengths[diff]);
++docCount;
mergeState.checkAbort.work(300);
}