You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/10/11 02:08:55 UTC
svn commit: r1396867 - in /lucene/dev/trunk/lucene: ./
codecs/src/java/org/apache/lucene/codecs/block/
Author: rmuir
Date: Thu Oct 11 00:08:54 2012
New Revision: 1396867
URL: http://svn.apache.org/viewvc?rev=1396867&view=rev
Log:
LUCENE-4473: encode low-freq terms offsets more efficiently in blockPF
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1396867&r1=1396866&r2=1396867&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Thu Oct 11 00:08:54 2012
@@ -75,6 +75,9 @@ Optimizations
failures in TestWeakIdentityMap disappear, too.
(Uwe Schindler, Mike McCandless, Robert Muir)
+* LUCENE-4473: BlockPostingsFormat encodes offsets more efficiently
+ for low frequency terms (< 128 occurrences). (Robert Muir)
+
Build
* LUCENE-4451: Memory leak per unique thread caused by
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java?rev=1396867&r1=1396866&r2=1396867&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java Thu Oct 11 00:08:54 2012
@@ -306,10 +306,10 @@ import org.apache.lucene.util.packed.Pac
* PayloadLength is stored at the current position, then it indicates the length
* of this payload. If PayloadLength is not stored, then this payload has the same
* length as the payload at the previous position.</li>
- * <li>OffsetDelta is the difference between this position's startOffset from the
+ * <li>OffsetDelta/2 is the difference between this position's startOffset from the
* previous occurrence (or zero, if this is the first occurrence in this document).
- * OffsetLength follows, encoding the difference between endOffset and startOffset.
- * Offset data is only written for
+ * If OffsetDelta is odd, then the length (endOffset-startOffset) differs from the
+ * previous occurrence and an OffsetLength follows. Offset data is only written for
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.</li>
* </ul>
* </dd>
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java?rev=1396867&r1=1396866&r2=1396867&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java Thu Oct 11 00:08:54 2012
@@ -729,8 +729,10 @@ final class BlockPostingsReader extends
posDeltaBuffer[i] = code;
}
if (indexHasOffsets) {
- posIn.readVInt();
- posIn.readVInt();
+ if ((posIn.readVInt() & 1) != 0) {
+ // offset length changed
+ posIn.readVInt();
+ }
}
}
} else {
@@ -1149,6 +1151,7 @@ final class BlockPostingsReader extends
// }
final int count = posIn.readVInt();
int payloadLength = 0;
+ int offsetLength = 0;
payloadByteUpto = 0;
for(int i=0;i<count;i++) {
int code = posIn.readVInt();
@@ -1177,8 +1180,12 @@ final class BlockPostingsReader extends
// if (DEBUG) {
// System.out.println(" i=" + i + " read offsets from posIn.fp=" + posIn.getFilePointer());
// }
- offsetStartDeltaBuffer[i] = posIn.readVInt();
- offsetLengthBuffer[i] = posIn.readVInt();
+ int deltaCode = posIn.readVInt();
+ if ((deltaCode & 1) != 0) {
+ offsetLength = posIn.readVInt();
+ }
+ offsetStartDeltaBuffer[i] = deltaCode >>> 1;
+ offsetLengthBuffer[i] = offsetLength;
// if (DEBUG) {
// System.out.println(" startOffDelta=" + offsetStartDeltaBuffer[i] + " offsetLen=" + offsetLengthBuffer[i]);
// }
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java?rev=1396867&r1=1396866&r2=1396867&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java Thu Oct 11 00:08:54 2012
@@ -424,7 +424,8 @@ final class BlockPostingsWriter extends
// majority)
// vInt encode the remaining positions/payloads/offsets:
- int lastPayloadLength = -1;
+ int lastPayloadLength = -1; // force first payload length to be written
+ int lastOffsetLength = -1; // force first offset length to be written
int payloadBytesReadUpto = 0;
for(int i=0;i<posBufferUpto;i++) {
final int posDelta = posDeltaBuffer[i];
@@ -457,8 +458,15 @@ final class BlockPostingsWriter extends
// if (DEBUG) {
// System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer());
// }
- posOut.writeVInt(offsetStartDeltaBuffer[i]);
- posOut.writeVInt(offsetLengthBuffer[i]);
+ int delta = offsetStartDeltaBuffer[i];
+ int length = offsetLengthBuffer[i];
+ if (length == lastOffsetLength) {
+ posOut.writeVInt(delta << 1);
+ } else {
+ posOut.writeVInt(delta << 1 | 1);
+ posOut.writeVInt(length);
+ lastOffsetLength = length;
+ }
}
}