You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/10/11 02:08:55 UTC

svn commit: r1396867 - in /lucene/dev/trunk/lucene: ./ codecs/src/java/org/apache/lucene/codecs/block/

Author: rmuir
Date: Thu Oct 11 00:08:54 2012
New Revision: 1396867

URL: http://svn.apache.org/viewvc?rev=1396867&view=rev
Log:
LUCENE-4473: encode low-freq terms offsets more efficiently in blockPF

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java
    lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java
    lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1396867&r1=1396866&r2=1396867&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Thu Oct 11 00:08:54 2012
@@ -75,6 +75,9 @@ Optimizations
   failures in TestWeakIdentityMap disappear, too.
   (Uwe Schindler, Mike McCandless, Robert Muir)
 
+* LUCENE-4473: BlockPostingsFormat encodes offsets more efficiently
+  for low frequency terms (< 128 occurrences).  (Robert Muir)
+
 Build
 
 * LUCENE-4451: Memory leak per unique thread caused by 

Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java?rev=1396867&r1=1396866&r2=1396867&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsFormat.java Thu Oct 11 00:08:54 2012
@@ -306,10 +306,10 @@ import org.apache.lucene.util.packed.Pac
  *       PayloadLength is stored at the current position, then it indicates the length
  *       of this payload. If PayloadLength is not stored, then this payload has the same
  *       length as the payload at the previous position.</li>
- *   <li>OffsetDelta is the difference between this position's startOffset from the
+ *   <li>OffsetDelta/2 is the difference between this position's startOffset from the
  *       previous occurrence (or zero, if this is the first occurrence in this document).
- *       OffsetLength follows, encoding the difference between endOffset and startOffset. 
- *       Offset data is only written for
+ *       If OffsetDelta is odd, then the length (endOffset-startOffset) differs from the
+ *       previous occurrence and an OffsetLength follows. Offset data is only written for
  *       {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.</li>
  * </ul>
  * </dd>

Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java?rev=1396867&r1=1396866&r2=1396867&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsReader.java Thu Oct 11 00:08:54 2012
@@ -729,8 +729,10 @@ final class BlockPostingsReader extends 
             posDeltaBuffer[i] = code;
           }
           if (indexHasOffsets) {
-            posIn.readVInt();
-            posIn.readVInt();
+            if ((posIn.readVInt() & 1) != 0) {
+              // offset length changed
+              posIn.readVInt();
+            }
           }
         }
       } else {
@@ -1149,6 +1151,7 @@ final class BlockPostingsReader extends 
         // }
         final int count = posIn.readVInt();
         int payloadLength = 0;
+        int offsetLength = 0;
         payloadByteUpto = 0;
         for(int i=0;i<count;i++) {
           int code = posIn.readVInt();
@@ -1177,8 +1180,12 @@ final class BlockPostingsReader extends 
             // if (DEBUG) {
             //   System.out.println("        i=" + i + " read offsets from posIn.fp=" + posIn.getFilePointer());
             // }
-            offsetStartDeltaBuffer[i] = posIn.readVInt();
-            offsetLengthBuffer[i] = posIn.readVInt();
+            int deltaCode = posIn.readVInt();
+            if ((deltaCode & 1) != 0) {
+              offsetLength = posIn.readVInt();
+            }
+            offsetStartDeltaBuffer[i] = deltaCode >>> 1;
+            offsetLengthBuffer[i] = offsetLength;
             // if (DEBUG) {
             //   System.out.println("          startOffDelta=" + offsetStartDeltaBuffer[i] + " offsetLen=" + offsetLengthBuffer[i]);
             // }

Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java?rev=1396867&r1=1396866&r2=1396867&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/block/BlockPostingsWriter.java Thu Oct 11 00:08:54 2012
@@ -424,7 +424,8 @@ final class BlockPostingsWriter extends 
         // majority)
 
         // vInt encode the remaining positions/payloads/offsets:
-        int lastPayloadLength = -1;
+        int lastPayloadLength = -1;  // force first payload length to be written
+        int lastOffsetLength = -1;   // force first offset length to be written
         int payloadBytesReadUpto = 0;
         for(int i=0;i<posBufferUpto;i++) {
           final int posDelta = posDeltaBuffer[i];
@@ -457,8 +458,15 @@ final class BlockPostingsWriter extends 
             // if (DEBUG) {
             //   System.out.println("          write offset @ pos.fp=" + posOut.getFilePointer());
             // }
-            posOut.writeVInt(offsetStartDeltaBuffer[i]);
-            posOut.writeVInt(offsetLengthBuffer[i]);
+            int delta = offsetStartDeltaBuffer[i];
+            int length = offsetLengthBuffer[i];
+            if (length == lastOffsetLength) {
+              posOut.writeVInt(delta << 1);
+            } else {
+              posOut.writeVInt(delta << 1 | 1);
+              posOut.writeVInt(length);
+              lastOffsetLength = length;
+            }
           }
         }