You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2010/10/30 12:17:21 UTC

svn commit: r1029012 - in /lucene/dev/trunk/lucene/src: java/org/apache/lucene/index/codecs/intblock/ java/org/apache/lucene/index/codecs/sep/ test/org/apache/lucene/index/codecs/mocksep/

Author: mikemccand
Date: Sat Oct 30 10:17:20 2010
New Revision: 1029012

URL: http://svn.apache.org/viewvc?rev=1029012&view=rev
Log:
LUCENE-2722: fix sep codec to store less in the terms dict

Modified:
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexOutput.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexInput.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexOutput.java

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java?rev=1029012&r1=1029011&r2=1029012&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java Sat Oct 30 10:17:20 2010
@@ -168,6 +168,25 @@ public abstract class FixedIntBlockIndex
     }
 
     @Override
+    public void read(final IntIndexInput.Reader indexIn, final boolean absolute) throws IOException {
+      if (absolute) {
+        fp = indexIn.readVLong();
+        upto = indexIn.next();
+      } else {
+        final long delta = indexIn.readVLong();
+        if (delta == 0) {
+          // same block
+          upto += indexIn.next();
+        } else {
+          // new block
+          fp += delta;
+          upto = indexIn.next();
+        }
+      }
+      assert upto < blockSize;
+    }
+
+    @Override
     public void seek(final IntIndexInput.Reader other) throws IOException {
       ((Reader) other).seek(fp, upto);
     }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java?rev=1029012&r1=1029011&r2=1029012&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java Sat Oct 30 10:17:20 2010
@@ -83,11 +83,30 @@ public abstract class FixedIntBlockIndex
         // same block
         indexOut.writeVLong(0);
         assert upto >= lastUpto;
-        indexOut.writeVLong(upto - lastUpto);
+        indexOut.writeVInt(upto - lastUpto);
       } else {      
         // new block
         indexOut.writeVLong(fp - lastFP);
-        indexOut.writeVLong(upto);
+        indexOut.writeVInt(upto);
+      }
+      lastUpto = upto;
+      lastFP = fp;
+    }
+
+    @Override
+    public void write(IntIndexOutput indexOut, boolean absolute) throws IOException {
+      if (absolute) {
+        indexOut.writeVLong(fp);
+        indexOut.write(upto);
+      } else if (fp == lastFP) {
+        // same block
+        indexOut.writeVLong(0);
+        assert upto >= lastUpto;
+        indexOut.write(upto - lastUpto);
+      } else {      
+        // new block
+        indexOut.writeVLong(fp - lastFP);
+        indexOut.write(upto);
       }
       lastUpto = upto;
       lastFP = fp;

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java?rev=1029012&r1=1029011&r2=1029012&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java Sat Oct 30 10:17:20 2010
@@ -189,6 +189,24 @@ public abstract class VariableIntBlockIn
     }
 
     @Override
+    public void read(final IntIndexInput.Reader indexIn, final boolean absolute) throws IOException {
+      if (absolute) {
+        fp = indexIn.readVLong();
+        upto = indexIn.next()&0xFF;
+      } else {
+        final long delta = indexIn.readVLong();
+        if (delta == 0) {
+          // same block
+          upto = indexIn.next()&0xFF;
+        } else {
+          // new block
+          fp += delta;
+          upto = indexIn.next()&0xFF;
+        }
+      }
+    }
+
+    @Override
     public String toString() {
       return "VarIntBlock.Index fp=" + fp + " upto=" + upto + " maxBlock=" + maxBlockSize;
     }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java?rev=1029012&r1=1029011&r2=1029012&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java Sat Oct 30 10:17:20 2010
@@ -103,6 +103,26 @@ public abstract class VariableIntBlockIn
       lastUpto = upto;
       lastFP = fp;
     }
+
+    @Override
+    public void write(IntIndexOutput indexOut, boolean absolute) throws IOException {
+      assert upto >= 0;
+      if (absolute) {
+        indexOut.writeVLong(fp);
+        indexOut.write(upto);
+      } else if (fp == lastFP) {
+        // same block
+        indexOut.writeVLong(0);
+        assert upto >= lastUpto;
+        indexOut.write(upto);
+      } else {      
+        // new block
+        indexOut.writeVLong(fp - lastFP);
+        indexOut.write(upto);
+      }
+      lastUpto = upto;
+      lastFP = fp;
+    }
   }
 
   @Override

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java?rev=1029012&r1=1029011&r2=1029012&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexInput.java Sat Oct 30 10:17:20 2010
@@ -41,6 +41,8 @@ public abstract class IntIndexInput impl
 
     public abstract void read(IndexInput indexIn, boolean absolute) throws IOException;
 
+    public abstract void read(IntIndexInput.Reader indexIn, boolean absolute) throws IOException;
+
     /** Seeks primary stream to the last read offset */
     public abstract void seek(IntIndexInput.Reader stream) throws IOException;
 
@@ -54,6 +56,18 @@ public abstract class IntIndexInput impl
     /** Reads next single int */
     public abstract int next() throws IOException;
 
+    /** Encodes as 1 or 2 ints, and can only use 61 of the 64
+     *  long bits. */
+    public long readVLong() throws IOException {
+      final int v = next();
+      if ((v & 1) == 0) {
+        return v >> 1;
+      } else {
+        final long v2 = next();
+        return (v2 << 30) | (v >> 1);
+      }
+    }
+
     /** Reads next chunk of ints */
     private IntsRef bulkResult;
 

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexOutput.java?rev=1029012&r1=1029011&r2=1029012&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexOutput.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/IntIndexOutput.java Sat Oct 30 10:17:20 2010
@@ -34,9 +34,27 @@ import java.io.Closeable;
  * @lucene.experimental */
 public abstract class IntIndexOutput implements Closeable {
 
-  /** Write an int to the primary file */
+  /** Write an int to the primary file.  The value must be
+   * >= 0.  */
   public abstract void write(int v) throws IOException;
 
+  public static final long MAX_SINGLE_INT_VLONG = Integer.MAX_VALUE - (1<<30);
+  public static final long MAX_VLONG = Long.MAX_VALUE - (1L<<62) - (1L<<61);
+
+  /** Encodes as 1 or 2 ints, and can only use 61 of the 64
+   *  long bits. */
+  public void writeVLong(long v) throws IOException {
+    assert v >= 0: "v=" + v;
+    assert v < MAX_VLONG: "v=" + v;
+    // we cannot pass a negative int 
+    if (v <= MAX_SINGLE_INT_VLONG) {
+      write(((int) v)<<1);
+    } else {
+      write(((int) ((v & MAX_SINGLE_INT_VLONG))<<1) | 1);
+      write(((int) (v >> 30)));
+    }
+  }
+
   public abstract static class Index {
 
     /** Internally records the current location */
@@ -46,8 +64,10 @@ public abstract class IntIndexOutput imp
     public abstract void set(Index other) throws IOException;
 
     /** Writes "location" of current output pointer of primary
-     * output to different output (out) */
+     *  output to different output (out) */
     public abstract void write(IndexOutput indexOut, boolean absolute) throws IOException;
+
+    public abstract void write(IntIndexOutput indexOut, boolean absolute) throws IOException;
   }
 
   /** If you are indexing the primary output file, call

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java?rev=1029012&r1=1029011&r2=1029012&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java Sat Oct 30 10:17:20 2010
@@ -130,21 +130,14 @@ public class SepPostingsReaderImpl exten
   }
 
   private static class SepTermState extends TermState {
+    // We store only the seek point to the docs file because
+    // the rest of the info (freqIndex, posIndex, etc.) is
+    // stored in the docs file:
     IntIndexInput.Index docIndex;
-    IntIndexInput.Index freqIndex;
-    IntIndexInput.Index posIndex;
-    long skipOffset;
-    long payloadOffset;
 
     public Object clone() {
       SepTermState other = (SepTermState) super.clone();
       other.docIndex = (IntIndexInput.Index) docIndex.clone();
-      if (freqIndex != null) {
-        other.freqIndex = (IntIndexInput.Index) freqIndex.clone();
-      }
-      if (posIndex != null) {
-        other.posIndex = (IntIndexInput.Index) posIndex.clone();
-      }
       return other;
     }
 
@@ -152,22 +145,6 @@ public class SepPostingsReaderImpl exten
       super.copy(_other);
       SepTermState other = (SepTermState) _other;
       docIndex.set(other.docIndex);
-      if (other.posIndex != null) {
-        if (posIndex == null) {
-          posIndex = (IntIndexInput.Index) other.posIndex.clone();
-        } else {
-          posIndex.set(other.posIndex);
-        }
-      }
-      if (other.freqIndex != null) {
-        if (freqIndex == null) {
-          freqIndex = (IntIndexInput.Index) other.freqIndex.clone();
-        } else {
-          freqIndex.set(other.freqIndex);
-        }
-      }
-      skipOffset = other.skipOffset;
-      payloadOffset = other.payloadOffset;
     }
 
     @Override
@@ -184,39 +161,8 @@ public class SepPostingsReaderImpl exten
   }
 
   @Override
-  public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState _termState, boolean isIndexTerm) throws IOException {
-    final SepTermState termState = (SepTermState) _termState;
-
-    // read freq index
-    if (!fieldInfo.omitTermFreqAndPositions) {
-      if (termState.freqIndex == null) {
-        assert isIndexTerm;
-        termState.freqIndex = freqIn.index();
-        termState.posIndex = posIn.index();
-      }
-      termState.freqIndex.read(termsIn, isIndexTerm);
-    }
-
-    // read doc index
-    termState.docIndex.read(termsIn, isIndexTerm);
-
-    // read skip index
-    if (isIndexTerm) {    
-      termState.skipOffset = termsIn.readVLong();
-    } else if (termState.docFreq >= skipInterval) {
-      termState.skipOffset += termsIn.readVLong();
-    }
-
-    // read pos, payload index
-    if (!fieldInfo.omitTermFreqAndPositions) {
-      termState.posIndex.read(termsIn, isIndexTerm);
-      final long v = termsIn.readVLong();
-      if (isIndexTerm) {
-        termState.payloadOffset = v;
-      } else {
-        termState.payloadOffset += v;
-      }
-    }
+  public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm) throws IOException {
+    ((SepTermState) termState).docIndex.read(termsIn, isIndexTerm);
   }
 
   @Override
@@ -311,14 +257,18 @@ public class SepPostingsReaderImpl exten
       docIndex.set(termState.docIndex);
       docIndex.seek(docReader);
 
-      skipOffset = termState.skipOffset;
-
       if (!omitTF) {
-        freqIndex.set(termState.freqIndex);
+        freqIndex.read(docReader, true);
         freqIndex.seek(freqReader);
+        
+        posIndex.read(docReader, true);
+        // skip payload offset
+        docReader.readVLong();
       } else {
         freq = 1;
       }
+      skipOffset = docReader.readVLong();
+
       docFreq = termState.docFreq;
       count = 0;
       doc = 0;
@@ -498,17 +448,15 @@ public class SepPostingsReaderImpl exten
       docIndex.set(termState.docIndex);
       docIndex.seek(docReader);
 
-      freqIndex.set(termState.freqIndex);
+      freqIndex.read(docReader, true);
       freqIndex.seek(freqReader);
 
-      posIndex.set(termState.posIndex);
+      posIndex.read(docReader, true);
       posSeekPending = true;
-      //posIndex.seek(posReader);
       payloadPending = false;
 
-      skipOffset = termState.skipOffset;
-      payloadOffset = termState.payloadOffset;
-      //payloadIn.seek(payloadOffset);
+      payloadOffset = docReader.readVLong();
+      skipOffset = docReader.readVLong();
 
       docFreq = termState.docFreq;
       count = 0;

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java?rev=1029012&r1=1029011&r2=1029012&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsWriterImpl.java Sat Oct 30 10:17:20 2010
@@ -79,6 +79,7 @@ public final class SepPostingsWriterImpl
   long lastPayloadStart;
   int lastDocID;
   int df;
+  private boolean firstDoc;
 
   public SepPostingsWriterImpl(SegmentWriteState state, IntStreamFactory factory) throws IOException {
     super();
@@ -147,6 +148,7 @@ public final class SepPostingsWriterImpl
       payloadStart = payloadOut.getFilePointer();
       lastPayloadLength = -1;
     }
+    firstDoc = true;
     skipListWriter.resetSkip(docIndex, freqIndex, posIndex);
   }
 
@@ -169,6 +171,20 @@ public final class SepPostingsWriterImpl
   @Override
   public void startDoc(int docID, int termDocFreq) throws IOException {
 
+    if (firstDoc) {
+      // TODO: we are writing absolute file pointers below,
+      // which is wasteful.  It'd be better compression to
+      // write the "baseline" into each indexed term, then
+      // write only the delta here.
+      if (!omitTF) {
+        freqIndex.write(docOut, true);
+        posIndex.write(docOut, true);
+        docOut.writeVLong(payloadStart);
+      }
+      docOut.writeVLong(skipOut.getFilePointer());
+      firstDoc = false;
+    }
+
     final int delta = docID - lastDocID;
 
     if (docID < 0 || (df > 0 && delta <= 0)) {
@@ -229,42 +245,16 @@ public final class SepPostingsWriterImpl
   @Override
   public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
 
-    long skipPos = skipOut.getFilePointer();
-
     // TODO: -- wasteful we are counting this in two places?
     assert docCount > 0;
     assert docCount == df;
 
-    // TODO: -- only do this if once (consolidate the
-    // conditional things that are written)
-    if (!omitTF) {
-      freqIndex.write(termsOut, isIndexTerm);
-    }
     docIndex.write(termsOut, isIndexTerm);
 
     if (df >= skipInterval) {
       skipListWriter.writeSkip(skipOut);
     }
 
-    if (isIndexTerm) {
-      termsOut.writeVLong(skipPos);
-      lastSkipStart = skipPos;
-    } else if (df >= skipInterval) {
-      termsOut.writeVLong(skipPos-lastSkipStart);
-      lastSkipStart = skipPos;
-    }
-
-    if (!omitTF) {
-      posIndex.write(termsOut, isIndexTerm);
-      if (isIndexTerm) {
-        // Write absolute at seek points
-        termsOut.writeVLong(payloadStart);
-      } else {
-        termsOut.writeVLong(payloadStart-lastPayloadStart);
-      }
-      lastPayloadStart = payloadStart;
-    }
-
     lastDocID = 0;
     df = 0;
   }

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexInput.java?rev=1029012&r1=1029011&r2=1029012&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexInput.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexInput.java Sat Oct 30 10:17:20 2010
@@ -81,6 +81,16 @@ public class MockSingleIntIndexInput ext
     }
 
     @Override
+    public void read(IntIndexInput.Reader indexIn, boolean absolute)
+      throws IOException {
+      if (absolute) {
+        fp = indexIn.readVLong();
+      } else {
+        fp += indexIn.readVLong();
+      }
+    }
+
+    @Override
     public void set(IntIndexInput.Index other) {
       fp = ((Index) other).fp;
     }

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexOutput.java?rev=1029012&r1=1029011&r2=1029012&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexOutput.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/mocksep/MockSingleIntIndexOutput.java Sat Oct 30 10:17:20 2010
@@ -76,6 +76,18 @@ public class MockSingleIntIndexOutput ex
       }
       lastFP = fp;
     }
+
+    @Override
+    public void write(IntIndexOutput indexOut, boolean absolute) 
+      throws IOException {
+      if (absolute) {
+        indexOut.writeVLong(fp);
+      } else {
+        indexOut.writeVLong(fp - lastFP);
+      }
+      lastFP = fp;
+    }
+      
     @Override
     public String toString() {
       return Long.toString(fp);