You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2011/02/09 10:36:03 UTC

svn commit: r1068809 [9/36] - in /lucene/dev/branches/docvalues: ./ dev-tools/eclipse/ dev-tools/idea/.idea/ dev-tools/idea/.idea/copyright/ dev-tools/idea/lucene/ dev-tools/idea/lucene/contrib/ant/ dev-tools/idea/lucene/contrib/queryparser/ dev-tools/...

Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsReader.java Wed Feb  9 09:35:27 2011
@@ -20,15 +20,18 @@ package org.apache.lucene.index.codecs.s
 import java.io.IOException;
 import java.util.Collection;
 
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.index.SegmentInfo;
-import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.codecs.BlockTermState;
 import org.apache.lucene.index.codecs.PostingsReaderBase;
-import org.apache.lucene.index.codecs.TermState;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CodecUtil;
@@ -45,9 +48,12 @@ public class StandardPostingsReader exte
   int skipInterval;
   int maxSkipLevels;
 
+  //private String segment;
+
   public StandardPostingsReader(Directory dir, SegmentInfo segmentInfo, int readBufferSize, String codecId) throws IOException {
     freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, codecId, StandardCodec.FREQ_EXTENSION),
                            readBufferSize);
+    //this.segment = segmentInfo.name;
     if (segmentInfo.getHasProx()) {
       boolean success = false;
       try {
@@ -83,33 +89,46 @@ public class StandardPostingsReader exte
   }
 
   // Must keep final because we do non-standard clone
-  private final static class DocTermState extends TermState {
+  private final static class StandardTermState extends BlockTermState {
     long freqOffset;
     long proxOffset;
     int skipOffset;
 
+    // Only used by the "primary" TermState -- clones don't
+    // copy this (basically they are "transient"):
+    ByteArrayDataInput bytesReader;
+    byte[] bytes;
+
+    @Override
     public Object clone() {
-      DocTermState other = new DocTermState();
-      other.copy(this);
+      StandardTermState other = new StandardTermState();
+      other.copyFrom(this);
       return other;
     }
 
-    public void copy(TermState _other) {
-      super.copy(_other);
-      DocTermState other = (DocTermState) _other;
+    @Override
+    public void copyFrom(TermState _other) {
+      super.copyFrom(_other);
+      StandardTermState other = (StandardTermState) _other;
       freqOffset = other.freqOffset;
       proxOffset = other.proxOffset;
       skipOffset = other.skipOffset;
+
+      // Do not copy bytes, bytesReader (else TermState is
+      // very heavy, ie drags around the entire block's
+      // byte[]).  On seek back, if next() is in fact used
+      // (rare!), they will be re-read from disk.
     }
 
+    @Override
     public String toString() {
       return super.toString() + " freqFP=" + freqOffset + " proxFP=" + proxOffset + " skipOffset=" + skipOffset;
     }
   }
 
   @Override
-  public TermState newTermState() {
-    return new DocTermState();
+  public BlockTermState newTermState() {
+    return new StandardTermState();
   }
 
   @Override
@@ -125,35 +144,61 @@ public class StandardPostingsReader exte
     }
   }
 
+  /* Reads but does not decode the byte[] blob holding
+     metadata for the current terms block */
   @Override
-  public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm)
-    throws IOException {
+  public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
+    final StandardTermState termState = (StandardTermState) _termState;
+
+    final int len = termsIn.readVInt();
+    //System.out.println("SPR.readTermsBlock termsIn.fp=" + termsIn.getFilePointer());
+    if (termState.bytes == null) {
+      termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
+      termState.bytesReader = new ByteArrayDataInput(null);
+    } else if (termState.bytes.length < len) {
+      termState.bytes = new byte[ArrayUtil.oversize(len, 1)];
+    }
+
+    termsIn.readBytes(termState.bytes, 0, len);
+    termState.bytesReader.reset(termState.bytes, 0, len);
+  }
 
-    final DocTermState docTermState = (DocTermState) termState;
+  @Override
+  public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState)
+    throws IOException {
+    final StandardTermState termState = (StandardTermState) _termState;
+    //System.out.println("StandardR.nextTerm seg=" + segment);
+    final boolean isFirstTerm = termState.termCount == 0;
 
-    if (isIndexTerm) {
-      docTermState.freqOffset = termsIn.readVLong();
+    if (isFirstTerm) {
+      termState.freqOffset = termState.bytesReader.readVLong();
     } else {
-      docTermState.freqOffset += termsIn.readVLong();
+      termState.freqOffset += termState.bytesReader.readVLong();
     }
-
-    if (docTermState.docFreq >= skipInterval) {
-      docTermState.skipOffset = termsIn.readVInt();
+    //System.out.println("  dF=" + termState.docFreq);
+    //System.out.println("  freqFP=" + termState.freqOffset);
+    assert termState.freqOffset < freqIn.length();
+
+    if (termState.docFreq >= skipInterval) {
+      termState.skipOffset = termState.bytesReader.readVInt();
+      //System.out.println("  skipOffset=" + termState.skipOffset + " vs freqIn.length=" + freqIn.length());
+      assert termState.freqOffset + termState.skipOffset < freqIn.length();
     } else {
-      docTermState.skipOffset = 0;
+      // undefined
     }
 
     if (!fieldInfo.omitTermFreqAndPositions) {
-      if (isIndexTerm) {
-        docTermState.proxOffset = termsIn.readVLong();
+      if (isFirstTerm) {
+        termState.proxOffset = termState.bytesReader.readVLong();
       } else {
-        docTermState.proxOffset += termsIn.readVLong();
+        termState.proxOffset += termState.bytesReader.readVLong();
       }
+      //System.out.println("  proxFP=" + termState.proxOffset);
     }
   }
     
   @Override
-  public DocsEnum docs(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsEnum reuse) throws IOException {
+  public DocsEnum docs(FieldInfo fieldInfo, BlockTermState termState, Bits skipDocs, DocsEnum reuse) throws IOException {
     SegmentDocsEnum docsEnum;
     if (reuse == null || !(reuse instanceof SegmentDocsEnum)) {
       docsEnum = new SegmentDocsEnum(freqIn);
@@ -166,11 +211,11 @@ public class StandardPostingsReader exte
         docsEnum = new SegmentDocsEnum(freqIn);
       }
     }
-    return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
+    return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
   }
 
   @Override
-  public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+  public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
     if (fieldInfo.omitTermFreqAndPositions) {
       return null;
     }
@@ -189,7 +234,7 @@ public class StandardPostingsReader exte
           docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
         }
       }
-      return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
+      return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
     } else {
       SegmentDocsAndPositionsEnum docsEnum;
       if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsEnum)) {
@@ -203,7 +248,7 @@ public class StandardPostingsReader exte
           docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn);
         }
       }
-      return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
+      return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
     }
   }
 
@@ -233,7 +278,7 @@ public class StandardPostingsReader exte
       this.freqIn = (IndexInput) freqIn.clone();
     }
 
-    public SegmentDocsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
+    public SegmentDocsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
       omitTF = fieldInfo.omitTermFreqAndPositions;
       if (omitTF) {
         freq = 1;
@@ -248,8 +293,10 @@ public class StandardPostingsReader exte
       // cases
       freqIn.seek(termState.freqOffset);
       limit = termState.docFreq;
+      assert limit > 0;
       ord = 0;
       doc = 0;
+      //System.out.println("  sde limit=" + limit + " freqFP=" + freqOffset);
 
       skipped = false;
 
@@ -331,13 +378,10 @@ public class StandardPostingsReader exte
     @Override
     public int advance(int target) throws IOException {
 
-      // TODO: jump right to next() if target is < X away
-      // from where we are now?
-
-      if (skipOffset > 0) {
+      if ((target - skipInterval) >= doc && limit >= skipInterval) {
 
         // There are enough docs in the posting to have
-        // skip data
+        // skip data, and it isn't too close.
 
         if (skipper == null) {
           // This is the first time this enum has ever been used for skipping -- do lazy init
@@ -407,7 +451,7 @@ public class StandardPostingsReader exte
       this.proxIn = (IndexInput) proxIn.clone();
     }
 
-    public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
+    public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
       assert !fieldInfo.omitTermFreqAndPositions;
       assert !fieldInfo.storePayloads;
 
@@ -420,6 +464,8 @@ public class StandardPostingsReader exte
       lazyProxPointer = termState.proxOffset;
 
       limit = termState.docFreq;
+      assert limit > 0;
+
       ord = 0;
       doc = 0;
       position = 0;
@@ -430,6 +476,7 @@ public class StandardPostingsReader exte
       freqOffset = termState.freqOffset;
       proxOffset = termState.proxOffset;
       skipOffset = termState.skipOffset;
+      //System.out.println("StandardR.D&PE reset seg=" + segment + " limit=" + limit + " freqFP=" + freqOffset + " proxFP=" + proxOffset);
 
       return this;
     }
@@ -438,6 +485,7 @@ public class StandardPostingsReader exte
     public int nextDoc() throws IOException {
       while(true) {
         if (ord == limit) {
+          //System.out.println("StandardR.D&PE seg=" + segment + " nextDoc return doc=END");
           return doc = NO_MORE_DOCS;
         }
 
@@ -461,6 +509,7 @@ public class StandardPostingsReader exte
 
       position = 0;
 
+      //System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc);
       return doc;
     }
 
@@ -477,13 +526,12 @@ public class StandardPostingsReader exte
     @Override
     public int advance(int target) throws IOException {
 
-      // TODO: jump right to next() if target is < X away
-      // from where we are now?
+      //System.out.println("StandardR.D&PE advance target=" + target);
 
-      if (skipOffset > 0) {
+      if ((target - skipInterval) >= doc && limit >= skipInterval) {
 
         // There are enough docs in the posting to have
-        // skip data
+        // skip data, and it isn't too close
 
         if (skipper == null) {
           // This is the first time this enum has ever been used for skipping -- do lazy init
@@ -524,6 +572,7 @@ public class StandardPostingsReader exte
       return doc;
     }
 
+    @Override
     public int nextPosition() throws IOException {
 
       if (lazyProxPointer != -1) {
@@ -552,10 +601,12 @@ public class StandardPostingsReader exte
 
     /** Returns the payload at this position, or null if no
      *  payload was indexed. */
+    @Override
     public BytesRef getPayload() throws IOException {
       throw new IOException("No payloads exist for this field!");
     }
 
+    @Override
     public boolean hasPayload() {
       return false;
     }
@@ -594,7 +645,7 @@ public class StandardPostingsReader exte
       this.proxIn = (IndexInput) proxIn.clone();
     }
 
-    public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
+    public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
       assert !fieldInfo.omitTermFreqAndPositions;
       assert fieldInfo.storePayloads;
       if (payload == null) {
@@ -622,6 +673,7 @@ public class StandardPostingsReader exte
       freqOffset = termState.freqOffset;
       proxOffset = termState.proxOffset;
       skipOffset = termState.skipOffset;
+      //System.out.println("StandardR.D&PE reset seg=" + segment + " limit=" + limit + " freqFP=" + freqOffset + " proxFP=" + proxOffset + " this=" + this);
 
       return this;
     }
@@ -630,6 +682,7 @@ public class StandardPostingsReader exte
     public int nextDoc() throws IOException {
       while(true) {
         if (ord == limit) {
+          //System.out.println("StandardR.D&PE seg=" + segment + " nextDoc return doc=END");
           return doc = NO_MORE_DOCS;
         }
 
@@ -653,6 +706,7 @@ public class StandardPostingsReader exte
 
       position = 0;
 
+      //System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc);
       return doc;
     }
 
@@ -669,13 +723,12 @@ public class StandardPostingsReader exte
     @Override
     public int advance(int target) throws IOException {
 
-      // TODO: jump right to next() if target is < X away
-      // from where we are now?
+      //System.out.println("StandardR.D&PE advance seg=" + segment + " target=" + target + " this=" + this);
 
-      if (skipOffset > 0) {
+      if ((target - skipInterval) >= doc && limit >= skipInterval) {
 
         // There are enough docs in the posting to have
-        // skip data
+        // skip data, and it isn't too close
 
         if (skipper == null) {
           // This is the first time this enum has ever been used for skipping -- do lazy init
@@ -687,7 +740,7 @@ public class StandardPostingsReader exte
           // This is the first time this posting has
           // skipped, since reset() was called, so now we
           // load the skip data for this posting
-
+          //System.out.println("  init skipper freqOffset=" + freqOffset + " skipOffset=" + skipOffset + " vs len=" + freqIn.length());
           skipper.init(freqOffset+skipOffset,
                        freqOffset, proxOffset,
                        limit, true);
@@ -718,6 +771,7 @@ public class StandardPostingsReader exte
       return doc;
     }
 
+    @Override
     public int nextPosition() throws IOException {
 
       if (lazyProxPointer != -1) {
@@ -748,6 +802,7 @@ public class StandardPostingsReader exte
         posPendingCount--;
         position = 0;
         payloadPending = false;
+        //System.out.println("StandardR.D&PE skipPos");
       }
 
       // read next position
@@ -771,11 +826,13 @@ public class StandardPostingsReader exte
 
       assert posPendingCount >= 0: "nextPosition() was called too many times (more than freq() times) posPendingCount=" + posPendingCount;
 
+      //System.out.println("StandardR.D&PE nextPos   return pos=" + position);
       return position;
     }
 
     /** Returns the payload at this position, or null if no
      *  payload was indexed. */
+    @Override
     public BytesRef getPayload() throws IOException {
       assert lazyProxPointer == -1;
       assert posPendingCount < freq;
@@ -785,6 +842,7 @@ public class StandardPostingsReader exte
       if (payloadLength > payload.bytes.length) {
         payload.grow(payloadLength);
       }
+
       proxIn.readBytes(payload.bytes, 0, payloadLength);
       payload.length = payloadLength;
       payloadPending = false;
@@ -792,6 +850,7 @@ public class StandardPostingsReader exte
       return payload;
     }
 
+    @Override
     public boolean hasPayload() {
       return payloadPending && payloadLength > 0;
     }

Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardPostingsWriter.java Wed Feb  9 09:35:27 2011
@@ -22,12 +22,14 @@ package org.apache.lucene.index.codecs.s
 
 import java.io.IOException;
 
-import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.IndexFileNames;
-import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.TermStats;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CodecUtil;
 
@@ -58,8 +60,15 @@ public final class StandardPostingsWrite
   int lastPayloadLength;
   int lastPosition;
 
+  private int pendingCount;
+
+  //private String segment;
+
+  private RAMOutputStream bytesWriter = new RAMOutputStream();
+
   public StandardPostingsWriter(SegmentWriteState state) throws IOException {
     super();
+    //this.segment = state.segmentName;
     String fileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, StandardCodec.FREQ_EXTENSION);
     freqOut = state.directory.createOutput(fileName);
 
@@ -95,6 +104,7 @@ public final class StandardPostingsWrite
 
   @Override
   public void startTerm() {
+    //System.out.println("StandardW: startTerm seg=" + segment + " pendingCount=" + pendingCount);
     freqStart = freqOut.getFilePointer();
     if (proxOut != null) {
       proxStart = proxOut.getFilePointer();
@@ -108,9 +118,12 @@ public final class StandardPostingsWrite
   // our parent calls setField whenever the field changes
   @Override
   public void setField(FieldInfo fieldInfo) {
+    //System.out.println("SPW: setField");
     this.fieldInfo = fieldInfo;
     omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions;
     storePayloads = fieldInfo.storePayloads;
+    //System.out.println("  set init blockFreqStart=" + freqStart);
+    //System.out.println("  set init blockProxStart=" + proxStart);
   }
 
   int lastDocID;
@@ -120,6 +133,7 @@ public final class StandardPostingsWrite
    *  then we just skip consuming positions/payloads. */
   @Override
   public void startDoc(int docID, int termDocFreq) throws IOException {
+    //System.out.println("StandardW:   startDoc seg=" + segment + " docID=" + docID + " tf=" + termDocFreq);
 
     final int delta = docID - lastDocID;
     
@@ -150,6 +164,7 @@ public final class StandardPostingsWrite
   /** Add a new position & payload */
   @Override
   public void addPosition(int position, BytesRef payload) throws IOException {
+    //System.out.println("StandardW:     addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
     assert !omitTermFreqAndPositions: "omitTermFreqAndPositions is true";
     assert proxOut != null;
 
@@ -184,40 +199,51 @@ public final class StandardPostingsWrite
 
   /** Called when we are done adding docs to this term */
   @Override
-  public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
-    assert docCount > 0;
+  public void finishTerm(TermStats stats) throws IOException {
+    //System.out.println("StandardW.finishTerm seg=" + segment);
+    assert stats.docFreq > 0;
 
     // TODO: wasteful we are counting this (counting # docs
     // for this term) in two places?
-    assert docCount == df;
+    assert stats.docFreq == df;
 
-    if (isIndexTerm) {
-      // Write absolute at seek points
-      termsOut.writeVLong(freqStart);
+    final boolean isFirstTerm = pendingCount == 0;
+    //System.out.println("  isFirstTerm=" + isFirstTerm);
+
+    //System.out.println("  freqFP=" + freqStart);
+    if (isFirstTerm) {
+      bytesWriter.writeVLong(freqStart);
     } else {
-      // Write delta between seek points
-      termsOut.writeVLong(freqStart - lastFreqStart);
+      bytesWriter.writeVLong(freqStart-lastFreqStart);
     }
-
     lastFreqStart = freqStart;
 
     if (df >= skipInterval) {
-      termsOut.writeVInt((int) (skipListWriter.writeSkip(freqOut)-freqStart));
+      bytesWriter.writeVInt((int) (skipListWriter.writeSkip(freqOut)-freqStart));
     }
-     
+
     if (!omitTermFreqAndPositions) {
-      if (isIndexTerm) {
-        // Write absolute at seek points
-        termsOut.writeVLong(proxStart);
+      //System.out.println("  proxFP=" + proxStart);
+      if (isFirstTerm) {
+        bytesWriter.writeVLong(proxStart);
       } else {
-        // Write delta between seek points
-        termsOut.writeVLong(proxStart - lastProxStart);
+        bytesWriter.writeVLong(proxStart - lastProxStart);
       }
       lastProxStart = proxStart;
     }
-
+     
     lastDocID = 0;
     df = 0;
+    pendingCount++;
+  }
+
+  @Override
+  public void flushTermsBlock() throws IOException {
+    //System.out.println("SPW.flushBlock pendingCount=" + pendingCount);
+    termsOut.writeVInt((int) bytesWriter.getFilePointer());
+    bytesWriter.writeTo(termsOut);
+    bytesWriter.reset();
+    pendingCount = 0;
   }
 
   @Override