You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/03/12 13:37:46 UTC

svn commit: r1299651 - in /lucene/dev/trunk/lucene/core/src: java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java test/org/apache/lucene/index/TestPostingsOffsets.java

Author: rmuir
Date: Mon Mar 12 12:37:46 2012
New Revision: 1299651

URL: http://svn.apache.org/viewvc?rev=1299651&view=rev
Log:
LUCENE-3864: support offsets in memorypostings

Modified:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java?rev=1299651&r1=1299650&r2=1299651&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java Mon Mar 12 12:37:46 2012
@@ -98,8 +98,6 @@ public class MemoryPostingsFormat extend
     return "PostingsFormat(name=" + getName() + " doPackFST= " + doPackFST + ")";
   }
 
-  private static final boolean VERBOSE = false;
-
   private final static class TermsWriter extends TermsConsumer {
     private final IndexOutput out;
     private final FieldInfo field;
@@ -123,10 +121,13 @@ public class MemoryPostingsFormat extend
       // NOTE: not private so we don't pay access check at runtime:
       int docCount;
       RAMOutputStream buffer = new RAMOutputStream();
+      
+      int lastOffsetLength;
+      int lastOffset;
 
       @Override
       public void startDoc(int docID, int termDocFreq) throws IOException {
-        if (VERBOSE) System.out.println("    startDoc docID=" + docID + " freq=" + termDocFreq);
+        //System.out.println("    startDoc docID=" + docID + " freq=" + termDocFreq);
         final int delta = docID - lastDocID;
         assert docID == 0 || delta > 0;
         lastDocID = docID;
@@ -143,20 +144,23 @@ public class MemoryPostingsFormat extend
         }
 
         lastPos = 0;
+        lastOffset = 0;
       }
 
       @Override
       public void addPosition(int pos, BytesRef payload, int startOffset, int endOffset) throws IOException {
         assert payload == null || field.storePayloads;
 
-        if (VERBOSE) System.out.println("      addPos pos=" + pos + " payload=" + payload);
+        //System.out.println("      addPos pos=" + pos + " payload=" + payload);
 
         final int delta = pos - lastPos;
         assert delta >= 0;
         lastPos = pos;
         
+        int payloadLen = 0;
+        
         if (field.storePayloads) {
-          final int payloadLen = payload == null ? 0 : payload.length;
+          payloadLen = payload == null ? 0 : payload.length;
           if (payloadLen != lastPayloadLen) {
             lastPayloadLen = payloadLen;
             buffer.writeVInt((delta<<1)|1);
@@ -164,13 +168,28 @@ public class MemoryPostingsFormat extend
           } else {
             buffer.writeVInt(delta<<1);
           }
-
-          if (payloadLen > 0) {
-            buffer.writeBytes(payload.bytes, payload.offset, payloadLen);
-          }
         } else {
           buffer.writeVInt(delta);
         }
+        
+        if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
+          // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
+          // and the numbers aren't that much smaller anyways.
+          int offsetDelta = startOffset - lastOffset;
+          int offsetLength = endOffset - startOffset;
+          if (offsetLength != lastOffsetLength) {
+            buffer.writeVInt(offsetDelta << 1 | 1);
+            buffer.writeVInt(offsetLength);
+          } else {
+            buffer.writeVInt(offsetDelta << 1);
+          }
+          lastOffset = startOffset;
+          lastOffsetLength = offsetLength;
+        }
+        
+        if (payloadLen > 0) {
+          buffer.writeBytes(payload.bytes, payload.offset, payloadLen);
+        }
       }
 
       @Override
@@ -182,6 +201,8 @@ public class MemoryPostingsFormat extend
         lastDocID = 0;
         docCount = 0;
         lastPayloadLen = 0;
+        // force first offset to write its length
+        lastOffsetLength = -1;
         return this;
       }
     }
@@ -190,7 +211,7 @@ public class MemoryPostingsFormat extend
 
     @Override
     public PostingsConsumer startTerm(BytesRef text) {
-      if (VERBOSE) System.out.println("  startTerm term=" + text.utf8ToString());
+      //System.out.println("  startTerm term=" + text.utf8ToString());
       return postingsWriter.reset();
     }
 
@@ -224,12 +245,12 @@ public class MemoryPostingsFormat extend
 
       spare.bytes = finalBuffer;
       spare.length = totalBytes;
-      if (VERBOSE) {
-        System.out.println("    finishTerm term=" + text.utf8ToString() + " " + totalBytes + " bytes totalTF=" + stats.totalTermFreq);
-        for(int i=0;i<totalBytes;i++) {
-          System.out.println("      " + Integer.toHexString(finalBuffer[i]&0xFF));
-        }
-      }
+
+      //System.out.println("    finishTerm term=" + text.utf8ToString() + " " + totalBytes + " bytes totalTF=" + stats.totalTermFreq);
+      //for(int i=0;i<totalBytes;i++) {
+      //  System.out.println("      " + Integer.toHexString(finalBuffer[i]&0xFF));
+      //}
+
       builder.add(Util.toIntsRef(text, scratchIntsRef), BytesRef.deepCopyOf(spare));
       termCount++;
     }
@@ -249,7 +270,7 @@ public class MemoryPostingsFormat extend
           fst = fst.pack(3, Math.max(10, fst.getNodeCount()/4));
         }
         fst.save(out);
-        if (VERBOSE) System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer());
+        //System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer());
       }
     }
 
@@ -270,10 +291,7 @@ public class MemoryPostingsFormat extend
     return new FieldsConsumer() {
       @Override
       public TermsConsumer addField(FieldInfo field) {
-        if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
-          throw new UnsupportedOperationException("this codec cannot index offsets");
-        }
-        if (VERBOSE) System.out.println("\naddField field=" + field.name);
+        //System.out.println("\naddField field=" + field.name);
         return new TermsWriter(out, field, doPackFST);
       }
 
@@ -331,11 +349,9 @@ public class MemoryPostingsFormat extend
     @Override
     public int nextDoc() {
       while(true) {
-        if (VERBOSE) System.out.println("  nextDoc cycle docUpto=" + docUpto + " numDocs=" + numDocs + " fp=" + in.getPosition() + " this=" + this);
+        //System.out.println("  nextDoc cycle docUpto=" + docUpto + " numDocs=" + numDocs + " fp=" + in.getPosition() + " this=" + this);
         if (docUpto == numDocs) {
-          if (VERBOSE) {
-            System.out.println("    END");
-          }
+          // System.out.println("    END");
           return docID = NO_MORE_DOCS;
         }
         docUpto++;
@@ -344,7 +360,7 @@ public class MemoryPostingsFormat extend
         } else {
           final int code = in.readVInt();
           accum += code >>> 1;
-          if (VERBOSE) System.out.println("  docID=" + accum + " code=" + code);
+          //System.out.println("  docID=" + accum + " code=" + code);
           if ((code & 1) != 0) {
             freq = 1;
           } else {
@@ -352,8 +368,8 @@ public class MemoryPostingsFormat extend
             assert freq > 0;
           }
 
-          if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
-            // Skip positions
+          if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+            // Skip positions/payloads
             for(int posUpto=0;posUpto<freq;posUpto++) {
               if (!storePayloads) {
                 in.readVInt();
@@ -365,11 +381,26 @@ public class MemoryPostingsFormat extend
                 in.skipBytes(payloadLen);
               }
             }
+          } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
+            // Skip positions/offsets/payloads
+            for(int posUpto=0;posUpto<freq;posUpto++) {
+              int posCode = in.readVInt();
+              if (storePayloads && ((posCode & 1) != 0)) {
+                payloadLen = in.readVInt();
+              }
+              if ((in.readVInt() & 1) != 0) {
+                // new offset length
+                in.readVInt();
+              }
+              if (storePayloads) {
+                in.skipBytes(payloadLen);
+              }
+            }
           }
         }
 
         if (liveDocs == null || liveDocs.get(accum)) {
-          if (VERBOSE) System.out.println("    return docID=" + accum + " freq=" + freq);
+          //System.out.println("    return docID=" + accum + " freq=" + freq);
           return (docID = accum);
         }
       }
@@ -413,26 +444,30 @@ public class MemoryPostingsFormat extend
     private int posPending;
     private int payloadLength;
     private boolean payloadRetrieved;
+    final boolean storeOffsets;
+    int offsetLength;
+    int startOffset;
 
     private int pos;
     private final BytesRef payload = new BytesRef();
 
-    public FSTDocsAndPositionsEnum(boolean storePayloads) {
+    public FSTDocsAndPositionsEnum(boolean storePayloads, boolean storeOffsets) {
       this.storePayloads = storePayloads;
+      this.storeOffsets = storeOffsets;
     }
 
-    public boolean canReuse(boolean storePayloads) {
-      return storePayloads == this.storePayloads;
+    public boolean canReuse(boolean storePayloads, boolean storeOffsets) {
+      return storePayloads == this.storePayloads && storeOffsets == this.storeOffsets;
     }
     
     public FSTDocsAndPositionsEnum reset(BytesRef bufferIn, Bits liveDocs, int numDocs) {
       assert numDocs > 0;
-      if (VERBOSE) {
-        System.out.println("D&P reset bytes this=" + this);
-        for(int i=bufferIn.offset;i<bufferIn.length;i++) {
-          System.out.println("  " + Integer.toHexString(bufferIn.bytes[i]&0xFF));
-        }
-      }
+
+      // System.out.println("D&P reset bytes this=" + this);
+      // for(int i=bufferIn.offset;i<bufferIn.length;i++) {
+      //   System.out.println("  " + Integer.toHexString(bufferIn.bytes[i]&0xFF));
+      // }
+
       if (buffer.length < bufferIn.length - bufferIn.offset) {
         buffer = ArrayUtil.grow(buffer, bufferIn.length - bufferIn.offset);
       }
@@ -447,6 +482,8 @@ public class MemoryPostingsFormat extend
       this.numDocs = numDocs;
       posPending = 0;
       payloadRetrieved = false;
+      startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
+      offsetLength = 0;
       return this;
     }
 
@@ -456,9 +493,9 @@ public class MemoryPostingsFormat extend
         nextPosition();
       }
       while(true) {
-        if (VERBOSE) System.out.println("  nextDoc cycle docUpto=" + docUpto + " numDocs=" + numDocs + " fp=" + in.getPosition() + " this=" + this);
+        //System.out.println("  nextDoc cycle docUpto=" + docUpto + " numDocs=" + numDocs + " fp=" + in.getPosition() + " this=" + this);
         if (docUpto == numDocs) {
-          if (VERBOSE) System.out.println("    END");
+          //System.out.println("    END");
           return docID = NO_MORE_DOCS;
         }
         docUpto++;
@@ -474,8 +511,9 @@ public class MemoryPostingsFormat extend
 
         if (liveDocs == null || liveDocs.get(accum)) {
           pos = 0;
+          startOffset = storeOffsets ? 0 : -1;
           posPending = freq;
-          if (VERBOSE) System.out.println("    return docID=" + accum + " freq=" + freq);
+          //System.out.println("    return docID=" + accum + " freq=" + freq);
           return (docID = accum);
         }
 
@@ -487,8 +525,18 @@ public class MemoryPostingsFormat extend
             final int skipCode = in.readVInt();
             if ((skipCode & 1) != 0) {
               payloadLength = in.readVInt();
-              if (VERBOSE) System.out.println("    new payloadLen=" + payloadLength);
+              //System.out.println("    new payloadLen=" + payloadLength);
             }
+          }
+          
+          if (storeOffsets) {
+            if ((in.readVInt() & 1) != 0) {
+              // new offset length
+              offsetLength = in.readVInt();
+            }
+          }
+          
+          if (storePayloads) {
             in.skipBytes(payloadLength);
           }
         }
@@ -497,7 +545,7 @@ public class MemoryPostingsFormat extend
 
     @Override
     public int nextPosition() {
-      if (VERBOSE) System.out.println("    nextPos storePayloads=" + storePayloads + " this=" + this);
+      //System.out.println("    nextPos storePayloads=" + storePayloads + " this=" + this);
       assert posPending > 0;
       posPending--;
       if (!storePayloads) {
@@ -511,6 +559,18 @@ public class MemoryPostingsFormat extend
           //} else {
           //System.out.println("      same payloadLen=" + payloadLength);
         }
+      }
+      
+      if (storeOffsets) {
+        int offsetCode = in.readVInt();
+        if ((offsetCode & 1) != 0) {
+          // new offset length
+          offsetLength = in.readVInt();
+        }
+        startOffset += offsetCode >>> 1;
+      }
+      
+      if (storePayloads) {
         payload.offset = in.getPosition();
         in.skipBytes(payloadLength);
         payload.length = payloadLength;
@@ -520,18 +580,18 @@ public class MemoryPostingsFormat extend
         payloadRetrieved = false;
       }
 
-      if (VERBOSE) System.out.println("      pos=" + pos + " payload=" + payload + " fp=" + in.getPosition());
+      //System.out.println("      pos=" + pos + " payload=" + payload + " fp=" + in.getPosition());
       return pos;
     }
 
     @Override
     public int startOffset() {
-      return -1;
+      return startOffset;
     }
 
     @Override
     public int endOffset() {
-      return -1;
+      return startOffset + offsetLength;
     }
 
     @Override
@@ -594,14 +654,14 @@ public class MemoryPostingsFormat extend
           totalTermFreq = -1;
         }
         current.output.offset = buffer.getPosition();
-        if (VERBOSE) System.out.println("  df=" + docFreq + " totTF=" + totalTermFreq + " offset=" + buffer.getPosition() + " len=" + current.output.length);
+        //System.out.println("  df=" + docFreq + " totTF=" + totalTermFreq + " offset=" + buffer.getPosition() + " len=" + current.output.length);
         didDecode = true;
       }
     }
 
     @Override
     public boolean seekExact(BytesRef text, boolean useCache /* ignored */) throws IOException {
-      if (VERBOSE) System.out.println("te.seekExact text=" + field.name + ":" + text.utf8ToString() + " this=" + this);
+      //System.out.println("te.seekExact text=" + field.name + ":" + text.utf8ToString() + " this=" + this);
       current = fstEnum.seekExact(text);
       didDecode = false;
       return current != null;
@@ -609,25 +669,24 @@ public class MemoryPostingsFormat extend
 
     @Override
     public SeekStatus seekCeil(BytesRef text, boolean useCache /* ignored */) throws IOException {
-      if (VERBOSE) System.out.println("te.seek text=" + field.name + ":" + text.utf8ToString() + " this=" + this);
+      //System.out.println("te.seek text=" + field.name + ":" + text.utf8ToString() + " this=" + this);
       current = fstEnum.seekCeil(text);
       if (current == null) {
         return SeekStatus.END;
       } else {
-        if (VERBOSE) {
-          System.out.println("  got term=" + current.input.utf8ToString());
-          for(int i=0;i<current.output.length;i++) {
-            System.out.println("    " + Integer.toHexString(current.output.bytes[i]&0xFF));
-          }
-        }
+
+        // System.out.println("  got term=" + current.input.utf8ToString());
+        // for(int i=0;i<current.output.length;i++) {
+        //   System.out.println("    " + Integer.toHexString(current.output.bytes[i]&0xFF));
+        // }
 
         didDecode = false;
 
         if (text.equals(current.input)) {
-          if (VERBOSE) System.out.println("  found!");
+          //System.out.println("  found!");
           return SeekStatus.FOUND;
         } else {
-          if (VERBOSE) System.out.println("  not found: " + current.input.utf8ToString());
+          //System.out.println("  not found: " + current.input.utf8ToString());
           return SeekStatus.NOT_FOUND;
         }
       }
@@ -654,9 +713,9 @@ public class MemoryPostingsFormat extend
     @Override
     public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
 
-      if (needsOffsets) {
-        // Not until we can index offsets...
-        return null;
+      boolean hasOffsets = field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+      if (needsOffsets && !hasOffsets) {
+        return null; // not available
       }
       
       if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
@@ -665,14 +724,14 @@ public class MemoryPostingsFormat extend
       decodeMetaData();
       FSTDocsAndPositionsEnum docsAndPositionsEnum;
       if (reuse == null || !(reuse instanceof FSTDocsAndPositionsEnum)) {
-        docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.storePayloads);
+        docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.storePayloads, hasOffsets);
       } else {
         docsAndPositionsEnum = (FSTDocsAndPositionsEnum) reuse;        
-        if (!docsAndPositionsEnum.canReuse(field.storePayloads)) {
-          docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.storePayloads);
+        if (!docsAndPositionsEnum.canReuse(field.storePayloads, hasOffsets)) {
+          docsAndPositionsEnum = new FSTDocsAndPositionsEnum(field.storePayloads, hasOffsets);
         }
       }
-      if (VERBOSE) System.out.println("D&P reset this=" + this);
+      //System.out.println("D&P reset this=" + this);
       return docsAndPositionsEnum.reset(current.output, liveDocs, docFreq);
     }
 
@@ -683,14 +742,14 @@ public class MemoryPostingsFormat extend
 
     @Override
     public BytesRef next() throws IOException {
-      if (VERBOSE) System.out.println("te.next");
+      //System.out.println("te.next");
       current = fstEnum.next();
       if (current == null) {
-        if (VERBOSE) System.out.println("  END");
+        //System.out.println("  END");
         return null;
       }
       didDecode = false;
-      if (VERBOSE) System.out.println("  term=" + field.name + ":" + current.input.utf8ToString());
+      //System.out.println("  term=" + field.name + ":" + current.input.utf8ToString());
       return current.input;
     }
 
@@ -794,9 +853,7 @@ public class MemoryPostingsFormat extend
           break;
         }
         final TermsReader termsReader = new TermsReader(state.fieldInfos, in, termCount);
-        if (VERBOSE) {
-          System.out.println("load field=" + termsReader.field.name);
-        }
+        // System.out.println("load field=" + termsReader.field.name);
         fields.put(termsReader.field.name, termsReader);
       }
     } finally {

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java?rev=1299651&r1=1299650&r2=1299651&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java Mon Mar 12 12:37:46 2012
@@ -29,6 +29,7 @@ import org.apache.lucene.analysis.MockPa
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
+import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
@@ -43,6 +44,8 @@ import org.apache.lucene.util.English;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util._TestUtil;
 
+// TODO: we really need to test indexingoffsets, but then getting only docs / docs + freqs.
+// not all codecs store prx separate...
 public class TestPostingsOffsets extends LuceneTestCase {
   IndexWriterConfig iwc;
   
@@ -54,7 +57,11 @@ public class TestPostingsOffsets extends
     
     if (Codec.getDefault().getName().equals("Lucene40")) {
       // pulsing etc are not implemented
-      iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+      if (random.nextBoolean()) {
+        iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+      } else {
+        iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
+      }
     }
   }
 
@@ -126,7 +133,11 @@ public class TestPostingsOffsets extends
     iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
     if (Codec.getDefault().getName().equals("Lucene40")) {
       // pulsing etc are not implemented
-      iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+      if (random.nextBoolean()) {
+        iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+      } else {
+        iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat()));
+      }
     }
     iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping
     RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);