You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/10/23 06:47:10 UTC

svn commit: r1401160 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/lucene41/ lucene/core/src/test/org/apache/lucene/codecs/lucene41/

Author: rmuir
Date: Tue Oct 23 04:47:10 2012
New Revision: 1401160

URL: http://svn.apache.org/viewvc?rev=1401160&view=rev
Log:
LUCENE-4496: don't decode unnecessary blocks in 4.1 codec

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1401160&r1=1401159&r2=1401160&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Tue Oct 23 04:47:10 2012
@@ -57,8 +57,8 @@ Bug Fixes
 * LUCENE-1822: BaseFragListBuilder hard-coded 6 char margin is too naive.
   (Alex Vigdor, Arcadius Ahouansou, Koji Sekiguchi)
 
-* LUCENE-4468: Fix rareish integer overflows in Block and Lucene40 postings 
-  formats (Robert Muir)
+* LUCENE-4468: Fix rareish integer overflows in Lucene41 postings 
+  format. (Robert Muir)
   
 * LUCENE-4486: Add support for ConstantScoreQuery in Highlighter.
  (Simon Willnauer)
@@ -75,16 +75,15 @@ Bug Fixes
 
 Optimizations
 
-* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets 
-  into the skipdata. You need to reindex any indexes created with
-  this experimental codec.  (Robert Muir)
+* LUCENE-4443: Lucene41PostingsFormat no longer writes unnecessary offsets 
+  into the skipdata. (Robert Muir)
 
 * LUCENE-4459: Improve WeakIdentityMap.keyIterator() to remove GCed keys
   from backing map early instead of waiting for reap(). This makes test
   failures in TestWeakIdentityMap disappear, too.
   (Uwe Schindler, Mike McCandless, Robert Muir)
 
-* LUCENE-4473: BlockPostingsFormat encodes offsets more efficiently
+* LUCENE-4473: Lucene41PostingsFormat encodes offsets more efficiently
   for low frequency terms (< 128 occurrences).  (Robert Muir)
 
 * LUCENE-4462: DocumentsWriter now flushes deletes, segment infos and builds
@@ -92,6 +91,10 @@ Optimizations
   was a single threaded process while now all IO and CPU heavy computation is done
   concurrently in DocumentsWriterPerThread. (Simon Willnauer)
 
+* LUCENE-4496: Optimize Lucene41PostingsFormat when requesting a subset of
+  the postings data (via flags to TermsEnum.docs/docsAndPositions) to use
+  ForUtil.skipBlock.  (Robert Muir)
+
 Build
 
 * LUCENE-4451: Memory leak per unique thread caused by 

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java?rev=1401160&r1=1401159&r2=1401160&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java Tue Oct 23 04:47:10 2012
@@ -275,10 +275,10 @@ public final class Lucene41PostingsReade
     } else {
       docsEnum = new BlockDocsEnum(fieldInfo);
     }
-    return docsEnum.reset(liveDocs, (IntBlockTermState) termState);
+    return docsEnum.reset(liveDocs, (IntBlockTermState) termState, flags);
   }
 
-  // TODO: specialize to liveDocs vs not, and freqs vs not
+  // TODO: specialize to liveDocs vs not
   
   @Override
   public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs,
@@ -310,7 +310,7 @@ public final class Lucene41PostingsReade
       } else {
         everythingEnum = new EverythingEnum(fieldInfo);
       }
-      return everythingEnum.reset(liveDocs, (IntBlockTermState) termState);
+      return everythingEnum.reset(liveDocs, (IntBlockTermState) termState, flags);
     }
   }
 
@@ -352,6 +352,8 @@ public final class Lucene41PostingsReade
     private int nextSkipDoc;
 
     private Bits liveDocs;
+    
+    private boolean needsFreq; // true if the caller actually needs frequencies
 
     public BlockDocsEnum(FieldInfo fieldInfo) throws IOException {
       this.startDocIn = Lucene41PostingsReader.this.docIn;
@@ -370,7 +372,7 @@ public final class Lucene41PostingsReade
         indexHasPayloads == fieldInfo.hasPayloads();
     }
     
-    public DocsEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException {
+    public DocsEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException {
       this.liveDocs = liveDocs;
       // if (DEBUG) {
       //   System.out.println("  FPR.reset: termState=" + termState);
@@ -381,6 +383,7 @@ public final class Lucene41PostingsReade
       skipOffset = termState.skipOffset;
 
       doc = -1;
+      this.needsFreq = (flags & DocsEnum.FLAG_FREQS) != 0;
       if (!indexHasFreq) {
         Arrays.fill(freqBuffer, 1);
       }
@@ -416,7 +419,11 @@ public final class Lucene41PostingsReade
           // if (DEBUG) {
           //   System.out.println("    fill freq block from fp=" + docIn.getFilePointer());
           // }
-          forUtil.readBlock(docIn, encoded, freqBuffer);
+          if (needsFreq) {
+            forUtil.readBlock(docIn, encoded, freqBuffer);
+          } else {
+            forUtil.skipBlock(docIn); // skip over freqs
+          }
         }
       } else {
         // Read vInts:
@@ -1044,6 +1051,9 @@ public final class Lucene41PostingsReade
 
     private Bits liveDocs;
     
+    private boolean needsOffsets; // true if we actually need offsets
+    private boolean needsPayloads; // true if we actually need payloads
+    
     public EverythingEnum(FieldInfo fieldInfo) throws IOException {
       this.startDocIn = Lucene41PostingsReader.this.docIn;
       this.docIn = startDocIn.clone();
@@ -1079,7 +1089,7 @@ public final class Lucene41PostingsReade
         indexHasPayloads == fieldInfo.hasPayloads();
     }
     
-    public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException {
+    public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException {
       this.liveDocs = liveDocs;
       // if (DEBUG) {
       //   System.out.println("  FPR.reset: termState=" + termState);
@@ -1101,6 +1111,9 @@ public final class Lucene41PostingsReade
         lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
       }
 
+      this.needsOffsets = (flags & DocsAndPositionsEnum.FLAG_OFFSETS) != 0;
+      this.needsPayloads = (flags & DocsAndPositionsEnum.FLAG_PAYLOADS) != 0;
+
       doc = -1;
       accum = 0;
       docUpto = 0;
@@ -1203,15 +1216,22 @@ public final class Lucene41PostingsReade
           // if (DEBUG) {
           //   System.out.println("        bulk payload block @ pay.fp=" + payIn.getFilePointer());
           // }
-          forUtil.readBlock(payIn, encoded, payloadLengthBuffer);
-          int numBytes = payIn.readVInt();
-          // if (DEBUG) {
-          //   System.out.println("        " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer());
-          // }
-          if (numBytes > payloadBytes.length) {
-            payloadBytes = ArrayUtil.grow(payloadBytes, numBytes);
+          if (needsPayloads) {
+            forUtil.readBlock(payIn, encoded, payloadLengthBuffer);
+            int numBytes = payIn.readVInt();
+            // if (DEBUG) {
+            //   System.out.println("        " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer());
+            // }
+            if (numBytes > payloadBytes.length) {
+              payloadBytes = ArrayUtil.grow(payloadBytes, numBytes);
+            }
+            payIn.readBytes(payloadBytes, 0, numBytes);
+          } else {
+            // this works, because when writing a vint block we always force the first length to be written
+            forUtil.skipBlock(payIn); // skip over lengths
+            int numBytes = payIn.readVInt(); // read length of payloadBytes
+            payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes
           }
-          payIn.readBytes(payloadBytes, 0, numBytes);
           payloadByteUpto = 0;
         }
 
@@ -1219,8 +1239,14 @@ public final class Lucene41PostingsReade
           // if (DEBUG) {
           //   System.out.println("        bulk offset block @ pay.fp=" + payIn.getFilePointer());
           // }
-          forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer);
-          forUtil.readBlock(payIn, encoded, offsetLengthBuffer);
+          if (needsOffsets) {
+            forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer);
+            forUtil.readBlock(payIn, encoded, offsetLengthBuffer);
+          } else {
+            // this works, because when writing a vint block we always force the first length to be written
+            forUtil.skipBlock(payIn); // skip over starts
+            forUtil.skipBlock(payIn); // skip over lengths
+          }
         }
       }
     }

Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java?rev=1401160&r1=1401159&r2=1401160&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java Tue Oct 23 04:47:10 2012
@@ -29,7 +29,6 @@ import org.apache.lucene.analysis.MockTo
 import org.apache.lucene.analysis.MockVariableLengthPayloadFilter;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.lucene41.Lucene41Codec;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -61,12 +60,12 @@ import org.apache.lucene.util.automaton.
 import org.apache.lucene.util.automaton.RegExp;
 
 /** 
- * Tests partial enumeration (only pulling a subset of the prox data) 
+ * Tests partial enumeration (only pulling a subset of the indexed data) 
  */
 public class TestBlockPostingsFormat3 extends LuceneTestCase {
   static final int MAXDOC = Lucene41PostingsFormat.BLOCK_SIZE * 20;
   
-  // creates 6 fields with different options and does "duels" of fields against each other
+  // creates 8 fields with different options and does "duels" of fields against each other
   public void test() throws Exception {
     Directory dir = newDirectory();
     Analyzer analyzer = new Analyzer(new Analyzer.PerFieldReuseStrategy()) {
@@ -85,35 +84,45 @@ public class TestBlockPostingsFormat3 ex
       }
     };
     IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
-    iwc.setCodec(new Lucene41Codec() {
-      @Override
-      public PostingsFormat getPostingsFormatForField(String field) {
-        return PostingsFormat.forName("Lucene41");
-        // TODO: we could actually add more fields implemented with different PFs
-      }
-    });
+    iwc.setCodec(new Lucene41Codec()); 
+    // TODO we could actually add more fields implemented with different PFs
+    // or, just put this test into the usual rotation?
     RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
     Document doc = new Document();
-    FieldType bareType = new FieldType(TextField.TYPE_NOT_STORED);
+    FieldType docsOnlyType = new FieldType(TextField.TYPE_NOT_STORED);
+    // turn this on for a cross-check
+    docsOnlyType.setStoreTermVectors(true);
+    docsOnlyType.setIndexOptions(IndexOptions.DOCS_ONLY);
+    
+    FieldType docsAndFreqsType = new FieldType(TextField.TYPE_NOT_STORED);
+    // turn this on for a cross-check
+    docsAndFreqsType.setStoreTermVectors(true);
+    docsAndFreqsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    
+    FieldType positionsType = new FieldType(TextField.TYPE_NOT_STORED);
     // turn these on for a cross-check
-    bareType.setStoreTermVectors(true);
-    bareType.setStoreTermVectorPositions(true);
-    bareType.setStoreTermVectorOffsets(true);
-    bareType.setStoreTermVectorPayloads(true);
-    FieldType offsetsType = new FieldType(bareType);
+    positionsType.setStoreTermVectors(true);
+    positionsType.setStoreTermVectorPositions(true);
+    positionsType.setStoreTermVectorOffsets(true);
+    positionsType.setStoreTermVectorPayloads(true);
+    FieldType offsetsType = new FieldType(positionsType);
     offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-    Field field1 = new Field("field1bare", "", bareType);
-    Field field2 = new Field("field2offsets", "", offsetsType);
-    Field field3 = new Field("field3payloadsFixed", "", bareType);
-    Field field4 = new Field("field4payloadsVariable", "", bareType);
-    Field field5 = new Field("field5payloadsFixedOffsets", "", offsetsType);
-    Field field6 = new Field("field6payloadsVariableOffsets", "", offsetsType);
+    Field field1 = new Field("field1docs", "", docsOnlyType);
+    Field field2 = new Field("field2freqs", "", docsAndFreqsType);
+    Field field3 = new Field("field3positions", "", positionsType);
+    Field field4 = new Field("field4offsets", "", offsetsType);
+    Field field5 = new Field("field5payloadsFixed", "", positionsType);
+    Field field6 = new Field("field6payloadsVariable", "", positionsType);
+    Field field7 = new Field("field7payloadsFixedOffsets", "", offsetsType);
+    Field field8 = new Field("field8payloadsVariableOffsets", "", offsetsType);
     doc.add(field1);
     doc.add(field2);
     doc.add(field3);
     doc.add(field4);
     doc.add(field5);
     doc.add(field6);
+    doc.add(field7);
+    doc.add(field8);
     for (int i = 0; i < MAXDOC; i++) {
       String stringValue = Integer.toString(i) + " verycommon " + English.intToEnglish(i).replace('-', ' ') + " " + _TestUtil.randomSimpleString(random());
       field1.setStringValue(stringValue);
@@ -122,6 +131,8 @@ public class TestBlockPostingsFormat3 ex
       field4.setStringValue(stringValue);
       field5.setStringValue(stringValue);
       field6.setStringValue(stringValue);
+      field7.setStringValue(stringValue);
+      field8.setStringValue(stringValue);
       iw.addDocument(doc);
     }
     iw.close();
@@ -139,11 +150,12 @@ public class TestBlockPostingsFormat3 ex
     DirectoryReader ir = DirectoryReader.open(dir);
     for (AtomicReaderContext leaf : ir.leaves()) {
       AtomicReader leafReader = leaf.reader();
-      assertTerms(leafReader.terms("field1bare"), leafReader.terms("field2offsets"), true);
-      assertTerms(leafReader.terms("field2offsets"), leafReader.terms("field3payloadsFixed"), true);
-      assertTerms(leafReader.terms("field3payloadsFixed"), leafReader.terms("field4payloadsVariable"), true);
-      assertTerms(leafReader.terms("field4payloadsVariable"), leafReader.terms("field5payloadsFixedOffsets"), true);
-      assertTerms(leafReader.terms("field5payloadsFixedOffsets"), leafReader.terms("field6payloadsVariableOffsets"), true);
+      assertTerms(leafReader.terms("field1docs"), leafReader.terms("field2freqs"), true);
+      assertTerms(leafReader.terms("field3positions"), leafReader.terms("field4offsets"), true);
+      assertTerms(leafReader.terms("field4offsets"), leafReader.terms("field5payloadsFixed"), true);
+      assertTerms(leafReader.terms("field5payloadsFixed"), leafReader.terms("field6payloadsVariable"), true);
+      assertTerms(leafReader.terms("field6payloadsVariable"), leafReader.terms("field7payloadsFixedOffsets"), true);
+      assertTerms(leafReader.terms("field7payloadsFixedOffsets"), leafReader.terms("field8payloadsVariableOffsets"), true);
     }
     ir.close();
   }
@@ -334,39 +346,31 @@ public class TestBlockPostingsFormat3 ex
         
         // with freqs:
         assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs),
-            rightDocs = rightTermsEnum.docs(null, rightDocs),
-            true);
+            rightDocs = rightTermsEnum.docs(null, rightDocs));
         assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs),
-            rightDocs = rightTermsEnum.docs(randomBits, rightDocs),
-            true);
+            rightDocs = rightTermsEnum.docs(randomBits, rightDocs));
 
         // w/o freqs:
         assertDocsEnum(leftDocs = leftTermsEnum.docs(null, leftDocs, 0),
-            rightDocs = rightTermsEnum.docs(null, rightDocs, 0),
-            false);
+            rightDocs = rightTermsEnum.docs(null, rightDocs, 0));
         assertDocsEnum(leftDocs = leftTermsEnum.docs(randomBits, leftDocs, 0),
-            rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0),
-            false);
+            rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0));
         
         // with freqs:
         assertDocsSkipping(leftTermsEnum.docFreq(), 
             leftDocs = leftTermsEnum.docs(null, leftDocs),
-            rightDocs = rightTermsEnum.docs(null, rightDocs),
-            true);
+            rightDocs = rightTermsEnum.docs(null, rightDocs));
         assertDocsSkipping(leftTermsEnum.docFreq(), 
             leftDocs = leftTermsEnum.docs(randomBits, leftDocs),
-            rightDocs = rightTermsEnum.docs(randomBits, rightDocs),
-            true);
+            rightDocs = rightTermsEnum.docs(randomBits, rightDocs));
 
         // w/o freqs:
         assertDocsSkipping(leftTermsEnum.docFreq(), 
             leftDocs = leftTermsEnum.docs(null, leftDocs, 0),
-            rightDocs = rightTermsEnum.docs(null, rightDocs, 0),
-            false);
+            rightDocs = rightTermsEnum.docs(null, rightDocs, 0));
         assertDocsSkipping(leftTermsEnum.docFreq(), 
             leftDocs = leftTermsEnum.docs(randomBits, leftDocs, 0),
-            rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0),
-            false);
+            rightDocs = rightTermsEnum.docs(randomBits, rightDocs, 0));
       }
     }
     assertNull(rightTermsEnum.next());
@@ -409,7 +413,7 @@ public class TestBlockPostingsFormat3 ex
   /**
    * checks docs + freqs, sequentially
    */
-  public void assertDocsEnum(DocsEnum leftDocs, DocsEnum rightDocs, boolean hasFreqs) throws Exception {
+  public void assertDocsEnum(DocsEnum leftDocs, DocsEnum rightDocs) throws Exception {
     if (leftDocs == null) {
       assertNull(rightDocs);
       return;
@@ -419,9 +423,7 @@ public class TestBlockPostingsFormat3 ex
     int docid;
     while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
       assertEquals(docid, rightDocs.nextDoc());
-      if (hasFreqs) {
-        assertEquals(leftDocs.freq(), rightDocs.freq());
-      }
+      // we don't assert freqs, they are allowed to be different
     }
     assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc());
   }
@@ -429,7 +431,7 @@ public class TestBlockPostingsFormat3 ex
   /**
    * checks advancing docs
    */
-  public void assertDocsSkipping(int docFreq, DocsEnum leftDocs, DocsEnum rightDocs, boolean hasFreqs) throws Exception {
+  public void assertDocsSkipping(int docFreq, DocsEnum leftDocs, DocsEnum rightDocs) throws Exception {
     if (leftDocs == null) {
       assertNull(rightDocs);
       return;
@@ -453,9 +455,7 @@ public class TestBlockPostingsFormat3 ex
       if (docid == DocIdSetIterator.NO_MORE_DOCS) {
         return;
       }
-      if (hasFreqs) {
-        assertEquals(leftDocs.freq(), rightDocs.freq());
-      }
+      // we don't assert freqs, they are allowed to be different
     }
   }