You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2011/01/25 17:12:35 UTC

svn commit: r1063332 - in /lucene/dev/trunk/lucene/src: java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java test/org/apache/lucene/index/TestDocsAndPositions.java

Author: simonw
Date: Tue Jan 25 16:12:35 2011
New Revision: 1063332

URL: http://svn.apache.org/viewvc?rev=1063332&view=rev
Log:
LUCENE-2888: Several DocsEnum / DocsAndPositionsEnum return wrong docID when next() / advance(int) return NO_MORE_DOCS

Added:
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java   (with props)
Modified:
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java?rev=1063332&r1=1063331&r2=1063332&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Tue Jan 25 16:12:35 2011
@@ -980,7 +980,7 @@ public class PreFlexFields extends Field
 
   private final class PreDocsEnum extends DocsEnum {
     final private SegmentTermDocs docs;
-
+    private int docID = -1;
     PreDocsEnum() throws IOException {
       docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos);
     }
@@ -998,18 +998,18 @@ public class PreFlexFields extends Field
     @Override
     public int nextDoc() throws IOException {
       if (docs.next()) {
-        return docs.doc();
+        return docID = docs.doc();
       } else {
-        return NO_MORE_DOCS;
+        return docID = NO_MORE_DOCS;
       }
     }
 
     @Override
     public int advance(int target) throws IOException {
       if (docs.skipTo(target)) {
-        return docs.doc();
+        return docID = docs.doc();
       } else {
-        return NO_MORE_DOCS;
+        return docID = NO_MORE_DOCS;
       }
     }
 
@@ -1020,7 +1020,7 @@ public class PreFlexFields extends Field
 
     @Override
     public int docID() {
-      return docs.doc();
+      return docID;
     }
 
     @Override
@@ -1036,7 +1036,7 @@ public class PreFlexFields extends Field
 
   private final class PreDocsAndPositionsEnum extends DocsAndPositionsEnum {
     final private SegmentTermPositions pos;
-
+    private int docID = -1;
     PreDocsAndPositionsEnum() throws IOException {
       pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos);
     }
@@ -1054,18 +1054,18 @@ public class PreFlexFields extends Field
     @Override
     public int nextDoc() throws IOException {
       if (pos.next()) {
-        return pos.doc();
+        return docID = pos.doc();
       } else {
-        return NO_MORE_DOCS;
+        return docID = NO_MORE_DOCS;
       }
     }
 
     @Override
     public int advance(int target) throws IOException {
       if (pos.skipTo(target)) {
-        return pos.doc();
+        return docID = pos.doc();
       } else {
-        return NO_MORE_DOCS;
+        return docID = NO_MORE_DOCS;
       }
     }
 
@@ -1076,16 +1076,18 @@ public class PreFlexFields extends Field
 
     @Override
     public int docID() {
-      return pos.doc();
+      return docID;
     }
 
     @Override
     public int nextPosition() throws IOException {
+      assert docID != NO_MORE_DOCS;
       return pos.nextPosition();
     }
 
     @Override
     public boolean hasPayload() {
+      assert docID != NO_MORE_DOCS;
       return pos.isPayloadAvailable();
     }
 

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java?rev=1063332&r1=1063331&r2=1063332&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java Tue Jan 25 16:12:35 2011
@@ -261,7 +261,7 @@ public class PulsingPostingsReaderImpl e
       while(true) {
         if (postings.eof()) {
           //System.out.println("PR   END");
-          return NO_MORE_DOCS;
+          return docID = NO_MORE_DOCS;
         }
 
         final int code = postings.readVInt();
@@ -319,7 +319,7 @@ public class PulsingPostingsReaderImpl e
         if (doc >= target)
           return doc;
       }
-      return NO_MORE_DOCS;
+      return docID = NO_MORE_DOCS;
     }
   }
 
@@ -368,7 +368,7 @@ public class PulsingPostingsReaderImpl e
 
         if (postings.eof()) {
           //System.out.println("PR   END");
-          return NO_MORE_DOCS;
+          return docID = NO_MORE_DOCS;
         }
 
         final int code = postings.readVInt();
@@ -406,7 +406,7 @@ public class PulsingPostingsReaderImpl e
           return doc;
         }
       }
-      return NO_MORE_DOCS;
+      return docID = NO_MORE_DOCS;
     }
 
     @Override

Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java?rev=1063332&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java Tue Jan 25 16:12:35 2011
@@ -0,0 +1,327 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.index.IndexReader.ReaderContext;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.ReaderUtil;
+
+public class TestDocsAndPositions extends LuceneTestCase {
+  private String fieldName;
+  private boolean usePayload;
+
+  public void setUp() throws Exception {
+    super.setUp();
+    fieldName = "field" + random.nextInt();
+    usePayload = random.nextBoolean();
+  }
+
+  /**
+   * Simple testcase for {@link DocsAndPositionsEnum}
+   */
+  public void testPositionsSimple() throws IOException {
+    Directory directory = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(
+            MockTokenizer.WHITESPACE, true, usePayload)));
+    for (int i = 0; i < 39; i++) {
+      Document doc = new Document();
+      doc.add(newField(fieldName, "1 2 3 4 5 6 7 8 9 10 "
+          + "1 2 3 4 5 6 7 8 9 10 " + "1 2 3 4 5 6 7 8 9 10 "
+          + "1 2 3 4 5 6 7 8 9 10", Field.Store.YES, Field.Index.ANALYZED));
+      writer.addDocument(doc);
+    }
+    IndexReader reader = writer.getReader();
+    writer.close();
+
+    for (int i = 0; i < 39 * RANDOM_MULTIPLIER; i++) {
+      BytesRef bytes = new BytesRef("1");
+      ReaderContext topReaderContext = reader.getTopReaderContext();
+      AtomicReaderContext[] leaves = ReaderUtil.leaves(topReaderContext);
+      for (AtomicReaderContext atomicReaderContext : leaves) {
+        DocsAndPositionsEnum docsAndPosEnum = getDocsAndPositions(
+            atomicReaderContext.reader, bytes, null);
+        assertNotNull(docsAndPosEnum);
+        if (atomicReaderContext.reader.maxDoc() == 0) {
+          continue;
+        }
+        final int advance = docsAndPosEnum.advance(random.nextInt(atomicReaderContext.reader.maxDoc()));
+        do {
+          String msg = "Advanced to: " + advance + " current doc: "
+              + docsAndPosEnum.docID() + " usePayloads: " + usePayload;
+          assertEquals(msg, 4, docsAndPosEnum.freq());
+          assertEquals(msg, 0, docsAndPosEnum.nextPosition());
+          assertEquals(msg, 4, docsAndPosEnum.freq());
+          assertEquals(msg, 10, docsAndPosEnum.nextPosition());
+          assertEquals(msg, 4, docsAndPosEnum.freq());
+          assertEquals(msg, 20, docsAndPosEnum.nextPosition());
+          assertEquals(msg, 4, docsAndPosEnum.freq());
+          assertEquals(msg, 30, docsAndPosEnum.nextPosition());
+        } while (docsAndPosEnum.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS);
+      }
+    }
+    reader.close();
+    directory.close();
+  }
+
+  public DocsAndPositionsEnum getDocsAndPositions(IndexReader reader,
+      BytesRef bytes, Bits skipDocs) throws IOException {
+      return reader.termPositionsEnum(null, fieldName, bytes);
+  }
+
+  public DocsEnum getDocsEnum(IndexReader reader, BytesRef bytes,
+      boolean freqs, Bits skipDocs) throws IOException {
+    int randInt = random.nextInt(10);
+    if (randInt == 0) { // once in a while throw in a positions enum
+      return getDocsAndPositions(reader, bytes, skipDocs);
+    } else {
+      return reader.termDocsEnum(skipDocs, fieldName, bytes);
+    } 
+  }
+
+  /**
+   * this test indexes random numbers within a range into a field and checks
+   * their occurrences by searching for a number from that range selected at
+   * random. All positions for that number are saved up front and compared to
+   * the enums positions.
+   */
+  public void testRandomPositons() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, dir,
+        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(
+            MockTokenizer.WHITESPACE, true, usePayload)));
+    int numDocs = 131;
+    int max = 1051;
+    int term = random.nextInt(max);
+    Integer[][] positionsInDoc = new Integer[numDocs][];
+    for (int i = 0; i < numDocs; i++) {
+      Document doc = new Document();
+      ArrayList<Integer> positions = new ArrayList<Integer>();
+      StringBuilder builder = new StringBuilder();
+      for (int j = 0; j < 3049; j++) {
+        int nextInt = random.nextInt(max);
+        builder.append(nextInt).append(" ");
+        if (nextInt == term) {
+          positions.add(Integer.valueOf(j));
+        }
+      }
+      doc.add(newField(fieldName, builder.toString(), Field.Store.YES,
+          Field.Index.ANALYZED));
+      positionsInDoc[i] = positions.toArray(new Integer[0]);
+      writer.addDocument(doc);
+    }
+
+    IndexReader reader = writer.getReader();
+    writer.close();
+
+    for (int i = 0; i < 39 * RANDOM_MULTIPLIER; i++) {
+      BytesRef bytes = new BytesRef("" + term);
+      ReaderContext topReaderContext = reader.getTopReaderContext();
+      AtomicReaderContext[] leaves = ReaderUtil.leaves(topReaderContext);
+      for (AtomicReaderContext atomicReaderContext : leaves) {
+        DocsAndPositionsEnum docsAndPosEnum = getDocsAndPositions(
+            atomicReaderContext.reader, bytes, null);
+        assertNotNull(docsAndPosEnum);
+        int initDoc = 0;
+        int maxDoc = atomicReaderContext.reader.maxDoc();
+        // initially advance or do next doc
+        if (random.nextBoolean()) {
+          initDoc = docsAndPosEnum.nextDoc();
+        } else {
+          initDoc = docsAndPosEnum.advance(random.nextInt(maxDoc));
+        }
+        // now run through the scorer and check if all positions are there...
+        do {
+          int docID = docsAndPosEnum.docID();
+          if (docID == DocsAndPositionsEnum.NO_MORE_DOCS) {
+            break;
+          }
+          Integer[] pos = positionsInDoc[atomicReaderContext.docBase + docID];
+          assertEquals(pos.length, docsAndPosEnum.freq());
+          // number of positions read should be random - don't read all of them
+          // allways
+          final int howMany = random.nextInt(20) == 0 ? pos.length
+              - random.nextInt(pos.length) : pos.length;
+          for (int j = 0; j < howMany; j++) {
+            assertEquals("iteration: " + i + " initDoc: " + initDoc + " doc: "
+                + docID + " base: " + atomicReaderContext.docBase
+                + " positions: " + Arrays.toString(pos) + " usePayloads: "
+                + usePayload, pos[j].intValue(), docsAndPosEnum.nextPosition());
+          }
+
+          if (random.nextInt(10) == 0) { // once is a while advance
+            docsAndPosEnum
+                .advance(docID + 1 + random.nextInt((maxDoc - docID)));
+          }
+
+        } while (docsAndPosEnum.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS);
+      }
+
+    }
+    reader.close();
+    dir.close();
+  }
+
+  public void testRandomDocs() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, dir,
+        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(
+            MockTokenizer.WHITESPACE, true, usePayload)));
+    int numDocs = 499;
+    int max = 15678;
+    int term = random.nextInt(max);
+    int[] freqInDoc = new int[numDocs];
+    for (int i = 0; i < numDocs; i++) {
+      Document doc = new Document();
+      StringBuilder builder = new StringBuilder();
+      for (int j = 0; j < 199; j++) {
+        int nextInt = random.nextInt(max);
+        builder.append(nextInt).append(" ");
+        if (nextInt == term) {
+          freqInDoc[i]++;
+        }
+      }
+      doc.add(newField(fieldName, builder.toString(), Field.Store.YES,
+          Field.Index.ANALYZED));
+      writer.addDocument(doc);
+    }
+
+    IndexReader reader = writer.getReader();
+    writer.close();
+
+    for (int i = 0; i < 39 * RANDOM_MULTIPLIER; i++) {
+      BytesRef bytes = new BytesRef("" + term);
+      ReaderContext topReaderContext = reader.getTopReaderContext();
+      AtomicReaderContext[] leaves = ReaderUtil.leaves(topReaderContext);
+      for (AtomicReaderContext context : leaves) {
+        int maxDoc = context.reader.maxDoc();
+        DocsEnum docsAndPosEnum = getDocsEnum(context.reader, bytes, true, null);
+        if (findNext(freqInDoc, context.docBase, context.docBase + maxDoc) == Integer.MAX_VALUE) {
+          assertNull(docsAndPosEnum);
+          continue;
+        }
+        assertNotNull(docsAndPosEnum);
+        docsAndPosEnum.nextDoc();
+        for (int j = 0; j < maxDoc; j++) {
+          if (freqInDoc[context.docBase + j] != 0) {
+            assertEquals(j, docsAndPosEnum.docID());
+            assertEquals(docsAndPosEnum.freq(), freqInDoc[context.docBase +j]);
+            if (i % 2 == 0 && random.nextInt(10) == 0) {
+              int next = findNext(freqInDoc, context.docBase+j+1, context.docBase + maxDoc) - context.docBase;
+              int advancedTo = docsAndPosEnum.advance(next);
+              if (next >= maxDoc) {
+                assertEquals(DocsEnum.NO_MORE_DOCS, advancedTo);
+              } else {
+                assertTrue("advanced to: " +advancedTo + " but should be <= " + next, next >= advancedTo);  
+              }
+            } else {
+              docsAndPosEnum.nextDoc();
+            }
+          } 
+        }
+        assertEquals("docBase: " + context.docBase + " maxDoc: " + maxDoc + " " + docsAndPosEnum.getClass(), DocsEnum.NO_MORE_DOCS, docsAndPosEnum.docID());
+      }
+      
+    }
+
+    reader.close();
+    dir.close();
+  }
+  
+  private static int findNext(int[] docs, int pos, int max) {
+    for (int i = pos; i < max; i++) {
+      if( docs[i] != 0) {
+        return i;
+      }
+    }
+    return Integer.MAX_VALUE;
+  }
+
+  /**
+   * tests retrieval of positions for terms that have a large number of
+   * occurrences to force test of buffer refill during positions iteration.
+   */
+  public void testLargeNumberOfPositions() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random, dir,
+        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(
+            MockTokenizer.WHITESPACE, true, usePayload)));
+    int howMany = 1000;
+    for (int i = 0; i < 39; i++) {
+      Document doc = new Document();
+      StringBuilder builder = new StringBuilder();
+      for (int j = 0; j < howMany; j++) {
+        if (j % 2 == 0) {
+          builder.append("even ");
+        } else {
+          builder.append("odd ");
+        }
+      }
+      doc.add(newField(fieldName, builder.toString(), Field.Store.YES,
+          Field.Index.ANALYZED));
+      writer.addDocument(doc);
+    }
+
+    // now do seaches
+    IndexReader reader = writer.getReader();
+    writer.close();
+
+    for (int i = 0; i < 39 * RANDOM_MULTIPLIER; i++) {
+      BytesRef bytes = new BytesRef("even");
+
+      ReaderContext topReaderContext = reader.getTopReaderContext();
+      AtomicReaderContext[] leaves = ReaderUtil.leaves(topReaderContext);
+      for (AtomicReaderContext atomicReaderContext : leaves) {
+        DocsAndPositionsEnum docsAndPosEnum = getDocsAndPositions(
+            atomicReaderContext.reader, bytes, null);
+        assertNotNull(docsAndPosEnum);
+
+        int initDoc = 0;
+        int maxDoc = atomicReaderContext.reader.maxDoc();
+        // initially advance or do next doc
+        if (random.nextBoolean()) {
+          initDoc = docsAndPosEnum.nextDoc();
+        } else {
+          initDoc = docsAndPosEnum.advance(random.nextInt(maxDoc));
+        }
+        String msg = "Iteration: " + i + " initDoc: " + initDoc + " payloads: "
+            + usePayload;
+        assertEquals(howMany / 2, docsAndPosEnum.freq());
+        for (int j = 0; j < howMany; j += 2) {
+          assertEquals("position missmatch index: " + j + " with freq: "
+              + docsAndPosEnum.freq() + " -- " + msg, j,
+              docsAndPosEnum.nextPosition());
+        }
+      }
+    }
+    reader.close();
+    dir.close();
+  }
+
+}