You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2011/01/25 17:12:35 UTC
svn commit: r1063332 - in /lucene/dev/trunk/lucene/src:
java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
test/org/apache/lucene/index/TestDocsAndPositions.java
Author: simonw
Date: Tue Jan 25 16:12:35 2011
New Revision: 1063332
URL: http://svn.apache.org/viewvc?rev=1063332&view=rev
Log:
LUCENE-2888: Several DocsEnum / DocsAndPositionsEnum return wrong docID when next() / advance(int) return NO_MORE_DOCS
Added:
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java (with props)
Modified:
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java?rev=1063332&r1=1063331&r2=1063332&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Tue Jan 25 16:12:35 2011
@@ -980,7 +980,7 @@ public class PreFlexFields extends Field
private final class PreDocsEnum extends DocsEnum {
final private SegmentTermDocs docs;
-
+ private int docID = -1;
PreDocsEnum() throws IOException {
docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos);
}
@@ -998,18 +998,18 @@ public class PreFlexFields extends Field
@Override
public int nextDoc() throws IOException {
if (docs.next()) {
- return docs.doc();
+ return docID = docs.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@Override
public int advance(int target) throws IOException {
if (docs.skipTo(target)) {
- return docs.doc();
+ return docID = docs.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@@ -1020,7 +1020,7 @@ public class PreFlexFields extends Field
@Override
public int docID() {
- return docs.doc();
+ return docID;
}
@Override
@@ -1036,7 +1036,7 @@ public class PreFlexFields extends Field
private final class PreDocsAndPositionsEnum extends DocsAndPositionsEnum {
final private SegmentTermPositions pos;
-
+ private int docID = -1;
PreDocsAndPositionsEnum() throws IOException {
pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos);
}
@@ -1054,18 +1054,18 @@ public class PreFlexFields extends Field
@Override
public int nextDoc() throws IOException {
if (pos.next()) {
- return pos.doc();
+ return docID = pos.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@Override
public int advance(int target) throws IOException {
if (pos.skipTo(target)) {
- return pos.doc();
+ return docID = pos.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@@ -1076,16 +1076,18 @@ public class PreFlexFields extends Field
@Override
public int docID() {
- return pos.doc();
+ return docID;
}
@Override
public int nextPosition() throws IOException {
+ assert docID != NO_MORE_DOCS;
return pos.nextPosition();
}
@Override
public boolean hasPayload() {
+ assert docID != NO_MORE_DOCS;
return pos.isPayloadAvailable();
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java?rev=1063332&r1=1063331&r2=1063332&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java Tue Jan 25 16:12:35 2011
@@ -261,7 +261,7 @@ public class PulsingPostingsReaderImpl e
while(true) {
if (postings.eof()) {
//System.out.println("PR END");
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
final int code = postings.readVInt();
@@ -319,7 +319,7 @@ public class PulsingPostingsReaderImpl e
if (doc >= target)
return doc;
}
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@@ -368,7 +368,7 @@ public class PulsingPostingsReaderImpl e
if (postings.eof()) {
//System.out.println("PR END");
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
final int code = postings.readVInt();
@@ -406,7 +406,7 @@ public class PulsingPostingsReaderImpl e
return doc;
}
}
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
@Override
Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java?rev=1063332&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestDocsAndPositions.java Tue Jan 25 16:12:35 2011
@@ -0,0 +1,327 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.index.IndexReader.ReaderContext;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.ReaderUtil;
+
+public class TestDocsAndPositions extends LuceneTestCase {
+ private String fieldName;
+ private boolean usePayload;
+
+ public void setUp() throws Exception {
+ super.setUp();
+ fieldName = "field" + random.nextInt();
+ usePayload = random.nextBoolean();
+ }
+
+ /**
+ * Simple testcase for {@link DocsAndPositionsEnum}
+ */
+ public void testPositionsSimple() throws IOException {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(
+ MockTokenizer.WHITESPACE, true, usePayload)));
+ for (int i = 0; i < 39; i++) {
+ Document doc = new Document();
+ doc.add(newField(fieldName, "1 2 3 4 5 6 7 8 9 10 "
+ + "1 2 3 4 5 6 7 8 9 10 " + "1 2 3 4 5 6 7 8 9 10 "
+ + "1 2 3 4 5 6 7 8 9 10", Field.Store.YES, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ }
+ IndexReader reader = writer.getReader();
+ writer.close();
+
+ for (int i = 0; i < 39 * RANDOM_MULTIPLIER; i++) {
+ BytesRef bytes = new BytesRef("1");
+ ReaderContext topReaderContext = reader.getTopReaderContext();
+ AtomicReaderContext[] leaves = ReaderUtil.leaves(topReaderContext);
+ for (AtomicReaderContext atomicReaderContext : leaves) {
+ DocsAndPositionsEnum docsAndPosEnum = getDocsAndPositions(
+ atomicReaderContext.reader, bytes, null);
+ assertNotNull(docsAndPosEnum);
+ if (atomicReaderContext.reader.maxDoc() == 0) {
+ continue;
+ }
+ final int advance = docsAndPosEnum.advance(random.nextInt(atomicReaderContext.reader.maxDoc()));
+ do {
+ String msg = "Advanced to: " + advance + " current doc: "
+ + docsAndPosEnum.docID() + " usePayloads: " + usePayload;
+ assertEquals(msg, 4, docsAndPosEnum.freq());
+ assertEquals(msg, 0, docsAndPosEnum.nextPosition());
+ assertEquals(msg, 4, docsAndPosEnum.freq());
+ assertEquals(msg, 10, docsAndPosEnum.nextPosition());
+ assertEquals(msg, 4, docsAndPosEnum.freq());
+ assertEquals(msg, 20, docsAndPosEnum.nextPosition());
+ assertEquals(msg, 4, docsAndPosEnum.freq());
+ assertEquals(msg, 30, docsAndPosEnum.nextPosition());
+ } while (docsAndPosEnum.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS);
+ }
+ }
+ reader.close();
+ directory.close();
+ }
+
+ public DocsAndPositionsEnum getDocsAndPositions(IndexReader reader,
+ BytesRef bytes, Bits skipDocs) throws IOException {
+ return reader.termPositionsEnum(null, fieldName, bytes);
+ }
+
+ public DocsEnum getDocsEnum(IndexReader reader, BytesRef bytes,
+ boolean freqs, Bits skipDocs) throws IOException {
+ int randInt = random.nextInt(10);
+ if (randInt == 0) { // once in a while throw in a positions enum
+ return getDocsAndPositions(reader, bytes, skipDocs);
+ } else {
+ return reader.termDocsEnum(skipDocs, fieldName, bytes);
+ }
+ }
+
+ /**
+ * this test indexes random numbers within a range into a field and checks
+ * their occurrences by searching for a number from that range selected at
+ * random. All positions for that number are saved up front and compared to
+ * the enums positions.
+ */
+ public void testRandomPositons() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(
+ MockTokenizer.WHITESPACE, true, usePayload)));
+ int numDocs = 131;
+ int max = 1051;
+ int term = random.nextInt(max);
+ Integer[][] positionsInDoc = new Integer[numDocs][];
+ for (int i = 0; i < numDocs; i++) {
+ Document doc = new Document();
+ ArrayList<Integer> positions = new ArrayList<Integer>();
+ StringBuilder builder = new StringBuilder();
+ for (int j = 0; j < 3049; j++) {
+ int nextInt = random.nextInt(max);
+ builder.append(nextInt).append(" ");
+ if (nextInt == term) {
+ positions.add(Integer.valueOf(j));
+ }
+ }
+ doc.add(newField(fieldName, builder.toString(), Field.Store.YES,
+ Field.Index.ANALYZED));
+ positionsInDoc[i] = positions.toArray(new Integer[0]);
+ writer.addDocument(doc);
+ }
+
+ IndexReader reader = writer.getReader();
+ writer.close();
+
+ for (int i = 0; i < 39 * RANDOM_MULTIPLIER; i++) {
+ BytesRef bytes = new BytesRef("" + term);
+ ReaderContext topReaderContext = reader.getTopReaderContext();
+ AtomicReaderContext[] leaves = ReaderUtil.leaves(topReaderContext);
+ for (AtomicReaderContext atomicReaderContext : leaves) {
+ DocsAndPositionsEnum docsAndPosEnum = getDocsAndPositions(
+ atomicReaderContext.reader, bytes, null);
+ assertNotNull(docsAndPosEnum);
+ int initDoc = 0;
+ int maxDoc = atomicReaderContext.reader.maxDoc();
+ // initially advance or do next doc
+ if (random.nextBoolean()) {
+ initDoc = docsAndPosEnum.nextDoc();
+ } else {
+ initDoc = docsAndPosEnum.advance(random.nextInt(maxDoc));
+ }
+ // now run through the scorer and check if all positions are there...
+ do {
+ int docID = docsAndPosEnum.docID();
+ if (docID == DocsAndPositionsEnum.NO_MORE_DOCS) {
+ break;
+ }
+ Integer[] pos = positionsInDoc[atomicReaderContext.docBase + docID];
+ assertEquals(pos.length, docsAndPosEnum.freq());
+ // number of positions read should be random - don't read all of them
+ // allways
+ final int howMany = random.nextInt(20) == 0 ? pos.length
+ - random.nextInt(pos.length) : pos.length;
+ for (int j = 0; j < howMany; j++) {
+ assertEquals("iteration: " + i + " initDoc: " + initDoc + " doc: "
+ + docID + " base: " + atomicReaderContext.docBase
+ + " positions: " + Arrays.toString(pos) + " usePayloads: "
+ + usePayload, pos[j].intValue(), docsAndPosEnum.nextPosition());
+ }
+
+ if (random.nextInt(10) == 0) { // once is a while advance
+ docsAndPosEnum
+ .advance(docID + 1 + random.nextInt((maxDoc - docID)));
+ }
+
+ } while (docsAndPosEnum.nextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS);
+ }
+
+ }
+ reader.close();
+ dir.close();
+ }
+
+ public void testRandomDocs() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(
+ MockTokenizer.WHITESPACE, true, usePayload)));
+ int numDocs = 499;
+ int max = 15678;
+ int term = random.nextInt(max);
+ int[] freqInDoc = new int[numDocs];
+ for (int i = 0; i < numDocs; i++) {
+ Document doc = new Document();
+ StringBuilder builder = new StringBuilder();
+ for (int j = 0; j < 199; j++) {
+ int nextInt = random.nextInt(max);
+ builder.append(nextInt).append(" ");
+ if (nextInt == term) {
+ freqInDoc[i]++;
+ }
+ }
+ doc.add(newField(fieldName, builder.toString(), Field.Store.YES,
+ Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ }
+
+ IndexReader reader = writer.getReader();
+ writer.close();
+
+ for (int i = 0; i < 39 * RANDOM_MULTIPLIER; i++) {
+ BytesRef bytes = new BytesRef("" + term);
+ ReaderContext topReaderContext = reader.getTopReaderContext();
+ AtomicReaderContext[] leaves = ReaderUtil.leaves(topReaderContext);
+ for (AtomicReaderContext context : leaves) {
+ int maxDoc = context.reader.maxDoc();
+ DocsEnum docsAndPosEnum = getDocsEnum(context.reader, bytes, true, null);
+ if (findNext(freqInDoc, context.docBase, context.docBase + maxDoc) == Integer.MAX_VALUE) {
+ assertNull(docsAndPosEnum);
+ continue;
+ }
+ assertNotNull(docsAndPosEnum);
+ docsAndPosEnum.nextDoc();
+ for (int j = 0; j < maxDoc; j++) {
+ if (freqInDoc[context.docBase + j] != 0) {
+ assertEquals(j, docsAndPosEnum.docID());
+ assertEquals(docsAndPosEnum.freq(), freqInDoc[context.docBase +j]);
+ if (i % 2 == 0 && random.nextInt(10) == 0) {
+ int next = findNext(freqInDoc, context.docBase+j+1, context.docBase + maxDoc) - context.docBase;
+ int advancedTo = docsAndPosEnum.advance(next);
+ if (next >= maxDoc) {
+ assertEquals(DocsEnum.NO_MORE_DOCS, advancedTo);
+ } else {
+ assertTrue("advanced to: " +advancedTo + " but should be <= " + next, next >= advancedTo);
+ }
+ } else {
+ docsAndPosEnum.nextDoc();
+ }
+ }
+ }
+ assertEquals("docBase: " + context.docBase + " maxDoc: " + maxDoc + " " + docsAndPosEnum.getClass(), DocsEnum.NO_MORE_DOCS, docsAndPosEnum.docID());
+ }
+
+ }
+
+ reader.close();
+ dir.close();
+ }
+
+ private static int findNext(int[] docs, int pos, int max) {
+ for (int i = pos; i < max; i++) {
+ if( docs[i] != 0) {
+ return i;
+ }
+ }
+ return Integer.MAX_VALUE;
+ }
+
+ /**
+ * tests retrieval of positions for terms that have a large number of
+ * occurrences to force test of buffer refill during positions iteration.
+ */
+ public void testLargeNumberOfPositions() throws IOException {
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(
+ MockTokenizer.WHITESPACE, true, usePayload)));
+ int howMany = 1000;
+ for (int i = 0; i < 39; i++) {
+ Document doc = new Document();
+ StringBuilder builder = new StringBuilder();
+ for (int j = 0; j < howMany; j++) {
+ if (j % 2 == 0) {
+ builder.append("even ");
+ } else {
+ builder.append("odd ");
+ }
+ }
+ doc.add(newField(fieldName, builder.toString(), Field.Store.YES,
+ Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ }
+
+ // now do seaches
+ IndexReader reader = writer.getReader();
+ writer.close();
+
+ for (int i = 0; i < 39 * RANDOM_MULTIPLIER; i++) {
+ BytesRef bytes = new BytesRef("even");
+
+ ReaderContext topReaderContext = reader.getTopReaderContext();
+ AtomicReaderContext[] leaves = ReaderUtil.leaves(topReaderContext);
+ for (AtomicReaderContext atomicReaderContext : leaves) {
+ DocsAndPositionsEnum docsAndPosEnum = getDocsAndPositions(
+ atomicReaderContext.reader, bytes, null);
+ assertNotNull(docsAndPosEnum);
+
+ int initDoc = 0;
+ int maxDoc = atomicReaderContext.reader.maxDoc();
+ // initially advance or do next doc
+ if (random.nextBoolean()) {
+ initDoc = docsAndPosEnum.nextDoc();
+ } else {
+ initDoc = docsAndPosEnum.advance(random.nextInt(maxDoc));
+ }
+ String msg = "Iteration: " + i + " initDoc: " + initDoc + " payloads: "
+ + usePayload;
+ assertEquals(howMany / 2, docsAndPosEnum.freq());
+ for (int j = 0; j < howMany; j += 2) {
+ assertEquals("position missmatch index: " + j + " with freq: "
+ + docsAndPosEnum.freq() + " -- " + msg, j,
+ docsAndPosEnum.nextPosition());
+ }
+ }
+ }
+ reader.close();
+ dir.close();
+ }
+
+}