You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/06/12 19:15:02 UTC
svn commit: r1349446 - in /lucene/dev/branches/branch_4x/lucene:
core/src/java/org/apache/lucene/codecs/lucene40/
core/src/java/org/apache/lucene/index/ core/src/test/org/apache/lucene/index/
test-framework/src/java/org/apache/lucene/analysis/
Author: rmuir
Date: Tue Jun 12 17:15:01 2012
New Revision: 1349446
URL: http://svn.apache.org/viewvc?rev=1349446&view=rev
Log:
LUCENE-4139: compute offsets correctly for multivalued fields
Modified:
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java?rev=1349446&r1=1349445&r2=1349446&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java Tue Jun 12 17:15:01 2012
@@ -252,6 +252,7 @@ public final class Lucene40PostingsWrite
// and the numbers aren't that much smaller anyways.
int offsetDelta = startOffset - lastOffset;
int offsetLength = endOffset - startOffset;
+ assert offsetDelta >= 0 && offsetLength >= 0 : "startOffset=" + startOffset + ",lastOffset=" + lastOffset + ",endOffset=" + endOffset;
if (offsetLength != lastOffsetLength) {
proxOut.writeVInt(offsetDelta << 1 | 1);
proxOut.writeVInt(offsetLength);
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java?rev=1349446&r1=1349445&r2=1349446&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java Tue Jun 12 17:15:01 2012
@@ -148,15 +148,16 @@ final class FreqProxTermsWriterPerField
postings.lastPositions[termID] = fieldState.position;
}
- void writeOffsets(final int termID, int prevOffset) {
+ void writeOffsets(final int termID, int offsetAccum) {
assert hasOffsets;
- final int startOffset = offsetAttribute.startOffset();
- final int endOffset = offsetAttribute.endOffset();
+ final int startOffset = offsetAccum + offsetAttribute.startOffset();
+ final int endOffset = offsetAccum + offsetAttribute.endOffset();
//System.out.println("writeOffsets termID=" + termID + " prevOffset=" + prevOffset + " startOff=" + startOffset + " endOff=" + endOffset);
- termsHashPerField.writeVInt(1, startOffset - prevOffset);
+ FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
+ assert startOffset - postings.lastOffsets[termID] >= 0;
+ termsHashPerField.writeVInt(1, startOffset - postings.lastOffsets[termID]);
termsHashPerField.writeVInt(1, endOffset - startOffset);
- FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
postings.lastOffsets[termID] = startOffset;
}
@@ -224,6 +225,7 @@ final class FreqProxTermsWriterPerField
if (hasProx) {
writeProx(termID, fieldState.position);
if (hasOffsets) {
+ postings.lastOffsets[termID] = 0;
writeOffsets(termID, fieldState.offset);
}
} else {
@@ -236,7 +238,7 @@ final class FreqProxTermsWriterPerField
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
}
if (hasOffsets) {
- writeOffsets(termID, postings.lastOffsets[termID]);
+ writeOffsets(termID, fieldState.offset);
}
}
}
@@ -523,14 +525,15 @@ final class FreqProxTermsWriterPerField
if (readOffsets) {
final int startOffset = offset + prox.readVInt();
final int endOffset = startOffset + prox.readVInt();
- offset = startOffset;
if (writePositions) {
if (writeOffsets) {
+ assert startOffset >=0 && endOffset >= startOffset : "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset;
postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset);
} else {
postingsConsumer.addPosition(position, thisPayload, -1, -1);
}
}
+ offset = startOffset;
} else if (writePositions) {
postingsConsumer.addPosition(position, thisPayload, -1, -1);
}
Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java?rev=1349446&r1=1349445&r2=1349446&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestPostingsOffsets.java Tue Jun 12 17:15:01 2012
@@ -385,6 +385,22 @@ public class TestPostingsOffsets extends
dir.close();
}
+ public void testAddFieldTwice() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
+ Document doc = new Document();
+ FieldType customType3 = new FieldType(TextField.TYPE_STORED);
+ customType3.setStoreTermVectors(true);
+ customType3.setStoreTermVectorPositions(true);
+ customType3.setStoreTermVectorOffsets(true);
+ customType3.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ doc.add(new Field("content3", "here is more content with aaa aaa aaa", customType3));
+ doc.add(new Field("content3", "here is more content with aaa aaa aaa", customType3));
+ iw.addDocument(doc);
+ iw.close();
+ dir.close(); // checkindex
+ }
+
// NOTE: the next two tests aren't that good as we need an EvilToken...
public void testNegativeOffsets() throws Exception {
try {
Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1349446&r1=1349445&r2=1349446&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Tue Jun 12 17:15:01 2012
@@ -26,12 +26,22 @@ import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Map;
import java.util.HashMap;
+import java.util.Set;
import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.IOUtils;
@@ -384,13 +394,14 @@ public abstract class BaseTokenStreamTes
final boolean useCharFilter;
final boolean simple;
final boolean offsetsAreCorrect;
+ final RandomIndexWriter iw;
// NOTE: not volatile because we don't want the tests to
// add memory barriers (ie alter how threads
// interact)... so this is just "best effort":
public boolean failed;
- AnalysisThread(long seed, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) {
+ AnalysisThread(long seed, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) {
this.seed = seed;
this.a = a;
this.iterations = iterations;
@@ -398,6 +409,7 @@ public abstract class BaseTokenStreamTes
this.useCharFilter = useCharFilter;
this.simple = simple;
this.offsetsAreCorrect = offsetsAreCorrect;
+ this.iw = iw;
}
@Override
@@ -406,7 +418,7 @@ public abstract class BaseTokenStreamTes
try {
// see the part in checkRandomData where it replays the same text again
// to verify reproducability/reuse: hopefully this would catch thread hazards.
- checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
+ checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
success = true;
} catch (IOException e) {
Rethrow.rethrow(e);
@@ -423,34 +435,88 @@ public abstract class BaseTokenStreamTes
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
long seed = random.nextLong();
boolean useCharFilter = random.nextBoolean();
- checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
- // now test with multiple threads: note we do the EXACT same thing we did before in each thread,
- // so this should only really fail from another thread if its an actual thread problem
- int numThreads = _TestUtil.nextInt(random, 2, 4);
- AnalysisThread threads[] = new AnalysisThread[numThreads];
- for (int i = 0; i < threads.length; i++) {
- threads[i] = new AnalysisThread(seed, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
+ Directory dir = null;
+ RandomIndexWriter iw = null;
+ if (rarely(random)) {
+ dir = newFSDirectory(_TestUtil.getTempDir("bttc"));
+ iw = new RandomIndexWriter(new Random(seed), dir, a);
}
- for (int i = 0; i < threads.length; i++) {
- threads[i].start();
- }
- for (int i = 0; i < threads.length; i++) {
- try {
- threads[i].join();
- } catch (InterruptedException e) {
- throw new RuntimeException(e);
+ boolean success = false;
+ try {
+ checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
+ // now test with multiple threads: note we do the EXACT same thing we did before in each thread,
+ // so this should only really fail from another thread if its an actual thread problem
+ int numThreads = _TestUtil.nextInt(random, 2, 4);
+ AnalysisThread threads[] = new AnalysisThread[numThreads];
+ for (int i = 0; i < threads.length; i++) {
+ threads[i] = new AnalysisThread(seed, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
}
- }
- for (int i = 0; i < threads.length; i++) {
- if (threads[i].failed) {
- throw new RuntimeException("some thread(s) failed");
+ for (int i = 0; i < threads.length; i++) {
+ threads[i].start();
+ }
+ for (int i = 0; i < threads.length; i++) {
+ try {
+ threads[i].join();
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ for (int i = 0; i < threads.length; i++) {
+ if (threads[i].failed) {
+ throw new RuntimeException("some thread(s) failed");
+ }
+ }
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(iw, dir);
+ } else {
+ IOUtils.closeWhileHandlingException(iw, dir); // checkindex
}
}
}
- private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) throws IOException {
+ static final Set<String> doesntSupportOffsets = new HashSet<String>() {{
+ add("Lucene3x");
+ add("MockFixedIntBlock");
+ add("MockVariableIntBlock");
+ add("MockSep");
+ add("MockRandom");
+ }};
+
+ private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException {
final LineFileDocs docs = new LineFileDocs(random);
+ Document doc = null;
+ Field field = null, currentField = null;
+ StringReader bogus = new StringReader("");
+ if (iw != null) {
+ doc = new Document();
+ FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+ if (random.nextBoolean()) {
+ ft.setStoreTermVectors(true);
+ ft.setStoreTermVectorOffsets(random.nextBoolean());
+ ft.setStoreTermVectorPositions(random.nextBoolean());
+ }
+ if (random.nextBoolean()) {
+ ft.setOmitNorms(true);
+ }
+ String pf = _TestUtil.getPostingsFormat("dummy");
+ boolean supportsOffsets = !doesntSupportOffsets.contains(pf);
+ switch(random.nextInt(4)) {
+ case 0: ft.setIndexOptions(IndexOptions.DOCS_ONLY); break;
+ case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
+ case 2: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
+ default:
+ if (supportsOffsets && offsetsAreCorrect) {
+ ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ } else {
+ ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
+ }
+ }
+ currentField = field = new Field("dummy", bogus, ft);
+ doc.add(currentField);
+ }
try {
for (int i = 0; i < iterations; i++) {
@@ -481,7 +547,23 @@ public abstract class BaseTokenStreamTes
}
try {
- checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect);
+ checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField);
+ if (iw != null) {
+ if (random.nextInt(7) == 0) {
+ // pile up a multivalued field
+ FieldType ft = field.fieldType();
+ currentField = new Field("dummy", bogus, ft);
+ doc.add(currentField);
+ } else {
+ iw.addDocument(doc);
+ if (doc.getFields().size() > 1) {
+ // back to 1 field
+ currentField = field;
+ doc.removeFields("dummy");
+ doc.add(currentField);
+ }
+ }
+ }
} catch (Throwable t) {
// TODO: really we should pass a random seed to
// checkAnalysisConsistency then print it here too:
@@ -528,6 +610,10 @@ public abstract class BaseTokenStreamTes
}
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
+ checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, null);
+ }
+
+ private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect, Field field) throws IOException {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
@@ -649,6 +735,8 @@ public abstract class BaseTokenStreamTes
}
reader = new StringReader(text);
+ long seed = random.nextLong();
+ random = new Random(seed);
if (random.nextInt(30) == 7) {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
@@ -718,6 +806,20 @@ public abstract class BaseTokenStreamTes
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]));
}
+
+ if (field != null) {
+ reader = new StringReader(text);
+ random = new Random(seed);
+ if (random.nextInt(30) == 7) {
+ if (VERBOSE) {
+ System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: indexing using spoon-feed reader");
+ }
+
+ reader = new MockReaderWrapper(random, reader);
+ }
+
+ field.setReaderValue(useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+ }
}
private static String randomAnalysisString(Random random, int maxLength, boolean simple) {