You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2018/03/27 02:15:33 UTC
lucene-solr:branch_7x: LUCENE-8192: always enforce index-time offsets
are correct with BaseTokenStreamTestCase
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7x 43796e516 -> e80ee7fff
LUCENE-8192: always enforce index-time offsets are correct with BaseTokenStreamTestCase
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/e80ee7ff
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/e80ee7ff
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/e80ee7ff
Branch: refs/heads/branch_7x
Commit: e80ee7fff85918e68c212757c0e6c4bddbdb5ab6
Parents: 43796e5
Author: Robert Muir <rm...@apache.org>
Authored: Mon Mar 26 22:02:14 2018 -0400
Committer: Robert Muir <rm...@apache.org>
Committed: Mon Mar 26 22:12:34 2018 -0400
----------------------------------------------------------------------
.../analysis/BaseTokenStreamTestCase.java | 93 +++++++++-----------
1 file changed, 42 insertions(+), 51 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e80ee7ff/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index 8c0a295..a87148f 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -120,15 +120,13 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
- // offsetsAreCorrect also validates:
+ // graphOffsetsAreCorrect validates:
// - graph offsets are correct (all tokens leaving from
// pos X have the same startOffset; all tokens
// arriving to pos Y have the same endOffset)
- // - offsets only move forwards (startOffset >=
- // lastStartOffset)
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts,
- boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
+ boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException {
assertNotNull(output);
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
@@ -224,7 +222,16 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertNull("payloads " + i, payloads[i]);
}
}
-
+ if (posIncrAtt != null) {
+ if (i == 0) {
+ assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
+ } else {
+ assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
+ }
+ }
+ if (posLengthAtt != null) {
+ assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1);
+ }
// we can enforce some basic things about a few attributes even if the caller doesn't check:
if (offsetAtt != null) {
final int startOffset = offsetAtt.startOffset();
@@ -235,12 +242,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
endOffset <= finalOffset.intValue());
}
- if (offsetsAreCorrect) {
- assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset);
- lastStartOffset = offsetAtt.startOffset();
- }
+ assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset);
+ lastStartOffset = offsetAtt.startOffset();
- if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
+ if (graphOffsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
// Validate offset consistency in the graph, ie
// all tokens leaving from a certain pos have the
// same startOffset, and all tokens arriving to a
@@ -275,16 +280,6 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
}
- if (posIncrAtt != null) {
- if (i == 0) {
- assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
- } else {
- assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
- }
- }
- if (posLengthAtt != null) {
- assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1);
- }
}
if (ts.incrementToken()) {
@@ -321,12 +316,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
int posLengths[], Integer finalOffset, boolean[] keywordAtts,
- boolean offsetsAreCorrect) throws IOException {
- assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, keywordAtts, offsetsAreCorrect, null);
+ boolean graphOffsetsAreCorrect) throws IOException {
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, keywordAtts, graphOffsetsAreCorrect, null);
}
- public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
- assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect);
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean graphOffsetsAreCorrect) throws IOException {
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, graphOffsetsAreCorrect);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
@@ -389,15 +384,15 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
}
- public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
+ public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect) throws IOException {
checkResetException(a, input);
- checkAnalysisConsistency(random(), a, true, input, offsetsAreCorrect);
- assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
+ checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect);
+ assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), graphOffsetsAreCorrect);
}
- public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
+ public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException {
checkResetException(a, input);
- assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, offsetsAreCorrect, payloads);
+ assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, graphOffsetsAreCorrect, payloads);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
@@ -505,7 +500,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
final Analyzer a;
final boolean useCharFilter;
final boolean simple;
- final boolean offsetsAreCorrect;
+ final boolean graphOffsetsAreCorrect;
final RandomIndexWriter iw;
final CountDownLatch latch;
@@ -514,14 +509,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// interact)... so this is just "best effort":
public boolean failed;
- AnalysisThread(long seed, CountDownLatch latch, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) {
+ AnalysisThread(long seed, CountDownLatch latch, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean graphOffsetsAreCorrect, RandomIndexWriter iw) {
this.seed = seed;
this.a = a;
this.iterations = iterations;
this.maxWordLength = maxWordLength;
this.useCharFilter = useCharFilter;
this.simple = simple;
- this.offsetsAreCorrect = offsetsAreCorrect;
+ this.graphOffsetsAreCorrect = graphOffsetsAreCorrect;
this.iw = iw;
this.latch = latch;
}
@@ -533,7 +528,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
latch.await();
// see the part in checkRandomData where it replays the same text again
// to verify reproducability/reuse: hopefully this would catch thread hazards.
- checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
+ checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
success = true;
} catch (Exception e) {
Rethrow.rethrow(e);
@@ -547,7 +542,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
checkRandomData(random, a, iterations, maxWordLength, simple, true);
}
- public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
+ public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean graphOffsetsAreCorrect) throws IOException {
checkResetException(a, "best effort");
long seed = random.nextLong();
boolean useCharFilter = random.nextBoolean();
@@ -563,14 +558,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
boolean success = false;
try {
- checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
+ checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
// now test with multiple threads: note we do the EXACT same thing we did before in each thread,
// so this should only really fail from another thread if it's an actual thread problem
int numThreads = TestUtil.nextInt(random, 2, 4);
final CountDownLatch startingGun = new CountDownLatch(1);
AnalysisThread threads[] = new AnalysisThread[numThreads];
for (int i = 0; i < threads.length; i++) {
- threads[i] = new AnalysisThread(seed, startingGun, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
+ threads[i] = new AnalysisThread(seed, startingGun, a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
}
for (int i = 0; i < threads.length; i++) {
threads[i].start();
@@ -601,7 +596,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
- private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException {
+ private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean graphOffsetsAreCorrect, RandomIndexWriter iw) throws IOException {
final LineFileDocs docs = new LineFileDocs(random);
Document doc = null;
@@ -626,11 +621,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
case 2: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
default:
- if (offsetsAreCorrect) {
- ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
- } else {
- ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
- }
+ ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
}
currentField = field = new Field("dummy", bogus, ft);
doc.add(currentField);
@@ -665,7 +656,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
try {
- checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField);
+ checkAnalysisConsistency(random, a, useCharFilter, text, graphOffsetsAreCorrect, currentField);
if (iw != null) {
if (random.nextInt(7) == 0) {
// pile up a multivalued field
@@ -727,11 +718,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
checkAnalysisConsistency(random, a, useCharFilter, text, true);
}
- public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
- checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, null);
+ public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean graphOffsetsAreCorrect) throws IOException {
+ checkAnalysisConsistency(random, a, useCharFilter, text, graphOffsetsAreCorrect, null);
}
- private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect, Field field) throws IOException {
+ private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean graphOffsetsAreCorrect, Field field) throws IOException {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
@@ -874,7 +865,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
toIntArray(positions),
toIntArray(positionLengths),
text.length(),
- offsetsAreCorrect);
+ graphOffsetsAreCorrect);
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertTokenStreamContents(ts,
@@ -885,7 +876,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
toIntArray(positions),
null,
text.length(),
- offsetsAreCorrect);
+ graphOffsetsAreCorrect);
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength
assertTokenStreamContents(ts,
@@ -896,7 +887,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
toIntArray(positions),
toIntArray(positionLengths),
text.length(),
- offsetsAreCorrect);
+ graphOffsetsAreCorrect);
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertTokenStreamContents(ts,
@@ -907,7 +898,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
toIntArray(positions),
null,
text.length(),
- offsetsAreCorrect);
+ graphOffsetsAreCorrect);
} else if (offsetAtt != null) {
// offset
assertTokenStreamContents(ts,
@@ -918,7 +909,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
null,
null,
text.length(),
- offsetsAreCorrect);
+ graphOffsetsAreCorrect);
} else {
// terms only
assertTokenStreamContents(ts,