You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2018/03/27 02:15:33 UTC

lucene-solr:branch_7x: LUCENE-8192: always enforce index-time offsets are correct with BaseTokenStreamTestCase

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7x 43796e516 -> e80ee7fff


LUCENE-8192: always enforce index-time offsets are correct with BaseTokenStreamTestCase


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/e80ee7ff
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/e80ee7ff
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/e80ee7ff

Branch: refs/heads/branch_7x
Commit: e80ee7fff85918e68c212757c0e6c4bddbdb5ab6
Parents: 43796e5
Author: Robert Muir <rm...@apache.org>
Authored: Mon Mar 26 22:02:14 2018 -0400
Committer: Robert Muir <rm...@apache.org>
Committed: Mon Mar 26 22:12:34 2018 -0400

----------------------------------------------------------------------
 .../analysis/BaseTokenStreamTestCase.java       | 93 +++++++++-----------
 1 file changed, 42 insertions(+), 51 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e80ee7ff/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index 8c0a295..a87148f 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -120,15 +120,13 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     }
   }
 
-  // offsetsAreCorrect also validates:
+  // graphOffsetsAreCorrect validates:
   //   - graph offsets are correct (all tokens leaving from
   //     pos X have the same startOffset; all tokens
   //     arriving to pos Y have the same endOffset)
-  //   - offsets only move forwards (startOffset >=
-  //     lastStartOffset)
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
                                                int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts,
-                                               boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
+                                               boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException {
     assertNotNull(output);
     CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
     
@@ -224,7 +222,16 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
           assertNull("payloads " + i, payloads[i]);
         }
       }
-
+      if (posIncrAtt != null) {
+        if (i == 0) {
+          assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
+        } else {
+          assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
+        }
+      }
+      if (posLengthAtt != null) {
+        assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1);
+      }
       // we can enforce some basic things about a few attributes even if the caller doesn't check:
       if (offsetAtt != null) {
         final int startOffset = offsetAtt.startOffset();
@@ -235,12 +242,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                      endOffset <= finalOffset.intValue());
         }
 
-        if (offsetsAreCorrect) {
-          assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset);
-          lastStartOffset = offsetAtt.startOffset();
-        }
+        assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset);
+        lastStartOffset = offsetAtt.startOffset();
 
-        if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
+        if (graphOffsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
           // Validate offset consistency in the graph, ie
           // all tokens leaving from a certain pos have the
           // same startOffset, and all tokens arriving to a
@@ -275,16 +280,6 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
           }
         }
       }
-      if (posIncrAtt != null) {
-        if (i == 0) {
-          assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1);
-        } else {
-          assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
-        }
-      }
-      if (posLengthAtt != null) {
-        assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1);
-      }
     }
 
     if (ts.incrementToken()) {
@@ -321,12 +316,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
   
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
                                                int posLengths[], Integer finalOffset, boolean[] keywordAtts,
-                                               boolean offsetsAreCorrect) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, keywordAtts, offsetsAreCorrect, null);
+                                               boolean graphOffsetsAreCorrect) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, keywordAtts, graphOffsetsAreCorrect, null);
   }
 
-  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect);
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean graphOffsetsAreCorrect) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, graphOffsetsAreCorrect);
   }
 
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
@@ -389,15 +384,15 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
   }
 
-  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
+  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect) throws IOException {
     checkResetException(a, input);
-    checkAnalysisConsistency(random(), a, true, input, offsetsAreCorrect);
-    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
+    checkAnalysisConsistency(random(), a, true, input, graphOffsetsAreCorrect);
+    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), graphOffsetsAreCorrect);
   }
 
-  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect, byte[][] payloads) throws IOException {
+  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean graphOffsetsAreCorrect, byte[][] payloads) throws IOException {
     checkResetException(a, input);
-    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, offsetsAreCorrect, payloads);
+    assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), null, null, graphOffsetsAreCorrect, payloads);
   }
 
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
@@ -505,7 +500,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     final Analyzer a;
     final boolean useCharFilter;
     final boolean simple;
-    final boolean offsetsAreCorrect;
+    final boolean graphOffsetsAreCorrect;
     final RandomIndexWriter iw;
     final CountDownLatch latch;
 
@@ -514,14 +509,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     // interact)... so this is just "best effort":
     public boolean failed;
     
-    AnalysisThread(long seed, CountDownLatch latch, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) {
+    AnalysisThread(long seed, CountDownLatch latch, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean graphOffsetsAreCorrect, RandomIndexWriter iw) {
       this.seed = seed;
       this.a = a;
       this.iterations = iterations;
       this.maxWordLength = maxWordLength;
       this.useCharFilter = useCharFilter;
       this.simple = simple;
-      this.offsetsAreCorrect = offsetsAreCorrect;
+      this.graphOffsetsAreCorrect = graphOffsetsAreCorrect;
       this.iw = iw;
       this.latch = latch;
     }
@@ -533,7 +528,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
         latch.await();
         // see the part in checkRandomData where it replays the same text again
         // to verify reproducability/reuse: hopefully this would catch thread hazards.
-        checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
+        checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
         success = true;
       } catch (Exception e) {
         Rethrow.rethrow(e);
@@ -547,7 +542,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     checkRandomData(random, a, iterations, maxWordLength, simple, true);
   }
 
-  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
+  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean graphOffsetsAreCorrect) throws IOException {
     checkResetException(a, "best effort");
     long seed = random.nextLong();
     boolean useCharFilter = random.nextBoolean();
@@ -563,14 +558,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     }
     boolean success = false;
     try {
-      checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
+      checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
       // now test with multiple threads: note we do the EXACT same thing we did before in each thread,
       // so this should only really fail from another thread if it's an actual thread problem
       int numThreads = TestUtil.nextInt(random, 2, 4);
       final CountDownLatch startingGun = new CountDownLatch(1);
       AnalysisThread threads[] = new AnalysisThread[numThreads];
       for (int i = 0; i < threads.length; i++) {
-        threads[i] = new AnalysisThread(seed, startingGun, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
+        threads[i] = new AnalysisThread(seed, startingGun, a, iterations, maxWordLength, useCharFilter, simple, graphOffsetsAreCorrect, iw);
       }
       for (int i = 0; i < threads.length; i++) {
         threads[i].start();
@@ -601,7 +596,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     }
   }
 
-  private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException {
+  private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean graphOffsetsAreCorrect, RandomIndexWriter iw) throws IOException {
 
     final LineFileDocs docs = new LineFileDocs(random);
     Document doc = null;
@@ -626,11 +621,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
         case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
         case 2: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
         default:
-          if (offsetsAreCorrect) {
-            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-          } else {
-            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
-          }
+          ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
       }
       currentField = field = new Field("dummy", bogus, ft);
       doc.add(currentField);
@@ -665,7 +656,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
         }
 
         try {
-          checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField);
+          checkAnalysisConsistency(random, a, useCharFilter, text, graphOffsetsAreCorrect, currentField);
           if (iw != null) {
             if (random.nextInt(7) == 0) {
               // pile up a multivalued field
@@ -727,11 +718,11 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     checkAnalysisConsistency(random, a, useCharFilter, text, true);
   }
 
-  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
-    checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, null);
+  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean graphOffsetsAreCorrect) throws IOException {
+    checkAnalysisConsistency(random, a, useCharFilter, text, graphOffsetsAreCorrect, null);
   }
   
-  private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect, Field field) throws IOException {
+  private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean graphOffsetsAreCorrect, Field field) throws IOException {
 
     if (VERBOSE) {
       System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
@@ -874,7 +865,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                                 toIntArray(positions),
                                 toIntArray(positionLengths),
                                 text.length(),
-                                offsetsAreCorrect);
+                                graphOffsetsAreCorrect);
     } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
       // offset + pos + type
       assertTokenStreamContents(ts, 
@@ -885,7 +876,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                                 toIntArray(positions),
                                 null,
                                 text.length(),
-                                offsetsAreCorrect);
+                                graphOffsetsAreCorrect);
     } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
       // offset + pos + posLength
       assertTokenStreamContents(ts, 
@@ -896,7 +887,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                                 toIntArray(positions),
                                 toIntArray(positionLengths),
                                 text.length(),
-                                offsetsAreCorrect);
+                                graphOffsetsAreCorrect);
     } else if (posIncAtt != null && offsetAtt != null) {
       // offset + pos
       assertTokenStreamContents(ts, 
@@ -907,7 +898,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                                 toIntArray(positions),
                                 null,
                                 text.length(),
-                                offsetsAreCorrect);
+                                graphOffsetsAreCorrect);
     } else if (offsetAtt != null) {
       // offset
       assertTokenStreamContents(ts, 
@@ -918,7 +909,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
                                 null,
                                 null,
                                 text.length(),
-                                offsetsAreCorrect);
+                                graphOffsetsAreCorrect);
     } else {
       // terms only
       assertTokenStreamContents(ts,