You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2019/01/28 09:14:49 UTC

[lucene-solr] branch master updated: LUCENE-8650: Fix end() and reset() in ConcatenatingTokenStream

This is an automated email from the ASF dual-hosted git repository.

romseygeek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 7713a4f  LUCENE-8650: Fix end() and reset() in ConcatenatingTokenStream
7713a4f is described below

commit 7713a4f2458c77de08193dc548807b9e90214aaf
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Tue Jan 22 09:19:48 2019 +0000

    LUCENE-8650: Fix end() and reset() in ConcatenatingTokenStream
---
 lucene/CHANGES.txt                                 |  4 +++
 .../miscellaneous/ConcatenatingTokenStream.java    | 20 +++++++++++++++
 .../TestConcatenatingTokenStream.java              | 29 ++++++++++++++++++++++
 3 files changed, 53 insertions(+)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fd35d8e..e2d955a 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -286,6 +286,10 @@ Bug fixes:
 * LUCENE-8654: Polygon2D#relateTriangle returns the wrong answer if polygon is inside
   the triangle. (Ignacio Vera)
 
+* LUCENE-8650: ConcatenatingTokenStream did not correctly clear its state in reset(), and
+  was not propagating final position increments from its child streams correctly.
+  (Dan Meehl, Alan Woodward)
+
 New Features
 
 * LUCENE-8026: ExitableDirectoryReader may now time out queries that run on
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java
index 960cae1..e32bda4 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java
@@ -22,6 +22,7 @@ import java.util.Iterator;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.IOUtils;
@@ -39,10 +40,13 @@ public final class ConcatenatingTokenStream extends TokenStream {
 
   private final TokenStream[] sources;
   private final OffsetAttribute[] sourceOffsets;
+  private final PositionIncrementAttribute[] sourceIncrements;
   private final OffsetAttribute offsetAtt;
+  private final PositionIncrementAttribute posIncAtt;
 
   private int currentSource;
   private int offsetIncrement;
+  private int initialPositionIncrement = 1;
 
   /**
    * Create a new ConcatenatingTokenStream from a set of inputs
@@ -52,9 +56,12 @@ public final class ConcatenatingTokenStream extends TokenStream {
     super(combineSources(sources));
     this.sources = sources;
     this.offsetAtt = addAttribute(OffsetAttribute.class);
+    this.posIncAtt = addAttribute(PositionIncrementAttribute.class);
     this.sourceOffsets = new OffsetAttribute[sources.length];
+    this.sourceIncrements = new PositionIncrementAttribute[sources.length];
     for (int i = 0; i < sources.length; i++) {
       this.sourceOffsets[i] = sources[i].addAttribute(OffsetAttribute.class);
+      this.sourceIncrements[i] = sources[i].addAttribute(PositionIncrementAttribute.class);
     }
   }
 
@@ -78,19 +85,26 @@ public final class ConcatenatingTokenStream extends TokenStream {
 
   @Override
   public boolean incrementToken() throws IOException {
+    boolean newSource = false;
     while (sources[currentSource].incrementToken() == false) {
       if (currentSource >= sources.length - 1)
         return false;
       sources[currentSource].end();
+      initialPositionIncrement = sourceIncrements[currentSource].getPositionIncrement();
       OffsetAttribute att = sourceOffsets[currentSource];
       if (att != null)
         offsetIncrement += att.endOffset();
       currentSource++;
+      newSource = true;
     }
 
     clearAttributes();
     sources[currentSource].copyTo(this);
     offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
+    if (newSource) {
+      int posInc = posIncAtt.getPositionIncrement();
+      posIncAtt.setPositionIncrement(posInc + initialPositionIncrement);
+    }
 
     return true;
   }
@@ -98,7 +112,11 @@ public final class ConcatenatingTokenStream extends TokenStream {
   @Override
   public void end() throws IOException {
     sources[currentSource].end();
+    int finalOffset = sourceOffsets[currentSource].endOffset() + offsetIncrement;
+    int finalPosInc = sourceIncrements[currentSource].getPositionIncrement();
     super.end();
+    offsetAtt.setOffset(finalOffset, finalOffset);
+    posIncAtt.setPositionIncrement(finalPosInc);
   }
 
   @Override
@@ -107,6 +125,8 @@ public final class ConcatenatingTokenStream extends TokenStream {
       source.reset();
     }
     super.reset();
+    currentSource = 0;
+    offsetIncrement = 0;
   }
 
   @Override
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
index 258f9b8..19542e4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
@@ -21,7 +21,9 @@ import java.io.IOException;
 import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
@@ -46,6 +48,33 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {
         new int[]{ 0, 6, 12, 19, 25, 31 },
         new int[]{ 5, 11, 18, 24, 30, 36 });
 
+    // test re-use
+    first.setReader(new StringReader("first words "));
+    second.setReader(new StringReader("second words"));
+    third.setReader(new StringReader(" third words"));
+    assertTokenStreamContents(ts,
+        new String[] { "first", "words", "second", "words", "third", "words" },
+        new int[]{ 0, 6, 12, 19, 25, 31 },
+        new int[]{ 5, 11, 18, 24, 30, 36 },
+        new int[]{ 1, 1, 1, 1, 1, 1 });
+
+  }
+
+  public void testOffsetGaps() throws IOException {
+    CannedTokenStream cts1 = new CannedTokenStream(2, 10,
+        new Token("a", 0, 1), new Token("b", 2, 3));
+    CannedTokenStream cts2 = new CannedTokenStream(2, 10,
+        new Token("c", 0, 1), new Token("d", 2, 3));
+
+    TokenStream ts = new ConcatenatingTokenStream(cts1, cts2);
+    assertTokenStreamContents(ts,
+        new String[] { "a", "b", "c", "d" },
+        new int[]{      0,   2,   10,  12 },
+        new int[]{      1,   3,   11,  13 },
+        null,
+        new int[]{      1,   1,   3,   1 },
+        null, 20, 2, null, false, null
+        );
   }
 
   public void testInconsistentAttributes() throws IOException {