You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2019/01/28 09:14:49 UTC
[lucene-solr] branch master updated: LUCENE-8650: Fix end() and
reset() in ConcatenatingTokenStream
This is an automated email from the ASF dual-hosted git repository.
romseygeek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 7713a4f LUCENE-8650: Fix end() and reset() in ConcatenatingTokenStream
7713a4f is described below
commit 7713a4f2458c77de08193dc548807b9e90214aaf
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Tue Jan 22 09:19:48 2019 +0000
LUCENE-8650: Fix end() and reset() in ConcatenatingTokenStream
---
lucene/CHANGES.txt | 4 +++
.../miscellaneous/ConcatenatingTokenStream.java | 20 +++++++++++++++
.../TestConcatenatingTokenStream.java | 29 ++++++++++++++++++++++
3 files changed, 53 insertions(+)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fd35d8e..e2d955a 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -286,6 +286,10 @@ Bug fixes:
* LUCENE-8654: Polygon2D#relateTriangle returns the wrong answer if polygon is inside
the triangle. (Ignacio Vera)
+* LUCENE-8650: ConcatenatingTokenStream did not correctly clear its state in reset(), and
+ was not propagating final position increments from its child streams correctly.
+ (Dan Meehl, Alan Woodward)
+
New Features
* LUCENE-8026: ExitableDirectoryReader may now time out queries that run on
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java
index 960cae1..e32bda4 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConcatenatingTokenStream.java
@@ -22,6 +22,7 @@ import java.util.Iterator;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IOUtils;
@@ -39,10 +40,13 @@ public final class ConcatenatingTokenStream extends TokenStream {
private final TokenStream[] sources;
private final OffsetAttribute[] sourceOffsets;
+ private final PositionIncrementAttribute[] sourceIncrements;
private final OffsetAttribute offsetAtt;
+ private final PositionIncrementAttribute posIncAtt;
private int currentSource;
private int offsetIncrement;
+ private int initialPositionIncrement = 1;
/**
* Create a new ConcatenatingTokenStream from a set of inputs
@@ -52,9 +56,12 @@ public final class ConcatenatingTokenStream extends TokenStream {
super(combineSources(sources));
this.sources = sources;
this.offsetAtt = addAttribute(OffsetAttribute.class);
+ this.posIncAtt = addAttribute(PositionIncrementAttribute.class);
this.sourceOffsets = new OffsetAttribute[sources.length];
+ this.sourceIncrements = new PositionIncrementAttribute[sources.length];
for (int i = 0; i < sources.length; i++) {
this.sourceOffsets[i] = sources[i].addAttribute(OffsetAttribute.class);
+ this.sourceIncrements[i] = sources[i].addAttribute(PositionIncrementAttribute.class);
}
}
@@ -78,19 +85,26 @@ public final class ConcatenatingTokenStream extends TokenStream {
@Override
public boolean incrementToken() throws IOException {
+ boolean newSource = false;
while (sources[currentSource].incrementToken() == false) {
if (currentSource >= sources.length - 1)
return false;
sources[currentSource].end();
+ initialPositionIncrement = sourceIncrements[currentSource].getPositionIncrement();
OffsetAttribute att = sourceOffsets[currentSource];
if (att != null)
offsetIncrement += att.endOffset();
currentSource++;
+ newSource = true;
}
clearAttributes();
sources[currentSource].copyTo(this);
offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
+ if (newSource) {
+ int posInc = posIncAtt.getPositionIncrement();
+ posIncAtt.setPositionIncrement(posInc + initialPositionIncrement);
+ }
return true;
}
@@ -98,7 +112,11 @@ public final class ConcatenatingTokenStream extends TokenStream {
@Override
public void end() throws IOException {
sources[currentSource].end();
+ int finalOffset = sourceOffsets[currentSource].endOffset() + offsetIncrement;
+ int finalPosInc = sourceIncrements[currentSource].getPositionIncrement();
super.end();
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ posIncAtt.setPositionIncrement(finalPosInc);
}
@Override
@@ -107,6 +125,8 @@ public final class ConcatenatingTokenStream extends TokenStream {
source.reset();
}
super.reset();
+ currentSource = 0;
+ offsetIncrement = 0;
}
@Override
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
index 258f9b8..19542e4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConcatenatingTokenStream.java
@@ -21,7 +21,9 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
@@ -46,6 +48,33 @@ public class TestConcatenatingTokenStream extends BaseTokenStreamTestCase {
new int[]{ 0, 6, 12, 19, 25, 31 },
new int[]{ 5, 11, 18, 24, 30, 36 });
+ // test re-use
+ first.setReader(new StringReader("first words "));
+ second.setReader(new StringReader("second words"));
+ third.setReader(new StringReader(" third words"));
+ assertTokenStreamContents(ts,
+ new String[] { "first", "words", "second", "words", "third", "words" },
+ new int[]{ 0, 6, 12, 19, 25, 31 },
+ new int[]{ 5, 11, 18, 24, 30, 36 },
+ new int[]{ 1, 1, 1, 1, 1, 1 });
+
+ }
+
+ public void testOffsetGaps() throws IOException {
+ CannedTokenStream cts1 = new CannedTokenStream(2, 10,
+ new Token("a", 0, 1), new Token("b", 2, 3));
+ CannedTokenStream cts2 = new CannedTokenStream(2, 10,
+ new Token("c", 0, 1), new Token("d", 2, 3));
+
+ TokenStream ts = new ConcatenatingTokenStream(cts1, cts2);
+ assertTokenStreamContents(ts,
+ new String[] { "a", "b", "c", "d" },
+ new int[]{ 0, 2, 10, 12 },
+ new int[]{ 1, 3, 11, 13 },
+ null,
+ new int[]{ 1, 1, 3, 1 },
+ null, 20, 2, null, false, null
+ );
}
public void testInconsistentAttributes() throws IOException {