You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2019/03/14 11:22:15 UTC
[lucene-solr] branch master updated: LUCENE-8719: Traverse all
paths at the end of a TokenStream in FixedShingleFilter
This is an automated email from the ASF dual-hosted git repository.
romseygeek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 9096b84 LUCENE-8719: Traverse all paths at the end of a TokenStream in FixedShingleFilter
9096b84 is described below
commit 9096b846af3aadcb18957aef31c4e97481d2cfef
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Thu Mar 14 11:10:02 2019 +0000
LUCENE-8719: Traverse all paths at the end of a TokenStream in FixedShingleFilter
---
lucene/CHANGES.txt | 3 +
.../analysis/shingle/FixedShingleFilter.java | 94 +++++++++++-----------
.../analysis/shingle/FixedShingleFilterTest.java | 17 ++++
3 files changed, 69 insertions(+), 45 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index c821eeb..cf9f100 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -28,6 +28,9 @@ Bug fixes
* LUCENE-8726: ValueSource.asDoubleValuesSource() could leak a reference to
IndexSearcher (Alan Woodward, Yury Pakhomov)
+* LUCENE-8719: FixedShingleFilter can miss shingles at the end of a token stream if
+ there are multiple paths with different lengths. (Alan Woodward)
+
Improvements
* LUCENE-8673: Use radix partitioning when merging dimensional points instead
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
index 8f7eb95..77be29b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
@@ -92,63 +92,67 @@ public final class FixedShingleFilter extends GraphTokenFilter {
@Override
public boolean incrementToken() throws IOException {
- int shinglePosInc;
- if (incrementGraph() == false) {
- if (incrementBaseToken() == false) {
- return false;
- }
- // starting a shingle at a new base position, use base position increment
- shinglePosInc = incAtt.getPositionIncrement();
- }
- else {
- // starting a new shingle at the same base with a different graph, use a 0
- // position increment
- shinglePosInc = 0;
- }
+ int shinglePosInc, startOffset, endOffset;
- final int startOffset = offsetAtt.startOffset();
- int endOffset = offsetAtt.endOffset();
- this.buffer.setEmpty();
- this.buffer.append(termAtt);
-
- // build the shingle by iterating over the current graph, adding
- // filler tokens if we encounter gaps
- for (int i = 1; i < shingleSize; i++) {
- if (incrementGraphToken() == false) {
- // we've reached the end of the token stream, check for trailing
- // positions and add fillers if necessary
- int trailingPositions = getTrailingPositions();
- if (i + trailingPositions < shingleSize) {
- // not enough trailing positions to make a full shingle
+ outer: while (true) {
+ if (incrementGraph() == false) {
+ if (incrementBaseToken() == false) {
return false;
}
- while (i < shingleSize) {
- this.buffer.append(tokenSeparator).append(fillerToken);
- i++;
- }
- break;
+ // starting a shingle at a new base position, use base position increment
+ shinglePosInc = incAtt.getPositionIncrement();
+ } else {
+ // starting a new shingle at the same base with a different graph, use a 0
+ // position increment
+ shinglePosInc = 0;
}
- int posInc = incAtt.getPositionIncrement();
- if (posInc > 1) {
- // if we have a posInc > 1, we need to fill in the gaps
- if (i + posInc > shingleSize) {
- // if the posInc is greater than the shingle size, we need to add fillers
- // up to the shingle size but no further
+
+ startOffset = offsetAtt.startOffset();
+ endOffset = offsetAtt.endOffset();
+ this.buffer.setEmpty();
+ this.buffer.append(termAtt);
+
+ // build the shingle by iterating over the current graph, adding
+ // filler tokens if we encounter gaps
+ for (int i = 1; i < shingleSize; i++) {
+ if (incrementGraphToken() == false) {
+ // we've reached the end of the token stream, check for trailing
+ // positions and add fillers if necessary
+ int trailingPositions = getTrailingPositions();
+ if (i + trailingPositions < shingleSize) {
+ // not enough trailing positions to make a full shingle
+ // start again at a different graph
+ continue outer;
+ }
while (i < shingleSize) {
this.buffer.append(tokenSeparator).append(fillerToken);
i++;
}
break;
}
- // otherwise just add them in as far as we need
- while (posInc > 1) {
- this.buffer.append(tokenSeparator).append(fillerToken);
- posInc--;
- i++;
+ int posInc = incAtt.getPositionIncrement();
+ if (posInc > 1) {
+ // if we have a posInc > 1, we need to fill in the gaps
+ if (i + posInc > shingleSize) {
+ // if the posInc is greater than the shingle size, we need to add fillers
+ // up to the shingle size but no further
+ while (i < shingleSize) {
+ this.buffer.append(tokenSeparator).append(fillerToken);
+ i++;
+ }
+ break;
+ }
+ // otherwise just add them in as far as we need
+ while (posInc > 1) {
+ this.buffer.append(tokenSeparator).append(fillerToken);
+ posInc--;
+ i++;
+ }
}
+ this.buffer.append(tokenSeparator).append(termAtt);
+ endOffset = offsetAtt.endOffset();
}
- this.buffer.append(tokenSeparator).append(termAtt);
- endOffset = offsetAtt.endOffset();
+ break;
}
clearAttributes();
this.offsetAtt.setOffset(startOffset, endOffset);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
index 85c7dc6..3c7ffe0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
@@ -199,6 +199,23 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
new int[] { 1, 0, 0, 0, 1, 0, });
}
+ public void testTrailingGraphsOfDifferingLengths() throws IOException {
+
+ // a b:3/c d e f
+ TokenStream ts = new CannedTokenStream(
+ new Token("a", 0, 1),
+ new Token("b", 1, 2, 3, 3),
+ new Token("c", 0, 2, 3),
+ new Token("d", 2, 3),
+ new Token("e", 2, 3),
+ new Token("f", 4, 5)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+ new String[]{ "a b f", "a c d", "c d e", "d e f"});
+
+ }
+
public void testParameterLimits() {
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
new FixedShingleFilter(new CannedTokenStream(), 1);