You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2019/03/14 11:22:14 UTC

[lucene-solr] branch branch_8x updated: LUCENE-8719: Traverse all paths at the end of a TokenStream in FixedShingleFilter

This is an automated email from the ASF dual-hosted git repository.

romseygeek pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 6571d06  LUCENE-8719: Traverse all paths at the end of a TokenStream in FixedShingleFilter
6571d06 is described below

commit 6571d06c25d7c883162f49f0955f8c0616241c52
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Thu Mar 14 11:10:02 2019 +0000

    LUCENE-8719: Traverse all paths at the end of a TokenStream in FixedShingleFilter
---
 lucene/CHANGES.txt                                 |  3 +
 .../analysis/shingle/FixedShingleFilter.java       | 94 +++++++++++-----------
 .../analysis/shingle/FixedShingleFilterTest.java   | 17 ++++
 3 files changed, 69 insertions(+), 45 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4f19cc9..bbef971 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -16,6 +16,9 @@ Bug fixes
 * LUCENE-8726: ValueSource.asDoubleValuesSource() could leak a reference to
   IndexSearcher (Alan Woodward, Yury Pakhomov)
 
+* LUCENE-8719: FixedShingleFilter can miss shingles at the end of a token stream if
+  there are multiple paths with different lengths. (Alan Woodward)
+
 Improvements
 
 * LUCENE-8673: Use radix partitioning when merging dimensional points instead
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
index 8f7eb95..77be29b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
@@ -92,63 +92,67 @@ public final class FixedShingleFilter extends GraphTokenFilter {
   @Override
   public boolean incrementToken() throws IOException {
 
-    int shinglePosInc;
-    if (incrementGraph() == false) {
-      if (incrementBaseToken() == false) {
-        return false;
-      }
-      // starting a shingle at a new base position, use base position increment
-      shinglePosInc = incAtt.getPositionIncrement();
-    }
-    else {
-      // starting a new shingle at the same base with a different graph, use a 0
-      // position increment
-      shinglePosInc = 0;
-    }
+    int shinglePosInc, startOffset, endOffset;
 
-    final int startOffset = offsetAtt.startOffset();
-    int endOffset = offsetAtt.endOffset();
-    this.buffer.setEmpty();
-    this.buffer.append(termAtt);
-
-    // build the shingle by iterating over the current graph, adding
-    // filler tokens if we encounter gaps
-    for (int i = 1; i < shingleSize; i++) {
-      if (incrementGraphToken() == false) {
-        // we've reached the end of the token stream, check for trailing
-        // positions and add fillers if necessary
-        int trailingPositions = getTrailingPositions();
-        if (i + trailingPositions < shingleSize) {
-          // not enough trailing positions to make a full shingle
+    outer: while (true) {
+      if (incrementGraph() == false) {
+        if (incrementBaseToken() == false) {
           return false;
         }
-        while (i < shingleSize) {
-          this.buffer.append(tokenSeparator).append(fillerToken);
-          i++;
-        }
-        break;
+        // starting a shingle at a new base position, use base position increment
+        shinglePosInc = incAtt.getPositionIncrement();
+      } else {
+        // starting a new shingle at the same base with a different graph, use a 0
+        // position increment
+        shinglePosInc = 0;
       }
-      int posInc = incAtt.getPositionIncrement();
-      if (posInc > 1) {
-        // if we have a posInc > 1, we need to fill in the gaps
-        if (i + posInc > shingleSize) {
-          // if the posInc is greater than the shingle size, we need to add fillers
-          // up to the shingle size but no further
+
+      startOffset = offsetAtt.startOffset();
+      endOffset = offsetAtt.endOffset();
+      this.buffer.setEmpty();
+      this.buffer.append(termAtt);
+
+      // build the shingle by iterating over the current graph, adding
+      // filler tokens if we encounter gaps
+      for (int i = 1; i < shingleSize; i++) {
+        if (incrementGraphToken() == false) {
+          // we've reached the end of the token stream, check for trailing
+          // positions and add fillers if necessary
+          int trailingPositions = getTrailingPositions();
+          if (i + trailingPositions < shingleSize) {
+            // not enough trailing positions to make a full shingle
+            // start again at a different graph
+            continue outer;
+          }
           while (i < shingleSize) {
             this.buffer.append(tokenSeparator).append(fillerToken);
             i++;
           }
           break;
         }
-        // otherwise just add them in as far as we need
-        while (posInc > 1) {
-          this.buffer.append(tokenSeparator).append(fillerToken);
-          posInc--;
-          i++;
+        int posInc = incAtt.getPositionIncrement();
+        if (posInc > 1) {
+          // if we have a posInc > 1, we need to fill in the gaps
+          if (i + posInc > shingleSize) {
+            // if the posInc is greater than the shingle size, we need to add fillers
+            // up to the shingle size but no further
+            while (i < shingleSize) {
+              this.buffer.append(tokenSeparator).append(fillerToken);
+              i++;
+            }
+            break;
+          }
+          // otherwise just add them in as far as we need
+          while (posInc > 1) {
+            this.buffer.append(tokenSeparator).append(fillerToken);
+            posInc--;
+            i++;
+          }
         }
+        this.buffer.append(tokenSeparator).append(termAtt);
+        endOffset = offsetAtt.endOffset();
       }
-      this.buffer.append(tokenSeparator).append(termAtt);
-      endOffset = offsetAtt.endOffset();
+      break;
     }
     clearAttributes();
     this.offsetAtt.setOffset(startOffset, endOffset);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
index 85c7dc6..3c7ffe0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
@@ -199,6 +199,23 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
           new int[] {    1,        0,      0,       0,       1,        0,     });
   }
 
+  public void testTrailingGraphsOfDifferingLengths() throws IOException {
+
+    // a b:3/c d e f
+    TokenStream ts = new CannedTokenStream(
+        new Token("a", 0, 1),
+        new Token("b", 1, 2, 3, 3),
+        new Token("c", 0, 2, 3),
+        new Token("d", 2, 3),
+        new Token("e", 2, 3),
+        new Token("f", 4, 5)
+    );
+
+    assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+        new String[]{ "a b f", "a c d", "c d e", "d e f"});
+
+  }
+
   public void testParameterLimits() {
     IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
       new FixedShingleFilter(new CannedTokenStream(), 1);