You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@lucene.apache.org by ro...@apache.org on 2018/12/04 10:02:58 UTC

[1/5] lucene-solr:branch_7x: LUCENE-8586: Fix infinite loop in Intervals.or()

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7_6 d30d6b89f -> 14bc64d51
  refs/heads/branch_7x c2486254e -> 6ecd93c69
  refs/heads/master 6c1116111 -> c78429a55


LUCENE-8586: Fix infinite loop in Intervals.or()


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6ecd93c6
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6ecd93c6
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6ecd93c6

Branch: refs/heads/branch_7x
Commit: 6ecd93c6939de8ca29ee2543bcfb3f5c8051357a
Parents: c248625
Author: Alan Woodward <ro...@apache.org>
Authored: Mon Dec 3 16:04:54 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Tue Dec 4 09:47:36 2018 +0000

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 ++
 .../intervals/DisjunctionIntervalsSource.java   | 47 +++++++++++++++++++-
 .../lucene/search/intervals/TestIntervals.java  | 14 ++++++
 3 files changed, 62 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6ecd93c6/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 32f3251..1845686 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -92,6 +92,9 @@ Bug fixes
 * LUCENE-8556: Use latitude and longitude instead of encoding values to check if triangle is ear
   when using morton optimisation. (Ignacio Vera)
 
+* LUCENE-8586: Intervals.or() could get stuck in an infinite loop on certain indexes
+  (Alan Woodward)
+
 New Features
 
 * LUCENE-8496: Selective indexing - modify BKDReader/BKDWriter to allow users

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6ecd93c6/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
index 79c1bcf..c54f18b 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
@@ -146,7 +146,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
 
     @Override
     public int nextInterval() throws IOException {
-      if (current == EMPTY) {
+      if (current == EMPTY || current == EXHAUSTED) {
         if (intervalQueue.size() > 0) {
           current = intervalQueue.top();
         }
@@ -160,7 +160,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
         }
       }
       if (intervalQueue.size() == 0) {
-        current = EMPTY;
+        current = EXHAUSTED;
         return NO_MORE_INTERVALS;
       }
       current = intervalQueue.top();
@@ -239,4 +239,47 @@ class DisjunctionIntervalsSource extends IntervalsSource {
     }
   };
 
+  private static final IntervalIterator EXHAUSTED = new IntervalIterator() {
+
+    @Override
+    public int docID() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long cost() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int start() {
+      return NO_MORE_INTERVALS;
+    }
+
+    @Override
+    public int end() {
+      return NO_MORE_INTERVALS;
+    }
+
+    @Override
+    public int nextInterval() {
+      return NO_MORE_INTERVALS;
+    }
+
+    @Override
+    public float matchCost() {
+      return 0;
+    }
+  };
+
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6ecd93c6/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
index d7754be..6002b3a 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
@@ -56,6 +56,8 @@ public class TestIntervals extends LuceneTestCase {
       "Porridge is great"
   };
 
+  //   0         1         2         3         4         5         6         7         8         9
+  //   012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
   private static String field2_docs[] = {
       "In Xanadu did Kubla Khan a stately pleasure dome decree",
       "Where Alph the sacred river ran through caverns measureless to man",
@@ -260,6 +262,18 @@ public class TestIntervals extends LuceneTestCase {
     assertFalse(mi.next());
   }
 
+  public void testCombinationDisjunction() throws IOException {
+    IntervalsSource source = Intervals.ordered(
+        Intervals.or(Intervals.term("alph"), Intervals.term("sacred")),
+        Intervals.term("measureless")
+    );
+    checkIntervals(source, "field2", 1, new int[][]{
+        {},
+        { 3, 8 },
+        {}, {}, {}, {}
+    });
+  }
+
   public void testNesting() throws IOException {
     IntervalsSource source = Intervals.unordered(
         Intervals.term("pease"),

[2/5] lucene-solr:master: LUCENE-8586: Fix infinite loop in Intervals.or()

Posted by ro...@apache.org.

LUCENE-8586: Fix infinite loop in Intervals.or()


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c78429a5
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c78429a5
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c78429a5

Branch: refs/heads/master
Commit: c78429a554d28611dacd90c388e6c34039b228d1
Parents: 75a053d
Author: Alan Woodward <ro...@apache.org>
Authored: Mon Dec 3 16:04:54 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Tue Dec 4 09:47:42 2018 +0000

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 ++
 .../intervals/DisjunctionIntervalsSource.java   | 47 +++++++++++++++++++-
 .../lucene/search/intervals/TestIntervals.java  | 14 ++++++
 3 files changed, 62 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c78429a5/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0a41d70..3c2a409 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -305,6 +305,9 @@ Bug fixes
 * LUCENE-8556: Use latitude and longitude instead of encoding values to check if triangle is ear
   when using morton optimisation. (Ignacio Vera)
 
+* LUCENE-8586: Intervals.or() could get stuck in an infinite loop on certain indexes
+  (Alan Woodward)
+
 New Features
 
 * LUCENE-8496: Selective indexing - modify BKDReader/BKDWriter to allow users

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c78429a5/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
index 79c1bcf..c54f18b 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
@@ -146,7 +146,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
 
     @Override
     public int nextInterval() throws IOException {
-      if (current == EMPTY) {
+      if (current == EMPTY || current == EXHAUSTED) {
         if (intervalQueue.size() > 0) {
           current = intervalQueue.top();
         }
@@ -160,7 +160,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
         }
       }
       if (intervalQueue.size() == 0) {
-        current = EMPTY;
+        current = EXHAUSTED;
         return NO_MORE_INTERVALS;
       }
       current = intervalQueue.top();
@@ -239,4 +239,47 @@ class DisjunctionIntervalsSource extends IntervalsSource {
     }
   };
 
+  private static final IntervalIterator EXHAUSTED = new IntervalIterator() {
+
+    @Override
+    public int docID() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long cost() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int start() {
+      return NO_MORE_INTERVALS;
+    }
+
+    @Override
+    public int end() {
+      return NO_MORE_INTERVALS;
+    }
+
+    @Override
+    public int nextInterval() {
+      return NO_MORE_INTERVALS;
+    }
+
+    @Override
+    public float matchCost() {
+      return 0;
+    }
+  };
+
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c78429a5/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
index d7754be..6002b3a 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
@@ -56,6 +56,8 @@ public class TestIntervals extends LuceneTestCase {
       "Porridge is great"
   };
 
+  //   0         1         2         3         4         5         6         7         8         9
+  //   012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
   private static String field2_docs[] = {
       "In Xanadu did Kubla Khan a stately pleasure dome decree",
       "Where Alph the sacred river ran through caverns measureless to man",
@@ -260,6 +262,18 @@ public class TestIntervals extends LuceneTestCase {
     assertFalse(mi.next());
   }
 
+  public void testCombinationDisjunction() throws IOException {
+    IntervalsSource source = Intervals.ordered(
+        Intervals.or(Intervals.term("alph"), Intervals.term("sacred")),
+        Intervals.term("measureless")
+    );
+    checkIntervals(source, "field2", 1, new int[][]{
+        {},
+        { 3, 8 },
+        {}, {}, {}, {}
+    });
+  }
+
   public void testNesting() throws IOException {
     IntervalsSource source = Intervals.unordered(
         Intervals.term("pease"),

[5/5] lucene-solr:branch_7_6: LUCENE-8586: Fix infinite loop in Intervals.or()

Posted by ro...@apache.org.

LUCENE-8586: Fix infinite loop in Intervals.or()


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/14bc64d5
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/14bc64d5
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/14bc64d5

Branch: refs/heads/branch_7_6
Commit: 14bc64d517bfa012fa00fc17da1d7173acf92c6b
Parents: d30d6b8
Author: Alan Woodward <ro...@apache.org>
Authored: Mon Dec 3 16:04:54 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Tue Dec 4 09:51:45 2018 +0000

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 ++
 .../intervals/DisjunctionIntervalsSource.java   | 47 +++++++++++++++++++-
 .../lucene/search/intervals/TestIntervals.java  | 14 ++++++
 3 files changed, 62 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/14bc64d5/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 252d8ca..d5b7f6f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -46,6 +46,9 @@ Bug fixes
 * LUCENE-8556: Use latitude and longitude instead of encoding values to check if triangle is ear
   when using morton optimisation. (Ignacio Vera)
 
+* LUCENE-8586: Intervals.or() could get stuck in an infinite loop on certain indexes
+  (Alan Woodward)
+
 New Features
 
 * LUCENE-8496: Selective indexing - modify BKDReader/BKDWriter to allow users

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/14bc64d5/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
index 79c1bcf..c54f18b 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalsSource.java
@@ -146,7 +146,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
 
     @Override
     public int nextInterval() throws IOException {
-      if (current == EMPTY) {
+      if (current == EMPTY || current == EXHAUSTED) {
         if (intervalQueue.size() > 0) {
           current = intervalQueue.top();
         }
@@ -160,7 +160,7 @@ class DisjunctionIntervalsSource extends IntervalsSource {
         }
       }
       if (intervalQueue.size() == 0) {
-        current = EMPTY;
+        current = EXHAUSTED;
         return NO_MORE_INTERVALS;
       }
       current = intervalQueue.top();
@@ -239,4 +239,47 @@ class DisjunctionIntervalsSource extends IntervalsSource {
     }
   };
 
+  private static final IntervalIterator EXHAUSTED = new IntervalIterator() {
+
+    @Override
+    public int docID() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long cost() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int start() {
+      return NO_MORE_INTERVALS;
+    }
+
+    @Override
+    public int end() {
+      return NO_MORE_INTERVALS;
+    }
+
+    @Override
+    public int nextInterval() {
+      return NO_MORE_INTERVALS;
+    }
+
+    @Override
+    public float matchCost() {
+      return 0;
+    }
+  };
+
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/14bc64d5/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
index d7754be..6002b3a 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/intervals/TestIntervals.java
@@ -56,6 +56,8 @@ public class TestIntervals extends LuceneTestCase {
       "Porridge is great"
   };
 
+  //   0         1         2         3         4         5         6         7         8         9
+  //   012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
   private static String field2_docs[] = {
       "In Xanadu did Kubla Khan a stately pleasure dome decree",
       "Where Alph the sacred river ran through caverns measureless to man",
@@ -260,6 +262,18 @@ public class TestIntervals extends LuceneTestCase {
     assertFalse(mi.next());
   }
 
+  public void testCombinationDisjunction() throws IOException {
+    IntervalsSource source = Intervals.ordered(
+        Intervals.or(Intervals.term("alph"), Intervals.term("sacred")),
+        Intervals.term("measureless")
+    );
+    checkIntervals(source, "field2", 1, new int[][]{
+        {},
+        { 3, 8 },
+        {}, {}, {}, {}
+    });
+  }
+
   public void testNesting() throws IOException {
     IntervalsSource source = Intervals.unordered(
         Intervals.term("pease"),

[3/5] lucene-solr:master: LUCENE-8564: Add GraphTokenFilter

Posted by ro...@apache.org.

LUCENE-8564: Add GraphTokenFilter


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/f5867a14
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/f5867a14
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/f5867a14

Branch: refs/heads/master
Commit: f5867a1413189675cf69ea88412237e32429786e
Parents: 6c11161
Author: Alan Woodward <ro...@apache.org>
Authored: Mon Dec 3 12:17:58 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Tue Dec 4 09:47:42 2018 +0000

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   4 +
 .../analysis/shingle/FixedShingleFilter.java    | 269 ++++--------------
 .../shingle/FixedShingleFilterTest.java         |  19 +-
 .../lucene/analysis/GraphTokenFilter.java       | 284 +++++++++++++++++++
 .../lucene/analysis/TestGraphTokenFilter.java   | 236 +++++++++++++++
 .../lucene/analysis/TestGraphTokenizers.java    |   2 +-
 .../java/org/apache/lucene/analysis/Token.java  |   7 +
 7 files changed, 591 insertions(+), 230 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 349d64d..fc609a5 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -155,6 +155,10 @@ New Features
 * LUCENE-8216: Added a new BM25FQuery in sandbox to blend statistics across several fields
   using the BM25F formula. (Adrien Grand, Jim Ferenczi)
 
+* LUCENE-8564: GraphTokenFilter is an abstract class useful for token filters that need
+  to read-ahead in the token stream and take into account graph structures.  This
+  also changes FixedShingleFilter to extend GraphTokenFilter (Alan Woodward)
+
 Improvements
 
 * LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
index cf82363..8f7eb95 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
@@ -18,16 +18,14 @@
 package org.apache.lucene.analysis.shingle;
 
 import java.io.IOException;
-import java.util.ArrayDeque;
-import java.util.Deque;
 
-import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.GraphTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.util.AttributeSource;
 
 /**
  * A FixedShingleFilter constructs shingles (token n-grams) from a token stream.
@@ -47,27 +45,20 @@ import org.apache.lucene.util.AttributeSource;
  *
  * @lucene.experimental
  */
-public final class FixedShingleFilter extends TokenFilter {
+public final class FixedShingleFilter extends GraphTokenFilter {
 
-  private final Deque<Token> tokenPool = new ArrayDeque<>();
-
-  private static final int MAX_SHINGLE_STACK_SIZE = 1000;
   private static final int MAX_SHINGLE_SIZE = 4;
 
   private final int shingleSize;
   private final String tokenSeparator;
-
-  private final Token gapToken = new Token(new AttributeSource());
-  private final Token endToken = new Token(new AttributeSource());
+  private final String fillerToken;
 
   private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 
-  private Token[] currentShingleTokens;
-  private int currentShingleStackSize;
-  private boolean inputStreamExhausted = false;
+  private final CharTermAttribute buffer = new CharTermAttributeImpl();
 
   /**
    * Creates a FixedShingleFilter over an input token stream
@@ -89,228 +80,82 @@ public final class FixedShingleFilter extends TokenFilter {
    */
   public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) {
     super(input);
+
     if (shingleSize <= 1 || shingleSize > MAX_SHINGLE_SIZE) {
       throw new IllegalArgumentException("Shingle size must be between 2 and " + MAX_SHINGLE_SIZE + ", got " + shingleSize);
     }
     this.shingleSize = shingleSize;
     this.tokenSeparator = tokenSeparator;
-    this.gapToken.termAtt.setEmpty().append(fillerToken);
-    this.currentShingleTokens = new Token[shingleSize];
+    this.fillerToken = fillerToken;
   }
 
   @Override
   public boolean incrementToken() throws IOException {
-    int posInc = 0;
-    if (nextShingle() == false) {
-      Token nextRoot = nextTokenInStream(currentShingleTokens[0]);
-      if (nextRoot == endToken)
-        return false;
-      recycleToken(currentShingleTokens[0]);
-      if (resetShingleRoot(nextRoot) == false) {
+
+    int shinglePosInc;
+    if (incrementGraph() == false) {
+      if (incrementBaseToken() == false) {
         return false;
       }
-      posInc = currentShingleTokens[0].posInc();
+      // starting a shingle at a new base position, use base position increment
+      shinglePosInc = incAtt.getPositionIncrement();
     }
-    clearAttributes();
-    incAtt.setPositionIncrement(posInc);
-    offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset());
-    termAtt.setEmpty();
-    termAtt.append(currentShingleTokens[0].term());
-    typeAtt.setType("shingle");
-    for (int i = 1; i < shingleSize; i++) {
-      termAtt.append(tokenSeparator).append(currentShingleTokens[i].term());
-    }
-    return true;
-  }
-
-  @Override
-  public void reset() throws IOException {
-    super.reset();
-    this.tokenPool.clear();
-    this.currentShingleTokens[0] = null;
-    this.inputStreamExhausted = false;
-    this.currentShingleStackSize = 0;
-  }
-
-  @Override
-  public void end() throws IOException {
-    if (inputStreamExhausted == false) {
-      finishInnerStream();
+    else {
+      // starting a new shingle at the same base with a different graph, use a 0
+      // position increment
+      shinglePosInc = 0;
     }
-    clearAttributes();
-    this.offsetAtt.setOffset(0, endToken.endOffset());
-    this.incAtt.setPositionIncrement(endToken.posInc());
-  }
-
-  private void finishInnerStream() throws IOException {
-    input.end();
-    inputStreamExhausted = true;
-    // check for gaps at the end of the tokenstream
-    endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
-    OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
-    endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
-  }
 
-  private Token lastTokenInShingle() {
-    int lastTokenIndex = shingleSize - 1;
-    while (currentShingleTokens[lastTokenIndex] == gapToken) {
-      lastTokenIndex--;
-    }
-    return currentShingleTokens[lastTokenIndex];
-  }
+    final int startOffset = offsetAtt.startOffset();
+    int endOffset = offsetAtt.endOffset();
+    this.buffer.setEmpty();
+    this.buffer.append(termAtt);
 
-  private boolean resetShingleRoot(Token token) throws IOException {
-    this.currentShingleTokens[0] = token;
+    // build the shingle by iterating over the current graph, adding
+    // filler tokens if we encounter gaps
     for (int i = 1; i < shingleSize; i++) {
-      Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]);
-      if (current == endToken) {
-        if (endToken.posInc() + i >= shingleSize) {
-          // end tokens are a special case, because their posIncs are always
-          // due to stopwords.  Therefore, we can happily append gap tokens
-          // to the end of the current shingle
-          for (int j = i; j < shingleSize; j++) {
-            this.currentShingleTokens[i] = gapToken;
-            i++;
-          }
-          return true;
+      if (incrementGraphToken() == false) {
+        // we've reached the end of the token stream, check for trailing
+        // positions and add fillers if necessary
+        int trailingPositions = getTrailingPositions();
+        if (i + trailingPositions < shingleSize) {
+          // not enough trailing positions to make a full shingle
+          return false;
         }
-        return false;
-      }
-      if (current.posInc() > 1) {
-        // insert gaps into the shingle list
-        for (int j = 1; j < current.posInc(); j++) {
-          this.currentShingleTokens[i] = gapToken;
+        while (i < shingleSize) {
+          this.buffer.append(tokenSeparator).append(fillerToken);
           i++;
-          if (i >= shingleSize)
-            return true;
         }
+        break;
       }
-      this.currentShingleTokens[i] = current;
-    }
-    return true;
-  }
-
-  private boolean nextShingle() throws IOException {
-    return currentShingleTokens[0] != null && advanceStack();
-  }
-
-  // check if the next token in the tokenstream is at the same position as this one
-  private boolean lastInStack(Token token) throws IOException {
-    Token next = nextTokenInStream(token);
-    return next == endToken || next.posInc() != 0;
-  }
-
-  private boolean advanceStack() throws IOException {
-    for (int i = shingleSize - 1; i >= 1; i--) {
-      if (currentShingleTokens[i] != gapToken && lastInStack(currentShingleTokens[i]) == false) {
-        currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]);
-        for (int j = i + 1; j < shingleSize; j++) {
-          currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]);
+      int posInc = incAtt.getPositionIncrement();
+      if (posInc > 1) {
+        // if we have a posInc > 1, we need to fill in the gaps
+        if (i + posInc > shingleSize) {
+          // if the posInc is greater than the shingle size, we need to add fillers
+          // up to the shingle size but no further
+          while (i < shingleSize) {
+            this.buffer.append(tokenSeparator).append(fillerToken);
+            i++;
+          }
+          break;
         }
-        if (currentShingleStackSize++ > MAX_SHINGLE_STACK_SIZE) {
-          throw new IllegalStateException("Too many shingles (> " + MAX_SHINGLE_STACK_SIZE + ") at term [" + currentShingleTokens[0].term() + "]");
+        // otherwise just add them in as far as we need
+        while (posInc > 1) {
+          this.buffer.append(tokenSeparator).append(fillerToken);
+          posInc--;
+          i++;
         }
-        return true;
       }
+      this.buffer.append(tokenSeparator).append(termAtt);
+      endOffset = offsetAtt.endOffset();
     }
-    currentShingleStackSize = 0;
-    return false;
-  }
-
-  private Token newToken() {
-    Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.removeFirst();
-    token.reset(this);
-    return token;
-  }
-
-  private void recycleToken(Token token) {
-    if (token == null)
-      return;
-    token.nextToken = null;
-    tokenPool.add(token);
-  }
-
-  // for testing
-  int instantiatedTokenCount() {
-    int tokenCount = tokenPool.size() + 1;
-    if (currentShingleTokens[0] == endToken || currentShingleTokens[0] == null)
-      return tokenCount;
-    for (Token t = currentShingleTokens[0]; t != endToken && t != null; t = t.nextToken) {
-      tokenCount++;
-    }
-    return tokenCount;
-  }
-
-  private Token nextTokenInGraph(Token token) throws IOException {
-    do {
-      token = nextTokenInStream(token);
-      if (token == endToken) {
-        return endToken;
-      }
-    } while (token.posInc() == 0);
-    return token;
-  }
-
-  private Token nextTokenInStream(Token token) throws IOException {
-    if (token != null && token.nextToken != null) {
-      return token.nextToken;
-    }
-    if (input.incrementToken() == false) {
-      finishInnerStream();
-      if (token == null) {
-        return endToken;
-      } else {
-        token.nextToken = endToken;
-        return endToken;
-      }
-    }
-    if (token == null) {
-      return newToken();
-    }
-    token.nextToken = newToken();
-    return token.nextToken;
-  }
-
-  private static class Token {
-    final AttributeSource attSource;
-    final PositionIncrementAttribute posIncAtt;
-    final CharTermAttribute termAtt;
-    final OffsetAttribute offsetAtt;
-
-    Token nextToken;
-
-    Token(AttributeSource attSource) {
-      this.attSource = attSource;
-      this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
-      this.termAtt = attSource.addAttribute(CharTermAttribute.class);
-      this.offsetAtt = attSource.addAttribute(OffsetAttribute.class);
-    }
-
-    int posInc() {
-      return this.posIncAtt.getPositionIncrement();
-    }
-
-    CharSequence term() {
-      return this.termAtt;
-    }
-
-    int startOffset() {
-      return this.offsetAtt.startOffset();
-    }
-
-    int endOffset() {
-      return this.offsetAtt.endOffset();
-    }
-
-    void reset(AttributeSource attSource) {
-      attSource.copyTo(this.attSource);
-      this.nextToken = null;
-    }
-
-    @Override
-    public String toString() {
-      return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc();
-    }
+    clearAttributes();
+    this.offsetAtt.setOffset(startOffset, endOffset);
+    this.incAtt.setPositionIncrement(shinglePosInc);
+    this.termAtt.setEmpty().append(buffer);
+    this.typeAtt.setType("shingle");
+    return true;
   }
 
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
index f5031b3..85c7dc6 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
@@ -163,6 +163,8 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
 
   public void testIncomingGraphs() throws IOException {
 
+    // b/a c b/a d
+
     TokenStream ts = new CannedTokenStream(
         new Token("b", 0, 1),
         new Token("a", 0, 0, 1),
@@ -208,21 +210,4 @@ public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
     assertEquals("Shingle size must be between 2 and 4, got 5", e2.getMessage());
   }
 
-  public void testShingleCountLimits() {
-
-    Token[] tokens = new Token[5000];
-    tokens[0] = new Token("term", 1, 0, 1);
-    tokens[1] = new Token("term1", 1, 2, 3);
-    for (int i = 2; i < 5000; i++) {
-      tokens[i] = new Token("term" + i, 0, 2, 3);
-    }
-
-    Exception e = expectThrows(IllegalStateException.class, () -> {
-      TokenStream ts = new FixedShingleFilter(new CannedTokenStream(tokens), 2);
-      ts.reset();
-      while (ts.incrementToken()) {}
-    });
-    assertEquals("Too many shingles (> 1000) at term [term]", e.getMessage());
-  }
-
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/core/src/java/org/apache/lucene/analysis/GraphTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/GraphTokenFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/GraphTokenFilter.java
new file mode 100644
index 0000000..9c1e02e
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/analysis/GraphTokenFilter.java
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Deque;
+import java.util.List;
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * An abstract TokenFilter that exposes its input stream as a graph
+ *
+ * Call {@link #incrementBaseToken()} to move the root of the graph to the next
+ * position in the TokenStream, {@link #incrementGraphToken()} to move along
+ * the current graph, and {@link #incrementGraph()} to reset to the next graph
+ * based at the current root.
+ *
+ * For example, given the stream 'a b/c:2 d e`, then with the base token at
+ * 'a', incrementGraphToken() will produce the stream 'a b d e', and then
+ * after calling incrementGraph() will produce the stream 'a c e'.
+ */
+public abstract class GraphTokenFilter extends TokenFilter {
+
+  private final Deque<Token> tokenPool = new ArrayDeque<>();
+  private final List<Token> currentGraph = new ArrayList<>();
+
+  /**
+   * The maximum permitted number of routes through a graph
+   */
+  public static final int MAX_GRAPH_STACK_SIZE = 1000;
+
+  /**
+   * The maximum permitted read-ahead in the token stream
+   */
+  public static final int MAX_TOKEN_CACHE_SIZE = 100;
+
+  private Token baseToken;
+  private int graphDepth;
+  private int graphPos;
+  private int trailingPositions = -1;
+  private int finalOffsets = -1;
+
+  private int stackSize;
+  private int cacheSize;
+
+  private final PositionIncrementAttribute posIncAtt;
+  private final OffsetAttribute offsetAtt;
+
+  /**
+   * Create a new GraphTokenFilter
+   */
+  public GraphTokenFilter(TokenStream input) {
+    super(input);
+    this.posIncAtt = input.addAttribute(PositionIncrementAttribute.class);
+    this.offsetAtt = input.addAttribute(OffsetAttribute.class);
+  }
+
+  /**
+   * Move the root of the graph to the next token in the wrapped TokenStream
+   *
+   * @return {@code false} if the underlying stream is exhausted
+   */
+  protected final boolean incrementBaseToken() throws IOException {
+    stackSize = 0;
+    graphDepth = 0;
+    graphPos = 0;
+    Token oldBase = baseToken;
+    baseToken = nextTokenInStream(baseToken);
+    if (baseToken == null) {
+      return false;
+    }
+    currentGraph.clear();
+    currentGraph.add(baseToken);
+    baseToken.attSource.copyTo(this);
+    recycleToken(oldBase);
+    return true;
+  }
+
+  /**
+   * Move to the next token in the current route through the graph
+   *
+   * @return {@code false} if there are not more tokens in the current graph
+   */
+  protected final boolean incrementGraphToken() throws IOException {
+    if (graphPos < graphDepth) {
+      graphPos++;
+      currentGraph.get(graphPos).attSource.copyTo(this);
+      return true;
+    }
+    Token token = nextTokenInGraph(currentGraph.get(graphDepth));
+    if (token == null) {
+      return false;
+    }
+    graphDepth++;
+    graphPos++;
+    currentGraph.add(graphDepth, token);
+    token.attSource.copyTo(this);
+    return true;
+  }
+
+  /**
+   * Reset to the root token again, and move down the next route through the graph
+   *
+   * @return false if there are no more routes through the graph
+   */
+  protected final boolean incrementGraph() throws IOException {
+    if (baseToken == null) {
+      return false;
+    }
+    graphPos = 0;
+    for (int i = graphDepth; i >= 1; i--) {
+      if (lastInStack(currentGraph.get(i)) == false) {
+        currentGraph.set(i, nextTokenInStream(currentGraph.get(i)));
+        for (int j = i + 1; j < graphDepth; j++) {
+          currentGraph.set(j, nextTokenInGraph(currentGraph.get(j)));
+        }
+        if (stackSize++ > MAX_GRAPH_STACK_SIZE) {
+          throw new IllegalStateException("Too many graph paths (> " + MAX_GRAPH_STACK_SIZE + ")");
+        }
+        currentGraph.get(0).attSource.copyTo(this);
+        graphDepth = i;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * Return the number of trailing positions at the end of the graph
+   *
+   * NB this should only be called after {@link #incrementGraphToken()} has returned {@code false}
+   */
+  public int getTrailingPositions() {
+    return trailingPositions;
+  }
+
+  @Override
+  public void end() throws IOException {
+    if (trailingPositions == -1) {
+      input.end();
+      trailingPositions = posIncAtt.getPositionIncrement();
+      finalOffsets = offsetAtt.endOffset();
+    }
+    else {
+      endAttributes();
+      this.posIncAtt.setPositionIncrement(trailingPositions);
+      this.offsetAtt.setOffset(finalOffsets, finalOffsets);
+    }
+  }
+
+  @Override
+  public void reset() throws IOException {
+    input.reset();
+    // new attributes can be added between reset() calls, so we can't reuse
+    // token objects from a previous run
+    tokenPool.clear();
+    cacheSize = 0;
+    graphDepth = 0;
+    trailingPositions = -1;
+    finalOffsets = -1;
+    baseToken = null;
+  }
+
+  int cachedTokenCount() {
+    return cacheSize;
+  }
+
+  private Token newToken() {
+    if (tokenPool.size() == 0) {
+      cacheSize++;
+      if (cacheSize > MAX_TOKEN_CACHE_SIZE) {
+        throw new IllegalStateException("Too many cached tokens (> " + MAX_TOKEN_CACHE_SIZE + ")");
+      }
+      return new Token(this.cloneAttributes());
+    }
+    Token token = tokenPool.removeFirst();
+    token.reset(input);
+    return token;
+  }
+
+  private void recycleToken(Token token) {
+    if (token == null)
+      return;
+    token.nextToken = null;
+    tokenPool.add(token);
+  }
+
+  private Token nextTokenInGraph(Token token) throws IOException {
+    int remaining = token.length();
+    do {
+      token = nextTokenInStream(token);
+      if (token == null) {
+        return null;
+      }
+      remaining -= token.posInc();
+    } while (remaining > 0);
+    return token;
+  }
+
+  // check if the next token in the tokenstream is at the same position as this one
+  private boolean lastInStack(Token token) throws IOException {
+    Token next = nextTokenInStream(token);
+    return next == null || next.posInc() != 0;
+  }
+
+  private Token nextTokenInStream(Token token) throws IOException {
+    if (token != null && token.nextToken != null) {
+      return token.nextToken;
+    }
+    if (this.trailingPositions != -1) {
+      // already hit the end
+      return null;
+    }
+    if (input.incrementToken() == false) {
+      input.end();
+      trailingPositions = posIncAtt.getPositionIncrement();
+      finalOffsets = offsetAtt.endOffset();
+      return null;
+    }
+    if (token == null) {
+      return newToken();
+    }
+    token.nextToken = newToken();
+    return token.nextToken;
+  }
+
+  private static class Token {
+
+    final AttributeSource attSource;
+    final PositionIncrementAttribute posIncAtt;
+    final PositionLengthAttribute lengthAtt;
+    Token nextToken;
+
+    Token(AttributeSource attSource) {
+      this.attSource = attSource;
+      this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
+      boolean hasLengthAtt = attSource.hasAttribute(PositionLengthAttribute.class);
+      this.lengthAtt = hasLengthAtt ? attSource.addAttribute(PositionLengthAttribute.class) : null;
+    }
+
+    int posInc() {
+      return this.posIncAtt.getPositionIncrement();
+    }
+
+    int length() {
+      if (this.lengthAtt == null) {
+        return 1;
+      }
+      return this.lengthAtt.getPositionLength();
+    }
+
+    void reset(AttributeSource attSource) {
+      attSource.copyTo(this.attSource);
+      this.nextToken = null;
+    }
+
+    @Override
+    public String toString() {
+      return attSource.toString();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenFilter.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenFilter.java
new file mode 100644
index 0000000..d3a476f
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenFilter.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+public class TestGraphTokenFilter extends BaseTokenStreamTestCase {
+
+  static class TestFilter extends GraphTokenFilter {
+
+    public TestFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+      return incrementBaseToken();
+    }
+  }
+
+  public void testGraphTokenStream() throws IOException {
+
+    TestGraphTokenizers.GraphTokenizer tok = new TestGraphTokenizers.GraphTokenizer();
+    GraphTokenFilter graph = new TestFilter(tok);
+
+    CharTermAttribute termAtt = graph.addAttribute(CharTermAttribute.class);
+    PositionIncrementAttribute posIncAtt = graph.addAttribute(PositionIncrementAttribute.class);
+
+    tok.setReader(new StringReader("a b/c d e/f:3 g/h i j k"));
+    tok.reset();
+
+    assertFalse(graph.incrementGraph());
+    assertEquals(0, graph.cachedTokenCount());
+
+    assertTrue(graph.incrementBaseToken());
+    assertEquals("a", termAtt.toString());
+    assertEquals(1, posIncAtt.getPositionIncrement());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("b", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("d", termAtt.toString());
+    assertTrue(graph.incrementGraph());
+    assertEquals("a", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("c", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("d", termAtt.toString());
+    assertFalse(graph.incrementGraph());
+    assertEquals(5, graph.cachedTokenCount());
+
+    assertTrue(graph.incrementBaseToken());
+    assertEquals("b", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("d", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("e", termAtt.toString());
+    assertTrue(graph.incrementGraph());
+    assertEquals("b", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("d", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("f", termAtt.toString());
+    assertFalse(graph.incrementGraph());
+    assertEquals(6, graph.cachedTokenCount());
+
+    assertTrue(graph.incrementBaseToken());
+    assertEquals("c", termAtt.toString());
+    assertEquals(0, posIncAtt.getPositionIncrement());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("d", termAtt.toString());
+    assertFalse(graph.incrementGraph());
+    assertEquals(6, graph.cachedTokenCount());
+
+    assertTrue(graph.incrementBaseToken());
+    assertEquals("d", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("e", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("g", termAtt.toString());
+    assertTrue(graph.incrementGraph());
+    assertEquals("d", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("e", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("h", termAtt.toString());
+    assertTrue(graph.incrementGraph());
+    assertEquals("d", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("f", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("j", termAtt.toString());
+    assertFalse(graph.incrementGraph());
+    assertEquals(8, graph.cachedTokenCount());
+
+    //tok.setReader(new StringReader("a b/c d e/f:3 g/h i j k"));
+
+    assertTrue(graph.incrementBaseToken());
+    assertEquals("e", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("g", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("i", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("j", termAtt.toString());
+    assertTrue(graph.incrementGraph());
+    assertEquals("e", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("h", termAtt.toString());
+    assertFalse(graph.incrementGraph());
+    assertEquals(8, graph.cachedTokenCount());
+
+    assertTrue(graph.incrementBaseToken());
+    assertEquals("f", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("j", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("k", termAtt.toString());
+    assertFalse(graph.incrementGraphToken());
+    assertFalse(graph.incrementGraph());
+    assertEquals(8, graph.cachedTokenCount());
+
+    assertTrue(graph.incrementBaseToken());
+    assertEquals("g", termAtt.toString());
+    assertTrue(graph.incrementGraphToken());
+    assertEquals("i", termAtt.toString());
+    assertFalse(graph.incrementGraph());
+    assertEquals(8, graph.cachedTokenCount());
+
+    assertTrue(graph.incrementBaseToken());
+    assertEquals("h", termAtt.toString());
+    assertFalse(graph.incrementGraph());
+    assertEquals(8, graph.cachedTokenCount());
+
+    assertTrue(graph.incrementBaseToken());
+    assertTrue(graph.incrementBaseToken());
+    assertTrue(graph.incrementBaseToken());
+    assertEquals("k", termAtt.toString());
+    assertFalse(graph.incrementGraphToken());
+    assertEquals(0, graph.getTrailingPositions());
+    assertFalse(graph.incrementGraph());
+    assertFalse(graph.incrementBaseToken());
+    assertEquals(8, graph.cachedTokenCount());
+
+  }
+
+  public void testTrailingPositions() throws IOException {
+
+    // a/b:2 c _
+    CannedTokenStream cts = new CannedTokenStream(1, 5,
+        new Token("a", 0, 1),
+        new Token("b", 0, 0, 1, 2),
+        new Token("c", 1, 2, 3)
+    );
+
+    GraphTokenFilter gts = new TestFilter(cts);
+    assertFalse(gts.incrementGraph());
+    assertTrue(gts.incrementBaseToken());
+    assertTrue(gts.incrementGraphToken());
+    assertFalse(gts.incrementGraphToken());
+    assertEquals(1, gts.getTrailingPositions());
+    assertFalse(gts.incrementGraph());
+    assertTrue(gts.incrementBaseToken());
+    assertFalse(gts.incrementGraphToken());
+    assertEquals(1, gts.getTrailingPositions());
+    assertFalse(gts.incrementGraph());
+  }
+
+  public void testMaximumGraphCacheSize() throws IOException {
+
+    Token[] tokens = new Token[GraphTokenFilter.MAX_TOKEN_CACHE_SIZE + 5];
+    for (int i = 0; i < GraphTokenFilter.MAX_TOKEN_CACHE_SIZE + 5; i++) {
+      tokens[i] = new Token("a", 1, i * 2, i * 2 + 1);
+    }
+
+    GraphTokenFilter gts = new TestFilter(new CannedTokenStream(tokens));
+    Exception e = expectThrows(IllegalStateException.class, () -> {
+      gts.reset();
+      gts.incrementBaseToken();
+      while (true) {
+        gts.incrementGraphToken();
+      }
+    });
+    assertEquals("Too many cached tokens (> 100)", e.getMessage());
+
+    gts.reset();
+    // after reset, the cache should be cleared and so we can read ahead once more
+    gts.incrementBaseToken();
+    gts.incrementGraphToken();
+
+  }
+
+  public void testGraphPathCountLimits() {
+
+    Token[] tokens = new Token[50];
+    tokens[0] = new Token("term", 1, 0, 1);
+    tokens[1] = new Token("term1", 1, 2, 3);
+    for (int i = 2; i < 50; i++) {
+      tokens[i] = new Token("term" + i, i % 2, 2, 3);
+    }
+
+    Exception e = expectThrows(IllegalStateException.class, () -> {
+      GraphTokenFilter graph = new TestFilter(new CannedTokenStream(tokens));
+      graph.reset();
+      graph.incrementBaseToken();
+      for (int i = 0; i < 10; i++) {
+        graph.incrementGraphToken();
+      }
+      while (graph.incrementGraph()) {
+        for (int i = 0; i < 10; i++) {
+          graph.incrementGraphToken();
+        }
+      }
+    });
+    assertEquals("Too many graph paths (> 1000)", e.getMessage());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
index 7e98662..8d3b8e1 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
@@ -56,7 +56,7 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
   // you cannot turn on MockCharFilter when random
   // testing...
 
-  private static class GraphTokenizer extends Tokenizer {
+  public static final class GraphTokenizer extends Tokenizer {
     private List<Token> tokens;
     private int upto;
     private int inputLength;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f5867a14/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
index 9994175..18b6b8b 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
@@ -82,6 +82,13 @@ public class Token extends PackedTokenAttributeImpl implements FlagsAttribute, P
     setPositionIncrement(posInc);
   }
 
+  public Token(CharSequence text, int posInc, int start, int end, int posLength) {
+    append(text);
+    setOffset(start, end);
+    setPositionIncrement(posInc);
+    setPositionLength(posLength);
+  }
+
   /**
    * {@inheritDoc}
    * @see FlagsAttribute

[4/5] lucene-solr:master: LUCENE-8509: WordDelimiterGraphFilter no longer adjusts offsets by default

Posted by ro...@apache.org.

LUCENE-8509: WordDelimiterGraphFilter no longer adjusts offsets by default


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/75a053dd
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/75a053dd
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/75a053dd

Branch: refs/heads/master
Commit: 75a053dd696d6e632755e613380450f22c78c91b
Parents: f5867a1
Author: Alan Woodward <ro...@apache.org>
Authored: Mon Dec 3 13:36:21 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Tue Dec 4 09:47:42 2018 +0000

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  4 ++
 .../miscellaneous/WordDelimiterGraphFilter.java | 17 ++++----
 .../WordDelimiterGraphFilterFactory.java        |  5 ++-
 .../TestWordDelimiterGraphFilter.java           | 42 +++++++++++---------
 4 files changed, 41 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75a053dd/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fc609a5..0a41d70 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -142,6 +142,10 @@ Changes in Runtime Behavior
   anymore. This doesn't affect ordering as this is a constant factor which is
   the same for every document. (Luca Cavanna via Adrien Grand)
 
+* LUCENE-8509: WordDelimiterGraphFilter will no longer set the offsets of internal
+  tokens by default, preventing a number of bugs when the filter is chained with
+  tokenfilters that change the length of their tokens (Alan Woodward)
+
 New Features
 
 * LUCENE-8340: LongPoint#newDistanceQuery may be used to boost scores based on

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75a053dd/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
index a438213..00ace5b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -191,6 +191,8 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
   // used for concatenating runs of similar typed subwords (word,number)
   private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
 
+  private final boolean adjustInternalOffsets;
+
   // number of subwords last output by concat.
   private int lastConcatCount;
 
@@ -206,10 +208,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
   private int savedEndOffset;
   private AttributeSource.State savedState;
   private int lastStartOffset;
-  
-  // if length by start + end offsets doesn't match the term text then assume
-  // this is a synonym and don't adjust the offsets.
-  private boolean hasIllegalOffsets;
+  private boolean adjustingOffsets;
 
   private int wordPos;
 
@@ -217,11 +216,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
    * Creates a new WordDelimiterGraphFilter
    *
    * @param in TokenStream to be filtered
+   * @param adjustInternalOffsets if the offsets of partial terms should be adjusted
    * @param charTypeTable table containing character types
    * @param configurationFlags Flags configuring the filter
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
-  public WordDelimiterGraphFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
+  public WordDelimiterGraphFilter(TokenStream in, boolean adjustInternalOffsets, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
     super(in);
     if ((configurationFlags &
         ~(GENERATE_WORD_PARTS |
@@ -240,6 +240,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
     this.protWords = protWords;
     this.iterator = new WordDelimiterIterator(
         charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE));
+    this.adjustInternalOffsets = adjustInternalOffsets;
   }
 
   /**
@@ -251,7 +252,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
   public WordDelimiterGraphFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
-    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
+    this(in, false, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
   }
 
   /** Iterates all words parts and concatenations, buffering up the term parts we should return. */
@@ -261,7 +262,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
 
     // if length by start + end offsets doesn't match the term's text then set offsets for all our word parts/concats to the incoming
     // offsets.  this can happen if WDGF is applied to an injected synonym, or to a stem'd form, etc:
-    hasIllegalOffsets = (savedEndOffset - savedStartOffset != savedTermLength);
+    adjustingOffsets = adjustInternalOffsets && savedEndOffset - savedStartOffset == savedTermLength;
 
     bufferedLen = 0;
     lastConcatCount = 0;
@@ -391,7 +392,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
         int startOffset;
         int endOffset;
 
-        if (hasIllegalOffsets) {
+        if (adjustingOffsets == false) {
           startOffset = savedStartOffset;
           endOffset = savedEndOffset;
         } else {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75a053dd/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java
index 613aedc..4666c7d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java
@@ -53,12 +53,14 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.*;
 public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
   public static final String PROTECTED_TOKENS = "protected";
   public static final String TYPES = "types";
+  public static final String OFFSETS = "adjustOffsets";
 
   private final String wordFiles;
   private final String types;
   private final int flags;
   byte[] typeTable = null;
   private CharArraySet protectedWords = null;
+  private boolean adjustOffsets = false;
   
   /** Creates a new WordDelimiterGraphFilterFactory */
   public WordDelimiterGraphFilterFactory(Map<String, String> args) {
@@ -94,6 +96,7 @@ public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implemen
     wordFiles = get(args, PROTECTED_TOKENS);
     types = get(args, TYPES);
     this.flags = flags;
+    this.adjustOffsets = getBoolean(args, OFFSETS, true);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -117,7 +120,7 @@ public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implemen
 
   @Override
   public TokenFilter create(TokenStream input) {
-    return new WordDelimiterGraphFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
+    return new WordDelimiterGraphFilter(input, adjustOffsets, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
                                         flags, protectedWords);
   }
   

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/75a053dd/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
index 65d3b02..e3f3f65 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -64,7 +64,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
     // test that subwords and catenated subwords have
     // the correct offsets.
-    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)),
+        true, DEFAULT_WORD_DELIM_TABLE, flags, null);
 
     assertTokenStreamContents(wdf, 
                               new String[] { "foobar", "foo", "bar" },
@@ -72,7 +73,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
                               new int[] { 12, 8, 12 });
 
     // with illegal offsets:
-    wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), true, DEFAULT_WORD_DELIM_TABLE, flags, null);
     assertTokenStreamContents(wdf,
                               new String[] { "foobar", "foo", "bar" },
                               new int[] { 5, 5, 5 },
@@ -81,7 +82,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
   
   public void testOffsetChange() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)),
+        true, DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "übelkeit" },
@@ -91,7 +93,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
   
   public void testOffsetChange2() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)),
+        true, DEFAULT_WORD_DELIM_TABLE, flags, null);
     // illegal offsets:
     assertTokenStreamContents(wdf,
                               new String[] { "übelkeit" },
@@ -101,7 +104,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
   
   public void testOffsetChange3() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)),
+        true, DEFAULT_WORD_DELIM_TABLE, flags, null);
     assertTokenStreamContents(wdf,
                               new String[] { "übelkeit" },
                               new int[] { 8 },
@@ -110,7 +114,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
   
   public void testOffsetChange4() throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)),
+        true, DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf,
         new String[] { "foobar", "foo", "bar"},
@@ -120,7 +125,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
 
   public void doSplit(final String input, String... output) throws Exception {
     int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
-    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input),
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input), false,
         WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
     
     assertTokenStreamContents(wdf, output);
@@ -182,7 +187,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
     // the correct offsets.
     Token token = new Token("foo-bar", 5, 12);
     token.setType("mytype");
-    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(token), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(token), flags, null);
 
     assertTokenStreamContents(wdf, 
                               new String[] {"foobar", "foo", "bar"},
@@ -235,7 +240,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
       public TokenStreamComponents createComponents(String field) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
-            tokenizer,
+            tokenizer, true, DEFAULT_WORD_DELIM_TABLE,
             flags, protWords));
       }
     };
@@ -272,7 +277,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
       public TokenStreamComponents createComponents(String field) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
-            new LargePosIncTokenFilter(tokenizer),
+            new LargePosIncTokenFilter(tokenizer), true, DEFAULT_WORD_DELIM_TABLE,
             flags, protWords));
       }
     };
@@ -317,7 +322,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
       public TokenStreamComponents createComponents(String field) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         StopFilter filter = new StopFilter(tokenizer, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
-        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(filter, flags, protWords));
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(filter, true, DEFAULT_WORD_DELIM_TABLE, flags, protWords));
       }
     };
 
@@ -350,8 +355,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
     assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
                      "abc-def klm-nop kpop",
                      new String[] {"abc", "def", "klm-nop", "kpop"},
-                     new int[]{0, 4, 8, 16},
-                     new int[]{3, 7, 15, 20},
+                     new int[]{0, 0, 8, 16},
+                     new int[]{7, 7, 15, 20},
                      null,
                      new int[]{1, 1, 1, 1},
                      null,
@@ -384,7 +389,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
       @Override
       public TokenStreamComponents createComponents(String field) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null));
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null));
       }
     };
     
@@ -414,8 +419,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
     
     assertAnalyzesTo(a, "abc-def-123-456", 
                      new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" }, 
-                     new int[] { 0, 0, 0, 0, 4, 8, 8, 12 }, 
-                     new int[] { 15, 15, 7, 3, 7, 15, 11, 15 },
+                     new int[] { 0, 0, 0, 0, 0, 0, 0, 0 },
+                     new int[] { 15, 15, 15, 15, 15, 15, 15, 15 },
                      null,
                      new int[] { 1, 0, 0, 0, 1, 1, 0, 1 },
                      null,
@@ -954,7 +959,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
   }
 
   public void testEmptyString() throws Exception {
-    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("", 0, 0)), DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | CATENATE_ALL | PRESERVE_ORIGINAL, null);
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("", 0, 0)),
+        GENERATE_WORD_PARTS | CATENATE_ALL | PRESERVE_ORIGINAL, null);
     wdf.reset();
     assertTrue(wdf.incrementToken());
     assertFalse(wdf.incrementToken());
@@ -967,7 +973,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
                                                new Token("foo-bar", 0, 7));
 
     CharArraySet protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("foo17-BAR")), true);
-    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
     assertGraphStrings(wdf,
                        "foo17-bar foo bar",
                        "foo17-bar foo-bar",