You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by is...@apache.org on 2017/01/19 01:45:04 UTC

[08/27] lucene-solr:jira/solr-5944: LUCENE-7619: add WordDelimiterGraphFilter (replacing WordDelimiterFilter) to produce a correct token stream graph when splitting words

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
new file mode 100644
index 0000000..c69bcca
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.core;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
+  
+  private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
+    final Token t = new Token(term, startOffset, endOffset);
+    t.setPositionIncrement(posInc);
+    t.setPositionLength(posLength);
+    return t;
+  }
+
+  public void testSimpleMock() throws Exception {
+    Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName) {
+          Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
+          TokenStream ts = new FlattenGraphFilter(tokenizer);
+          return new TokenStreamComponents(tokenizer, ts);
+        }
+      };
+
+    assertAnalyzesTo(a, "wtf happened",
+                     new String[] {"wtf", "happened"},
+                     new int[]    {    0,          4},
+                     new int[]    {    3,         12},
+                     null,
+                     new int[]    {    1,          1},
+                     new int[]    {    1,          1},
+                     true);
+  }
+
+  // Make sure graph is unchanged if it's already flat
+  public void testAlreadyFlatten() throws Exception {
+    TokenStream in = new CannedTokenStream(0, 12, new Token[] {
+        token("wtf", 1, 1, 0, 3),
+        token("what", 0, 1, 0, 3),
+        token("wow", 0, 1, 0, 3),
+        token("the", 1, 1, 0, 3),
+        token("that's", 0, 1, 0, 3),
+        token("fudge", 1, 1, 0, 3),
+        token("funny", 0, 1, 0, 3),
+        token("happened", 1, 1, 4, 12)
+      });
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+    assertTokenStreamContents(out,
+                              new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
+                              new int[] {0, 0, 0, 0, 0, 0, 0, 4},
+                              new int[] {3, 3, 3, 3, 3, 3, 3, 12},
+                              new int[] {1, 0, 0, 1, 0, 1, 0, 1},
+                              new int[] {1, 1, 1, 1, 1, 1, 1, 1},
+                              12);
+  }
+
+  public void testWTF1() throws Exception {
+
+    // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
+    TokenStream in = new CannedTokenStream(0, 12, new Token[] {
+        token("wtf", 1, 5, 0, 3),
+        token("what", 0, 1, 0, 3),
+        token("wow", 0, 3, 0, 3),
+        token("the", 1, 1, 0, 3),
+        token("fudge", 1, 3, 0, 3),
+        token("that's", 1, 1, 0, 3),
+        token("funny", 1, 1, 0, 3),
+        token("happened", 1, 1, 4, 12)
+      });
+
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+    assertTokenStreamContents(out,
+                              new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
+                              new int[] {0, 0, 0, 0, 0, 0, 0, 4},
+                              new int[] {3, 3, 3, 3, 3, 3, 3, 12},
+                              new int[] {1, 0, 0, 1, 0, 1, 0, 1},
+                              new int[] {3, 1, 1, 1, 1, 1, 1, 1},
+                              12);
+    
+  }
+
+  /** Same as testWTF1 except the "wtf" token comes out later */
+  public void testWTF2() throws Exception {
+
+    // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
+    TokenStream in = new CannedTokenStream(0, 12, new Token[] {
+        token("what", 1, 1, 0, 3),
+        token("wow", 0, 3, 0, 3),
+        token("wtf", 0, 5, 0, 3),
+        token("the", 1, 1, 0, 3),
+        token("fudge", 1, 3, 0, 3),
+        token("that's", 1, 1, 0, 3),
+        token("funny", 1, 1, 0, 3),
+        token("happened", 1, 1, 4, 12)
+      });
+
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+    assertTokenStreamContents(out,
+                              new String[] {"what", "wow", "wtf", "the", "that's", "fudge", "funny", "happened"},
+                              new int[] {0, 0, 0, 0, 0, 0, 0, 4},
+                              new int[] {3, 3, 3, 3, 3, 3, 3, 12},
+                              new int[] {1, 0, 0, 1, 0, 1, 0, 1},
+                              new int[] {1, 1, 3, 1, 1, 1, 1, 1},
+                              12);
+    
+  }
+
+  public void testNonGreedySynonyms() throws Exception {
+    // This is just "hypothetical" for Lucene today, because SynFilter is
+    // greedy: when two syn rules match on overlapping tokens, only one
+    // (greedily) wins.  This test pretends all syn matches could match:
+
+    TokenStream in = new CannedTokenStream(0, 20, new Token[] {
+        token("wizard", 1, 1, 0, 6),
+        token("wizard_of_oz", 0, 3, 0, 12),
+        token("of", 1, 1, 7, 9),
+        token("oz", 1, 1, 10, 12),
+        token("oz_screams", 0, 2, 10, 20),
+        token("screams", 1, 1, 13, 20),
+      });
+
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+    assertTokenStreamContents(out,
+                              new String[] {"wizard", "wizard_of_oz", "of", "oz", "oz_screams", "screams"},
+                              new int[] {0, 0, 7, 10, 10, 13},
+                              new int[] {6, 12, 9, 12, 20, 20},
+                              new int[] {1, 0, 1, 1, 0, 1},
+                              new int[] {1, 3, 1, 1, 2, 1},
+                              20);
+    
+  }
+
+  public void testNonGraph() throws Exception {
+    TokenStream in = new CannedTokenStream(0, 22, new Token[] {
+        token("hello", 1, 1, 0, 5),
+        token("pseudo", 1, 1, 6, 12),
+        token("world", 1, 1, 13, 18),
+        token("fun", 1, 1, 19, 22),
+      });
+
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+    assertTokenStreamContents(out,
+                              new String[] {"hello", "pseudo", "world", "fun"},
+                              new int[] {0, 6, 13, 19},
+                              new int[] {5, 12, 18, 22},
+                              new int[] {1, 1, 1, 1},
+                              new int[] {1, 1, 1, 1},
+                              22);
+  }
+
+  public void testSimpleHole() throws Exception {
+    TokenStream in = new CannedTokenStream(0, 13, new Token[] {
+        token("hello", 1, 1, 0, 5),
+        token("hole", 2, 1, 6, 10),
+        token("fun", 1, 1, 11, 13),
+      });
+
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+    assertTokenStreamContents(out,
+                              new String[] {"hello", "hole", "fun"},
+                              new int[] {0, 6, 11},
+                              new int[] {5, 10, 13},
+                              new int[] {1, 2, 1},
+                              new int[] {1, 1, 1},
+                              13);
+  }
+
+  public void testHoleUnderSyn() throws Exception {
+    // Tests a StopFilter after SynFilter where a stopword in a syn is removed
+    //
+    //   wizard of oz -> woz syn, but then "of" becomes a hole
+
+    TokenStream in = new CannedTokenStream(0, 12, new Token[] {
+        token("wizard", 1, 1, 0, 6),
+        token("woz", 0, 3, 0, 12),
+        token("oz", 2, 1, 10, 12),
+      });
+
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    assertTokenStreamContents(out,
+                              new String[] {"wizard", "woz", "oz"},
+                              new int[] {0, 0, 10},
+                              new int[] {6, 12, 12},
+                              new int[] {1, 0, 2},
+                              new int[] {1, 3, 1},
+                              12);
+  }
+
+  public void testStrangelyNumberedNodes() throws Exception {
+
+    // Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!)
+    TokenStream in = new CannedTokenStream(0, 27, new Token[] {
+        token("dog", 1, 3, 0, 5),
+        token("puppy", 0, 3, 0, 5),
+        token("flies", 3, 1, 6, 11),
+      });
+
+    TokenStream out = new FlattenGraphFilter(in);
+
+    assertTokenStreamContents(out,
+                              new String[] {"dog", "puppy", "flies"},
+                              new int[] {0, 0, 6},
+                              new int[] {5, 5, 11},
+                              new int[] {1, 0, 1},
+                              new int[] {1, 1, 1},
+                              27);
+  }
+
+  public void testTwoLongParallelPaths() throws Exception {
+
+    // "a a a a a a" in parallel with "b b b b b b"
+    TokenStream in = new CannedTokenStream(0, 11, new Token[] {
+        token("a", 1, 1, 0, 1),
+        token("b", 0, 2, 0, 1),
+        token("a", 1, 2, 2, 3),
+        token("b", 1, 2, 2, 3),
+        token("a", 1, 2, 4, 5),
+        token("b", 1, 2, 4, 5),
+        token("a", 1, 2, 6, 7),
+        token("b", 1, 2, 6, 7),
+        token("a", 1, 2, 8, 9),
+        token("b", 1, 2, 8, 9),
+        token("a", 1, 2, 10, 11),
+        token("b", 1, 2, 10, 11),
+      });
+
+
+    TokenStream out = new FlattenGraphFilter(in);
+    
+    // ... becomes flattened to a single path with overlapping a/b token between each node:
+    assertTokenStreamContents(out,
+                              new String[] {"a", "b", "a", "b", "a", "b", "a", "b", "a", "b", "a", "b"},
+                              new int[] {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10},
+                              new int[] {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11},
+                              new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+                              new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+                              11);
+    
+  }
+
+  // NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
index 7f35298..7f0481f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@@ -446,4 +446,73 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
       a.close();
     }
   }
+
+  /*
+  public void testToDot() throws Exception {
+    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE | PRESERVE_ORIGINAL | CATENATE_WORDS | CATENATE_NUMBERS | STEM_ENGLISH_POSSESSIVE;
+    String text = "PowerSystem2000-5-Shot's";
+    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token(text, 0, text.length())), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    //StringWriter sw = new StringWriter();
+    // TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, new PrintWriter(sw));
+    PrintWriter pw = new PrintWriter("/x/tmp/before.dot");
+    TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, pw);
+    toDot.toDot();
+    pw.close();
+    System.out.println("TEST DONE");
+    //System.out.println("DOT:\n" + sw.toString());
+  }
+  */
+
+  public void testOnlyNumbers() throws Exception {
+    int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
+    Analyzer a = new Analyzer() {
+        
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
+      }
+    };
+
+    assertAnalyzesTo(a, "7-586", 
+                     new String[] {},
+                     new int[] {},
+                     new int[] {},
+                     null,
+                     new int[] {},
+                     null,
+                     false);
+  }
+
+  public void testNumberPunct() throws Exception {
+    int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
+    Analyzer a = new Analyzer() {
+        
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
+      }
+    };
+
+    assertAnalyzesTo(a, "6-", 
+                     new String[] {"6"},
+                     new int[] {0},
+                     new int[] {1},
+                     null,
+                     new int[] {1},
+                     null,
+                     false);
+  }
+
+  private Analyzer getAnalyzer(final int flags) {
+    return new Analyzer() {
+        
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
+      }
+    };
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
new file mode 100644
index 0000000..2daf886
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -0,0 +1,897 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.*;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.TestUtil;
+
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.*;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
+
+/**
+ * New WordDelimiterGraphFilter tests... most of the tests are in ConvertedLegacyTest
+ * TODO: should explicitly test things like protWords and not rely on
+ * the factory tests in Solr.
+ */
+public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
+
+  public void testOffsets() throws IOException {
+    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+    // test that subwords and catenated subwords have
+    // the correct offsets.
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+
+    assertTokenStreamContents(wdf, 
+                              new String[] { "foobar", "foo", "bar" },
+                              new int[] { 5, 5, 9 }, 
+                              new int[] { 12, 8, 12 });
+
+    // with illegal offsets:
+    wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    assertTokenStreamContents(wdf,
+                              new String[] { "foobar", "foo", "bar" },
+                              new int[] { 5, 5, 5 },
+                              new int[] { 6, 6, 6 });
+  }
+  
+  public void testOffsetChange() throws Exception {
+    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("�belkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    
+    assertTokenStreamContents(wdf,
+        new String[] { "�belkeit" },
+        new int[] { 7 },
+        new int[] { 15 });
+  }
+  
+  public void testOffsetChange2() throws Exception {
+    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(�belkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    // illegal offsets:
+    assertTokenStreamContents(wdf,
+                              new String[] { "�belkeit" },
+                              new int[] { 7 },
+                              new int[] { 17 });
+  }
+  
+  public void testOffsetChange3() throws Exception {
+    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(�belkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    assertTokenStreamContents(wdf,
+                              new String[] { "�belkeit" },
+                              new int[] { 8 },
+                              new int[] { 16 });
+  }
+  
+  public void testOffsetChange4() throws Exception {
+    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    
+    assertTokenStreamContents(wdf,
+        new String[] { "foobar", "foo", "bar"},
+        new int[] { 8, 8, 12 },
+        new int[] { 15, 11, 15 });
+  }
+
+  public void doSplit(final String input, String... output) throws Exception {
+    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input),
+        WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
+    
+    assertTokenStreamContents(wdf, output);
+  }
+
+  public void testSplits() throws Exception {
+    doSplit("basic-split","basic","split");
+    doSplit("camelCase","camel","Case");
+
+    // non-space marking symbol shouldn't cause split
+    // this is an example in Thai    
+    doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19");
+    // possessive followed by delimiter
+    doSplit("test's'", "test");
+
+    // some russian upper and lowercase
+    doSplit("\u0420\u043e\u0431\u0435\u0440\u0442", "\u0420\u043e\u0431\u0435\u0440\u0442");
+    // now cause a split (russian camelCase)
+    doSplit("\u0420\u043e\u0431\u0415\u0440\u0442", "\u0420\u043e\u0431", "\u0415\u0440\u0442");
+
+    // a composed titlecase character, don't split
+    doSplit("a\u01c5ungla", "a\u01c5ungla");
+    
+    // a modifier letter, don't split
+    doSplit("\u0633\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0644\u0627\u0645", "\u0633\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0644\u0627\u0645");
+    
+    // enclosing mark, don't split
+    doSplit("test\u20dd", "test\u20dd");
+    
+    // combining spacing mark (the virama), don't split
+    doSplit("\u0939\u093f\u0928\u094d\u0926\u0940", "\u0939\u093f\u0928\u094d\u0926\u0940");
+    
+    // don't split non-ascii digits
+    doSplit("\u0661\u0662\u0663\u0664", "\u0661\u0662\u0663\u0664");
+    
+    // don't split supplementaries into unpaired surrogates
+    doSplit("\U00020000\U00020000", "\U00020000\U00020000");
+  }
+  
+  public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
+    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
+    flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input), flags, null);
+
+    assertTokenStreamContents(wdf, output);
+  }
+  
+  /*
+   * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters. 
+   */
+  public void testPossessives() throws Exception {
+    doSplitPossessive(1, "ra's", "ra");
+    doSplitPossessive(0, "ra's", "ra", "s");
+  }
+  
+  /*
+   * Set a large position increment gap of 10 if the token is "largegap" or "/"
+   */
+  private final class LargePosIncTokenFilter extends TokenFilter {
+    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+    
+    protected LargePosIncTokenFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        if (termAtt.toString().equals("largegap") || termAtt.toString().equals("/"))
+          posIncAtt.setPositionIncrement(10);
+        return true;
+      } else {
+        return false;
+      }
+    }  
+  }
+  
+  public void testPositionIncrements() throws Exception {
+    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+    final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
+    
+    /* analyzer that uses whitespace + wdf */
+    Analyzer a = new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
+            tokenizer,
+            flags, protWords));
+      }
+    };
+
+    /* in this case, works as expected. */
+    assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
+        new int[] { 0, 9 },
+        new int[] { 6, 13 },
+        null,
+        new int[] { 1, 2 },
+        null,
+        false);
+
+    /* only in this case, posInc of 2 ?! */
+    assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "solR", "sol", "R" },
+        new int[] { 0, 9, 9, 12 },
+        new int[] { 6, 13, 12, 13 },
+        null,                     
+        new int[] { 1, 2, 0, 1 },
+        null,
+        false);
+    
+    assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
+        new int[] { 0, 9, 15 },
+        new int[] { 6, 14, 19 },
+        null,
+        new int[] { 1, 2, 1 },
+        null,
+        false);
+    
+    /* analyzer that will consume tokens with large position increments */
+    Analyzer a2 = new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
+            new LargePosIncTokenFilter(tokenizer),
+            flags, protWords));
+      }
+    };
+    
+    /* increment of "largegap" is preserved */
+    assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
+        new int[] { 0, 7, 16 },
+        new int[] { 6, 15, 20 },
+        null,
+        new int[] { 1, 10, 1 },
+        null,
+        false);
+    
+    /* the "/" had a position increment of 10, where did it go?!?!! */
+    assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
+        new int[] { 0, 9 },
+        new int[] { 6, 13 },
+        null,
+        new int[] { 1, 11 },
+        null,
+        false);
+    
+    /* in this case, the increment of 10 from the "/" is carried over */
+    assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "solR", "sol", "R" },
+        new int[] { 0, 9, 9, 12 },
+        new int[] { 6, 13, 12, 13 },
+        null,
+        new int[] { 1, 11, 0, 1 },
+        null,
+        false);
+    
+    assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
+        new int[] { 0, 9, 15 },
+        new int[] { 6, 14, 19 },
+        null,
+        new int[] { 1, 11, 1 },
+        null,
+        false);
+
+    Analyzer a3 = new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        StopFilter filter = new StopFilter(tokenizer, StandardAnalyzer.STOP_WORDS_SET);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(filter, flags, protWords));
+      }
+    };
+
+    assertAnalyzesTo(a3, "lucene.solr", 
+        new String[] { "lucenesolr", "lucene", "solr" },
+        new int[] { 0, 0, 7 },
+        new int[] { 11, 6, 11 },
+        null,
+        new int[] { 1, 0, 1 },
+        null,
+        false);
+
+    /* the stopword should add a gap here */
+    assertAnalyzesTo(a3, "the lucene.solr", 
+        new String[] { "lucenesolr", "lucene", "solr" }, 
+        new int[] { 4, 4, 11 }, 
+        new int[] { 15, 10, 15 },
+        null,
+        new int[] { 2, 0, 1 },
+        null,
+        false);
+
+    IOUtils.close(a, a2, a3);
+  }
+  
+  /** concat numbers + words + all */
+  public void testLotsOfConcatenating() throws Exception {
+    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    
+
+    /* analyzer that uses whitespace + wdf */
+    Analyzer a = new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null));
+      }
+    };
+    
+    assertAnalyzesTo(a, "abc-def-123-456", 
+        new String[] { "abcdef123456", "abcdef", "abc", "def", "123456", "123", "456" }, 
+        new int[] { 0, 0, 0, 4, 8, 8, 12 }, 
+        new int[] { 15, 7, 3, 7, 15, 11, 15 },
+        null,
+        new int[] { 1, 0, 0, 1, 1, 0, 1 },
+        null,
+        false);
+    a.close();
+  }
+  
+  /** concat numbers + words + all + preserve original */
+  public void testLotsOfConcatenating2() throws Exception {
+    final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    
+
+    /* analyzer that uses whitespace + wdf */
+    Analyzer a = new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null));
+      }
+    };
+    
+    assertAnalyzesTo(a, "abc-def-123-456", 
+                     new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" }, 
+                     new int[] { 0, 0, 0, 0, 4, 8, 8, 12 }, 
+                     new int[] { 15, 15, 7, 3, 7, 15, 11, 15 },
+                     null,
+                     new int[] { 1, 0, 0, 0, 1, 1, 0, 1 },
+                     null,
+                     false);
+    a.close();
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    int numIterations = atLeast(5);
+    for (int i = 0; i < numIterations; i++) {
+      final int flags = random().nextInt(512);
+      final CharArraySet protectedWords;
+      if (random().nextBoolean()) {
+        protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
+      } else {
+        protectedWords = null;
+      }
+      
+      Analyzer a = new Analyzer() {
+        
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName) {
+          Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords));
+        }
+      };
+      // TODO: properly support positionLengthAttribute
+      checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20, false, false);
+      a.close();
+    }
+  }
+  
+  /** blast some enormous random strings through the analyzer */
+  public void testRandomHugeStrings() throws Exception {
+    int numIterations = atLeast(5);
+    for (int i = 0; i < numIterations; i++) {
+      final int flags = random().nextInt(512);
+      final CharArraySet protectedWords;
+      if (random().nextBoolean()) {
+        protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
+      } else {
+        protectedWords = null;
+      }
+      
+      Analyzer a = new Analyzer() {
+        
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName) {
+          Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+          TokenStream wdgf = new WordDelimiterGraphFilter(tokenizer, flags, protectedWords);
+          return new TokenStreamComponents(tokenizer, wdgf);
+        }
+      };
+      // TODO: properly support positionLengthAttribute
+      checkRandomData(random(), a, 20*RANDOM_MULTIPLIER, 8192, false, false);
+      a.close();
+    }
+  }
+  
+  public void testEmptyTerm() throws IOException {
+    Random random = random();
+    for (int i = 0; i < 512; i++) {
+      final int flags = i;
+      final CharArraySet protectedWords;
+      if (random.nextBoolean()) {
+        protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
+      } else {
+        protectedWords = null;
+      }
+    
+      Analyzer a = new Analyzer() { 
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName) {
+          Tokenizer tokenizer = new KeywordTokenizer();
+          return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords));
+        }
+      };
+      // depending upon options, this thing may or may not preserve the empty term
+      checkAnalysisConsistency(random, a, random.nextBoolean(), "");
+      a.close();
+    }
+  }
+
+  private Analyzer getAnalyzer(int flags) {
+    return getAnalyzer(flags, null);
+  }
+  
+  private Analyzer getAnalyzer(int flags, CharArraySet protectedWords) {
+    return new Analyzer() { 
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords));
+      }
+    };
+  }
+
+  private static boolean has(int flags, int flag) {
+    return (flags & flag) != 0;
+  }
+
+  private static boolean isEnglishPossessive(String text, int pos) {
+    if (pos > 2) {
+      if ((text.charAt(pos-1) == 's' || text.charAt(pos-1) == 'S') &&
+          (pos == text.length() || text.charAt(pos) != '-')) {
+        text = text.substring(0, text.length()-2);
+      }
+    }
+    return true;
+  }
+
+  private static class WordPart {
+    final String part;
+    final int startOffset;
+    final int endOffset;
+    final int type;
+
+    public WordPart(String text, int startOffset, int endOffset) {
+      this.part = text.substring(startOffset, endOffset);
+      this.startOffset = startOffset;
+      this.endOffset = endOffset;
+      this.type = toType(part.charAt(0));
+    }
+
+    @Override
+    public String toString() {
+      return "WordPart(" + part + " " + startOffset + "-" + endOffset + ")";
+    }
+  }
+
+  private static final int NUMBER = 0;
+  private static final int LETTER = 1;
+  private static final int DELIM = 2;
+
+  private static int toType(char ch) {
+    if (Character.isDigit(ch)) {
+      // numbers
+      return NUMBER;
+    } else if (Character.isLetter(ch)) {
+      // letters
+      return LETTER;
+    } else {
+      // delimiter
+      return DELIM;
+    }
+  }
+
+  /** Does (hopefully) the same thing as WordDelimiterGraphFilter, according to the flags, but more slowly, returning all string paths combinations. */
+  private Set<String> slowWDF(String text, int flags) {
+
+    // first make word parts:
+    List<WordPart> wordParts = new ArrayList<>();
+    int lastCH = -1;
+    int wordPartStart = 0;
+    boolean inToken = false;
+
+    for(int i=0;i<text.length();i++) {
+      char ch = text.charAt(i);
+      if (toType(ch) == DELIM) {
+        // delimiter
+        if (inToken) {
+          // end current token
+          wordParts.add(new WordPart(text, wordPartStart, i));
+          inToken = false;
+        }
+
+        // strip english possessive at the end of this token?:
+        if (has(flags, STEM_ENGLISH_POSSESSIVE) &&
+            ch == '\'' && i > 0 &&
+            i < text.length()-1 &&
+            (text.charAt(i+1) == 's' || text.charAt(i+1) == 'S') &&
+            toType(text.charAt(i-1)) == LETTER &&
+            (i+2 == text.length() || toType(text.charAt(i+2)) == DELIM)) {
+          i += 2;
+        }
+    
+      } else if (inToken == false) {
+        // start new token
+        inToken = true;
+        wordPartStart = i;
+      } else {
+        boolean newToken = false;
+        if (Character.isLetter(lastCH)) {
+          if (Character.isLetter(ch)) {
+            if (has(flags, SPLIT_ON_CASE_CHANGE) && Character.isLowerCase(lastCH) && Character.isLowerCase(ch) == false) {
+              // start new token on lower -> UPPER case change (but not vice versa!)
+              newToken = true;
+            }
+          } else if (has(flags, SPLIT_ON_NUMERICS) && Character.isDigit(ch)) {
+            // start new token on letter -> number change
+            newToken = true;
+          }
+        } else {
+          assert Character.isDigit(lastCH);
+          if (Character.isLetter(ch) && has(flags, SPLIT_ON_NUMERICS) ) {
+            // start new token on number -> letter change
+            newToken = true;
+          }
+        }
+        if (newToken) {
+          wordParts.add(new WordPart(text, wordPartStart, i));
+          wordPartStart = i;
+        }
+      }
+      lastCH = ch;
+    }
+
+    if (inToken) {
+      // add last token
+      wordParts.add(new WordPart(text, wordPartStart, text.length()));
+    }
+    
+    Set<String> paths = new HashSet<>();
+    if (wordParts.isEmpty() == false) {
+      enumerate(flags, 0, text, wordParts, paths, new StringBuilder());
+    }
+
+    if (has(flags, PRESERVE_ORIGINAL)) {
+      paths.add(text);
+    }
+
+    if (has(flags, CATENATE_ALL) && wordParts.isEmpty() == false) {
+      StringBuilder b = new StringBuilder();
+      for(WordPart wordPart : wordParts) {
+        b.append(wordPart.part);
+      }
+      paths.add(b.toString());
+    }
+    
+    return paths;
+  }
+
+  private void add(StringBuilder path, String part) {
+    if (path.length() != 0) {
+      path.append(' ');
+    }
+    path.append(part);
+  }
+
+  private void add(StringBuilder path, List<WordPart> wordParts, int from, int to) {
+    if (path.length() != 0) {
+      path.append(' ');
+    }
+    // no spaces:
+    for(int i=from;i<to;i++) {
+      path.append(wordParts.get(i).part);
+    }
+  }
+
+  private void addWithSpaces(StringBuilder path, List<WordPart> wordParts, int from, int to) {
+    for(int i=from;i<to;i++) {
+      add(path, wordParts.get(i).part);
+    }
+  }
+
+  /** Finds the end (exclusive) of the series of part with the same type */
+  private int endOfRun(List<WordPart> wordParts, int start) {
+    int upto = start+1;
+    while(upto < wordParts.size() && wordParts.get(upto).type == wordParts.get(start).type) {
+      upto++;
+    }
+    return upto;
+  }
+
+  /** Recursively enumerates all paths through the word parts */
+  private void enumerate(int flags, int upto, String text, List<WordPart> wordParts, Set<String> paths, StringBuilder path) {
+    if (upto == wordParts.size()) {
+      if (path.length() > 0) {
+        paths.add(path.toString());
+      }
+    } else {
+      int savLength = path.length();
+      int end = endOfRun(wordParts, upto);
+
+      if (wordParts.get(upto).type == NUMBER) {
+        // always output single word, optionally surrounded by delims:
+        if (has(flags, GENERATE_NUMBER_PARTS) || wordParts.size() == 1) {
+          addWithSpaces(path, wordParts, upto, end);
+          if (has(flags, CATENATE_NUMBERS)) {
+            // recurse first with the parts
+            enumerate(flags, end, text, wordParts, paths, path);
+            path.setLength(savLength);
+            // .. and second with the concat
+            add(path, wordParts, upto, end);
+          }
+        } else if (has(flags, CATENATE_NUMBERS)) {
+          add(path, wordParts, upto, end);
+        }
+        enumerate(flags, end, text, wordParts, paths, path);
+        path.setLength(savLength);
+      } else {
+        assert wordParts.get(upto).type == LETTER;
+        // always output single word, optionally surrounded by delims:
+        if (has(flags, GENERATE_WORD_PARTS) || wordParts.size() == 1) {
+          addWithSpaces(path, wordParts, upto, end);
+          if (has(flags, CATENATE_WORDS)) {
+            // recurse first with the parts
+            enumerate(flags, end, text, wordParts, paths, path);
+            path.setLength(savLength);
+            // .. and second with the concat
+            add(path, wordParts, upto, end);
+          }
+        } else if (has(flags, CATENATE_WORDS)) {
+          add(path, wordParts, upto, end);
+        }
+        enumerate(flags, end, text, wordParts, paths, path);
+        path.setLength(savLength);
+      }
+    }
+  }
+
+  public void testBasicGraphSplits() throws Exception {
+    assertGraphStrings(getAnalyzer(0),
+                       "PowerShotPlus",
+                       "PowerShotPlus");
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS),
+                       "PowerShotPlus",
+                       "PowerShotPlus");
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE),
+                       "PowerShotPlus",
+                       "Power Shot Plus");
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL),
+                       "PowerShotPlus",
+                       "PowerShotPlus",
+                       "Power Shot Plus");
+
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS),
+                       "Power-Shot-Plus",
+                       "Power Shot Plus");
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE),
+                       "Power-Shot-Plus",
+                       "Power Shot Plus");
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL),
+                       "Power-Shot-Plus",
+                       "Power-Shot-Plus",
+                       "Power Shot Plus");
+
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE),
+                       "PowerShotPlus",
+                       "Power Shot Plus");
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE),
+                       "PowerShot1000Plus",
+                       "Power Shot1000Plus");
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE),
+                       "Power-Shot-Plus",
+                       "Power Shot Plus");
+
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | CATENATE_WORDS),
+                       "PowerShotPlus",
+                       "Power Shot Plus",
+                       "PowerShotPlus");
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | CATENATE_WORDS),
+                       "PowerShot1000Plus",
+                       "Power Shot1000Plus",
+                       "PowerShot1000Plus");
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | CATENATE_WORDS | CATENATE_NUMBERS),
+                       "Power-Shot-1000-17-Plus",
+                       "Power Shot 1000 17 Plus",
+                       "Power Shot 100017 Plus",
+                       "PowerShot 1000 17 Plus",
+                       "PowerShot 100017 Plus");
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | CATENATE_WORDS | CATENATE_NUMBERS | PRESERVE_ORIGINAL),
+                       "Power-Shot-1000-17-Plus",
+                       "Power-Shot-1000-17-Plus",
+                       "Power Shot 1000 17 Plus",
+                       "Power Shot 100017 Plus",
+                       "PowerShot 1000 17 Plus",
+                       "PowerShot 100017 Plus");
+  }
+
+  /*
+  public void testToDot() throws Exception {
+    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE | PRESERVE_ORIGINAL | CATENATE_WORDS | CATENATE_NUMBERS | STEM_ENGLISH_POSSESSIVE;
+    String text = "PowerSystem2000-5-Shot's";
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token(text, 0, text.length())), DEFAULT_WORD_DELIM_TABLE, flags, null);
+    //StringWriter sw = new StringWriter();
+    // TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, new PrintWriter(sw));
+    PrintWriter pw = new PrintWriter("/tmp/foo2.dot");
+    TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, pw);
+    toDot.toDot();
+    pw.close();
+    //System.out.println("DOT:\n" + sw.toString());
+  }
+  */
+
+  private String randomWDFText() {
+    StringBuilder b = new StringBuilder();
+    int length = TestUtil.nextInt(random(), 1, 50);
+    for(int i=0;i<length;i++) {
+      int surpriseMe = random().nextInt(37);
+      int lower = -1;
+      int upper = -1;
+      if (surpriseMe < 10) {
+        // lowercase letter
+        lower = 'a';
+        upper = 'z';
+      } else if (surpriseMe < 20) {
+        // uppercase letter
+        lower = 'A';
+        upper = 'Z';
+      } else if (surpriseMe < 30) {
+        // digit
+        lower = '0';
+        upper = '9';
+      } else if (surpriseMe < 35) {
+        // punct
+        lower = '-';
+        upper = '-';
+      } else {
+        b.append("'s");
+      }
+
+      if (lower != -1) {
+        b.append((char) TestUtil.nextInt(random(), lower, upper));
+      }
+    }
+
+    return b.toString();
+  }
+
+  public void testInvalidFlag() throws Exception {
+    expectThrows(IllegalArgumentException.class,
+                 () -> {
+                   new WordDelimiterGraphFilter(new CannedTokenStream(), 1 << 31, null);
+                 });
+  }
+
+  public void testRandomPaths() throws Exception {
+    int iters = atLeast(100);
+    for(int iter=0;iter<iters;iter++) {
+      String text = randomWDFText();
+      if (VERBOSE) {
+        System.out.println("\nTEST: text=" + text + " len=" + text.length());
+      }
+
+      int flags = 0;
+      if (random().nextBoolean()) {
+        flags |= GENERATE_WORD_PARTS;
+      }
+      if (random().nextBoolean()) {
+        flags |= GENERATE_NUMBER_PARTS;
+      }
+      if (random().nextBoolean()) {
+        flags |= CATENATE_WORDS;
+      }
+      if (random().nextBoolean()) {
+        flags |= CATENATE_NUMBERS;
+      }
+      if (random().nextBoolean()) {
+        flags |= CATENATE_ALL;
+      }
+      if (random().nextBoolean()) {
+        flags |= PRESERVE_ORIGINAL;
+      }
+      if (random().nextBoolean()) {
+        flags |= SPLIT_ON_CASE_CHANGE;
+      }
+      if (random().nextBoolean()) {
+        flags |= SPLIT_ON_NUMERICS;
+      }
+      if (random().nextBoolean()) {
+        flags |= STEM_ENGLISH_POSSESSIVE;
+      }
+
+      verify(text, flags);
+    }
+  }
+
+  /** Runs normal and slow WDGF and compares results */
+  private void verify(String text, int flags) throws IOException {
+
+    Set<String> expected = slowWDF(text, flags);
+    if (VERBOSE) {
+      for(String path : expected) {
+        System.out.println("  " + path);
+      }
+    }
+
+    Set<String> actual = getGraphStrings(getAnalyzer(flags), text);
+    if (actual.equals(expected) == false) {
+      StringBuilder b = new StringBuilder();
+      b.append("\n\nFAIL: text=");
+      b.append(text);
+      b.append(" flags=");
+      b.append(WordDelimiterGraphFilter.flagsToString(flags));
+      b.append('\n');
+      b.append("  expected paths:\n");
+      for (String s : expected) {
+        b.append("    ");
+        b.append(s);
+        if (actual.contains(s) == false) {
+          b.append(" [missing!]");
+        }
+        b.append('\n');
+      }
+
+      b.append("  actual paths:\n");
+      for (String s : actual) {
+        b.append("    ");
+        b.append(s);
+        if (expected.contains(s) == false) {
+          b.append(" [unexpected!]");
+        }
+        b.append('\n');
+      }
+
+      fail(b.toString());
+    }
+  }
+
+  public void testOnlyNumbers() throws Exception {
+    // no token should be produced
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS), "7-586");
+  }
+
+  public void testNoCatenate() throws Exception {
+    // no token should be produced
+    assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS), "a-b-c-9-d", "a b c 9 d");
+  }
+
+  public void testCuriousCase1() throws Exception {
+    verify("u-0L-4836-ip4Gw--13--q7--L07E1", CATENATE_WORDS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE);
+  }
+
+  public void testCuriousCase2() throws Exception {
+    verify("u-l-p", CATENATE_ALL);
+  }
+
+  public void testOriginalPosLength() throws Exception {
+    verify("Foo-Bar-Baz", CATENATE_WORDS | SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL);
+  }
+
+  public void testCuriousCase3() throws Exception {
+    verify("cQzk4-GL0izl0mKM-J8--4m-'s", GENERATE_NUMBER_PARTS | CATENATE_NUMBERS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS);
+  }
+
+  public void testEmptyString() throws Exception {
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("", 0, 0)), DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | CATENATE_ALL | PRESERVE_ORIGINAL, null);
+    wdf.reset();
+    assertTrue(wdf.incrementToken());
+    assertFalse(wdf.incrementToken());
+    wdf.end();
+    wdf.close();
+  }
+
+  public void testProtectedWords() throws Exception {
+    TokenStream tokens = new CannedTokenStream(new Token("foo17-bar", 0, 9),
+                                               new Token("foo-bar", 0, 7));
+
+    CharArraySet protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("foo17-BAR")), true);
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
+    assertGraphStrings(wdf,
+                       "foo17-bar foo bar",
+                       "foo17-bar foo-bar",
+                       "foo17-bar foobar");
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFlattenGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFlattenGraphFilter.java
deleted file mode 100644
index d61fa96..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFlattenGraphFilter.java
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.synonym;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.CannedTokenStream;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-
-public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
-  
-  private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
-    final Token t = new Token(term, startOffset, endOffset);
-    t.setPositionIncrement(posInc);
-    t.setPositionLength(posLength);
-    return t;
-  }
-
-  public void testSimpleMock() throws Exception {
-    Analyzer a = new Analyzer() {
-        @Override
-        protected TokenStreamComponents createComponents(String fieldName) {
-          Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
-          TokenStream ts = new FlattenGraphFilter(tokenizer);
-          return new TokenStreamComponents(tokenizer, ts);
-        }
-      };
-
-    assertAnalyzesTo(a, "wtf happened",
-                     new String[] {"wtf", "happened"},
-                     new int[]    {    0,          4},
-                     new int[]    {    3,         12},
-                     null,
-                     new int[]    {    1,          1},
-                     new int[]    {    1,          1},
-                     true);
-  }
-
-  // Make sure graph is unchanged if it's already flat
-  public void testAlreadyFlatten() throws Exception {
-    TokenStream in = new CannedTokenStream(0, 12, new Token[] {
-        token("wtf", 1, 1, 0, 3),
-        token("what", 0, 1, 0, 3),
-        token("wow", 0, 1, 0, 3),
-        token("the", 1, 1, 0, 3),
-        token("that's", 0, 1, 0, 3),
-        token("fudge", 1, 1, 0, 3),
-        token("funny", 0, 1, 0, 3),
-        token("happened", 1, 1, 4, 12)
-      });
-
-    TokenStream out = new FlattenGraphFilter(in);
-
-    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
-    assertTokenStreamContents(out,
-                              new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
-                              new int[] {0, 0, 0, 0, 0, 0, 0, 4},
-                              new int[] {3, 3, 3, 3, 3, 3, 3, 12},
-                              new int[] {1, 0, 0, 1, 0, 1, 0, 1},
-                              new int[] {1, 1, 1, 1, 1, 1, 1, 1},
-                              12);
-  }
-
-  public void testWTF1() throws Exception {
-
-    // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
-    TokenStream in = new CannedTokenStream(0, 12, new Token[] {
-        token("wtf", 1, 5, 0, 3),
-        token("what", 0, 1, 0, 3),
-        token("wow", 0, 3, 0, 3),
-        token("the", 1, 1, 0, 3),
-        token("fudge", 1, 3, 0, 3),
-        token("that's", 1, 1, 0, 3),
-        token("funny", 1, 1, 0, 3),
-        token("happened", 1, 1, 4, 12)
-      });
-
-
-    TokenStream out = new FlattenGraphFilter(in);
-
-    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
-    assertTokenStreamContents(out,
-                              new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
-                              new int[] {0, 0, 0, 0, 0, 0, 0, 4},
-                              new int[] {3, 3, 3, 3, 3, 3, 3, 12},
-                              new int[] {1, 0, 0, 1, 0, 1, 0, 1},
-                              new int[] {3, 1, 1, 1, 1, 1, 1, 1},
-                              12);
-    
-  }
-
-  /** Same as testWTF1 except the "wtf" token comes out later */
-  public void testWTF2() throws Exception {
-
-    // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
-    TokenStream in = new CannedTokenStream(0, 12, new Token[] {
-        token("what", 1, 1, 0, 3),
-        token("wow", 0, 3, 0, 3),
-        token("wtf", 0, 5, 0, 3),
-        token("the", 1, 1, 0, 3),
-        token("fudge", 1, 3, 0, 3),
-        token("that's", 1, 1, 0, 3),
-        token("funny", 1, 1, 0, 3),
-        token("happened", 1, 1, 4, 12)
-      });
-
-
-    TokenStream out = new FlattenGraphFilter(in);
-
-    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
-    assertTokenStreamContents(out,
-                              new String[] {"what", "wow", "wtf", "the", "that's", "fudge", "funny", "happened"},
-                              new int[] {0, 0, 0, 0, 0, 0, 0, 4},
-                              new int[] {3, 3, 3, 3, 3, 3, 3, 12},
-                              new int[] {1, 0, 0, 1, 0, 1, 0, 1},
-                              new int[] {1, 1, 3, 1, 1, 1, 1, 1},
-                              12);
-    
-  }
-
-  public void testNonGreedySynonyms() throws Exception {
-    // This is just "hypothetical" for Lucene today, because SynFilter is
-    // greedy: when two syn rules match on overlapping tokens, only one
-    // (greedily) wins.  This test pretends all syn matches could match:
-
-    TokenStream in = new CannedTokenStream(0, 20, new Token[] {
-        token("wizard", 1, 1, 0, 6),
-        token("wizard_of_oz", 0, 3, 0, 12),
-        token("of", 1, 1, 7, 9),
-        token("oz", 1, 1, 10, 12),
-        token("oz_screams", 0, 2, 10, 20),
-        token("screams", 1, 1, 13, 20),
-      });
-
-
-    TokenStream out = new FlattenGraphFilter(in);
-
-    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
-    assertTokenStreamContents(out,
-                              new String[] {"wizard", "wizard_of_oz", "of", "oz", "oz_screams", "screams"},
-                              new int[] {0, 0, 7, 10, 10, 13},
-                              new int[] {6, 12, 9, 12, 20, 20},
-                              new int[] {1, 0, 1, 1, 0, 1},
-                              new int[] {1, 3, 1, 1, 2, 1},
-                              20);
-    
-  }
-
-  public void testNonGraph() throws Exception {
-    TokenStream in = new CannedTokenStream(0, 22, new Token[] {
-        token("hello", 1, 1, 0, 5),
-        token("pseudo", 1, 1, 6, 12),
-        token("world", 1, 1, 13, 18),
-        token("fun", 1, 1, 19, 22),
-      });
-
-
-    TokenStream out = new FlattenGraphFilter(in);
-
-    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
-    assertTokenStreamContents(out,
-                              new String[] {"hello", "pseudo", "world", "fun"},
-                              new int[] {0, 6, 13, 19},
-                              new int[] {5, 12, 18, 22},
-                              new int[] {1, 1, 1, 1},
-                              new int[] {1, 1, 1, 1},
-                              22);
-  }
-
-  public void testSimpleHole() throws Exception {
-    TokenStream in = new CannedTokenStream(0, 13, new Token[] {
-        token("hello", 1, 1, 0, 5),
-        token("hole", 2, 1, 6, 10),
-        token("fun", 1, 1, 11, 13),
-      });
-
-
-    TokenStream out = new FlattenGraphFilter(in);
-
-    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
-    assertTokenStreamContents(out,
-                              new String[] {"hello", "hole", "fun"},
-                              new int[] {0, 6, 11},
-                              new int[] {5, 10, 13},
-                              new int[] {1, 2, 1},
-                              new int[] {1, 1, 1},
-                              13);
-  }
-
-  public void testHoleUnderSyn() throws Exception {
-    // Tests a StopFilter after SynFilter where a stopword in a syn is removed
-    //
-    //   wizard of oz -> woz syn, but then "of" becomes a hole
-
-    TokenStream in = new CannedTokenStream(0, 12, new Token[] {
-        token("wizard", 1, 1, 0, 6),
-        token("woz", 0, 3, 0, 12),
-        token("oz", 2, 1, 10, 12),
-      });
-
-
-    TokenStream out = new FlattenGraphFilter(in);
-
-    assertTokenStreamContents(out,
-                              new String[] {"wizard", "woz", "oz"},
-                              new int[] {0, 0, 10},
-                              new int[] {6, 12, 12},
-                              new int[] {1, 0, 2},
-                              new int[] {1, 3, 1},
-                              12);
-  }
-
-  public void testStrangelyNumberedNodes() throws Exception {
-
-    // Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!)
-    TokenStream in = new CannedTokenStream(0, 27, new Token[] {
-        token("dog", 1, 3, 0, 5),
-        token("puppy", 0, 3, 0, 5),
-        token("flies", 3, 1, 6, 11),
-      });
-
-    TokenStream out = new FlattenGraphFilter(in);
-
-    assertTokenStreamContents(out,
-                              new String[] {"dog", "puppy", "flies"},
-                              new int[] {0, 0, 6},
-                              new int[] {5, 5, 11},
-                              new int[] {1, 0, 1},
-                              new int[] {1, 1, 1},
-                              27);
-  }
-
-  public void testTwoLongParallelPaths() throws Exception {
-
-    // "a a a a a a" in parallel with "b b b b b b"
-    TokenStream in = new CannedTokenStream(0, 11, new Token[] {
-        token("a", 1, 1, 0, 1),
-        token("b", 0, 2, 0, 1),
-        token("a", 1, 2, 2, 3),
-        token("b", 1, 2, 2, 3),
-        token("a", 1, 2, 4, 5),
-        token("b", 1, 2, 4, 5),
-        token("a", 1, 2, 6, 7),
-        token("b", 1, 2, 6, 7),
-        token("a", 1, 2, 8, 9),
-        token("b", 1, 2, 8, 9),
-        token("a", 1, 2, 10, 11),
-        token("b", 1, 2, 10, 11),
-      });
-
-
-    TokenStream out = new FlattenGraphFilter(in);
-    
-    // ... becomes flattened to a single path with overlapping a/b token between each node:
-    assertTokenStreamContents(out,
-                              new String[] {"a", "b", "a", "b", "a", "b", "a", "b", "a", "b", "a", "b"},
-                              new int[] {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10},
-                              new int[] {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11},
-                              new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
-                              new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
-                              11);
-    
-  }
-
-  // NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java
index edf2d2a..e00a165 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java
@@ -17,14 +17,22 @@
 
 package org.apache.lucene.analysis.synonym;
 
+import java.io.IOException;
+import java.io.StringReader;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockGraphTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.TokenStreamToAutomaton;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.FlattenGraphFilter;
 import org.apache.lucene.analysis.tokenattributes.*;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -35,7 +43,6 @@ import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.IOUtils;
@@ -49,15 +56,6 @@ import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
 import org.apache.lucene.util.automaton.Transition;
 import org.apache.lucene.util.fst.Util;
 
-import java.io.IOException;
-import java.io.StringReader;
-import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
 public class TestSynonymGraphFilter extends BaseTokenStreamTestCase {
 
   /** Set as a side effect by {@link #getAnalyzer} and {@link #getFlattenAnalyzer}. */
@@ -1832,7 +1830,7 @@ public class TestSynonymGraphFilter extends BaseTokenStreamTestCase {
                      new int[]      {1,        1,   0,        0,     0,        1,   0,        0,   1,    0,         1,    1,         1},
                      new int[]      {1,        1,   1,        1,     4,        3,   1,        1,   2,    1,         1,    1,         1});
     
-    assertAllStrings(analyzer, "the usa is wealthy", new String[] {
+    assertGraphStrings(analyzer, "the usa is wealthy", new String[] {
         "the usa is wealthy",
         "the united states is wealthy",
         "the u s a is wealthy",
@@ -1924,33 +1922,4 @@ public class TestSynonymGraphFilter extends BaseTokenStreamTestCase {
         new int[]{1, 1, 0, 1, 1});
     a.close();
   }
-
-  /**
-   * Helper method to validate all strings that can be generated from a token stream.
-   * Uses {@link TokenStreamToAutomaton} to create an automaton. Asserts the finite strings of the automaton are all
-   * and only the given valid strings.
-   * @param analyzer analyzer containing the SynonymFilter under test.
-   * @param text text to be analyzed.
-   * @param expectedStrings all expected finite strings.
-   */
-  public void assertAllStrings(Analyzer analyzer, String text, String[] expectedStrings) throws IOException {
-    TokenStream tokenStream = analyzer.tokenStream("dummy", text);
-    try {
-      Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
-      Set<IntsRef> finiteStrings = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
-
-      assertEquals("Invalid resulting strings count. Expected " + expectedStrings.length + " was " + finiteStrings.size(),
-          expectedStrings.length, finiteStrings.size());
-
-      Set<String> expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings));
-
-      BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
-      for (IntsRef ir: finiteStrings) {
-        String s = Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' ');
-        assertTrue("Unexpected string found: " + s, expectedStringsSet.contains(s));
-      }
-    } finally {
-      tokenStream.close();
-    }
-  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
index 64bac66..0675abe 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
@@ -39,6 +39,7 @@ import org.apache.lucene.util.automaton.Automaton;
 public class TokenStreamToAutomaton {
 
   private boolean preservePositionIncrements;
+  private boolean finalOffsetGapAsHole;
   private boolean unicodeArcs;
 
   /** Sole constructor. */
@@ -51,6 +52,11 @@ public class TokenStreamToAutomaton {
     this.preservePositionIncrements = enablePositionIncrements;
   }
 
+  /** If true, any final offset gaps will result in adding a position hole. */
+  public void setFinalOffsetGapAsHole(boolean finalOffsetGapAsHole) {
+    this.finalOffsetGapAsHole = finalOffsetGapAsHole;
+  }
+
   /** Whether to make transition labels Unicode code points instead of UTF8 bytes, 
    *  <code>false</code> by default */
   public void setUnicodeArcs(boolean unicodeArcs) {
@@ -118,7 +124,7 @@ public class TokenStreamToAutomaton {
     int maxOffset = 0;
     while (in.incrementToken()) {
       int posInc = posIncAtt.getPositionIncrement();
-      if (!preservePositionIncrements && posInc > 1) {
+      if (preservePositionIncrements == false && posInc > 1) {
         posInc = 1;
       }
       assert pos > -1 || posInc > 0;
@@ -201,10 +207,35 @@ public class TokenStreamToAutomaton {
     }
 
     in.end();
+
     int endState = -1;
-    if (offsetAtt.endOffset() > maxOffset) {
+
+    int endPosInc = posIncAtt.getPositionIncrement();
+
+    if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
+      endPosInc = 1;
+    }
+    
+    if (endPosInc > 0) {
+      // there were hole(s) after the last token
       endState = builder.createState();
-      builder.setAccept(endState, true);
+
+      // add trailing holes now:
+      int lastState = endState;
+      while (true) {
+        int state1 = builder.createState();
+        builder.addTransition(lastState, state1, HOLE);
+        endPosInc--;
+        if (endPosInc == 0) {
+          builder.setAccept(state1, true);
+          break;
+        }
+        int state2 = builder.createState();
+        builder.addTransition(state1, state2, POS_SEP);
+        lastState = state2;
+      }
+    } else {
+      endState = -1;
     }
 
     pos++;
@@ -219,7 +250,7 @@ public class TokenStreamToAutomaton {
       }
       pos++;
     }
-
+    
     return builder.finish();
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
index cdc5d42..166d6b2 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
@@ -43,7 +43,7 @@ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribut
     // OffsetAtt
 
     if (startOffset < 0 || endOffset < startOffset) {
-      throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
+      throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset; got "
           + "startOffset=" + startOffset + ",endOffset=" + endOffset);
     }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
index c89a374..ad1e232 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
@@ -107,7 +107,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
   @Override
   public void setOffset(int startOffset, int endOffset) {
     if (startOffset < 0 || endOffset < startOffset) {
-      throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
+      throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset; got "
           + "startOffset=" + startOffset + ",endOffset=" + endOffset);
     }
     this.startOffset = startOffset;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
index 4d63d6f..e89fec1 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
@@ -30,8 +30,7 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
   @Override
   public void setPositionIncrement(int positionIncrement) {
     if (positionIncrement < 0) {
-      throw new IllegalArgumentException
-        ("Increment must be zero or greater: got " + positionIncrement);
+      throw new IllegalArgumentException("Position increment must be zero or greater; got " + positionIncrement);
     }
     this.positionIncrement = positionIncrement;
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
index 9bfdb49..d019a2b 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
@@ -30,8 +30,7 @@ public class PositionLengthAttributeImpl extends AttributeImpl implements Positi
   @Override
   public void setPositionLength(int positionLength) {
     if (positionLength < 1) {
-      throw new IllegalArgumentException
-        ("Position length must be 1 or greater: got " + positionLength);
+      throw new IllegalArgumentException("Position length must be 1 or greater; got " + positionLength);
     }
     this.positionLength = positionLength;
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
index 8899dd1..7e98662 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
@@ -21,16 +21,22 @@ import java.io.PrintWriter;
 import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Random;
+import java.util.Set;
 
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonTestUtil;
 import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.fst.Util;
 
 import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
 
@@ -565,7 +571,13 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
     assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
   }
 
-  // TODO: testEndsWithHole... but we need posInc to set in TS.end()
+  public void testEndsWithHole() throws Exception {
+    final TokenStream ts = new CannedTokenStream(1, 0,
+                                                 new Token[] {
+                                                   token("abc", 2, 1),
+                                                 });
+    assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc"), SEP_A, HOLE_A), ts);
+  }
 
   public void testSynHangingOverEnd() throws Exception {
     final TokenStream ts = new CannedTokenStream(
@@ -576,14 +588,47 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
     assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
   }
 
+  /** Returns all paths */
+  private Set<String> toPathStrings(Automaton a) {
+    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
+    Set<String> paths = new HashSet<>();
+    for (IntsRef ir: AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
+      paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
+    }
+    return paths;
+  }
+
   private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
     assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
   }
 
   private void assertSameLanguage(Automaton expected, Automaton actual) {
-    assertTrue(Operations.sameLanguage(
-      Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES),
-      Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES)));
+    Automaton expectedDet = Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES);
+    Automaton actualDet = Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES);
+    if (Operations.sameLanguage(expectedDet, actualDet) == false) {
+      Set<String> expectedPaths = toPathStrings(expectedDet);
+      Set<String> actualPaths = toPathStrings(actualDet);
+      StringBuilder b = new StringBuilder();
+      b.append("expected:\n");
+      for(String path : expectedPaths) {
+        b.append("  ");
+        b.append(path);
+        if (actualPaths.contains(path) == false) {
+          b.append(" [missing!]");
+        }
+        b.append('\n');
+      }
+      b.append("actual:\n");
+      for(String path : actualPaths) {
+        b.append("  ");
+        b.append(path);
+        if (expectedPaths.contains(path) == false) {
+          b.append(" [unexpected!]");
+        }
+        b.append('\n');
+      }
+      fail("accepted language is different:\n\n" + b.toString());
+    }
   }
 
   public void testTokenStreamGraphWithHoles() throws Exception {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
index 19982a5..9c6a624 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@@ -332,6 +332,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
   TokenStreamToAutomaton getTokenStreamToAutomaton() {
     final TokenStreamToAutomaton tsta = new TokenStreamToAutomaton();
     tsta.setPreservePositionIncrements(preservePositionIncrements);
+    tsta.setFinalOffsetGapAsHole(true);
     return tsta;
   }
   
@@ -865,7 +866,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
     // Turn tokenstream into automaton:
     Automaton automaton = null;
     try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
-        automaton = getTokenStreamToAutomaton().toAutomaton(ts);
+      automaton = getTokenStreamToAutomaton().toAutomaton(ts);
     }
 
     automaton = replaceSep(automaton);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index 924756e..070eab2 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -41,11 +41,16 @@ import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.AttributeReflector;
+import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.LineFileDocs;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.Rethrow;
 import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonTestUtil;
+import org.apache.lucene.util.fst.Util;
 
 /** 
  * Base class for all Lucene unit tests that use TokenStreams. 
@@ -166,6 +171,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
     final Map<Integer,Integer> posToStartOffset = new HashMap<>();
     final Map<Integer,Integer> posToEndOffset = new HashMap<>();
 
+    // TODO: would be nice to be able to assert silly duplicated tokens are not created, but a number of cases do this "legitimately": LUCENE-7622
+
     ts.reset();
     int pos = -1;
     int lastStartOffset = 0;
@@ -182,7 +189,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
       checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
       assertTrue("token "+i+" does not exist", ts.incrementToken());
       assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled());
-      
+
       assertEquals("term "+i, output[i], termAtt.toString());
       if (startOffsets != null) {
         assertEquals("startOffset " + i + " term=" + termAtt, startOffsets[i], offsetAtt.startOffset());
@@ -261,12 +268,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
         }
       }
       if (posLengthAtt != null) {
-        assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
+        assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1);
       }
     }
 
     if (ts.incrementToken()) {
-      fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt);
+      fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + ts.getAttribute(CharTermAttribute.class));
     }
 
     // repeat our extra safety checks for end()
@@ -977,4 +984,105 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
   public static AttributeFactory newAttributeFactory() {
     return newAttributeFactory(random());
   }
+
+  private static String toString(Set<String> strings) {
+    List<String> stringsList = new ArrayList<>(strings);
+    Collections.sort(stringsList);
+    StringBuilder b = new StringBuilder();
+    for(String s : stringsList) {
+      b.append("  ");
+      b.append(s);
+      b.append('\n');
+    }
+    return b.toString();
+  }
+
+  /**
+   * Enumerates all accepted strings in the token graph created by the analyzer on the provided text, and then
+   * asserts that it's equal to the expected strings.
+   * Uses {@link TokenStreamToAutomaton} to create an automaton. Asserts the finite strings of the automaton are all
+   * and only the given valid strings.
+   * @param analyzer analyzer containing the SynonymFilter under test.
+   * @param text text to be analyzed.
+   * @param expectedStrings all expected finite strings.
+   */
+  public static void assertGraphStrings(Analyzer analyzer, String text, String... expectedStrings) throws IOException {
+    checkAnalysisConsistency(random(), analyzer, true, text, true);
+    try (TokenStream tokenStream = analyzer.tokenStream("dummy", text)) {
+      assertGraphStrings(tokenStream, expectedStrings);
+    }
+  }
+
+  /**
+   * Enumerates all accepted strings in the token graph created by the already initialized {@link TokenStream}.
+   */
+  public static void assertGraphStrings(TokenStream tokenStream, String... expectedStrings) throws IOException {
+    Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
+    Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
+
+    Set<String> expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings));
+
+    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
+    Set<String> actualStrings = new HashSet<>();
+    for (IntsRef ir: actualStringPaths) {
+      actualStrings.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
+    }
+    for (String s : actualStrings) {
+      assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), expectedStringsSet.contains(s));
+    }
+    for (String s : expectedStrings) {
+      assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), actualStrings.contains(s));
+    }
+  }
+
+  /** Returns all paths accepted by the token stream graph produced by analyzing text with the provided analyzer.  The tokens {@link
+   *  CharTermAttribute} values are concatenated, and separated with space. */
+  public static Set<String> getGraphStrings(Analyzer analyzer, String text) throws IOException {
+    try(TokenStream tokenStream = analyzer.tokenStream("dummy", text)) {
+      return getGraphStrings(tokenStream);
+    }
+  }
+
+  /** Returns all paths accepted by the token stream graph produced by the already initialized {@link TokenStream}. */
+  public static Set<String> getGraphStrings(TokenStream tokenStream) throws IOException {
+    Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
+    Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
+    BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
+    Set<String> paths = new HashSet<>();
+    for (IntsRef ir: actualStringPaths) {
+      paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
+    }
+    return paths;
+  }
+
+  /** Returns a {@code String} summary of the tokens this analyzer produces on this text */
+  public static String toString(Analyzer analyzer, String text) throws IOException {
+    try(TokenStream ts = analyzer.tokenStream("field", text)) {
+      StringBuilder b = new StringBuilder();
+      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+      PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
+      PositionLengthAttribute posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
+      OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
+      assertNotNull(offsetAtt);
+      ts.reset();
+      int pos = -1;
+      while (ts.incrementToken()) {
+        pos += posIncAtt.getPositionIncrement();
+        b.append(termAtt);
+        b.append(" at pos=");
+        b.append(pos);
+        if (posLengthAtt != null) {
+          b.append(" to pos=");
+          b.append(pos + posLengthAtt.getPositionLength());
+        }
+        b.append(" offsets=");
+        b.append(offsetAtt.startOffset());
+        b.append('-');
+        b.append(offsetAtt.endOffset());
+        b.append('\n');
+      }
+      ts.end();
+      return b.toString();
+    }
+  }
 }