You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2014/01/30 00:34:49 UTC

svn commit: r1562639 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/shingle/ analysis/common/src/test/org/apache/lucene/analysis/shingle/

Author: sarowe
Date: Wed Jan 29 23:34:48 2014
New Revision: 1562639

URL: http://svn.apache.org/r1562639
Log:
LUCENE-5353: ShingleFilter's filler token should be configurable

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Jan 29 23:34:48 2014
@@ -119,6 +119,9 @@ New Features
   encode term metadata, and all dictionary implementations can now plug in any 
   PostingsBaseFormat. (Han Jiang, Mike McCandless)
 
+* LUCENE-5353: ShingleFilter's filler token should be configurable.
+  (Ahmet Arslan, Simon Willnauer, Steve Rowe)
+
 Build
 
 * LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java Wed Jan 29 23:34:48 2014
@@ -36,6 +36,7 @@ public final class ShingleAnalyzerWrappe
   private final String tokenSeparator;
   private final boolean outputUnigrams;
   private final boolean outputUnigramsIfNoShingles;
+  private final String fillerToken;
 
   public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
     this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
@@ -46,7 +47,8 @@ public final class ShingleAnalyzerWrappe
   }
 
   public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
-    this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false);
+    this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+         true, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
   }
 
   /**
@@ -63,6 +65,7 @@ public final class ShingleAnalyzerWrappe
    *        minShingleSize tokens in the input stream)?
    *        Note that if outputUnigrams==true, then unigrams are always output,
    *        regardless of whether any shingles are available.
+   * @param fillerToken filler token to use when positionIncrement is more than 1
    */
   public ShingleAnalyzerWrapper(
       Analyzer delegate,
@@ -70,7 +73,8 @@ public final class ShingleAnalyzerWrappe
       int maxShingleSize,
       String tokenSeparator,
       boolean outputUnigrams,
-      boolean outputUnigramsIfNoShingles) {
+      boolean outputUnigramsIfNoShingles,
+      String fillerToken) {
     super(delegate.getReuseStrategy());
     this.delegate = delegate;
 
@@ -91,6 +95,7 @@ public final class ShingleAnalyzerWrappe
     this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
     this.outputUnigrams = outputUnigrams;
     this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+    this.fillerToken = fillerToken;
   }
 
   /**
@@ -137,6 +142,10 @@ public final class ShingleAnalyzerWrappe
     return outputUnigramsIfNoShingles;
   }
 
+  public String getFillerToken() {
+    return fillerToken;
+  }
+
   @Override
   public final Analyzer getWrappedAnalyzer(String fieldName) {
     return delegate;
@@ -150,6 +159,7 @@ public final class ShingleAnalyzerWrappe
     filter.setTokenSeparator(tokenSeparator);
     filter.setOutputUnigrams(outputUnigrams);
     filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
+    filter.setFillerToken(fillerToken);
     return new TokenStreamComponents(components.getTokenizer(), filter);
   }
 }

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Wed Jan 29 23:34:48 2014
@@ -47,7 +47,7 @@ public final class ShingleFilter extends
   /**
    * filler token for when positionIncrement is more than 1
    */
-  public static final char[] FILLER_TOKEN = { '_' };
+  public static final String DEFAULT_FILLER_TOKEN = "_";
 
   /**
    * default maximum shingle size is 2.
@@ -67,7 +67,7 @@ public final class ShingleFilter extends
   /**
    * The default string to use when joining adjacent tokens to form a shingle
    */
-  public static final String TOKEN_SEPARATOR = " ";
+  public static final String DEFAULT_TOKEN_SEPARATOR = " ";
 
   /**
    * The sequence of input stream tokens (or filler tokens, if necessary)
@@ -95,7 +95,13 @@ public final class ShingleFilter extends
   /**
    * The string to use when joining adjacent tokens to form a shingle
    */
-  private String tokenSeparator = TOKEN_SEPARATOR;
+  private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR;
+
+  /**
+   * The string to insert for each position at which there is no token
+   * (i.e., when position increment is greater than one).
+   */
+  private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray();
 
   /**
    * By default, we output unigrams (individual tokens) as well as shingles
@@ -284,6 +290,16 @@ public final class ShingleFilter extends
     this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
   }
 
+  /**
+   * Sets the string to insert for each position at which there is no token
+   * (i.e., when position increment is greater than one).
+   *
+   * @param fillerToken string to insert at each position where there is no token
+   */
+  public void setFillerToken(String fillerToken) {
+    this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray();
+  }
+
   @Override
   public boolean incrementToken() throws IOException {
     boolean tokenAvailable = false;
@@ -341,7 +357,7 @@ public final class ShingleFilter extends
   /**
    * <p>Get the next token from the input stream.
    * <p>If the next token has <code>positionIncrement > 1</code>,
-   * <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
+   * <code>positionIncrement - 1</code> {@link #fillerToken}s are
    * inserted first.
    * @param target Where to put the new token; if null, a new instance is created.
    * @return On success, the populated token; null otherwise
@@ -359,7 +375,7 @@ public final class ShingleFilter extends
       // A filler token occupies no space
       newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), 
                                     newTarget.offsetAtt.startOffset());
-      newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+      newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
       newTarget.isFiller = true;
       --numFillerTokensToInsert;
     } else if (isNextInputStreamToken) {
@@ -390,7 +406,7 @@ public final class ShingleFilter extends
           isNextInputStreamToken = true;
           // A filler token occupies no space
           newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
-          newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+          newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
           newTarget.isFiller = true;
           --numFillerTokensToInsert;
         } else {

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java Wed Jan 29 23:34:48 2014
@@ -29,7 +29,7 @@ import java.util.Map;
  *   &lt;analyzer&gt;
  *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
  *     &lt;filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
- *             outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/&gt;
+ *             outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
  */
@@ -39,6 +39,7 @@ public class ShingleFilterFactory extend
   private final boolean outputUnigrams;
   private final boolean outputUnigramsIfNoShingles;
   private final String tokenSeparator;
+  private final String fillerToken;
 
   /** Creates a new ShingleFilterFactory */
   public ShingleFilterFactory(Map<String, String> args) {
@@ -57,7 +58,8 @@ public class ShingleFilterFactory extend
     }
     outputUnigrams = getBoolean(args, "outputUnigrams", true);
     outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false);
-    tokenSeparator = get(args, "tokenSeparator", ShingleFilter.TOKEN_SEPARATOR);
+    tokenSeparator = get(args, "tokenSeparator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
+    fillerToken = get(args, "fillerToken", ShingleFilter.DEFAULT_FILLER_TOKEN);
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -69,6 +71,7 @@ public class ShingleFilterFactory extend
     r.setOutputUnigrams(outputUnigrams);
     r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
     r.setTokenSeparator(tokenSeparator);
+    r.setFillerToken(fillerToken);
     return r;
   }
 }

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java Wed Jan 29 23:34:48 2014
@@ -21,9 +21,13 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.StopFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.TextField;
@@ -169,7 +173,8 @@ public class ShingleAnalyzerWrapperTest 
                           new int[] { 1,  0,  0,  1,  0,  0,  1,  0,  0,  1,  0,  1,  1 });
 
     analyzer = new ShingleAnalyzerWrapper(
-        new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false);
+        new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4,
+        ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
                           new String[] { "please divide this",   "please divide this sentence", 
                                          "divide this sentence", "divide this sentence into", 
@@ -195,7 +200,8 @@ public class ShingleAnalyzerWrapperTest 
                           new int[] { 1,  0,  1,  0,  1,  0,  1,  0,  1,  1 });
 
     analyzer = new ShingleAnalyzerWrapper(
-        new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false);
+        new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3,
+        ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
                           new String[] { "please divide this", 
                                          "divide this sentence", 
@@ -211,7 +217,8 @@ public class ShingleAnalyzerWrapperTest 
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
-        "", true, false);
+        "", true, false,
+        ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "please", "pleasedivide", 
                                          "divide", "divideinto", 
@@ -225,7 +232,8 @@ public class ShingleAnalyzerWrapperTest 
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
-        "", false, false);
+        "", false, false,
+        ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "pleasedivide", 
                                          "divideinto", 
@@ -240,7 +248,8 @@ public class ShingleAnalyzerWrapperTest 
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
-        null, true, false);
+        null, true, false,
+        ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "please", "pleasedivide", 
                                          "divide", "divideinto", 
@@ -254,7 +263,8 @@ public class ShingleAnalyzerWrapperTest 
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
-        "", false, false);
+        "", false, false,
+        ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "pleasedivide", 
                                          "divideinto", 
@@ -263,12 +273,14 @@ public class ShingleAnalyzerWrapperTest 
                           new int[] { 13, 18, 27 },
                           new int[] {  1,  1,  1 });
   }
+
   public void testAltTokenSeparator() throws Exception {
     ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
-        "<SEP>", true, false);
+        "<SEP>", true, false,
+        ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "please", "please<SEP>divide", 
                                          "divide", "divide<SEP>into", 
@@ -282,7 +294,8 @@ public class ShingleAnalyzerWrapperTest 
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
-        "<SEP>", false, false);
+        "<SEP>", false, false,
+        ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please divide into shingles",
                           new String[] { "please<SEP>divide", 
                                          "divide<SEP>into", 
@@ -291,13 +304,64 @@ public class ShingleAnalyzerWrapperTest 
                           new int[] { 13, 18, 27 },
                           new int[] {  1,  1,  1 });
   }
-  
+
+  public void testAltFillerToken() throws Exception {
+    Analyzer delegate = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "into");
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+    };
+
+    ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
+        delegate,
+        ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+        ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+        ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+        true, false, "--");
+    assertAnalyzesTo(analyzer, "please divide into shingles",
+                     new String[] { "please", "please divide",
+                                    "divide", "divide --",
+                                    "-- shingles", "shingles" },
+                     new int[] { 0,  0,  7,  7, 19, 19 },
+                     new int[] { 6, 13, 13, 19, 27, 27 },
+                     new int[] { 1,  0,  1,  0,  1,  1 });
+
+    analyzer = new ShingleAnalyzerWrapper(
+        delegate,
+        ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+        ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+        ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+        false, false, null);
+    assertAnalyzesTo(analyzer, "please divide into shingles",
+                     new String[] { "please divide", "divide ", " shingles" },
+                     new int[] {  0,  7, 19 },
+                     new int[] { 13, 19, 27 },
+                     new int[] {  1,  1,  1 });
+
+    analyzer = new ShingleAnalyzerWrapper(
+        delegate,
+        ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+        ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+        ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+        false, false, "");
+    assertAnalyzesTo(analyzer, "please divide into shingles",
+                     new String[] { "please divide", "divide ", " shingles" },
+                     new int[] {  0,  7, 19 },
+                     new int[] { 13, 19, 27 },
+                     new int[] {  1,  1,  1 });
+  }
+
   public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
     ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
         ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
         ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
-        "", false, true);
+        "", false, true,
+        ShingleFilter.DEFAULT_FILLER_TOKEN);
     assertAnalyzesTo(analyzer, "please",
                           new String[] { "please" },
                           new int[] { 0 },

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Wed Jan 29 23:34:48 2014
@@ -1196,4 +1196,52 @@ public class ShingleFilterTest extends B
                               new int[] {1, 0, 0, 1, 0, 0},
                               20);
   }
+
+  public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException {
+    // Analyzing "purple wizard of the", where of and the are removed as a
+    // stopwords, leaving two trailing holes:
+    Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
+    ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+    filter.setFillerToken("--");
+
+    assertTokenStreamContents(filter,
+        new String[]{"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"},
+        new int[]{0, 0, 0, 7, 7, 7},
+        new int[]{6, 13, 20, 13, 20, 20},
+        new int[]{1, 0, 0, 1, 0, 0},
+        20);
+
+     filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+    filter.setFillerToken("");
+
+    assertTokenStreamContents(filter,
+        new String[]{"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  "},
+        new int[]{0, 0, 0, 7, 7, 7},
+        new int[]{6, 13, 20, 13, 20, 20},
+        new int[]{1, 0, 0, 1, 0, 0},
+        20);
+
+
+    filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+    filter.setFillerToken(null);
+
+    assertTokenStreamContents(filter,
+        new String[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard  "},
+        new int[] {0, 0, 0, 7, 7, 7},
+        new int[] {6, 13, 20, 13, 20, 20},
+        new int[] {1, 0, 0, 1, 0, 0},
+        20);
+
+
+    filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+    filter.setFillerToken(null);
+    filter.setTokenSeparator(null);
+
+    assertTokenStreamContents(filter,
+        new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"},
+        new int[] {0, 0, 0, 7, 7, 7},
+        new int[] {6, 13, 20, 13, 20, 20},
+        new int[] {1, 0, 0, 1, 0, 0},
+        20);
+  }
 }