You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2014/01/30 00:34:49 UTC
svn commit: r1562639 - in /lucene/dev/trunk/lucene: ./
analysis/common/src/java/org/apache/lucene/analysis/shingle/
analysis/common/src/test/org/apache/lucene/analysis/shingle/
Author: sarowe
Date: Wed Jan 29 23:34:48 2014
New Revision: 1562639
URL: http://svn.apache.org/r1562639
Log:
LUCENE-5353: ShingleFilter's filler token should be configurable
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Jan 29 23:34:48 2014
@@ -119,6 +119,9 @@ New Features
encode term metadata, and all dictionary implementations can now plug in any
PostingsBaseFormat. (Han Jiang, Mike McCandless)
+* LUCENE-5353: ShingleFilter's filler token should be configurable.
+ (Ahmet Arslan, Simon Willnauer, Steve Rowe)
+
Build
* LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java Wed Jan 29 23:34:48 2014
@@ -36,6 +36,7 @@ public final class ShingleAnalyzerWrappe
private final String tokenSeparator;
private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles;
+ private final String fillerToken;
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
@@ -46,7 +47,8 @@ public final class ShingleAnalyzerWrappe
}
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
- this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.TOKEN_SEPARATOR, true, false);
+ this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+ true, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
}
/**
@@ -63,6 +65,7 @@ public final class ShingleAnalyzerWrappe
* minShingleSize tokens in the input stream)?
* Note that if outputUnigrams==true, then unigrams are always output,
* regardless of whether any shingles are available.
+ * @param fillerToken filler token to use when positionIncrement is more than 1
*/
public ShingleAnalyzerWrapper(
Analyzer delegate,
@@ -70,7 +73,8 @@ public final class ShingleAnalyzerWrappe
int maxShingleSize,
String tokenSeparator,
boolean outputUnigrams,
- boolean outputUnigramsIfNoShingles) {
+ boolean outputUnigramsIfNoShingles,
+ String fillerToken) {
super(delegate.getReuseStrategy());
this.delegate = delegate;
@@ -91,6 +95,7 @@ public final class ShingleAnalyzerWrappe
this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
this.outputUnigrams = outputUnigrams;
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+ this.fillerToken = fillerToken;
}
/**
@@ -137,6 +142,10 @@ public final class ShingleAnalyzerWrappe
return outputUnigramsIfNoShingles;
}
+ public String getFillerToken() {
+ return fillerToken;
+ }
+
@Override
public final Analyzer getWrappedAnalyzer(String fieldName) {
return delegate;
@@ -150,6 +159,7 @@ public final class ShingleAnalyzerWrappe
filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
+ filter.setFillerToken(fillerToken);
return new TokenStreamComponents(components.getTokenizer(), filter);
}
}
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Wed Jan 29 23:34:48 2014
@@ -47,7 +47,7 @@ public final class ShingleFilter extends
/**
* filler token for when positionIncrement is more than 1
*/
- public static final char[] FILLER_TOKEN = { '_' };
+ public static final String DEFAULT_FILLER_TOKEN = "_";
/**
* default maximum shingle size is 2.
@@ -67,7 +67,7 @@ public final class ShingleFilter extends
/**
* The default string to use when joining adjacent tokens to form a shingle
*/
- public static final String TOKEN_SEPARATOR = " ";
+ public static final String DEFAULT_TOKEN_SEPARATOR = " ";
/**
* The sequence of input stream tokens (or filler tokens, if necessary)
@@ -95,7 +95,13 @@ public final class ShingleFilter extends
/**
* The string to use when joining adjacent tokens to form a shingle
*/
- private String tokenSeparator = TOKEN_SEPARATOR;
+ private String tokenSeparator = DEFAULT_TOKEN_SEPARATOR;
+
+ /**
+ * The string to insert for each position at which there is no token
+ * (i.e., when position increment is greater than one).
+ */
+ private char[] fillerToken = DEFAULT_FILLER_TOKEN.toCharArray();
/**
* By default, we output unigrams (individual tokens) as well as shingles
@@ -284,6 +290,16 @@ public final class ShingleFilter extends
this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
}
+ /**
+ * Sets the string to insert for each position at which there is no token
+ * (i.e., when position increment is greater than one).
+ *
+ * @param fillerToken string to insert at each position where there is no token
+ */
+ public void setFillerToken(String fillerToken) {
+ this.fillerToken = null == fillerToken ? new char[0] : fillerToken.toCharArray();
+ }
+
@Override
public boolean incrementToken() throws IOException {
boolean tokenAvailable = false;
@@ -341,7 +357,7 @@ public final class ShingleFilter extends
/**
* <p>Get the next token from the input stream.
* <p>If the next token has <code>positionIncrement > 1</code>,
- * <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
+ * <code>positionIncrement - 1</code> {@link #fillerToken}s are
* inserted first.
* @param target Where to put the new token; if null, a new instance is created.
* @return On success, the populated token; null otherwise
@@ -359,7 +375,7 @@ public final class ShingleFilter extends
// A filler token occupies no space
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(),
newTarget.offsetAtt.startOffset());
- newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else if (isNextInputStreamToken) {
@@ -390,7 +406,7 @@ public final class ShingleFilter extends
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
- newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java Wed Jan 29 23:34:48 2014
@@ -29,7 +29,7 @@ import java.util.Map;
* <analyzer>
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
* <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
- * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
+ * outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/>
* </analyzer>
* </fieldType></pre>
*/
@@ -39,6 +39,7 @@ public class ShingleFilterFactory extend
private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles;
private final String tokenSeparator;
+ private final String fillerToken;
/** Creates a new ShingleFilterFactory */
public ShingleFilterFactory(Map<String, String> args) {
@@ -57,7 +58,8 @@ public class ShingleFilterFactory extend
}
outputUnigrams = getBoolean(args, "outputUnigrams", true);
outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false);
- tokenSeparator = get(args, "tokenSeparator", ShingleFilter.TOKEN_SEPARATOR);
+ tokenSeparator = get(args, "tokenSeparator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
+ fillerToken = get(args, "fillerToken", ShingleFilter.DEFAULT_FILLER_TOKEN);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -69,6 +71,7 @@ public class ShingleFilterFactory extend
r.setOutputUnigrams(outputUnigrams);
r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
r.setTokenSeparator(tokenSeparator);
+ r.setFillerToken(fillerToken);
return r;
}
}
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java Wed Jan 29 23:34:48 2014
@@ -21,9 +21,13 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
@@ -169,7 +173,8 @@ public class ShingleAnalyzerWrapperTest
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1 });
analyzer = new ShingleAnalyzerWrapper(
- new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4, ShingleFilter.TOKEN_SEPARATOR, false, false);
+ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 4,
+ ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this", "please divide this sentence",
"divide this sentence", "divide this sentence into",
@@ -195,7 +200,8 @@ public class ShingleAnalyzerWrapperTest
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 1 });
analyzer = new ShingleAnalyzerWrapper(
- new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3, ShingleFilter.TOKEN_SEPARATOR, false, false);
+ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3,
+ ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
new String[] { "please divide this",
"divide this sentence",
@@ -211,7 +217,8 @@ public class ShingleAnalyzerWrapperTest
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "", true, false);
+ "", true, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
@@ -225,7 +232,8 @@ public class ShingleAnalyzerWrapperTest
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "", false, false);
+ "", false, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
@@ -240,7 +248,8 @@ public class ShingleAnalyzerWrapperTest
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- null, true, false);
+ null, true, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "pleasedivide",
"divide", "divideinto",
@@ -254,7 +263,8 @@ public class ShingleAnalyzerWrapperTest
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "", false, false);
+ "", false, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "pleasedivide",
"divideinto",
@@ -263,12 +273,14 @@ public class ShingleAnalyzerWrapperTest
new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 });
}
+
public void testAltTokenSeparator() throws Exception {
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "<SEP>", true, false);
+ "<SEP>", true, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please", "please<SEP>divide",
"divide", "divide<SEP>into",
@@ -282,7 +294,8 @@ public class ShingleAnalyzerWrapperTest
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "<SEP>", false, false);
+ "<SEP>", false, false,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please divide into shingles",
new String[] { "please<SEP>divide",
"divide<SEP>into",
@@ -291,13 +304,64 @@ public class ShingleAnalyzerWrapperTest
new int[] { 13, 18, 27 },
new int[] { 1, 1, 1 });
}
-
+
+ public void testAltFillerToken() throws Exception {
+ Analyzer delegate = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "into");
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+
+ ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
+ delegate,
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+ true, false, "--");
+ assertAnalyzesTo(analyzer, "please divide into shingles",
+ new String[] { "please", "please divide",
+ "divide", "divide --",
+ "-- shingles", "shingles" },
+ new int[] { 0, 0, 7, 7, 19, 19 },
+ new int[] { 6, 13, 13, 19, 27, 27 },
+ new int[] { 1, 0, 1, 0, 1, 1 });
+
+ analyzer = new ShingleAnalyzerWrapper(
+ delegate,
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+ false, false, null);
+ assertAnalyzesTo(analyzer, "please divide into shingles",
+ new String[] { "please divide", "divide ", " shingles" },
+ new int[] { 0, 7, 19 },
+ new int[] { 13, 19, 27 },
+ new int[] { 1, 1, 1 });
+
+ analyzer = new ShingleAnalyzerWrapper(
+ delegate,
+ ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
+ ShingleFilter.DEFAULT_TOKEN_SEPARATOR,
+ false, false, "");
+ assertAnalyzesTo(analyzer, "please divide into shingles",
+ new String[] { "please divide", "divide ", " shingles" },
+ new int[] { 0, 7, 19 },
+ new int[] { 13, 19, 27 },
+ new int[] { 1, 1, 1 });
+ }
+
public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false),
ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE,
ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE,
- "", false, true);
+ "", false, true,
+ ShingleFilter.DEFAULT_FILLER_TOKEN);
assertAnalyzesTo(analyzer, "please",
new String[] { "please" },
new int[] { 0 },
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=1562639&r1=1562638&r2=1562639&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Wed Jan 29 23:34:48 2014
@@ -1196,4 +1196,52 @@ public class ShingleFilterTest extends B
new int[] {1, 0, 0, 1, 0, 0},
20);
}
+
+ public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException {
+ // Analyzing "purple wizard of the", where of and the are removed as a
+ // stopwords, leaving two trailing holes:
+ Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
+ ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+ filter.setFillerToken("--");
+
+ assertTokenStreamContents(filter,
+ new String[]{"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"},
+ new int[]{0, 0, 0, 7, 7, 7},
+ new int[]{6, 13, 20, 13, 20, 20},
+ new int[]{1, 0, 0, 1, 0, 0},
+ 20);
+
+ filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+ filter.setFillerToken("");
+
+ assertTokenStreamContents(filter,
+ new String[]{"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
+ new int[]{0, 0, 0, 7, 7, 7},
+ new int[]{6, 13, 20, 13, 20, 20},
+ new int[]{1, 0, 0, 1, 0, 0},
+ 20);
+
+
+ filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+ filter.setFillerToken(null);
+
+ assertTokenStreamContents(filter,
+ new String[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
+ new int[] {0, 0, 0, 7, 7, 7},
+ new int[] {6, 13, 20, 13, 20, 20},
+ new int[] {1, 0, 0, 1, 0, 0},
+ 20);
+
+
+ filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+ filter.setFillerToken(null);
+ filter.setTokenSeparator(null);
+
+ assertTokenStreamContents(filter,
+ new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"},
+ new int[] {0, 0, 0, 7, 7, 7},
+ new int[] {6, 13, 20, 13, 20, 20},
+ new int[] {1, 0, 0, 1, 0, 0},
+ 20);
+ }
}