You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2018/03/21 15:07:05 UTC
[1/2] lucene-solr:master: LUCENE-8202: Add FixedShingleFilter
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7x 2539578cb -> 230a77ce3
refs/heads/master d4e69c5cd -> fac84c01c
LUCENE-8202: Add FixedShingleFilter
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/fac84c01
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/fac84c01
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/fac84c01
Branch: refs/heads/master
Commit: fac84c01c84b3693a8c1251ae77f349c38497e06
Parents: d4e69c5
Author: Alan Woodward <ro...@apache.org>
Authored: Wed Mar 21 10:35:28 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Wed Mar 21 13:45:03 2018 +0000
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +
.../analysis/shingle/FixedShingleFilter.java | 294 +++++++++++++++++++
.../shingle/FixedShingleFilterFactory.java | 52 ++++
...ache.lucene.analysis.util.TokenFilterFactory | 1 +
.../shingle/FixedShingleFilterTest.java | 200 +++++++++++++
.../analysis/BaseTokenStreamTestCase.java | 4 +
.../java/org/apache/lucene/analysis/Token.java | 7 +
7 files changed, 561 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fac84c01/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index dd2a57c..c2cbc07 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -106,6 +106,9 @@ New Features
* LUCENE-8197: A new FeatureField makes it easy and efficient to integrate
static relevance signals into the final score. (Adrien Grand, Robert Muir)
+* LUCENE-8202: Add a FixedShingleFilter (Alan Woodward, Adrien Grand, Jim
+ Ferenczi)
+
Other
* LUCENE-8214: Improve selection of testPoint for GeoComplexPolygon.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fac84c01/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
new file mode 100644
index 0000000..a223cd8
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
@@ -0,0 +1,294 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.shingle;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * A FixedShingleFilter constructs shingles (token n-grams) from a token stream.
+ * In other words, it creates combinations of tokens as a single token.
+ *
+ * Unlike the {@link ShingleFilter}, FixedShingleFilter only emits shingles of a
+ * fixed size, and never emits unigrams, even at the end of a TokenStream. In
+ * addition, if the filter encounters stacked tokens (eg synonyms), then it will
+ * output stacked shingles
+ *
+ * For example, the sentence "please divide this sentence into shingles"
+ * might be tokenized into shingles "please divide", "divide this",
+ * "this sentence", "sentence into", and "into shingles".
+ *
+ * This filter handles position increments > 1 by inserting filler tokens
+ * (tokens with termtext "_").
+ *
+ * @lucene.experimental
+ */
+public final class FixedShingleFilter extends TokenFilter {
+
+ private final Deque<Token> tokenPool = new ArrayDeque<>();
+
+ private final int shingleSize;
+ private final String tokenSeparator;
+ private final Token gapToken = new Token(new AttributeSource());
+ private final Token endToken = new Token(new AttributeSource());
+
+ private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ private Token[] currentShingleTokens;
+ private boolean inputStreamExhausted = false;
+
+ public FixedShingleFilter(TokenStream input, int shingleSize) {
+ this(input, shingleSize, " ", "_");
+ }
+
+ public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) {
+ super(input);
+ this.shingleSize = shingleSize;
+ this.tokenSeparator = tokenSeparator;
+
+ this.gapToken.termAtt.setEmpty().append(fillerToken);
+
+ this.currentShingleTokens = new Token[shingleSize];
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ int posInc = 0;
+ if (nextShingle() == false) {
+ Token nextRoot = nextTokenInStream(currentShingleTokens[0]);
+ if (nextRoot == endToken)
+ return false;
+ recycleToken(currentShingleTokens[0]);
+ if (resetShingleRoot(nextRoot) == false) {
+ return false;
+ }
+ posInc = currentShingleTokens[0].posInc();
+ }
+ clearAttributes();
+ incAtt.setPositionIncrement(posInc);
+ offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset());
+ termAtt.setEmpty();
+ termAtt.append(currentShingleTokens[0].term());
+ typeAtt.setType("shingle");
+ posLenAtt.setPositionLength(shingleSize);
+ for (int i = 1; i < shingleSize; i++) {
+ termAtt.append(tokenSeparator).append(currentShingleTokens[i].term());
+ }
+ return true;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ this.tokenPool.clear();
+ this.currentShingleTokens[0] = null;
+ this.inputStreamExhausted = false;
+ }
+
+ @Override
+ public void end() throws IOException {
+ if (inputStreamExhausted == false) {
+ finishInnerStream();
+ }
+ clearAttributes();
+ this.offsetAtt.setOffset(0, endToken.endOffset());
+ }
+
+ private void finishInnerStream() throws IOException {
+ input.end();
+ inputStreamExhausted = true;
+ // check for gaps at the end of the tokenstream
+ endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
+ OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
+ endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
+ }
+
+ private Token lastTokenInShingle() {
+ int lastTokenIndex = shingleSize - 1;
+ while (currentShingleTokens[lastTokenIndex] == gapToken) {
+ lastTokenIndex--;
+ }
+ return currentShingleTokens[lastTokenIndex];
+ }
+
+ private boolean resetShingleRoot(Token token) throws IOException {
+ this.currentShingleTokens[0] = token;
+ for (int i = 1; i < shingleSize; i++) {
+ Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]);
+ if (current == endToken) {
+ if (endToken.posInc() + i >= shingleSize) {
+ // end tokens are a special case, because their posIncs are always
+ // due to stopwords. Therefore, we can happily append gap tokens
+ // to the end of the current shingle
+ for (int j = i; j < shingleSize; j++) {
+ this.currentShingleTokens[i] = gapToken;
+ i++;
+ }
+ return true;
+ }
+ return false;
+ }
+ if (current.posInc() > 1) {
+ // insert gaps into the shingle list
+ for (int j = 1; j < current.posInc(); j++) {
+ this.currentShingleTokens[i] = gapToken;
+ i++;
+ if (i >= shingleSize)
+ return true;
+ }
+ }
+ this.currentShingleTokens[i] = current;
+ }
+ return true;
+ }
+
+ private boolean nextShingle() throws IOException {
+ return currentShingleTokens[0] != null && advanceStack();
+ }
+
+ // check if the next token in the tokenstream is at the same position as this one
+ private boolean lastInStack(Token token) throws IOException {
+ Token next = nextTokenInStream(token);
+ return next == endToken || next.posInc() != 0;
+ }
+
+ private boolean advanceStack() throws IOException {
+ for (int i = shingleSize - 1; i >= 1; i--) {
+ if (currentShingleTokens[i] != gapToken && lastInStack(currentShingleTokens[i]) == false) {
+ currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]);
+ for (int j = i + 1; j < shingleSize; j++) {
+ currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]);
+ }
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private Token newToken() {
+ Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.removeFirst();
+ token.reset(this);
+ return token;
+ }
+
+ private void recycleToken(Token token) {
+ if (token == null)
+ return;
+ token.nextToken = null;
+ tokenPool.add(token);
+ }
+
+ // for testing
+ int instantiatedTokenCount() {
+ int tokenCount = tokenPool.size() + 1;
+ if (currentShingleTokens[0] == endToken || currentShingleTokens[0] == null)
+ return tokenCount;
+ for (Token t = currentShingleTokens[0]; t != endToken && t != null; t = t.nextToken) {
+ tokenCount++;
+ }
+ return tokenCount;
+ }
+
+ private Token nextTokenInGraph(Token token) throws IOException {
+ do {
+ token = nextTokenInStream(token);
+ if (token == endToken) {
+ return endToken;
+ }
+ } while (token.posInc() == 0);
+ return token;
+ }
+
+ private Token nextTokenInStream(Token token) throws IOException {
+ if (token != null && token.nextToken != null) {
+ return token.nextToken;
+ }
+ if (input.incrementToken() == false) {
+ finishInnerStream();
+ if (token == null) {
+ return endToken;
+ }
+ else {
+ token.nextToken = endToken;
+ return endToken;
+ }
+ }
+ if (token == null) {
+ return newToken();
+ }
+ token.nextToken = newToken();
+ return token.nextToken;
+ }
+
+ private static class Token {
+ final AttributeSource attSource;
+ final PositionIncrementAttribute posIncAtt;
+ final CharTermAttribute termAtt;
+ final OffsetAttribute offsetAtt;
+
+ Token nextToken;
+
+ Token(AttributeSource attSource) {
+ this.attSource = attSource;
+ this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
+ this.termAtt = attSource.addAttribute(CharTermAttribute.class);
+ this.offsetAtt = attSource.addAttribute(OffsetAttribute.class);
+ }
+
+ int posInc() {
+ return this.posIncAtt.getPositionIncrement();
+ }
+
+ CharSequence term() {
+ return this.termAtt;
+ }
+
+ int startOffset() {
+ return this.offsetAtt.startOffset();
+ }
+
+ int endOffset() {
+ return this.offsetAtt.endOffset();
+ }
+
+ void reset(AttributeSource attSource) {
+ attSource.copyTo(this.attSource);
+ this.nextToken = null;
+ }
+
+ @Override
+ public String toString() {
+ return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc();
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fac84c01/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java
new file mode 100644
index 0000000..de824b8
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.shingle;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link FixedShingleFilter}
+ *
+ * Parameters are:
+ * <ul>
+ * <li>shingleSize - how many tokens should be combined into each shingle (default: 2)
+ * <li>tokenSeparator - how tokens should be joined together in the shingle (default: space)
+ * <li>fillerToken - what should be added in place of stop words (default: _ )
+ * </ul>
+ */
+public class FixedShingleFilterFactory extends TokenFilterFactory {
+
+ private final int shingleSize;
+ private final String tokenSeparator;
+ private final String fillerToken;
+
+ public FixedShingleFilterFactory(Map<String, String> args) {
+ super(args);
+ this.shingleSize = getInt(args, "shingleSize", 2);
+ this.tokenSeparator = get(args, "tokenSeparator", " ");
+ this.fillerToken = get(args, "fillerToken", "_");
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new FixedShingleFilter(input, shingleSize, tokenSeparator, fillerToken);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fac84c01/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index 6dcc81c..b46de18 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -101,6 +101,7 @@ org.apache.lucene.analysis.pt.PortugueseStemFilterFactory
org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
org.apache.lucene.analysis.shingle.ShingleFilterFactory
+org.apache.lucene.analysis.shingle.FixedShingleFilterFactory
org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
org.apache.lucene.analysis.standard.ClassicFilterFactory
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fac84c01/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
new file mode 100644
index 0000000..d7d2825
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.shingle;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
+
+ public void testBiGramFilter() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("please", 0, 6),
+ new Token("divide", 7, 13),
+ new Token("this", 14, 18),
+ new Token("sentence", 19, 27),
+ new Token("into", 28, 32),
+ new Token("shingles", 33, 41)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 2),
+ new String[]{"please divide", "divide this", "this sentence", "sentence into", "into shingles"},
+ new int[]{0, 7, 14, 19, 28,},
+ new int[]{13, 18, 27, 32, 41,},
+ new String[]{"shingle", "shingle", "shingle", "shingle", "shingle",},
+ new int[]{1, 1, 1, 1, 1,},
+ new int[]{2, 2, 2, 2, 2});
+
+ }
+
+ public void testBiGramFilterWithAltSeparator() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("please", 0, 6),
+ new Token("divide", 7, 13),
+ new Token("this", 14, 18),
+ new Token("sentence", 19, 27),
+ new Token("into", 28, 32),
+ new Token("shingles", 33, 41)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 2, "<SEP>", "_"),
+ new String[]{"please<SEP>divide", "divide<SEP>this", "this<SEP>sentence", "sentence<SEP>into", "into<SEP>shingles"},
+ new int[]{0, 7, 14, 19, 28},
+ new int[]{13, 18, 27, 32, 41},
+ new String[]{"shingle", "shingle", "shingle", "shingle", "shingle"},
+ new int[]{1, 1, 1, 1, 1});
+
+ }
+
+ public void testTriGramFilter() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("please", 0, 6),
+ new Token("divide", 7, 13),
+ new Token("this", 14, 18),
+ new Token("sentence", 19, 27),
+ new Token("into", 28, 32),
+ new Token("shingles", 33, 41)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+ new String[]{"please divide this", "divide this sentence", "this sentence into", "sentence into shingles"});
+ }
+
+ public void testShingleSizeGreaterThanTokenstreamLength() throws IOException {
+
+ TokenStream ts = new FixedShingleFilter(new CannedTokenStream(
+ new Token("please", 0, 6),
+ new Token("divide", 7, 13)
+ ), 3);
+
+ ts.reset();
+ assertFalse(ts.incrementToken());
+
+ }
+
+ public void testWithStopwords() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("please", 0, 6),
+ new Token("divide", 7, 13),
+ new Token("sentence", 2, 19, 27),
+ new Token("shingles", 2, 33, 41)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+ new String[]{"please divide _", "divide _ sentence", "sentence _ shingles"},
+ new int[]{0, 7, 19,},
+ new int[]{13, 27, 41,},
+ new String[]{"shingle", "shingle", "shingle",},
+ new int[]{1, 1, 2,});
+
+ }
+
+ public void testConsecutiveStopwords() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("b", 2, 2, 3),
+ new Token("c", 4, 5),
+ new Token("d", 6, 7),
+ new Token("b", 3, 12, 13),
+ new Token("c", 14, 15)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 4),
+ new String[]{"b c d _", "c d _ _", "d _ _ b"},
+ new int[]{2, 4, 6,},
+ new int[]{7, 7, 13,},
+ new int[]{2, 1, 1,});
+ }
+
+ public void testTrailingStopwords() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(1, 7,
+ new Token("b", 0, 1),
+ new Token("c", 2, 3),
+ new Token("d", 4, 5)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+ new String[] { "b c d", "c d _" },
+ new int[] { 0, 2, },
+ new int[] { 5, 5, },
+ new int[] { 1, 1, });
+
+
+ }
+
+ public void testMultipleTrailingStopwords() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(2, 9,
+ new Token("b", 0, 1),
+ new Token("c", 2, 3),
+ new Token("d", 4, 5)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+ new String[] { "b c d", "c d _", "d _ _" },
+ new int[] { 0, 2, 4 },
+ new int[] { 5, 5, 5 },
+ new int[] { 1, 1, 1 });
+ }
+
+ public void testIncomingGraphs() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("b", 0, 1),
+ new Token("a", 0, 0, 1),
+ new Token("c", 2, 3),
+ new Token("b", 4, 5),
+ new Token("a", 0, 4, 5),
+ new Token("d", 6, 7)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 2),
+ new String[] { "b c", "a c", "c b", "c a", "b d", "a d" },
+ new int[] { 0, 0, 2, 2, 4, 4 },
+ new int[] { 3, 3, 5, 5, 7, 7 },
+ new int[] { 1, 0, 1, 0, 1, 0 });
+ }
+
+ public void testShinglesSpanningGraphs() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("b", 0, 1),
+ new Token("a", 0, 0, 1),
+ new Token("c", 2, 3),
+ new Token("b", 4, 5),
+ new Token("a", 0, 4, 5),
+ new Token("d", 6, 7)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+ new String[] { "b c b", "b c a", "a c b", "a c a", "c b d", "c a d" },
+ new int[] { 0, 0, 0, 0, 2, 2, },
+ new int[] { 5, 5, 5, 5, 7, 7, },
+ new int[] { 1, 0, 0, 0, 1, 0, });
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fac84c01/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index 3e1e375..8c0a295 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -341,6 +341,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
}
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int[] posLengths) throws IOException {
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, null);
+ }
+
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
assertTokenStreamContents(ts, output, null, null, null, null, null, null);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fac84c01/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
index 04b1df8..9994175 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
@@ -75,6 +75,13 @@ public class Token extends PackedTokenAttributeImpl implements FlagsAttribute, P
setOffset(start, end);
}
+ /** Constructs a Token with the given term text, position increment, start and end offsets */
+ public Token(CharSequence text, int posInc, int start, int end) {
+ append(text);
+ setOffset(start, end);
+ setPositionIncrement(posInc);
+ }
+
/**
* {@inheritDoc}
* @see FlagsAttribute
[2/2] lucene-solr:branch_7x: LUCENE-8202: Add FixedShingleFilter
Posted by ro...@apache.org.
LUCENE-8202: Add FixedShingleFilter
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/230a77ce
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/230a77ce
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/230a77ce
Branch: refs/heads/branch_7x
Commit: 230a77ce38ebe6294a06aebf23d85b68223b6ec2
Parents: 2539578
Author: Alan Woodward <ro...@apache.org>
Authored: Wed Mar 21 10:35:28 2018 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Wed Mar 21 13:45:24 2018 +0000
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +
.../analysis/shingle/FixedShingleFilter.java | 294 +++++++++++++++++++
.../shingle/FixedShingleFilterFactory.java | 52 ++++
...ache.lucene.analysis.util.TokenFilterFactory | 1 +
.../shingle/FixedShingleFilterTest.java | 200 +++++++++++++
.../analysis/BaseTokenStreamTestCase.java | 4 +
.../java/org/apache/lucene/analysis/Token.java | 7 +
7 files changed, 561 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/230a77ce/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 048b00d..d9ad6c9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -16,6 +16,9 @@ New Features
* LUCENE-8197: A new FeatureField makes it easy and efficient to integrate
static relevance signals into the final score. (Adrien Grand, Robert Muir)
+* LUCENE-8202: Add a FixedShingleFilter (Alan Woodward, Adrien Grand, Jim
+ Ferenczi)
+
Other
* LUCENE-8214: Improve selection of testPoint for GeoComplexPolygon.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/230a77ce/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
new file mode 100644
index 0000000..a223cd8
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilter.java
@@ -0,0 +1,294 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.shingle;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * A FixedShingleFilter constructs shingles (token n-grams) from a token stream.
+ * In other words, it creates combinations of tokens as a single token.
+ *
+ * Unlike the {@link ShingleFilter}, FixedShingleFilter only emits shingles of a
+ * fixed size, and never emits unigrams, even at the end of a TokenStream. In
+ * addition, if the filter encounters stacked tokens (eg synonyms), then it will
+ * output stacked shingles
+ *
+ * For example, the sentence "please divide this sentence into shingles"
+ * might be tokenized into shingles "please divide", "divide this",
+ * "this sentence", "sentence into", and "into shingles".
+ *
+ * This filter handles position increments > 1 by inserting filler tokens
+ * (tokens with termtext "_").
+ *
+ * @lucene.experimental
+ */
+public final class FixedShingleFilter extends TokenFilter {
+
+ private final Deque<Token> tokenPool = new ArrayDeque<>();
+
+ private final int shingleSize;
+ private final String tokenSeparator;
+ private final Token gapToken = new Token(new AttributeSource());
+ private final Token endToken = new Token(new AttributeSource());
+
+ private final PositionIncrementAttribute incAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ private Token[] currentShingleTokens;
+ private boolean inputStreamExhausted = false;
+
+ public FixedShingleFilter(TokenStream input, int shingleSize) {
+ this(input, shingleSize, " ", "_");
+ }
+
+ public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) {
+ super(input);
+ this.shingleSize = shingleSize;
+ this.tokenSeparator = tokenSeparator;
+
+ this.gapToken.termAtt.setEmpty().append(fillerToken);
+
+ this.currentShingleTokens = new Token[shingleSize];
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ int posInc = 0;
+ if (nextShingle() == false) {
+ Token nextRoot = nextTokenInStream(currentShingleTokens[0]);
+ if (nextRoot == endToken)
+ return false;
+ recycleToken(currentShingleTokens[0]);
+ if (resetShingleRoot(nextRoot) == false) {
+ return false;
+ }
+ posInc = currentShingleTokens[0].posInc();
+ }
+ clearAttributes();
+ incAtt.setPositionIncrement(posInc);
+ offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset());
+ termAtt.setEmpty();
+ termAtt.append(currentShingleTokens[0].term());
+ typeAtt.setType("shingle");
+ posLenAtt.setPositionLength(shingleSize);
+ for (int i = 1; i < shingleSize; i++) {
+ termAtt.append(tokenSeparator).append(currentShingleTokens[i].term());
+ }
+ return true;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ this.tokenPool.clear();
+ this.currentShingleTokens[0] = null;
+ this.inputStreamExhausted = false;
+ }
+
+ @Override
+ public void end() throws IOException {
+ if (inputStreamExhausted == false) {
+ finishInnerStream();
+ }
+ clearAttributes();
+ this.offsetAtt.setOffset(0, endToken.endOffset());
+ }
+
+ private void finishInnerStream() throws IOException {
+ input.end();
+ inputStreamExhausted = true;
+ // check for gaps at the end of the tokenstream
+ endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement());
+ OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class);
+ endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset());
+ }
+
+ private Token lastTokenInShingle() {
+ int lastTokenIndex = shingleSize - 1;
+ while (currentShingleTokens[lastTokenIndex] == gapToken) {
+ lastTokenIndex--;
+ }
+ return currentShingleTokens[lastTokenIndex];
+ }
+
+ private boolean resetShingleRoot(Token token) throws IOException {
+ this.currentShingleTokens[0] = token;
+ for (int i = 1; i < shingleSize; i++) {
+ Token current = nextTokenInGraph(this.currentShingleTokens[i - 1]);
+ if (current == endToken) {
+ if (endToken.posInc() + i >= shingleSize) {
+ // end tokens are a special case, because their posIncs are always
+ // due to stopwords. Therefore, we can happily append gap tokens
+ // to the end of the current shingle
+ for (int j = i; j < shingleSize; j++) {
+ this.currentShingleTokens[i] = gapToken;
+ i++;
+ }
+ return true;
+ }
+ return false;
+ }
+ if (current.posInc() > 1) {
+ // insert gaps into the shingle list
+ for (int j = 1; j < current.posInc(); j++) {
+ this.currentShingleTokens[i] = gapToken;
+ i++;
+ if (i >= shingleSize)
+ return true;
+ }
+ }
+ this.currentShingleTokens[i] = current;
+ }
+ return true;
+ }
+
+ private boolean nextShingle() throws IOException {
+ return currentShingleTokens[0] != null && advanceStack();
+ }
+
+ // check if the next token in the tokenstream is at the same position as this one
+ private boolean lastInStack(Token token) throws IOException {
+ Token next = nextTokenInStream(token);
+ return next == endToken || next.posInc() != 0;
+ }
+
+ private boolean advanceStack() throws IOException {
+ for (int i = shingleSize - 1; i >= 1; i--) {
+ if (currentShingleTokens[i] != gapToken && lastInStack(currentShingleTokens[i]) == false) {
+ currentShingleTokens[i] = nextTokenInStream(currentShingleTokens[i]);
+ for (int j = i + 1; j < shingleSize; j++) {
+ currentShingleTokens[j] = nextTokenInGraph(currentShingleTokens[j - 1]);
+ }
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private Token newToken() {
+ Token token = tokenPool.size() == 0 ? new Token(this.cloneAttributes()) : tokenPool.removeFirst();
+ token.reset(this);
+ return token;
+ }
+
+ private void recycleToken(Token token) {
+ if (token == null)
+ return;
+ token.nextToken = null;
+ tokenPool.add(token);
+ }
+
+ // for testing
+ int instantiatedTokenCount() {
+ int tokenCount = tokenPool.size() + 1;
+ if (currentShingleTokens[0] == endToken || currentShingleTokens[0] == null)
+ return tokenCount;
+ for (Token t = currentShingleTokens[0]; t != endToken && t != null; t = t.nextToken) {
+ tokenCount++;
+ }
+ return tokenCount;
+ }
+
+ private Token nextTokenInGraph(Token token) throws IOException {
+ do {
+ token = nextTokenInStream(token);
+ if (token == endToken) {
+ return endToken;
+ }
+ } while (token.posInc() == 0);
+ return token;
+ }
+
+ private Token nextTokenInStream(Token token) throws IOException {
+ if (token != null && token.nextToken != null) {
+ return token.nextToken;
+ }
+ if (input.incrementToken() == false) {
+ finishInnerStream();
+ if (token == null) {
+ return endToken;
+ }
+ else {
+ token.nextToken = endToken;
+ return endToken;
+ }
+ }
+ if (token == null) {
+ return newToken();
+ }
+ token.nextToken = newToken();
+ return token.nextToken;
+ }
+
+ private static class Token {
+ final AttributeSource attSource;
+ final PositionIncrementAttribute posIncAtt;
+ final CharTermAttribute termAtt;
+ final OffsetAttribute offsetAtt;
+
+ Token nextToken;
+
+ Token(AttributeSource attSource) {
+ this.attSource = attSource;
+ this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
+ this.termAtt = attSource.addAttribute(CharTermAttribute.class);
+ this.offsetAtt = attSource.addAttribute(OffsetAttribute.class);
+ }
+
+ int posInc() {
+ return this.posIncAtt.getPositionIncrement();
+ }
+
+ CharSequence term() {
+ return this.termAtt;
+ }
+
+ int startOffset() {
+ return this.offsetAtt.startOffset();
+ }
+
+ int endOffset() {
+ return this.offsetAtt.endOffset();
+ }
+
+ void reset(AttributeSource attSource) {
+ attSource.copyTo(this.attSource);
+ this.nextToken = null;
+ }
+
+ @Override
+ public String toString() {
+ return term() + "(" + startOffset() + "," + endOffset() + ") " + posInc();
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/230a77ce/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java
new file mode 100644
index 0000000..de824b8
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/FixedShingleFilterFactory.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.shingle;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link FixedShingleFilter}
+ *
+ * Parameters are:
+ * <ul>
+ * <li>shingleSize - how many tokens should be combined into each shingle (default: 2)
+ * <li>tokenSeparator - how tokens should be joined together in the shingle (default: space)
+ * <li>fillerToken - what should be added in place of stop words (default: _ )
+ * </ul>
+ */
+public class FixedShingleFilterFactory extends TokenFilterFactory {
+
+ private final int shingleSize;
+ private final String tokenSeparator;
+ private final String fillerToken;
+
+ public FixedShingleFilterFactory(Map<String, String> args) {
+ super(args);
+ this.shingleSize = getInt(args, "shingleSize", 2);
+ this.tokenSeparator = get(args, "tokenSeparator", " ");
+ this.fillerToken = get(args, "fillerToken", "_");
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new FixedShingleFilter(input, shingleSize, tokenSeparator, fillerToken);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/230a77ce/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index 6dcc81c..b46de18 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -101,6 +101,7 @@ org.apache.lucene.analysis.pt.PortugueseStemFilterFactory
org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
org.apache.lucene.analysis.shingle.ShingleFilterFactory
+org.apache.lucene.analysis.shingle.FixedShingleFilterFactory
org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
org.apache.lucene.analysis.standard.ClassicFilterFactory
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/230a77ce/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
new file mode 100644
index 0000000..d7d2825
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/FixedShingleFilterTest.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.shingle;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class FixedShingleFilterTest extends BaseTokenStreamTestCase {
+
+ public void testBiGramFilter() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("please", 0, 6),
+ new Token("divide", 7, 13),
+ new Token("this", 14, 18),
+ new Token("sentence", 19, 27),
+ new Token("into", 28, 32),
+ new Token("shingles", 33, 41)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 2),
+ new String[]{"please divide", "divide this", "this sentence", "sentence into", "into shingles"},
+ new int[]{0, 7, 14, 19, 28,},
+ new int[]{13, 18, 27, 32, 41,},
+ new String[]{"shingle", "shingle", "shingle", "shingle", "shingle",},
+ new int[]{1, 1, 1, 1, 1,},
+ new int[]{2, 2, 2, 2, 2});
+
+ }
+
+ public void testBiGramFilterWithAltSeparator() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("please", 0, 6),
+ new Token("divide", 7, 13),
+ new Token("this", 14, 18),
+ new Token("sentence", 19, 27),
+ new Token("into", 28, 32),
+ new Token("shingles", 33, 41)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 2, "<SEP>", "_"),
+ new String[]{"please<SEP>divide", "divide<SEP>this", "this<SEP>sentence", "sentence<SEP>into", "into<SEP>shingles"},
+ new int[]{0, 7, 14, 19, 28},
+ new int[]{13, 18, 27, 32, 41},
+ new String[]{"shingle", "shingle", "shingle", "shingle", "shingle"},
+ new int[]{1, 1, 1, 1, 1});
+
+ }
+
+ public void testTriGramFilter() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("please", 0, 6),
+ new Token("divide", 7, 13),
+ new Token("this", 14, 18),
+ new Token("sentence", 19, 27),
+ new Token("into", 28, 32),
+ new Token("shingles", 33, 41)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+ new String[]{"please divide this", "divide this sentence", "this sentence into", "sentence into shingles"});
+ }
+
+ public void testShingleSizeGreaterThanTokenstreamLength() throws IOException {
+
+ TokenStream ts = new FixedShingleFilter(new CannedTokenStream(
+ new Token("please", 0, 6),
+ new Token("divide", 7, 13)
+ ), 3);
+
+ ts.reset();
+ assertFalse(ts.incrementToken());
+
+ }
+
+ public void testWithStopwords() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("please", 0, 6),
+ new Token("divide", 7, 13),
+ new Token("sentence", 2, 19, 27),
+ new Token("shingles", 2, 33, 41)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+ new String[]{"please divide _", "divide _ sentence", "sentence _ shingles"},
+ new int[]{0, 7, 19,},
+ new int[]{13, 27, 41,},
+ new String[]{"shingle", "shingle", "shingle",},
+ new int[]{1, 1, 2,});
+
+ }
+
+ public void testConsecutiveStopwords() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("b", 2, 2, 3),
+ new Token("c", 4, 5),
+ new Token("d", 6, 7),
+ new Token("b", 3, 12, 13),
+ new Token("c", 14, 15)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 4),
+ new String[]{"b c d _", "c d _ _", "d _ _ b"},
+ new int[]{2, 4, 6,},
+ new int[]{7, 7, 13,},
+ new int[]{2, 1, 1,});
+ }
+
+ public void testTrailingStopwords() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(1, 7,
+ new Token("b", 0, 1),
+ new Token("c", 2, 3),
+ new Token("d", 4, 5)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+ new String[] { "b c d", "c d _" },
+ new int[] { 0, 2, },
+ new int[] { 5, 5, },
+ new int[] { 1, 1, });
+
+
+ }
+
+ public void testMultipleTrailingStopwords() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(2, 9,
+ new Token("b", 0, 1),
+ new Token("c", 2, 3),
+ new Token("d", 4, 5)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+ new String[] { "b c d", "c d _", "d _ _" },
+ new int[] { 0, 2, 4 },
+ new int[] { 5, 5, 5 },
+ new int[] { 1, 1, 1 });
+ }
+
+ public void testIncomingGraphs() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("b", 0, 1),
+ new Token("a", 0, 0, 1),
+ new Token("c", 2, 3),
+ new Token("b", 4, 5),
+ new Token("a", 0, 4, 5),
+ new Token("d", 6, 7)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 2),
+ new String[] { "b c", "a c", "c b", "c a", "b d", "a d" },
+ new int[] { 0, 0, 2, 2, 4, 4 },
+ new int[] { 3, 3, 5, 5, 7, 7 },
+ new int[] { 1, 0, 1, 0, 1, 0 });
+ }
+
+ public void testShinglesSpanningGraphs() throws IOException {
+
+ TokenStream ts = new CannedTokenStream(
+ new Token("b", 0, 1),
+ new Token("a", 0, 0, 1),
+ new Token("c", 2, 3),
+ new Token("b", 4, 5),
+ new Token("a", 0, 4, 5),
+ new Token("d", 6, 7)
+ );
+
+ assertTokenStreamContents(new FixedShingleFilter(ts, 3),
+ new String[] { "b c b", "b c a", "a c b", "a c a", "c b d", "c a d" },
+ new int[] { 0, 0, 0, 0, 2, 2, },
+ new int[] { 5, 5, 5, 5, 7, 7, },
+ new int[] { 1, 0, 0, 0, 1, 0, });
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/230a77ce/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index 3e1e375..8c0a295 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -341,6 +341,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
}
+ public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int[] posLengths) throws IOException {
+ assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, null);
+ }
+
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
assertTokenStreamContents(ts, output, null, null, null, null, null, null);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/230a77ce/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
index 04b1df8..9994175 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/Token.java
@@ -75,6 +75,13 @@ public class Token extends PackedTokenAttributeImpl implements FlagsAttribute, P
setOffset(start, end);
}
+ /** Constructs a Token with the given term text, position increment, start and end offsets */
+ public Token(CharSequence text, int posInc, int start, int end) {
+ append(text);
+ setOffset(start, end);
+ setPositionIncrement(posInc);
+ }
+
/**
* {@inheritDoc}
* @see FlagsAttribute