You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/09/17 18:26:37 UTC
svn commit: r1524117 - in /lucene/dev/trunk/lucene: ./
analysis/common/src/java/org/apache/lucene/analysis/shingle/
analysis/common/src/test/org/apache/lucene/analysis/shingle/
test-framework/src/java/org/apache/lucene/analysis/
Author: mikemccand
Date: Tue Sep 17 16:26:36 2013
New Revision: 1524117
URL: http://svn.apache.org/r1524117
Log:
LUCENE-5180: ShingleFilter creates shingles from trailing holes
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1524117&r1=1524116&r2=1524117&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Sep 17 16:26:36 2013
@@ -38,6 +38,10 @@ New Features
* SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
(Ryo Onodera via Koji Sekiguchi)
+* LUCENE-5180: ShingleFilter now creates shingles with trailing holes,
+ for example if a StopFilter had removed the last token. (Mike
+ McCandless)
+
Optimizations
* LUCENE-4848: Use Java 7 NIO2-FileChannel instead of RandomAccessFile
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=1524117&r1=1524116&r2=1524117&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Tue Sep 17 16:26:36 2013
@@ -147,6 +147,12 @@ public final class ShingleFilter extends
* true if no shingles have been output yet (for outputUnigramsIfNoShingles).
*/
boolean noShingleOutput = true;
+
+ /**
+ * Holds the State after input.end() was called, so we can
+ * restore it in our end() impl.
+ */
+ private State endState;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -279,7 +285,7 @@ public final class ShingleFilter extends
}
@Override
- public final boolean incrementToken() throws IOException {
+ public boolean incrementToken() throws IOException {
boolean tokenAvailable = false;
int builtGramSize = 0;
if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
@@ -364,39 +370,63 @@ public final class ShingleFilter extends
}
isNextInputStreamToken = false;
newTarget.isFiller = false;
- } else if (!exhausted && input.incrementToken()) {
- if (null == target) {
- newTarget = new InputWindowToken(cloneAttributes());
- } else {
- this.copyTo(target.attSource);
- }
- if (posIncrAtt.getPositionIncrement() > 1) {
- // Each output shingle must contain at least one input token,
- // so no more than (maxShingleSize - 1) filler tokens will be inserted.
- numFillerTokensToInsert
- = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
- // Save the current token as the next input stream token
- if (null == nextInputStreamToken) {
- nextInputStreamToken = cloneAttributes();
+ } else if (!exhausted) {
+ if (input.incrementToken()) {
+ if (null == target) {
+ newTarget = new InputWindowToken(cloneAttributes());
+ } else {
+ this.copyTo(target.attSource);
+ }
+ if (posIncrAtt.getPositionIncrement() > 1) {
+ // Each output shingle must contain at least one input token,
+ // so no more than (maxShingleSize - 1) filler tokens will be inserted.
+ numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
+ // Save the current token as the next input stream token
+ if (null == nextInputStreamToken) {
+ nextInputStreamToken = cloneAttributes();
+ } else {
+ this.copyTo(nextInputStreamToken);
+ }
+ isNextInputStreamToken = true;
+ // A filler token occupies no space
+ newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
+ newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ newTarget.isFiller = true;
+ --numFillerTokensToInsert;
} else {
- this.copyTo(nextInputStreamToken);
+ newTarget.isFiller = false;
}
- isNextInputStreamToken = true;
- // A filler token occupies no space
- newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
- newTarget.termAtt.copyBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
- newTarget.isFiller = true;
- --numFillerTokensToInsert;
} else {
- newTarget.isFiller = false;
+ exhausted = true;
+ input.end();
+ endState = captureState();
+ numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
+ if (numFillerTokensToInsert > 0) {
+ nextInputStreamToken = new AttributeSource(getAttributeFactory());
+ nextInputStreamToken.addAttribute(CharTermAttribute.class);
+ OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class);
+ newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
+ // Recurse/loop just once:
+ return getNextToken(target);
+ } else {
+ newTarget = null;
+ }
}
} else {
newTarget = null;
- exhausted = true;
}
return newTarget;
}
+ @Override
+ public void end() throws IOException {
+ if (!exhausted) {
+ super.end();
+ } else {
+ restoreState(endState);
+ }
+ }
+
/**
* <p>Fills {@link #inputWindow} with input stream tokens, if available,
* shifting to the right if the window was previously full.
@@ -445,6 +475,7 @@ public final class ShingleFilter extends
isOutputHere = false;
noShingleOutput = true;
exhausted = false;
+ endState = null;
if (outputUnigramsIfNoShingles && ! outputUnigrams) {
// Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
gramSize.minValue = minShingleSize;
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=1524117&r1=1524116&r2=1524117&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Tue Sep 17 16:26:36 2013
@@ -24,6 +24,7 @@ import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
@@ -34,41 +35,6 @@ import org.apache.lucene.analysis.tokena
public class ShingleFilterTest extends BaseTokenStreamTestCase {
- public class TestTokenStream extends TokenStream {
-
- protected int index = 0;
- protected Token[] testToken;
-
- private CharTermAttribute termAtt;
- private OffsetAttribute offsetAtt;
- private PositionIncrementAttribute posIncrAtt;
- private TypeAttribute typeAtt;
-
- public TestTokenStream(Token[] testToken) {
- super();
- this.testToken = testToken;
- this.termAtt = addAttribute(CharTermAttribute.class);
- this.offsetAtt = addAttribute(OffsetAttribute.class);
- this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- this.typeAtt = addAttribute(TypeAttribute.class);
- }
-
- @Override
- public final boolean incrementToken() {
- clearAttributes();
- if (index < testToken.length) {
- Token t = testToken[index++];
- termAtt.copyBuffer(t.buffer(), 0, t.length());
- offsetAtt.setOffset(t.startOffset(), t.endOffset());
- posIncrAtt.setPositionIncrement(t.getPositionIncrement());
- typeAtt.setType(TypeAttribute.DEFAULT_TYPE);
- return true;
- } else {
- return false;
- }
- }
- }
-
public static final Token[] TEST_TOKEN = new Token[] {
createToken("please", 0, 6),
createToken("divide", 7, 13),
@@ -1066,7 +1032,7 @@ public class ShingleFilterTest extends B
boolean outputUnigrams)
throws IOException {
- ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+ ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), maxSize);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
@@ -1076,7 +1042,7 @@ public class ShingleFilterTest extends B
String[] types, boolean outputUnigrams)
throws IOException {
ShingleFilter filter
- = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+ = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
@@ -1087,7 +1053,7 @@ public class ShingleFilterTest extends B
boolean outputUnigramsIfNoShingles)
throws IOException {
ShingleFilter filter
- = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+ = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
@@ -1098,7 +1064,7 @@ public class ShingleFilterTest extends B
String[] types, boolean outputUnigrams)
throws IOException {
ShingleFilter filter
- = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+ = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
@@ -1170,4 +1136,63 @@ public class ShingleFilterTest extends B
};
checkOneTermReuse(a, "", "");
}
+
+ public void testTrailingHole1() throws IOException {
+ // Analyzing "wizard of", where of is removed as a
+ // stopword leaving a trailing hole:
+ Token[] inputTokens = new Token[] {createToken("wizard", 0, 6)};
+ ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 9, inputTokens), 2, 2);
+
+ assertTokenStreamContents(filter,
+ new String[] {"wizard", "wizard _"},
+ new int[] {0, 0},
+ new int[] {6, 9},
+ new int[] {1, 0},
+ 9);
+ }
+
+ public void testTrailingHole2() throws IOException {
+ // Analyzing "purple wizard of", where of is removed as a
+ // stopword leaving a trailing hole:
+ Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
+ createToken("wizard", 7, 13)};
+ ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);
+
+ assertTokenStreamContents(filter,
+ new String[] {"purple", "purple wizard", "wizard", "wizard _"},
+ new int[] {0, 0, 7, 7},
+ new int[] {6, 13, 13, 16},
+ new int[] {1, 0, 1, 0},
+ 16);
+ }
+
+ public void testTwoTrailingHoles() throws IOException {
+ // Analyzing "purple wizard of the", where of and the are removed as a
+ // stopwords, leaving two trailing holes:
+ Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
+ createToken("wizard", 7, 13)};
+ ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2);
+
+ assertTokenStreamContents(filter,
+ new String[] {"purple", "purple wizard", "wizard", "wizard _"},
+ new int[] {0, 0, 7, 7},
+ new int[] {6, 13, 13, 20},
+ new int[] {1, 0, 1, 0},
+ 20);
+ }
+
+ public void testTwoTrailingHolesTriShingle() throws IOException {
+ // Analyzing "purple wizard of the", where of and the are removed as a
+ // stopwords, leaving two trailing holes:
+ Token[] inputTokens = new Token[] {createToken("purple", 0, 6),
+ createToken("wizard", 7, 13)};
+ ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
+
+ assertTokenStreamContents(filter,
+ new String[] {"purple", "purple wizard", "purple wizard _", "wizard", "wizard _", "wizard _ _"},
+ new int[] {0, 0, 0, 7, 7, 7},
+ new int[] {6, 13, 20, 13, 20, 20},
+ new int[] {1, 0, 0, 1, 0, 0},
+ 20);
+ }
}
Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java?rev=1524117&r1=1524116&r2=1524117&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java Tue Sep 17 16:26:36 2013
@@ -17,6 +17,8 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
+import java.io.IOException;
+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
@@ -34,9 +36,28 @@ public final class CannedTokenStream ext
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
-
+ private final int finalOffset;
+ private final int finalPosInc;
+
public CannedTokenStream(Token... tokens) {
this.tokens = tokens;
+ finalOffset = 0;
+ finalPosInc = 0;
+ }
+
+ /** If you want trailing holes, pass a non-zero
+ * finalPosInc. */
+ public CannedTokenStream(int finalPosInc, int finalOffset, Token... tokens) {
+ this.tokens = tokens;
+ this.finalOffset = finalOffset;
+ this.finalPosInc = finalPosInc;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ posIncrAtt.setPositionIncrement(finalPosInc);
+ offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override