You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by bu...@apache.org on 2009/08/03 06:33:10 UTC
svn commit: r800195 - in /lucene/java/trunk/contrib: ./
analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/
analyzers/common/src/java/org/apache/lucene/analysis/shingle/
analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous...
Author: buschmi
Date: Mon Aug 3 04:33:10 2009
New Revision: 800195
URL: http://svn.apache.org/viewvc?rev=800195&view=rev
Log:
LUCENE-1775: Change remaining contrib TokenFilters (shingle, prefix-suffix) to use the new TokenStream API.
Modified:
lucene/java/trunk/contrib/CHANGES.txt
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=800195&r1=800194&r2=800195&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Mon Aug 3 04:33:10 2009
@@ -18,6 +18,12 @@
you are interested in locally and access them on each call to the method that used to pass a new
Token. Look at the included updated impls for examples. (Mark Miller)
+ 2. LUCENE-1460: Change contrib TokenStreams/Filters to use the new
+ TokenStream API. (Robert Muir, Michael Busch)
+
+ 3. LUCENE-1775: Change remaining TokenFilters (shingle, prefix-suffix) to
+ use the new TokenStream API. (Robert Muir, Michael Busch)
+
Bug fixes
1. LUCENE-1423: InstantiatedTermEnum#skipTo(Term) throws ArrayIndexOutOfBounds on empty index.
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java?rev=800195&r1=800194&r2=800195&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java Mon Aug 3 04:33:10 2009
@@ -24,13 +24,16 @@
/**
* Links two PrefixAwareTokenFilter
- * @deprecated
+ * <p/>
+ * <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
+ * the ones located in org.apache.lucene.analysis.tokenattributes.
*/
public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
private PrefixAwareTokenFilter suffix;
public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) {
+ super(suffix);
prefix = new PrefixAwareTokenFilter(prefix, input) {
public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
return PrefixAndSuffixAwareTokenFilter.this.updateInputToken(suffixToken, lastInputToken);
@@ -56,11 +59,21 @@
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- return suffix.next(reusableToken);
+ public final boolean incrementToken() throws IOException {
+ return suffix.incrementToken();
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
}
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
+ }
public void reset() throws IOException {
suffix.reset();
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java?rev=800195&r1=800194&r2=800195&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java Mon Aug 3 04:33:10 2009
@@ -19,6 +19,12 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
import java.io.IOException;
@@ -29,28 +35,58 @@
* to be used when updating the token values in the second stream based on that token.
*
* The default implementation adds last prefix token end offset to the suffix token start and end offsets.
- * @deprecated
+ * <p/>
+ * <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
+ * the ones located in org.apache.lucene.analysis.tokenattributes.
*/
public class PrefixAwareTokenFilter extends TokenStream {
private TokenStream prefix;
private TokenStream suffix;
+
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private PayloadAttribute payloadAtt;
+ private OffsetAttribute offsetAtt;
+ private TypeAttribute typeAtt;
+ private FlagsAttribute flagsAtt;
+
+ private TermAttribute p_termAtt;
+ private PositionIncrementAttribute p_posIncrAtt;
+ private PayloadAttribute p_payloadAtt;
+ private OffsetAttribute p_offsetAtt;
+ private TypeAttribute p_typeAtt;
+ private FlagsAttribute p_flagsAtt;
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) {
+ super(suffix);
this.suffix = suffix;
this.prefix = prefix;
prefixExhausted = false;
+
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+
+ p_termAtt = (TermAttribute) prefix.addAttribute(TermAttribute.class);
+ p_posIncrAtt = (PositionIncrementAttribute) prefix.addAttribute(PositionIncrementAttribute.class);
+ p_payloadAtt = (PayloadAttribute) prefix.addAttribute(PayloadAttribute.class);
+ p_offsetAtt = (OffsetAttribute) prefix.addAttribute(OffsetAttribute.class);
+ p_typeAtt = (TypeAttribute) prefix.addAttribute(TypeAttribute.class);
+ p_flagsAtt = (FlagsAttribute) prefix.addAttribute(FlagsAttribute.class);
}
private Token previousPrefixToken = new Token();
+ private Token reusableToken = new Token();
private boolean prefixExhausted;
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
-
+ public final boolean incrementToken() throws IOException {
if (!prefixExhausted) {
- Token nextToken = prefix.next(reusableToken);
+ Token nextToken = getNextPrefixInputToken(reusableToken);
if (nextToken == null) {
prefixExhausted = true;
} else {
@@ -60,16 +96,63 @@
if (p != null) {
previousPrefixToken.setPayload((Payload) p.clone());
}
- return nextToken;
+ setCurrentToken(nextToken);
+ return true;
}
}
- Token nextToken = suffix.next(reusableToken);
+ Token nextToken = getNextSuffixInputToken(reusableToken);
if (nextToken == null) {
- return null;
+ return false;
}
- return updateSuffixToken(nextToken, previousPrefixToken);
+ nextToken = updateSuffixToken(nextToken, previousPrefixToken);
+ setCurrentToken(nextToken);
+ return true;
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
+ }
+
+ private void setCurrentToken(Token token) {
+ if (token == null) return;
+ termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
+ posIncrAtt.setPositionIncrement(token.getPositionIncrement());
+ flagsAtt.setFlags(token.getFlags());
+ offsetAtt.setOffset(token.startOffset(), token.endOffset());
+ typeAtt.setType(token.type());
+ payloadAtt.setPayload(token.getPayload());
+ }
+
+ private Token getNextPrefixInputToken(Token token) throws IOException {
+ if (!prefix.incrementToken()) return null;
+ token.setTermBuffer(p_termAtt.termBuffer(), 0, p_termAtt.termLength());
+ token.setPositionIncrement(p_posIncrAtt.getPositionIncrement());
+ token.setFlags(p_flagsAtt.getFlags());
+ token.setOffset(p_offsetAtt.startOffset(), p_offsetAtt.endOffset());
+ token.setType(p_typeAtt.type());
+ token.setPayload(p_payloadAtt.getPayload());
+ return token;
+ }
+
+ private Token getNextSuffixInputToken(Token token) throws IOException {
+ if (!suffix.incrementToken()) return null;
+ token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
+ token.setPositionIncrement(posIncrAtt.getPositionIncrement());
+ token.setFlags(flagsAtt.getFlags());
+ token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
+ token.setType(typeAtt.type());
+ token.setPayload(payloadAtt.getPayload());
+ return token;
}
/**
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=800195&r1=800194&r2=800195&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Mon Aug 3 04:33:10 2009
@@ -18,12 +18,17 @@
*/
import java.io.IOException;
-import java.util.LinkedList;
import java.util.Iterator;
+import java.util.LinkedList;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
/**
* <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
@@ -39,8 +44,6 @@
public class ShingleFilter extends TokenFilter {
private LinkedList shingleBuf = new LinkedList();
- private LinkedList outputBuf = new LinkedList();
- private LinkedList tokenBuf = new LinkedList();
private StringBuffer[] shingles;
private String tokenType = "shingle";
@@ -81,6 +84,11 @@
public ShingleFilter(TokenStream input, int maxShingleSize) {
super(input);
setMaxShingleSize(maxShingleSize);
+ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+
}
/**
@@ -148,23 +156,69 @@
shingles[i].setLength(0);
}
}
+
+ private AttributeSource.State nextToken;
+ private int shingleBufferPosition;
+ private int[] endOffsets;
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#next()
*/
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- if (outputBuf.isEmpty()) {
- fillOutputBuf(reusableToken);
- }
- Token nextToken = null;
- if ( ! outputBuf.isEmpty())
- {
- nextToken = (Token)outputBuf.remove(0);
+ public final boolean incrementToken() throws IOException {
+ while (true) {
+ if (nextToken == null) {
+ if (!fillShingleBuffer()) {
+ return false;
+ }
+ }
+
+ nextToken = (AttributeSource.State) shingleBuf.getFirst();
+
+ if (shingleBufferPosition == 0 && (! shingleBuf.isEmpty()) && outputUnigrams) {
+ restoreState(nextToken);
+ posIncrAtt.setPositionIncrement(1);
+ shingleBufferPosition++;
+ return true;
+ }
+
+ if (shingleBufferPosition < shingleBuf.size()) {
+ restoreState(nextToken);
+ typeAtt.setType(tokenType);
+ offsetAtt.setOffset(offsetAtt.startOffset(), endOffsets[shingleBufferPosition]);
+ StringBuffer buf = shingles[shingleBufferPosition];
+ int termLength = buf.length();
+ char[] termBuffer = termAtt.termBuffer();
+ if (termBuffer.length < termLength)
+ termBuffer = termAtt.resizeTermBuffer(termLength);
+ buf.getChars(0, termLength, termBuffer, 0);
+ termAtt.setTermLength(termLength);
+ if ((! outputUnigrams) && shingleBufferPosition == 1) {
+ posIncrAtt.setPositionIncrement(1);
+ } else {
+ posIncrAtt.setPositionIncrement(0);
+ }
+ shingleBufferPosition++;
+ if (shingleBufferPosition == shingleBuf.size()) {
+ nextToken = null;
+ shingleBufferPosition = 0;
+ }
+ return true;
+ } else {
+ nextToken = null;
+ shingleBufferPosition = 0;
+ }
}
- return nextToken;
}
-
+
+ private int numFillerTokensToInsert;
+ private AttributeSource.State currentToken;
+ private boolean hasCurrentToken;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private TypeAttribute typeAtt;
+
/**
* Get the next token from the input stream and push it on the token buffer.
* If we encounter a token with position increment > 1, we put filler tokens
@@ -174,41 +228,53 @@
* @return the next token, or null if at end of input stream
* @throws IOException if the input stream has a problem
*/
- private Token getNextToken(final Token reusableToken) throws IOException {
- if (tokenBuf.isEmpty()) {
- Token nextToken = input.next(reusableToken);
- if (nextToken != null) {
- for (int i = 1; i < nextToken.getPositionIncrement(); i++) {
- Token fillerToken = (Token) nextToken.clone();
- // A filler token occupies no space
- fillerToken.setEndOffset(fillerToken.startOffset());
- fillerToken.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
- tokenBuf.add(fillerToken);
- }
- tokenBuf.add(nextToken.clone());
- return getNextToken(nextToken);
- } else {
- return null;
- }
- } else {
- return (Token)tokenBuf.remove(0);
+ private boolean getNextToken() throws IOException {
+
+ while (true) {
+ if (numFillerTokensToInsert > 0) {
+ if (currentToken == null) {
+ currentToken = captureState();
+ } else {
+ restoreState(currentToken);
+ }
+ numFillerTokensToInsert--;
+ // A filler token occupies no space
+ offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
+ termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+ return true;
+ }
+
+ if (hasCurrentToken) {
+ if (currentToken != null) {
+ restoreState(currentToken);
+ currentToken = null;
+ }
+ hasCurrentToken = false;
+ return true;
+ }
+
+ if (!input.incrementToken()) return false;
+ hasCurrentToken = true;
+
+ if (posIncrAtt.getPositionIncrement() > 1) {
+ numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
+ }
}
- }
+ }
/**
* Fill the output buffer with new shingles.
*
* @throws IOException if there's a problem getting the next token
*/
- private void fillOutputBuf(Token token) throws IOException {
+ private boolean fillShingleBuffer() throws IOException {
boolean addedToken = false;
/*
* Try to fill the shingle buffer.
*/
do {
- token = getNextToken(token);
- if (token != null) {
- shingleBuf.add(token.clone());
+ if (getNextToken()) {
+ shingleBuf.add(captureState());
if (shingleBuf.size() > maxShingleSize)
{
shingleBuf.remove(0);
@@ -219,69 +285,55 @@
}
} while (shingleBuf.size() < maxShingleSize);
+ if (shingleBuf.isEmpty()) {
+ return false;
+ }
+
/*
* If no new token could be added to the shingle buffer, we have reached
* the end of the input stream and have to discard the least recent token.
*/
if (! addedToken) {
- if (shingleBuf.isEmpty()) {
- return;
- } else {
- shingleBuf.remove(0);
- }
+ shingleBuf.remove(0);
+ }
+
+ if (shingleBuf.isEmpty()) {
+ return false;
}
clearShingles();
- int[] endOffsets = new int[shingleBuf.size()];
+ endOffsets = new int[shingleBuf.size()];
for (int i = 0; i < endOffsets.length; i++) {
endOffsets[i] = 0;
}
int i = 0;
- Token shingle = null;
for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) {
- shingle = (Token) it.next();
+ restoreState((AttributeSource.State) it.next());
for (int j = i; j < shingles.length; j++) {
if (shingles[j].length() != 0) {
shingles[j].append(TOKEN_SEPARATOR);
}
- shingles[j].append(shingle.termBuffer(), 0, shingle.termLength());
+ shingles[j].append(termAtt.termBuffer(), 0, termAtt.termLength());
}
- endOffsets[i] = shingle.endOffset();
+ endOffsets[i] = offsetAtt.endOffset();
i++;
}
+
+ return true;
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
- if ((! shingleBuf.isEmpty()) && outputUnigrams) {
- Token unigram = (Token) shingleBuf.getFirst();
- unigram.setPositionIncrement(1);
- outputBuf.add(unigram);
- }
-
- /*
- * Push new tokens to the output buffer.
- */
- if (!shingleBuf.isEmpty()) {
- Token firstShingle = (Token) shingleBuf.get(0);
- shingle = (Token) firstShingle.clone();
- shingle.setType(tokenType);
- }
- for (int j = 1; j < shingleBuf.size(); j++) {
- shingle.setEndOffset(endOffsets[j]);
- StringBuffer buf = shingles[j];
- int termLength = buf.length();
- char[] termBuffer = shingle.termBuffer();
- if (termBuffer.length < termLength)
- termBuffer = shingle.resizeTermBuffer(termLength);
- buf.getChars(0, termLength, termBuffer, 0);
- shingle.setTermLength(termLength);
- if ((! outputUnigrams) && j == 1) {
- shingle.setPositionIncrement(1);
- } else {
- shingle.setPositionIncrement(0);
- }
- outputBuf.add(shingle.clone());
- }
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java?rev=800195&r1=800194&r2=800195&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java Mon Aug 3 04:33:10 2009
@@ -30,6 +30,12 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.payloads.PayloadHelper;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
@@ -104,6 +110,9 @@
* <p>The filter also has basic support for calculating weights for the shingles
* based on the weights of the tokens from the input stream, output shingle size, et c.
* See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}.
+ * <p/>
+ * <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
+ * the ones located in org.apache.lucene.analysis.tokenattributes.
*/
public class ShingleMatrixFilter extends TokenStream {
@@ -183,7 +192,21 @@
private TokenStream input;
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private PayloadAttribute payloadAtt;
+ private OffsetAttribute offsetAtt;
+ private TypeAttribute typeAtt;
+ private FlagsAttribute flagsAtt;
+
+ private TermAttribute in_termAtt;
+ private PositionIncrementAttribute in_posIncrAtt;
+ private PayloadAttribute in_payloadAtt;
+ private OffsetAttribute in_offsetAtt;
+ private TypeAttribute in_typeAtt;
+ private FlagsAttribute in_flagsAtt;
+
/**
* Creates a shingle filter based on a user defined matrix.
*
@@ -205,8 +228,22 @@
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
this.settingsCodec = settingsCodec;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+
// set the input to be an empty token stream, we already have the data.
this.input = new EmptyTokenStream();
+
+ in_termAtt = (TermAttribute) input.addAttribute(TermAttribute.class);
+ in_posIncrAtt = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
+ in_payloadAtt = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
+ in_offsetAtt = (OffsetAttribute) input.addAttribute(OffsetAttribute.class);
+ in_typeAtt = (TypeAttribute) input.addAttribute(TypeAttribute.class);
+ in_flagsAtt = (FlagsAttribute) input.addAttribute(FlagsAttribute.class);
}
/**
@@ -273,6 +310,19 @@
this.spacerCharacter = spacerCharacter;
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
this.settingsCodec = settingsCodec;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+
+ in_termAtt = (TermAttribute) input.addAttribute(TermAttribute.class);
+ in_posIncrAtt = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
+ in_payloadAtt = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
+ in_offsetAtt = (OffsetAttribute) input.addAttribute(OffsetAttribute.class);
+ in_typeAtt = (TypeAttribute) input.addAttribute(TypeAttribute.class);
+ in_flagsAtt = (FlagsAttribute) input.addAttribute(FlagsAttribute.class);
}
// internal filter instance variables
@@ -302,10 +352,10 @@
}
private Matrix matrix;
-
-
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+
+ private Token reusableToken = new Token();
+
+ public final boolean incrementToken() throws IOException {
if (matrix == null) {
matrix = new Matrix();
// fill matrix with maximumShingleSize columns
@@ -321,9 +371,39 @@
do {
token = produceNextToken(reusableToken);
} while (token == request_next_token);
+ if (token == null) return false;
+
+ termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
+ posIncrAtt.setPositionIncrement(token.getPositionIncrement());
+ flagsAtt.setFlags(token.getFlags());
+ offsetAtt.setOffset(token.startOffset(), token.endOffset());
+ typeAtt.setType(token.type());
+ payloadAtt.setPayload(token.getPayload());
+ return true;
+ }
+
+ private Token getNextInputToken(Token token) throws IOException {
+ if (!input.incrementToken()) return null;
+ token.setTermBuffer(in_termAtt.termBuffer(), 0, in_termAtt.termLength());
+ token.setPositionIncrement(in_posIncrAtt.getPositionIncrement());
+ token.setFlags(in_flagsAtt.getFlags());
+ token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset());
+ token.setType(in_typeAtt.type());
+ token.setPayload(in_payloadAtt.getPayload());
return token;
}
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
+ }
private static final Token request_next_token = new Token();
@@ -573,7 +653,7 @@
token = readColumnBuf;
readColumnBuf = null;
} else {
- token = input.next(new Token());
+ token = getNextInputToken(new Token());
}
if (token == null) {
@@ -585,7 +665,7 @@
currentReaderRow.getTokens().add(token);
TokenPositioner tokenPositioner;
- while ((readColumnBuf = input.next(new Token())) != null
+ while ((readColumnBuf = getNextInputToken(new Token())) != null
&& (tokenPositioner = settingsCodec.getTokenPositioner(readColumnBuf)) != TokenPositioner.newColumn) {
if (tokenPositioner == TokenPositioner.sameRow) {
@@ -599,7 +679,7 @@
}
if (readColumnBuf == null) {
- readColumnBuf = input.next(new Token());
+ readColumnBuf = getNextInputToken(new Token());
if (readColumnBuf == null) {
currentReaderColumn.setLast(true);
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java?rev=800195&r1=800194&r2=800195&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java Mon Aug 3 04:33:10 2009
@@ -21,6 +21,8 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.io.StringReader;
@@ -34,22 +36,22 @@
new WhitespaceTokenizer(new StringReader("hello world")),
new SingleTokenTokenStream(createToken("$", 0, 0)));
- Token token = new Token();
- assertNext(ts, token, "^", 0, 0);
- assertNext(ts, token, "hello", 0, 5);
- assertNext(ts, token, "world", 6, 11);
- assertNext(ts, token, "$", 11, 11);
- assertNull(ts.next(token));
+ assertNext(ts, "^", 0, 0);
+ assertNext(ts, "hello", 0, 5);
+ assertNext(ts, "world", 6, 11);
+ assertNext(ts, "$", 11, 11);
+ assertFalse(ts.incrementToken());
}
- private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
- Token nextToken = ts.next(reusableToken);
- assertNotNull(nextToken);
- assertEquals(text, nextToken.term());
- assertEquals(startOffset, nextToken.startOffset());
- assertEquals(endOffset, nextToken.endOffset());
- return nextToken;
+ private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
+
+ assertTrue(ts.incrementToken());
+ assertEquals(text, termAtt.term());
+ assertEquals(startOffset, offsetAtt.startOffset());
+ assertEquals(endOffset, offsetAtt.endOffset());
}
private static Token createToken(String term, int start, int offset)
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java?rev=800195&r1=800194&r2=800195&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java Mon Aug 3 04:33:10 2009
@@ -21,6 +21,8 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
import java.io.StringReader;
@@ -34,32 +36,31 @@
ts = new PrefixAwareTokenFilter(
new SingleTokenTokenStream(createToken("a", 0, 1)),
new SingleTokenTokenStream(createToken("b", 0, 1)));
- final Token reusableToken = new Token();
- assertNext(ts, reusableToken, "a", 0, 1);
- assertNext(ts, reusableToken, "b", 1, 2);
- assertNull(ts.next(reusableToken));
-
+ assertNext(ts, "a", 0, 1);
+ assertNext(ts, "b", 1, 2);
+ assertFalse(ts.incrementToken());
// prefix and suffix using 2x prefix
ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world")));
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0)));
- assertNext(ts, reusableToken, "^", 0, 0);
- assertNext(ts, reusableToken, "hello", 0, 5);
- assertNext(ts, reusableToken, "world", 6, 11);
- assertNext(ts, reusableToken, "$", 11, 11);
- assertNull(ts.next(reusableToken));
+ assertNext(ts, "^", 0, 0);
+ assertNext(ts, "hello", 0, 5);
+ assertNext(ts, "world", 6, 11);
+ assertNext(ts, "$", 11, 11);
+ assertFalse(ts.incrementToken());
}
- private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
- Token nextToken = ts.next(reusableToken);
- assertNotNull(nextToken);
- assertEquals(text, nextToken.term());
- assertEquals(startOffset, nextToken.startOffset());
- assertEquals(endOffset, nextToken.endOffset());
- return nextToken;
+ private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
+
+ assertTrue(ts.incrementToken());
+ assertEquals(text, termAtt.term());
+ assertEquals(startOffset, offsetAtt.startOffset());
+ assertEquals(endOffset, offsetAtt.endOffset());
}
private static Token createToken(String term, int start, int offset)
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java?rev=800195&r1=800194&r2=800195&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java Mon Aug 3 04:33:10 2009
@@ -23,6 +23,8 @@
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
@@ -157,10 +159,13 @@
TokenStream ts = analyzer.tokenStream("content",
new StringReader("this sentence"));
int j = -1;
- final Token reusableToken = new Token();
- for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
- j += nextToken.getPositionIncrement();
- String termText = nextToken.term();
+
+ PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+
+ while (ts.incrementToken()) {
+ j += posIncrAtt.getPositionIncrement();
+ String termText = termAtt.term();
q.add(new Term("content", termText), j);
}
@@ -182,9 +187,11 @@
TokenStream ts = analyzer.tokenStream("content",
new StringReader("test sentence"));
- final Token reusableToken = new Token();
- for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
- String termText = nextToken.term();
+
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+
+ while (ts.incrementToken()) {
+ String termText = termAtt.term();
q.add(new TermQuery(new Term("content", termText)),
BooleanClause.Occur.SHOULD);
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=800195&r1=800194&r2=800195&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Mon Aug 3 04:33:10 2009
@@ -22,6 +22,11 @@
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttributeImpl;
public class ShingleFilterTest extends TestCase {
@@ -29,18 +34,31 @@
protected int index = 0;
protected Token[] testToken;
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private TypeAttribute typeAtt;
public TestTokenStream(Token[] testToken) {
super();
this.testToken = testToken;
+ this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ public final boolean incrementToken() throws IOException {
if (index < testToken.length) {
- return testToken[index++];
+ Token t = testToken[index++];
+ termAtt.setTermBuffer(t.termBuffer(), 0, t.termLength());
+ offsetAtt.setOffset(t.startOffset(), t.endOffset());
+ posIncrAtt.setPositionIncrement(t.getPositionIncrement());
+ typeAtt.setType(TypeAttributeImpl.DEFAULT_TYPE);
+ return true;
} else {
- return null;
+ return false;
}
}
}
@@ -163,25 +181,29 @@
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
}
-
+
protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
int[] positionIncrements, String[] types)
throws IOException {
TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+ TermAttribute termAtt = (TermAttribute) filter.addAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) filter.addAttribute(OffsetAttribute.class);
+ PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) filter.addAttribute(PositionIncrementAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute) filter.addAttribute(TypeAttribute.class);
+
int i = 0;
- final Token reusableToken = new Token();
- for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
- String termText = nextToken.term();
+ while (filter.incrementToken()) {
+ String termText = termAtt.term();
String goldText = tokensToCompare[i].term();
assertEquals("Wrong termText", goldText, termText);
assertEquals("Wrong startOffset for token \"" + termText + "\"",
- tokensToCompare[i].startOffset(), nextToken.startOffset());
+ tokensToCompare[i].startOffset(), offsetAtt.startOffset());
assertEquals("Wrong endOffset for token \"" + termText + "\"",
- tokensToCompare[i].endOffset(), nextToken.endOffset());
+ tokensToCompare[i].endOffset(), offsetAtt.endOffset());
assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
- positionIncrements[i], nextToken.getPositionIncrement());
- assertEquals("Wrong type for token \"" + termText + "\"", types[i], nextToken.type());
+ positionIncrements[i], posIncrAtt.getPositionIncrement());
+ assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
i++;
}
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java?rev=800195&r1=800194&r2=800195&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java Mon Aug 3 04:33:10 2009
@@ -17,21 +17,28 @@
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedList;
+
import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
-import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.LinkedList;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class TestShingleMatrixFilter extends TestCase {
@@ -43,7 +50,7 @@
TokenStream ts;
ts = new ShingleMatrixFilter(new EmptyTokenStream(), 1, 2, new Character(' '), false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
- assertNull(ts.next(new Token()));
+ assertFalse(ts.incrementToken());
TokenListStream tls;
LinkedList tokens;
@@ -66,20 +73,20 @@
Token reusableToken = new Token();
- assertNext(ts, reusableToken, "please", 0, 6);
- assertNext(ts, reusableToken, "please divide", 0, 13);
- assertNext(ts, reusableToken, "divide", 7, 13);
- assertNext(ts, reusableToken, "divide this", 7, 18);
- assertNext(ts, reusableToken, "this", 14, 18);
- assertNext(ts, reusableToken, "this sentence", 14, 27);
- assertNext(ts, reusableToken, "sentence", 19, 27);
- assertNext(ts, reusableToken, "sentence into", 19, 32);
- assertNext(ts, reusableToken, "into", 28, 32);
- assertNext(ts, reusableToken, "into shingles", 28, 39);
- assertNext(ts, reusableToken, "shingles", 33, 39);
+ assertNext(ts, "please", 0, 6);
+ assertNext(ts, "please divide", 0, 13);
+ assertNext(ts, "divide", 7, 13);
+ assertNext(ts, "divide this", 7, 18);
+ assertNext(ts, "this", 14, 18);
+ assertNext(ts, "this sentence", 14, 27);
+ assertNext(ts, "sentence", 19, 27);
+ assertNext(ts, "sentence into", 19, 32);
+ assertNext(ts, "into", 28, 32);
+ assertNext(ts, "into shingles", 28, 39);
+ assertNext(ts, "shingles", 33, 39);
- assertNull(ts.next(reusableToken));
+ assertFalse(ts.incrementToken());
}
@@ -92,7 +99,7 @@
ShingleMatrixFilter.defaultSettingsCodec = null;//new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
TokenStream ts;
- TokenListStream tls;
+ TokenStream tls;
LinkedList tokens;
// test a plain old token stream with synonyms tranlated to rows.
@@ -111,25 +118,25 @@
ts = new ShingleMatrixFilter(tls, 2, 2, new Character('_'), false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
final Token reusableToken = new Token();
- assertNext(ts, reusableToken, "hello_world");
- assertNext(ts, reusableToken, "greetings_world");
- assertNext(ts, reusableToken, "hello_earth");
- assertNext(ts, reusableToken, "greetings_earth");
- assertNext(ts, reusableToken, "hello_tellus");
- assertNext(ts, reusableToken, "greetings_tellus");
- assertNull(ts.next(reusableToken));
+ assertNext(ts, "hello_world");
+ assertNext(ts, "greetings_world");
+ assertNext(ts, "hello_earth");
+ assertNext(ts, "greetings_earth");
+ assertNext(ts, "hello_tellus");
+ assertNext(ts, "greetings_tellus");
+ assertFalse(ts.incrementToken());
// bi-grams with no spacer character, start offset, end offset
tls.reset();
ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
- assertNext(ts, reusableToken, "helloworld", 0, 10);
- assertNext(ts, reusableToken, "greetingsworld", 0, 10);
- assertNext(ts, reusableToken, "helloearth", 0, 10);
- assertNext(ts, reusableToken, "greetingsearth", 0, 10);
- assertNext(ts, reusableToken, "hellotellus", 0, 10);
- assertNext(ts, reusableToken, "greetingstellus", 0, 10);
- assertNull(ts.next(reusableToken));
+ assertNext(ts, "helloworld", 0, 10);
+ assertNext(ts, "greetingsworld", 0, 10);
+ assertNext(ts, "helloearth", 0, 10);
+ assertNext(ts, "greetingsearth", 0, 10);
+ assertNext(ts, "hellotellus", 0, 10);
+ assertNext(ts, "greetingstellus", 0, 10);
+ assertFalse(ts.incrementToken());
// add ^_prefix_and_suffix_$
@@ -148,7 +155,7 @@
tls = new TokenListStream(tokens);
ts = new PrefixAndSuffixAwareTokenFilter(new SingleTokenTokenStream(tokenFactory("^", 1, 100f, 0, 0)), tls, new SingleTokenTokenStream(tokenFactory("$", 1, 50f, 0, 0)));
- tls = new TokenListStream(ts);
+ tls = new CachingTokenFilter(ts);
// bi-grams, position incrememnt, weight, start offset, end offset
@@ -159,18 +166,18 @@
// token.clear();
// }
- assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
- assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
- assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
- assertNull(ts.next(reusableToken));
+ assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
+ assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
+ assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
+ assertFalse(ts.incrementToken());
// test unlimited size and allow single boundary token as shingle
tls.reset();
@@ -182,44 +189,44 @@
// token.clear();
// }
- assertNext(ts, reusableToken, "^", 1, 10.0f, 0, 0);
- assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
- assertNext(ts, reusableToken, "^_hello_world", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_hello_world_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "hello", 1, 1.0f, 0, 4);
- assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "hello_world_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, reusableToken, "world", 1, 1.0f, 5, 10);
- assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, reusableToken, "$", 1, 7.071068f, 10, 10);
- assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
- assertNext(ts, reusableToken, "^_greetings_world", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_greetings_world_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "greetings", 1, 1.0f, 0, 4);
- assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "greetings_world_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, reusableToken, "^_hello_earth", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_hello_earth_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "hello_earth_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, reusableToken, "earth", 1, 1.0f, 5, 10);
- assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, reusableToken, "^_greetings_earth", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "greetings_earth_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, reusableToken, "^_hello_tellus", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "hello_tellus_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, reusableToken, "tellus", 1, 1.0f, 5, 10);
- assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, reusableToken, "^_greetings_tellus", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "^", 1, 10.0f, 0, 0);
+ assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
+ assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "hello", 1, 1.0f, 0, 4);
+ assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "world", 1, 1.0f, 5, 10);
+ assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, "$", 1, 7.071068f, 10, 10);
+ assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
+ assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "greetings", 1, 1.0f, 0, 4);
+ assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "earth", 1, 1.0f, 5, 10);
+ assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "tellus", 1, 1.0f, 5, 10);
+ assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
- assertNull(ts.next(reusableToken));
+ assertFalse(ts.incrementToken());
// test unlimited size but don't allow single boundary token as shingle
@@ -230,43 +237,43 @@
// token.clear();
// }
- assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
- assertNext(ts, reusableToken, "^_hello_world", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_hello_world_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "hello", 1, 1.0f, 0, 4);
- assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "hello_world_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, reusableToken, "world", 1, 1.0f, 5, 10);
- assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
- assertNext(ts, reusableToken, "^_greetings_world", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_greetings_world_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "greetings", 1, 1.0f, 0, 4);
- assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "greetings_world_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, reusableToken, "^_hello_earth", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_hello_earth_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "hello_earth_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, reusableToken, "earth", 1, 1.0f, 5, 10);
- assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, reusableToken, "^_greetings_earth", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "greetings_earth_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, reusableToken, "^_hello_tellus", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "hello_tellus_$", 1, 7.2111025f, 0, 10);
- assertNext(ts, reusableToken, "tellus", 1, 1.0f, 5, 10);
- assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
- assertNext(ts, reusableToken, "^_greetings_tellus", 1, 10.099504f, 0, 10);
- assertNext(ts, reusableToken, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
- assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
+ assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "hello", 1, 1.0f, 0, 4);
+ assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "world", 1, 1.0f, 5, 10);
+ assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
+ assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "greetings", 1, 1.0f, 0, 4);
+ assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "earth", 1, 1.0f, 5, 10);
+ assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
+ assertNext(ts, "tellus", 1, 1.0f, 5, 10);
+ assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
+ assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
+ assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
+ assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
- assertNull(ts.next(reusableToken));
+ assertFalse(ts.incrementToken());
System.currentTimeMillis();
@@ -301,20 +308,20 @@
// shingle, position increment, weight, start offset, end offset
- assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "greetings_and", 1, 1.4142135f, 0, 4);
- assertNext(ts, reusableToken, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
- assertNext(ts, reusableToken, "and_salutations", 1, 1.4142135f, 0, 4);
- assertNext(ts, reusableToken, "and_salutations_world", 1, 1.7320508f, 0, 10);
- assertNext(ts, reusableToken, "salutations_world", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "and_salutations_earth", 1, 1.7320508f, 0, 10);
- assertNext(ts, reusableToken, "salutations_earth", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
- assertNext(ts, reusableToken, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
- assertNext(ts, reusableToken, "salutations_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
+ assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
+ assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
+ assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
+ assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
+ assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+ assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
+ assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);
- assertNull(ts.next(reusableToken));
+ assertFalse(ts.incrementToken());
System.currentTimeMillis();
@@ -361,47 +368,47 @@
// }
final Token reusableToken = new Token();
- assertNext(ts, reusableToken, "no_surprise", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "no_surprise_to", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "no_surprise_to_see", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "surprise_to", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "surprise_to_see", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "surprise_to_see_england", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "to_see", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "to_see_england", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "to_see_england_manager", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "see_england", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "see_england_manager", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "see_england_manager_svennis", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "england_manager", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "england_manager_svennis", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "england_manager_svennis_in", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "manager_svennis", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "manager_svennis_in", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "manager_svennis_in_the", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "svennis_in", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "svennis_in_the", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "svennis_in_the_croud", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "in_the", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "in_the_croud", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "the_croud", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "see_england_manager_sven", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "england_manager_sven", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "england_manager_sven_göran", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "manager_sven", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "manager_sven_göran", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "sven_göran", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "göran_eriksson", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
- assertNext(ts, reusableToken, "eriksson_in", 1, 1.4142135f, 0, 0);
- assertNext(ts, reusableToken, "eriksson_in_the", 1, 1.7320508f, 0, 0);
- assertNext(ts, reusableToken, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
+ assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
+ assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
+ assertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
+ assertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
+ assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
+ assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
+ assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
+ assertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
+ assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
+ assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
+ assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
+ assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
+ assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
+ assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
+ assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
- assertNull(ts.next(reusableToken));
+ assertFalse(ts.incrementToken());
}
@@ -445,40 +452,46 @@
// assert-methods start here
- private Token assertNext(TokenStream ts, final Token reusableToken, String text) throws IOException {
- Token nextToken = ts.next(reusableToken);
- assertNotNull(nextToken);
- assertEquals(text, nextToken.term());
- return nextToken;
- }
-
- private Token assertNext(TokenStream ts, final Token reusableToken, String text, int positionIncrement, float boost) throws IOException {
- Token nextToken = ts.next(reusableToken);
- assertNotNull(nextToken);
- assertEquals(text, nextToken.term());
- assertEquals(positionIncrement, nextToken.getPositionIncrement());
- assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()), 0);
- return nextToken;
- }
-
- private Token assertNext(TokenStream ts, final Token reusableToken, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
- Token nextToken = ts.next(reusableToken);
- assertNotNull(nextToken);
- assertEquals(text, nextToken.term());
- assertEquals(positionIncrement, nextToken.getPositionIncrement());
- assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()), 0);
- assertEquals(startOffset, nextToken.startOffset());
- assertEquals(endOffset, nextToken.endOffset());
- return nextToken;
- }
-
- private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
- Token nextToken = ts.next(reusableToken);
- assertNotNull(nextToken);
- assertEquals(text, nextToken.term());
- assertEquals(startOffset, nextToken.startOffset());
- assertEquals(endOffset, nextToken.endOffset());
- return nextToken;
+ private void assertNext(TokenStream ts, String text) throws IOException {
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+
+ assertTrue(ts.incrementToken());
+ assertEquals(text, termAtt.term());
+ }
+
+ private void assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException {
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+ PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
+ PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
+
+ assertTrue(ts.incrementToken());
+ assertEquals(text, termAtt.term());
+ assertEquals(positionIncrement, posIncrAtt.getPositionIncrement());
+ assertEquals(boost, payloadAtt.getPayload() == null ? 1f : PayloadHelper.decodeFloat(payloadAtt.getPayload().getData()), 0);
+ }
+
+ private void assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+ PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.addAttribute(PositionIncrementAttribute.class);
+ PayloadAttribute payloadAtt = (PayloadAttribute) ts.addAttribute(PayloadAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
+
+ assertTrue(ts.incrementToken());
+ assertEquals(text, termAtt.term());
+ assertEquals(positionIncrement, posIncrAtt.getPositionIncrement());
+ assertEquals(boost, payloadAtt.getPayload() == null ? 1f : PayloadHelper.decodeFloat(payloadAtt.getPayload().getData()), 0);
+ assertEquals(startOffset, offsetAtt.startOffset());
+ assertEquals(endOffset, offsetAtt.endOffset());
+ }
+
+ private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
+ TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) ts.addAttribute(OffsetAttribute.class);
+
+ assertTrue(ts.incrementToken());
+ assertEquals(text, termAtt.term());
+ assertEquals(startOffset, offsetAtt.startOffset());
+ assertEquals(endOffset, offsetAtt.endOffset());
}
private static Token createToken(String term, int start, int offset)
@@ -492,31 +505,41 @@
public static class TokenListStream extends TokenStream {
private Collection tokens;
-
- public TokenListStream(TokenStream ts) throws IOException {
- tokens = new ArrayList();
- final Token reusableToken = new Token();
- for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
- tokens.add((Token) nextToken.clone());
- }
- }
-
+ TermAttribute termAtt;
+ PositionIncrementAttribute posIncrAtt;
+ PayloadAttribute payloadAtt;
+ OffsetAttribute offsetAtt;
+ TypeAttribute typeAtt;
+ FlagsAttribute flagsAtt;
+
public TokenListStream(Collection tokens) {
this.tokens = tokens;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
}
private Iterator iterator;
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ public boolean incrementToken() throws IOException {
if (iterator == null) {
iterator = tokens.iterator();
}
if (!iterator.hasNext()) {
- return null;
+ return false;
}
- Token nextToken = (Token) iterator.next();
- return (Token) nextToken.clone();
+ Token prototype = (Token) iterator.next();
+ termAtt.setTermBuffer(prototype.termBuffer(), 0, prototype.termLength());
+ posIncrAtt.setPositionIncrement(prototype.getPositionIncrement());
+ flagsAtt.setFlags(prototype.getFlags());
+ offsetAtt.setOffset(prototype.startOffset(), prototype.endOffset());
+ typeAtt.setType(prototype.type());
+ payloadAtt.setPayload(prototype.getPayload());
+
+ return true;
}