You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by bu...@apache.org on 2009/07/24 23:45:50 UTC
svn commit: r797665 [1/3] - in /lucene/java/trunk: ./
src/java/org/apache/lucene/analysis/
src/java/org/apache/lucene/analysis/standard/
src/java/org/apache/lucene/analysis/tokenattributes/
src/java/org/apache/lucene/index/ src/java/org/apache/lucene/q...
Author: buschmi
Date: Fri Jul 24 21:45:48 2009
New Revision: 797665
URL: http://svn.apache.org/viewvc?rev=797665&view=rev
Log:
LUCENE-1693: Various improvements to the new TokenStream API.
Added:
lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttributeImpl.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java (with props)
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java (with props)
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTokenStreamBWComp.java (with props)
lucene/java/trunk/src/test/org/apache/lucene/util/TestAttributeSource.java (with props)
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/SinkTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeTokenFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/package.html
lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerField.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocInverterPerThread.java
lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.java
lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.jj
lucene/java/trunk/src/java/org/apache/lucene/search/QueryTermVector.java
lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java
lucene/java/trunk/src/java/org/apache/lucene/util/AttributeSource.java
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestASCIIFoldingFilter.java
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestNumericTokenStream.java
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeTokenFilter.java
lucene/java/trunk/src/test/org/apache/lucene/index/TestDocumentWriter.java
lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiFieldQueryParser.java
lucene/java/trunk/src/test/org/apache/lucene/util/LuceneTestCase.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Jul 24 21:45:48 2009
@@ -64,6 +64,22 @@
process. It is not recommended to implement it, but rather extend
Searcher. (Shai Erera via Mike McCandless)
+ 4. LUCENE-1422, LUCENE-1693: The new TokenStream API (see below) using
+ Attributes has some backwards breaks in rare cases.
+ We did our best to make the transition as easy as possible. You should
+ not have problems, if your tokenizers still implement next(Token) or
+ next(), the calls are automatically wrapped. The indexer and query parser
+ use the new API using incrementToken() calls. All core TokenStreams
+ are implemented using the new API. You can mix old and new API
+ style TokenFilters/TokenStream. Problems only occur when you have done
+ the following:
+ You have overridden next(Token) or next() in one of the non-abstract core
+ TokenStreams/-Filters. This classes should normally be final, but some
+ of them are not. In this case next(Token)/next() would never be called.
+ To early fail with a hard compile/runtime error, the next(Token)/next()
+ methods in these TokenStreams/-Filters were made final.
+ (Michael Busch, Uwe Schindler)
+
Changes in runtime behavior
1. LUCENE-1424: QueryParser now by default uses constant score query
@@ -156,14 +172,16 @@
and deprecate FSDirectory.getDirectory(). FSDirectory instances
are not required to be singletons per path. (yonik)
-4. LUCENE-1422: New TokenStream API that uses a new class called
+4. LUCENE-1422, LUCENE-1693: New TokenStream API that uses a new class called
AttributeSource instead of the now deprecated Token class. All attributes
that the Token class had have been moved into separate classes:
TermAttribute, OffsetAttribute, PositionIncrementAttribute,
PayloadAttribute, TypeAttribute and FlagsAttribute. The new API
is much more flexible; it allows to combine the Attributes arbitrarily
and also to define custom Attributes. The new API has the same performance
- as the old next(Token) approach. (Michael Busch)
+ as the old next(Token) approach.
+ For conformance with this new API Tee-/SinkTokenizer was deprecated
+ and replaced by a new TeeSinkTokenFilter. (Michael Busch, Uwe Schindler)
5. LUCENE-1467: Add nextDoc() and next(int) methods to OpenBitSetIterator.
These methods can be used to avoid additional calls to doc().
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java Fri Jul 24 21:45:48 2009
@@ -1,5 +1,8 @@
package org.apache.lucene.analysis;
+import java.io.IOException;
+
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.ArrayUtil;
/**
@@ -53,24 +56,21 @@
* accents from Latin1 characters. For example, 'à' will be replaced by
* 'a'.
*/
-public class ASCIIFoldingFilter extends TokenFilter {
+public final class ASCIIFoldingFilter extends TokenFilter {
public ASCIIFoldingFilter(TokenStream input)
{
super(input);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
private char[] output = new char[512];
private int outputPos;
+ private TermAttribute termAtt;
- public Token next(Token result)
- throws java.io.IOException
- {
- result = input.next(result);
-
- if (result != null)
- {
- final char[] buffer = result.termBuffer();
- final int length = result.termLength();
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ final char[] buffer = termAtt.termBuffer();
+ final int length = termAtt.termLength();
// If no characters actually require rewriting then we
// just return token as-is:
@@ -79,13 +79,13 @@
if (c >= '\u0080')
{
foldToASCII(buffer, length);
- result.setTermBuffer(output, 0, outputPos);
+ termAtt.setTermBuffer(output, 0, outputPos);
break;
}
}
- return result;
+ return true;
} else {
- return null;
+ return false;
}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CachingTokenFilter.java Fri Jul 24 21:45:48 2009
@@ -25,24 +25,35 @@
import org.apache.lucene.util.AttributeSource;
/**
- * This class can be used if the Tokens of a TokenStream
+ * This class can be used if the token attributes of a TokenStream
* are intended to be consumed more than once. It caches
- * all Tokens locally in a List.
+ * all token attribute states locally in a List.
*
- * CachingTokenFilter implements the optional method
+ * <P>CachingTokenFilter implements the optional method
* {@link TokenStream#reset()}, which repositions the
* stream to the first Token.
- *
*/
public class CachingTokenFilter extends TokenFilter {
- private List cache;
- private Iterator iterator;
+ private List cache = null;
+ private Iterator iterator = null;
public CachingTokenFilter(TokenStream input) {
super(input);
}
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws IOException {
+ return super.next();
+ }
- public boolean incrementToken() throws IOException {
+ public final boolean incrementToken() throws IOException {
if (cache == null) {
// fill cache lazily
cache = new LinkedList();
@@ -51,34 +62,14 @@
}
if (!iterator.hasNext()) {
- // the cache is exhausted, return null
+ // the cache is exhausted, return false
return false;
}
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
- AttributeSource state = (AttributeSource) iterator.next();
- state.restoreState(this);
+ restoreState((AttributeSource.State) iterator.next());
return true;
}
-
- /** @deprecated */
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- if (cache == null) {
- // fill cache lazily
- cache = new LinkedList();
- fillCache(reusableToken);
- iterator = cache.iterator();
- }
-
- if (!iterator.hasNext()) {
- // the cache is exhausted, return null
- return null;
- }
- // Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
- Token nextToken = (Token) iterator.next();
- return (Token) nextToken.clone();
- }
-
+
public void reset() throws IOException {
if(cache != null) {
iterator = cache.iterator();
@@ -90,12 +81,5 @@
cache.add(captureState());
}
}
-
- /** @deprecated */
- private void fillCache(final Token reusableToken) throws IOException {
- for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
- cache.add(nextToken.clone());
- }
- }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java Fri Jul 24 21:45:48 2009
@@ -94,49 +94,16 @@
return true;
}
- /** @deprecated */
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
public final Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- reusableToken.clear();
- int length = 0;
- int start = bufferIndex;
- char[] buffer = reusableToken.termBuffer();
- while (true) {
-
- if (bufferIndex >= dataLen) {
- offset += dataLen;
- dataLen = input.read(ioBuffer);
- if (dataLen == -1) {
- if (length > 0)
- break;
- else
- return null;
- }
- bufferIndex = 0;
- }
-
- final char c = ioBuffer[bufferIndex++];
-
- if (isTokenChar(c)) { // if it's a token char
-
- if (length == 0) // start of token
- start = offset + bufferIndex - 1;
- else if (length == buffer.length)
- buffer = reusableToken.resizeTermBuffer(1+length);
-
- buffer[length++] = normalize(c); // buffer it, normalized
-
- if (length == MAX_WORD_LEN) // buffer overflow!
- break;
-
- } else if (length > 0) // at non-Letter w/ chars
- break; // return 'em
- }
+ return super.next(reusableToken);
+ }
- reusableToken.setTermLength(length);
- reusableToken.setStartOffset(input.correctOffset(start));
- reusableToken.setEndOffset(input.correctOffset(start+length));
- return reusableToken;
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws IOException {
+ return super.next();
}
public void reset(Reader input) throws IOException {
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java Fri Jul 24 21:45:48 2009
@@ -57,27 +57,17 @@
} else
return false;
}
-
- /** @deprecated */
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
public final Token next(final Token reusableToken) throws java.io.IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken != null) {
- final char[] buffer = nextToken.termBuffer();
- final int length = nextToken.termLength();
- // If no characters actually require rewriting then we
- // just return token as-is:
- for(int i=0;i<length;i++) {
- final char c = buffer[i];
- if (c >= '\u00c0' && c <= '\uFB06') {
- removeAccents(buffer, length);
- nextToken.setTermBuffer(output, 0, outputPos);
- break;
- }
- }
- return nextToken;
- } else
- return null;
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws java.io.IOException {
+ return super.next();
}
/**
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java Fri Jul 24 21:45:48 2009
@@ -45,7 +45,7 @@
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
- public boolean incrementToken() throws IOException {
+ public final boolean incrementToken() throws IOException {
if (!done) {
done = true;
int upto = 0;
@@ -65,28 +65,16 @@
return false;
}
- /** @deprecated */
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- if (!done) {
- done = true;
- int upto = 0;
- reusableToken.clear();
- char[] buffer = reusableToken.termBuffer();
- while (true) {
- final int length = input.read(buffer, upto, buffer.length-upto);
- if (length == -1) break;
- upto += length;
- if (upto == buffer.length)
- buffer = reusableToken.resizeTermBuffer(1+buffer.length);
- }
- reusableToken.setTermLength(upto);
- reusableToken.setStartOffset(input.correctOffset(0));
- reusableToken.setEndOffset(input.correctOffset(upto));
-
- return reusableToken;
- }
- return null;
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws IOException {
+ return super.next();
}
public void reset(Reader input) throws IOException {
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LengthFilter.java Fri Jul 24 21:45:48 2009
@@ -61,24 +61,4 @@
// reached EOS -- return null
return false;
}
-
- /**
- * Returns the next input Token whose term() is the right len
- * @deprecated
- */
- public final Token next(final Token reusableToken) throws IOException
- {
- assert reusableToken != null;
- // return the first non-stop word found
- for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken))
- {
- int len = nextToken.termLength();
- if (len >= min && len <= max) {
- return nextToken;
- }
- // note: else we ignore it but should we index each part of it?
- }
- // reached EOS -- return null
- return null;
- }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java Fri Jul 24 21:45:48 2009
@@ -46,20 +46,4 @@
} else
return false;
}
-
- /** @deprecated */
- public final Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken != null) {
-
- final char[] buffer = nextToken.termBuffer();
- final int length = nextToken.termLength();
- for(int i=0;i<length;i++)
- buffer[i] = Character.toLowerCase(buffer[i]);
-
- return nextToken;
- } else
- return null;
- }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java Fri Jul 24 21:45:48 2009
@@ -206,40 +206,6 @@
shift += precisionStep;
return true;
}
-
- // @Override
- /** @deprecated Will be removed in Lucene 3.0 */
- public Token next(final Token reusableToken) {
- assert reusableToken != null;
- if (valSize == 0)
- throw new IllegalStateException("call set???Value() before usage");
- if (shift >= valSize)
- return null;
-
- reusableToken.clear();
-
- final char[] buffer;
- switch (valSize) {
- case 64:
- buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG);
- reusableToken.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer));
- break;
-
- case 32:
- buffer = reusableToken.resizeTermBuffer(NumericUtils.BUF_SIZE_INT);
- reusableToken.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer));
- break;
-
- default:
- // should not happen
- throw new IllegalArgumentException("valSize must be 32 or 64");
- }
-
- reusableToken.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC);
- reusableToken.setPositionIncrement((shift == 0) ? 1 : 0);
- shift += precisionStep;
- return reusableToken;
- }
// @Override
public String toString() {
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/PorterStemFilter.java Fri Jul 24 21:45:48 2009
@@ -57,16 +57,4 @@
termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
return true;
}
-
- /** @deprecated */
- public final Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
- if (nextToken == null)
- return null;
-
- if (stemmer.stem(nextToken.termBuffer(), 0, nextToken.termLength()))
- nextToken.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
- return nextToken;
- }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/SinkTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/SinkTokenizer.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/SinkTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/SinkTokenizer.java Fri Jul 24 21:45:48 2009
@@ -22,19 +22,21 @@
import java.util.Iterator;
import java.util.List;
-import org.apache.lucene.util.AttributeSource;
-
/**
* A SinkTokenizer can be used to cache Tokens for use in an Analyzer
- *
+ * <p/>
+ * WARNING: {@link TeeTokenFilter} and {@link SinkTokenizer} only work with the old TokenStream API.
+ * If you switch to the new API, you need to use {@link TeeSinkTokenFilter} instead, which offers
+ * the same functionality.
* @see TeeTokenFilter
+ * @deprecated Use {@link TeeSinkTokenFilter} instead
*
**/
public class SinkTokenizer extends Tokenizer {
protected List/*<Token>*/ lst = new ArrayList/*<Token>*/();
protected Iterator/*<Token>*/ iter;
-
+
public SinkTokenizer(List/*<Token>*/ input) {
this.lst = input;
if (this.lst == null) this.lst = new ArrayList/*<Token>*/();
@@ -64,29 +66,9 @@
}
/**
- * Increments this stream to the next token out of the list of cached tokens
- * @throws IOException
- */
- public boolean incrementToken() throws IOException {
- if (iter == null) iter = lst.iterator();
- // Since this TokenStream can be reset we have to maintain the tokens as immutable
- if (iter.hasNext()) {
- AttributeSource state = (AttributeSource) iter.next();
- state.restoreState(this);
- return true;
- }
- return false;
- }
-
- public void add(AttributeSource source) throws IOException {
- lst.add(source);
- }
-
- /**
* Returns the next token out of the list of cached tokens
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
* @throws IOException
- * @deprecated
*/
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/StopFilter.java Fri Jul 24 21:45:48 2009
@@ -235,27 +235,6 @@
}
/**
- * Returns the next input Token whose term() is not a stop word.
- * @deprecated
- */
- public final Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- // return the first non-stop word found
- int skippedPositions = 0;
- for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
- if (!stopWords.contains(nextToken.termBuffer(), 0, nextToken.termLength())) {
- if (enablePositionIncrements) {
- nextToken.setPositionIncrement(nextToken.getPositionIncrement() + skippedPositions);
- }
- return nextToken;
- }
- skippedPositions += nextToken.getPositionIncrement();
- }
- // reached EOS -- return null
- return null;
- }
-
- /**
* @see #setEnablePositionIncrementsDefault(boolean).
* @deprecated Please specify this when you create the StopFilter
*/
Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java?rev=797665&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java Fri Jul 24 21:45:48 2009
@@ -0,0 +1,206 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.lang.ref.WeakReference;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Collections;
+
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * This TokenFilter provides the ability to set aside attribute states
+ * that have already been analyzed. This is useful in situations where multiple fields share
+ * many common analysis steps and then go their separate ways.
+ * <p/>
+ * It is also useful for doing things like entity extraction or proper noun analysis as
+ * part of the analysis workflow and saving off those tokens for use in another field.
+ *
+ * <pre>
+TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
+TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
+TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
+
+TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
+source2.addSinkTokenStream(sink1);
+source2.addSinkTokenStream(sink2);
+
+TokenStream final1 = new LowerCaseFilter(source1);
+TokenStream final2 = source2;
+TokenStream final3 = new EntityDetect(sink1);
+TokenStream final4 = new URLDetect(sink2);
+
+d.add(new Field("f1", final1));
+d.add(new Field("f2", final2));
+d.add(new Field("f3", final3));
+d.add(new Field("f4", final4));
+ * </pre>
+ * In this example, <code>sink1</code> and <code>sink2<code> will both get tokens from both
+ * <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
+ * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
+ * It is important, that tees are consumed before sinks (in the above example, the field names must be
+ * less the sink's field names). If you are not sure, which stream is consumed first, you can simply
+ * add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}.
+ * This TokenFilter is exhausted after this. In the above example, change
+ * the example above to:
+ * <pre>
+...
+TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
+TokenStream final2 = source2.newSinkTokenStream();
+sink1.consumeAllTokens();
+sink2.consumeAllTokens();
+...
+ * </pre>
+ * In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
+ * <p>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
+ */
+public final class TeeSinkTokenFilter extends TokenFilter {
+ private final List sinks = new LinkedList();
+
+ /**
+ * Instantiates a new TeeSinkTokenFilter.
+ */
+ public TeeSinkTokenFilter(TokenStream input) {
+ super(input);
+ }
+
+ /**
+ * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream.
+ */
+ public SinkTokenStream newSinkTokenStream() {
+ return newSinkTokenStream(ACCEPT_ALL_FILTER);
+ }
+
+ /**
+ * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream
+ * that pass the supplied filter.
+ * @see SinkFilter
+ */
+ public SinkTokenStream newSinkTokenStream(SinkFilter filter) {
+ SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter);
+ this.sinks.add(new WeakReference(sink));
+ return sink;
+ }
+
+ /**
+ * Adds a {@link SinkTokenStream} created by another <code>TeeSinkTokenFilter</code>
+ * to this one. The supplied stream will also receive all consumed tokens.
+ * This method can be used to pass tokens from two different tees to one sink.
+ */
+ public void addSinkTokenStream(final SinkTokenStream sink) {
+ // check that sink has correct factory
+ if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) {
+ throw new IllegalArgumentException("The supplied sink is not compatible to this tee");
+ }
+ // add eventually missing attribute impls to the existing sink
+ for (Iterator it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) {
+ sink.addAttributeImpl((AttributeImpl) it.next());
+ }
+ this.sinks.add(new WeakReference(sink));
+ }
+
+ /**
+ * <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks
+ * when itsself is consumed. To be sure, that all tokens from the input
+ * stream are passed to the sinks, you can call this methods.
+ * This instance is exhausted after this, but all sinks are instant available.
+ */
+ public void consumeAllTokens() throws IOException {
+ while (incrementToken());
+ }
+
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ // capture state lazily - maybe no SinkFilter accepts this state
+ AttributeSource.State state = null;
+ for (Iterator it = sinks.iterator(); it.hasNext(); ) {
+ final SinkTokenStream sink = (SinkTokenStream) ((WeakReference) it.next()).get();
+ if (sink != null) {
+ if (sink.accept(this)) {
+ if (state == null) {
+ state = this.captureState();
+ }
+ sink.addState(state);
+ }
+ }
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * TODO: Missing Docs
+ */
+ public static interface SinkFilter {
+ boolean accept(AttributeSource source);
+ }
+
+ public static final class SinkTokenStream extends TokenStream {
+ private final List cachedStates = new LinkedList();
+ private Iterator it = null;
+ private SinkFilter filter;
+
+ private SinkTokenStream(AttributeSource source, SinkFilter filter) {
+ super(source);
+ this.filter = filter;
+ }
+
+ private boolean accept(AttributeSource source) {
+ return filter.accept(source);
+ }
+
+ private void addState(AttributeSource.State state) {
+ if (it != null) {
+ throw new IllegalStateException("The tee must be consumed before sinks are consumed.");
+ }
+ cachedStates.add(state);
+ }
+
+ public final boolean incrementToken() throws IOException {
+ // lazy init the iterator
+ if (it == null) {
+ it = cachedStates.iterator();
+ }
+
+ if (!it.hasNext()) {
+ return false;
+ }
+
+ AttributeSource.State state = (State) it.next();
+ restoreState(state);
+ return true;
+ }
+
+ public final void reset() {
+ it = cachedStates.iterator();
+ }
+ }
+
+ private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
+ public boolean accept(AttributeSource source) {
+ return true;
+ }
+ };
+
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeTokenFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeTokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TeeTokenFilter.java Fri Jul 24 21:45:48 2009
@@ -18,7 +18,6 @@
package org.apache.lucene.analysis;
import java.io.IOException;
-import java.util.Iterator;
/**
@@ -30,8 +29,8 @@
* part of the analysis workflow and saving off those tokens for use in another field.
*
* <pre>
-SinkTokenizer sink1 = new SinkTokenizer(null);
-SinkTokenizer sink2 = new SinkTokenizer(null);
+SinkTokenizer sink1 = new SinkTokenizer();
+SinkTokenizer sink2 = new SinkTokenizer();
TokenStream source1 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader1), sink1), sink2);
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader2), sink1), sink2);
@@ -46,14 +45,22 @@
d.add(new Field("f3", final3));
d.add(new Field("f4", final4));
* </pre>
- * In this example, sink1 and sink2 will both get tokens from both reader1 and reader2 after whitespace tokenizer
- and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
- Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene
+ * In this example, <code>sink1</code> and <code>sink2<code> will both get tokens from both
+ * <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
+ * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
+ * It is important, that tees are consumed before sinks (in the above example, the field names must be
+ * less the sink's field names).
+ * Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene
<p/>
*
- * See http://issues.apache.org/jira/browse/LUCENE-1058
+ * See <a href="http://issues.apache.org/jira/browse/LUCENE-1058">LUCENE-1058</a>.
+ * <p/>
+ * WARNING: {@link TeeTokenFilter} and {@link SinkTokenizer} only work with the old TokenStream API.
+ * If you switch to the new API, you need to use {@link TeeSinkTokenFilter} instead, which offers
+ * the same functionality.
+
* @see SinkTokenizer
- *
+ * @deprecated Use {@link TeeSinkTokenFilter} instead
**/
public class TeeTokenFilter extends TokenFilter {
SinkTokenizer sink;
@@ -61,21 +68,8 @@
public TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
super(input);
this.sink = sink;
- Iterator it = getAttributesIterator();
- while (it.hasNext()) {
- sink.addAttribute(it.next().getClass());
- }
}
- public boolean incrementToken() throws IOException {
- if (input.incrementToken()) {
- sink.add(captureState());
- return true;
- }
- return false;
- }
-
- /** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java Fri Jul 24 21:45:48 2009
@@ -17,14 +17,19 @@
* limitations under the License.
*/
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.TermPositions; // for javadoc
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
/**
- This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9.
- See Javadocs in {@link TokenStream} for further details.
- <p>
A Token is an occurrence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
@@ -44,11 +49,13 @@
{@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
<br><br>
- <p><font color="#FF0000">
- WARNING: The status of the <b>Payloads</b> feature is experimental.
- The APIs introduced here might change in the future and will not be
- supported anymore in such a case.</font>
-
+
+ <p><b>NOTE:</b> As of 2.9, Token implements all {@link Attribute} interfaces
+ that are part of core Lucene and can be found in the {@code tokenattributes} subpackage.
+ Even though it is not necessary to use Token anymore, with the new TokenStream API it can
+ be used as convenience class that implements all {@link Attribute}s, which is especially useful
+ to easily switch from the old to the new TokenStream API.
+
<br><br>
<p><b>NOTE:</b> As of 2.3, Token stores the term text
@@ -118,10 +125,10 @@
</p>
@see org.apache.lucene.index.Payload
- @deprecated A new TokenStream API was introduced with Lucene 2.9.
- See javadocs in {@link TokenStream} for further details.
*/
-public class Token implements Cloneable {
+public class Token extends AttributeImpl
+ implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute,
+ FlagsAttribute, OffsetAttribute, PayloadAttribute {
public static final String DEFAULT_TYPE = "word";
@@ -134,7 +141,7 @@
/**
* Characters for the term text.
* @deprecated This will be made private. Instead, use:
- * {@link termBuffer()},
+ * {@link #termBuffer()},
* {@link #setTermBuffer(char[], int, int)},
* {@link #setTermBuffer(String)}, or
* {@link #setTermBuffer(String, int, int)}
@@ -144,28 +151,28 @@
/**
* Length of term text in the buffer.
* @deprecated This will be made private. Instead, use:
- * {@link termLength()}, or @{link setTermLength(int)}.
+ * {@link #termLength()}, or @{link setTermLength(int)}.
*/
int termLength;
/**
* Start in source text.
* @deprecated This will be made private. Instead, use:
- * {@link startOffset()}, or @{link setStartOffset(int)}.
+ * {@link #startOffset()}, or @{link setStartOffset(int)}.
*/
int startOffset;
/**
* End in source text.
* @deprecated This will be made private. Instead, use:
- * {@link endOffset()}, or @{link setEndOffset(int)}.
+ * {@link #endOffset()}, or @{link setEndOffset(int)}.
*/
int endOffset;
/**
* The lexical type of the token.
* @deprecated This will be made private. Instead, use:
- * {@link type()}, or @{link setType(String)}.
+ * {@link #type()}, or @{link setType(String)}.
*/
String type = DEFAULT_TYPE;
@@ -173,13 +180,13 @@
/**
* @deprecated This will be made private. Instead, use:
- * {@link getPayload()}, or @{link setPayload(Payload)}.
+ * {@link #getPayload()}, or @{link setPayload(Payload)}.
*/
Payload payload;
/**
* @deprecated This will be made private. Instead, use:
- * {@link getPositionIncrement()}, or @{link setPositionIncrement(String)}.
+ * {@link #getPositionIncrement()}, or @{link setPositionIncrement(String)}.
*/
int positionIncrement = 1;
@@ -561,6 +568,13 @@
public void setEndOffset(int offset) {
this.endOffset = offset;
}
+
+ /** Set the starting and ending offset.
+ @see #startOffset() and #endOffset()*/
+ public void setOffset(int startOffset, int endOffset) {
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ }
/** Returns this Token's lexical type. Defaults to "word". */
public final String type() {
@@ -640,19 +654,15 @@
}
public Object clone() {
- try {
- Token t = (Token)super.clone();
- // Do a deep clone
- if (termBuffer != null) {
- t.termBuffer = (char[]) termBuffer.clone();
- }
- if (payload != null) {
- t.setPayload((Payload) payload.clone());
- }
- return t;
- } catch (CloneNotSupportedException e) {
- throw new RuntimeException(e); // shouldn't happen
+ Token t = (Token)super.clone();
+ // Do a deep clone
+ if (termBuffer != null) {
+ t.termBuffer = (char[]) termBuffer.clone();
+ }
+ if (payload != null) {
+ t.setPayload((Payload) payload.clone());
}
+ return t;
}
/** Makes a clone, but replaces the term buffer &
@@ -862,4 +872,9 @@
type = prototype.type;
payload = prototype.payload;
}
+
+ public void copyTo(AttributeImpl target) {
+ Token to = (Token) target;
+ to.reinit(this);
+ }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenFilter.java Fri Jul 24 21:45:48 2009
@@ -42,7 +42,7 @@
super(input);
this.input = input;
}
-
+
/** Close the input TokenStream. */
public void close() throws IOException {
input.close();
@@ -50,20 +50,6 @@
/** Reset the filter as well as the input TokenStream. */
public void reset() throws IOException {
- super.reset();
input.reset();
}
-
- public boolean useNewAPI() {
- return input.useNewAPI();
- }
-
- /**
- * Sets whether or not to use the new TokenStream API. Settings this
- * will apply to this Filter and all TokenStream/Filters upstream.
- */
- public void setUseNewAPI(boolean use) {
- input.setUseNewAPI(use);
- }
-
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenStream.java Fri Jul 24 21:45:48 2009
@@ -18,10 +18,15 @@
*/
import java.io.IOException;
-import java.util.Iterator;
-import org.apache.lucene.index.Payload;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
/** A TokenStream enumerates the sequence of tokens, either from
@@ -36,13 +41,13 @@
</ul>
A new TokenStream API is introduced with Lucene 2.9. Since
2.9 Token is deprecated and the preferred way to store
- the information of a token is to use {@link Attribute}s.
+ the information of a token is to use {@link AttributeImpl}s.
<p>
For that reason TokenStream extends {@link AttributeSource}
- now. Note that only one instance per {@link Attribute} is
+ now. Note that only one instance per {@link AttributeImpl} is
created and reused for every token. This approach reduces
object creations and allows local caching of references to
- the {@link Attribute}s. See {@link #incrementToken()} for further details.
+ the {@link AttributeImpl}s. See {@link #incrementToken()} for further details.
<p>
<b>The workflow of the new TokenStream API is as follows:</b>
<ol>
@@ -60,19 +65,8 @@
<p>
Sometimes it is desirable to capture a current state of a
TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter},
- {@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase
- {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(AttributeSource)} can be used.
- <p>
- <b>NOTE:</b> In order to enable the new API the method
- {@link #useNewAPI()} has to be called with useNewAPI=true.
- Otherwise the deprecated method {@link #next(Token)} will
- be used by Lucene consumers (indexer and queryparser) to
- consume the tokens. {@link #next(Token)} will be removed
- in Lucene 3.0.
- <p>
- NOTE: To use the old API subclasses must override {@link #next(Token)}.
- It's also OK to instead override {@link #next()} but that
- method is slower compared to {@link #next(Token)}.
+ {@link TeeSinkTokenFilter}). For this usecase
+ {@link AttributeSource#captureState} and {@link AttributeSource#restoreState} can be used.
* <p><font color="#FF0000">
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
@@ -80,110 +74,203 @@
*/
public abstract class TokenStream extends AttributeSource {
- private static boolean useNewAPIDefault = false;
- private boolean useNewAPI = useNewAPIDefault;
+
+ /** @deprecated Remove this when old API is removed! */
+ private static final AttributeFactory DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY
+ = new TokenWrapperAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
+
+ /** @deprecated Remove this when old API is removed! */
+ private static final Class[] METHOD_NO_PARAMS = new Class[0];
+
+ /** @deprecated Remove this when old API is removed! */
+ private static final Class[] METHOD_TOKEN_PARAM = new Class[]{Token.class};
+ /** @deprecated Remove this when old API is removed! */
+ private final TokenWrapper tokenWrapper;
+
+ /** @deprecated Remove this when old API is removed! */
+ private static boolean onlyUseNewAPI = false;
+
+ /** @deprecated Remove this when old API is removed! */
+ private final boolean
+ hasIncrementToken = isMethodOverridden("incrementToken", METHOD_NO_PARAMS),
+ hasReusableNext = onlyUseNewAPI ? false : isMethodOverridden("next", METHOD_TOKEN_PARAM),
+ hasNext = onlyUseNewAPI ? false : isMethodOverridden("next", METHOD_NO_PARAMS);
+
+ /** @deprecated Remove this when old API is removed! */
+ private boolean isMethodOverridden(String name, Class[] params) {
+ try {
+ return this.getClass().getMethod(name, params).getDeclaringClass() != TokenStream.class;
+ } catch (NoSuchMethodException e) {
+ // should not happen
+ throw new RuntimeException(e);
+ }
+ }
+
+ /** @deprecated Remove this when old API is removed! */
+ private static final class TokenWrapperAttributeFactory extends AttributeFactory {
+ private final AttributeFactory delegate;
+
+ private TokenWrapperAttributeFactory(AttributeFactory delegate) {
+ this.delegate = delegate;
+ }
+
+ public AttributeImpl createAttributeInstance(Class attClass) {
+ return attClass.isAssignableFrom(TokenWrapper.class)
+ ? new TokenWrapper()
+ : delegate.createAttributeInstance(attClass);
+ }
+
+ // this is needed for TeeSinkTokenStream's check for compatibility of AttributeSource,
+ // so two TokenStreams using old API have the same AttributeFactory wrapped by this one.
+ public boolean equals(Object other) {
+ if (this == other) return true;
+ if (other instanceof TokenWrapperAttributeFactory) {
+ final TokenWrapperAttributeFactory af = (TokenWrapperAttributeFactory) other;
+ return this.delegate.equals(af.delegate);
+ }
+ return false;
+ }
+ }
+
+ /**
+ * A TokenStream using the default attribute factory.
+ */
protected TokenStream() {
- super();
+ super(onlyUseNewAPI
+ ? AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY
+ : TokenStream.DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY
+ );
+ tokenWrapper = initTokenWrapper(null);
+ check();
}
+ /**
+ * A TokenStream that uses the same attributes as the supplied one.
+ */
protected TokenStream(AttributeSource input) {
super(input);
+ tokenWrapper = initTokenWrapper(input);
+ check();
}
-
+
/**
- * Returns whether or not the new TokenStream APIs are used
- * by default.
- * (see {@link #incrementToken()}, {@link AttributeSource}).
+ * A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
*/
- public static boolean useNewAPIDefault() {
- return useNewAPIDefault;
+ protected TokenStream(AttributeFactory factory) {
+ super(onlyUseNewAPI
+ ? factory
+ : new TokenWrapperAttributeFactory(factory)
+ );
+ tokenWrapper = initTokenWrapper(null);
+ check();
}
+ /** @deprecated Remove this when old API is removed! */
+ private TokenWrapper initTokenWrapper(AttributeSource input) {
+ if (onlyUseNewAPI) {
+ // no wrapper needed
+ return null;
+ } else {
+ // if possible get the wrapper from the filter's input stream
+ if (input instanceof TokenStream && ((TokenStream) input).tokenWrapper != null) {
+ return ((TokenStream) input).tokenWrapper;
+ }
+ // check that all attributes are implemented by the same TokenWrapper instance
+ final AttributeImpl att = addAttribute(TermAttribute.class);
+ if (att instanceof TokenWrapper &&
+ addAttribute(TypeAttribute.class) == att &&
+ addAttribute(PositionIncrementAttribute.class) == att &&
+ addAttribute(FlagsAttribute.class) == att &&
+ addAttribute(OffsetAttribute.class) == att &&
+ addAttribute(PayloadAttribute.class) == att
+ ) {
+ return (TokenWrapper) att;
+ } else {
+ throw new UnsupportedOperationException(
+ "If onlyUseNewAPI is disabled, all basic Attributes must be implemented by the internal class "+
+ "TokenWrapper. Please make sure, that all TokenStreams/TokenFilters in this chain have been "+
+ "instantiated with this flag disabled and do not add any custom instances for the basic Attributes!"
+ );
+ }
+ }
+ }
+
+ /** @deprecated Remove this when old API is removed! */
+ private void check() {
+ if (onlyUseNewAPI && !hasIncrementToken) {
+ throw new UnsupportedOperationException(getClass().getName()+" does not implement incrementToken() which is needed for onlyUseNewAPI.");
+ }
+
+ // a TokenStream subclass must at least implement one of the methods!
+ if (!(hasIncrementToken || hasNext || hasReusableNext)) {
+ throw new UnsupportedOperationException(getClass().getName()+" does not implement any of incrementToken(), next(Token), next().");
+ }
+ }
+
/**
- * Use this API to enable or disable the new TokenStream API.
- * by default. Can be overridden by calling {@link #setUseNewAPI(boolean)}.
- * (see {@link #incrementToken()}, {@link AttributeSource}).
- * <p>
- * If set to true, the indexer will call {@link #incrementToken()}
- * to consume Tokens from this stream.
- * <p>
- * If set to false, the indexer will call {@link #next(Token)}
- * instead.
+ * For extra performance you can globally enable the new {@link #incrementToken}
+ * API using {@link Attribute}s. There will be a small, but in most cases neglectible performance
+ * increase by enabling this, but it only works if <b>all</b> TokenStreams and -Filters
+ * use the new API and implement {@link #incrementToken}. This setting can only be enabled
+ * globally.
+ * <P>This setting only affects TokenStreams instantiated after this call. All TokenStreams
+ * already created use the other setting.
+ * <P>All core analyzers are compatible with this setting, if you have own
+ * TokenStreams/-Filters, that are also compatible, enable this.
+ * <P>When enabled, tokenization may throw {@link UnsupportedOperationException}s,
+ * if the whole tokenizer chain is not compatible.
+ * <P>The default is <code>false</code>, so there is the fallback to the old API available.
+ * @deprecated This setting will be <code>true</code> per default in Lucene 3.0,
+ * when {@link #incrementToken} is abstract and must be always implemented.
*/
- public static void setUseNewAPIDefault(boolean use) {
- useNewAPIDefault = use;
+ public static void setOnlyUseNewAPI(boolean onlyUseNewAPI) {
+ TokenStream.onlyUseNewAPI = onlyUseNewAPI;
}
- /**
- * Returns whether or not the new TokenStream APIs are used
- * for this stream.
- * (see {@link #incrementToken()}, {@link AttributeSource}).
+ /** Returns if only the new API is used.
+ * @see #setOnlyUseNewAPI
+ * @deprecated This setting will be <code>true</code> per default in Lucene 3.0,
+ * when {@link #incrementToken} is abstract and must be always implemented.
*/
- public boolean useNewAPI() {
- return useNewAPI;
+ public static boolean getOnlyUseNewAPI() {
+ return onlyUseNewAPI;
}
-
+
/**
- * Use this API to enable or disable the new TokenStream API
- * for this stream. Overrides {@link #setUseNewAPIDefault(boolean)}.
- * (see {@link #incrementToken()}, {@link AttributeSource}).
+ * Consumers (e. g. the indexer) use this method to advance the stream
+ * to the next token. Implementing classes must implement this method
+ * and update the appropriate {@link AttributeImpl}s with content of the
+ * next token.
* <p>
- * If set to true, the indexer will call {@link #incrementToken()}
- * to consume Tokens from this stream.
+ * This method is called for every token of a document, so an efficient
+ * implementation is crucial for good performance. To avoid calls to
+ * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
+ * downcasts, references to all {@link AttributeImpl}s that this stream uses
+ * should be retrieved during instantiation.
* <p>
- * If set to false, the indexer will call {@link #next(Token)}
- * instead.
- * <p>
- * <b>NOTE: All streams and filters in one chain must use the
- * same API. </b>
- */
- public void setUseNewAPI(boolean use) {
- useNewAPI = use;
- }
-
- /**
- * Consumers (e. g. the indexer) use this method to advance the stream
- * to the next token. Implementing classes must implement this method
- * and update the appropriate {@link Attribute}s with content of the
- * next token.
- * <p>
- * This method is called for every token of a document, so an efficient
- * implementation is crucial for good performance. To avoid calls to
- * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
- * downcasts, references to all {@link Attribute}s that this stream uses
- * should be retrieved during instantiation.
- * <p>
- * To make sure that filters and consumers know which attributes are available
+ * To make sure that filters and consumers know which attributes are available
* the attributes must be added during instantiation. Filters and
* consumers are not required to check for availability of attributes in {@link #incrementToken()}.
- *
- * @return false for end of stream; true otherwise
- *
- * <p>
- * <b>Note that this method will be defined abstract in Lucene 3.0.</b>
- */
- public boolean incrementToken() throws IOException {
- // subclasses must implement this method; will be made abstract in Lucene 3.0
- return false;
- }
-
- /** Returns the next token in the stream, or null at EOS.
- * @deprecated The returned Token is a "full private copy" (not
- * re-used across calls to next()) but will be slower
- * than calling {@link #next(Token)} instead.. */
- public Token next() throws IOException {
- final Token reusableToken = new Token();
- Token nextToken = next(reusableToken);
-
- if (nextToken != null) {
- Payload p = nextToken.getPayload();
- if (p != null) {
- nextToken.setPayload((Payload) p.clone());
- }
+ *
+ * @return false for end of stream; true otherwise
+ *
+ * <p>
+ * <b>Note that this method will be defined abstract in Lucene 3.0.</b>
+ */
+ public boolean incrementToken() throws IOException {
+ assert !onlyUseNewAPI && tokenWrapper != null;
+
+ final Token token;
+ if (hasReusableNext) {
+ token = next(tokenWrapper.delegate);
+ } else {
+ assert hasNext;
+ token = next();
}
-
- return nextToken;
+ if (token == null) return false;
+ tokenWrapper.delegate = token;
+ return true;
}
/** Returns the next token in the stream, or null at EOS.
@@ -215,12 +302,46 @@
* good idea to assert that it is not null.)
* @return next token in the stream or null if end-of-stream was hit
* @deprecated The new {@link #incrementToken()} and {@link AttributeSource}
- * APIs should be used instead. See also {@link #useNewAPI()}.
+ * APIs should be used instead.
*/
public Token next(final Token reusableToken) throws IOException {
- // We don't actually use inputToken, but still add this assert
assert reusableToken != null;
- return next();
+
+ if (onlyUseNewAPI)
+ throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API.");
+
+ if (hasIncrementToken) {
+ tokenWrapper.delegate = reusableToken;
+ return incrementToken() ? tokenWrapper.delegate : null;
+ } else {
+ assert hasNext;
+ final Token token = next();
+ if (token == null) return null;
+ tokenWrapper.delegate = token;
+ return token;
+ }
+ }
+
+ /** Returns the next token in the stream, or null at EOS.
+ * @deprecated The returned Token is a "full private copy" (not
+ * re-used across calls to next()) but will be slower
+ * than calling {@link #next(Token)} or using the new
+ * {@link #incrementToken()} method with the new
+ * {@link AttributeSource} API.
+ */
+ public Token next() throws IOException {
+ if (onlyUseNewAPI)
+ throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API.");
+
+ if (hasIncrementToken) {
+ return incrementToken() ? ((Token) tokenWrapper.delegate.clone()) : null;
+ } else {
+ assert hasReusableNext;
+ final Token token = next(tokenWrapper.delegate);
+ if (token == null) return null;
+ tokenWrapper.delegate = token;
+ return (Token) token.clone();
+ }
}
/** Resets this stream to the beginning. This is an
@@ -240,24 +361,4 @@
/** Releases resources associated with this stream. */
public void close() throws IOException {}
- public String toString() {
- StringBuffer sb = new StringBuffer();
- sb.append('(');
-
- if (hasAttributes()) {
- // TODO Java 1.5
- //Iterator<Attribute> it = attributes.values().iterator();
- Iterator it = getAttributesIterator();
- if (it.hasNext()) {
- sb.append(it.next().toString());
- }
- while (it.hasNext()) {
- sb.append(',');
- sb.append(it.next().toString());
- }
- }
- sb.append(')');
- return sb.toString();
- }
-
}
Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java?rev=797665&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java Fri Jul 24 21:45:48 2009
@@ -0,0 +1,163 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.util.AttributeImpl;
+
+/**
+ * This class wraps a Token and supplies a single attribute instance
+ * where the delegate token can be replaced.
+ * @deprecated Will be removed, when old TokenStream API is removed.
+ */
+final class TokenWrapper extends AttributeImpl
+ implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute,
+ FlagsAttribute, OffsetAttribute, PayloadAttribute {
+
+ Token delegate;
+
+ TokenWrapper() {
+ this(new Token());
+ }
+
+ TokenWrapper(Token delegate) {
+ this.delegate = delegate;
+ }
+
+ // TermAttribute:
+
+ public String term() {
+ return delegate.term();
+ }
+
+ public void setTermBuffer(char[] buffer, int offset, int length) {
+ delegate.setTermBuffer(buffer, offset, length);
+ }
+
+ public void setTermBuffer(String buffer) {
+ delegate.setTermBuffer(buffer);
+ }
+
+ public void setTermBuffer(String buffer, int offset, int length) {
+ delegate.setTermBuffer(buffer, offset, length);
+ }
+
+ public char[] termBuffer() {
+ return delegate.termBuffer();
+ }
+
+ public char[] resizeTermBuffer(int newSize) {
+ return delegate.resizeTermBuffer(newSize);
+ }
+
+ public int termLength() {
+ return delegate.termLength();
+ }
+
+ public void setTermLength(int length) {
+ delegate.setTermLength(length);
+ }
+
+ // TypeAttribute:
+
+ public String type() {
+ return delegate.type();
+ }
+
+ public void setType(String type) {
+ delegate.setType(type);
+ }
+
+ public void setPositionIncrement(int positionIncrement) {
+ delegate.setPositionIncrement(positionIncrement);
+ }
+
+ public int getPositionIncrement() {
+ return delegate.getPositionIncrement();
+ }
+
+ // FlagsAttribute
+
+ public int getFlags() {
+ return delegate.getFlags();
+ }
+
+ public void setFlags(int flags) {
+ delegate.setFlags(flags);
+ }
+
+ // OffsetAttribute
+
+ public int startOffset() {
+ return delegate.startOffset();
+ }
+
+ public void setOffset(int startOffset, int endOffset) {
+ delegate.setOffset(startOffset, endOffset);
+ }
+
+ public int endOffset() {
+ return delegate.endOffset();
+ }
+
+ // PayloadAttribute
+ public Payload getPayload() {
+ return delegate.getPayload();
+ }
+
+ public void setPayload(Payload payload) {
+ delegate.setPayload(payload);
+ }
+
+ // TokenAttribute
+
+ public void clear() {
+ delegate.clear();
+ }
+
+ // AttributeImpl
+
+ public String toString() {
+ return delegate.toString();
+ }
+
+ public int hashCode() {
+ return delegate.hashCode();
+ }
+
+ public boolean equals(Object other) {
+ if (other instanceof TokenWrapper) {
+ return ((TokenWrapper) other).delegate.equals(this.delegate);
+ }
+ return false;
+ }
+
+ public Object clone() {
+ return new TokenWrapper((Token) delegate.clone());
+ }
+
+ public void copyTo(AttributeImpl target) {
+ ((TokenWrapper) target).delegate = (Token) this.delegate.clone();
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java Fri Jul 24 21:45:48 2009
@@ -24,17 +24,10 @@
<p>
This is an abstract class.
<p>
- <b>NOTE:</b> In order to enable the new API the method
- {@link #useNewAPI()} has to be called with useNewAPI=true.
- Otherwise the deprecated method {@link #next(Token)} will
- be used by Lucene consumers (indexer and queryparser) to
- consume the tokens. {@link #next(Token)} will be removed
- in Lucene 3.0.
- <p>
NOTE: To use the old API subclasses must override {@link #next(Token)}.
It's also OK to instead override {@link #next()} but that
method is slower compared to {@link #next(Token)}.
- <p>
+ <p>
NOTE: subclasses overriding {@link #next(Token)} must
call {@link Token#clear()}.
* <p><font color="#FF0000">
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/package.html?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/package.html (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/package.html Fri Jul 24 21:45:48 2009
@@ -442,57 +442,73 @@
in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped.
Note how <code>incrementToken()</code> can efficiently access the instance variable; no attribute lookup or downcasting
is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.
+
<h4>Adding a custom Attribute</h4>
Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently
-<code>PartOfSpeechAttribute</code>:
+<code>PartOfSpeechAttribute</code>. First we need to define the interface of the new Attribute:
<pre>
- public static enum PartOfSpeech {
- Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
+ public interface PartOfSpeechAttribute extends Attribute {
+ public static enum PartOfSpeech {
+ Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
+ }
+
+ public void setPartOfSpeech(PartOfSpeech pos);
+
+ public PartOfSpeech getPartOfSpeech();
}
+</pre>
+
+Now we also need to write the implementing class. The name of that class is important here: By default, Lucene
+checks if there is a class with the name of the Attribute with the postfix 'Impl'. In this example, we would
+consequently call the implementing class <code>PartOfSpeechAttributeImpl</code>. <br/>
+This should be the usual behavior. However, there is also an expert-API that allows changing these naming conventions:
+{@link org.apache.lucene.util.AttributeSource.AttributeFactory}. The factory accepts an Attribute interface as argument
+and returns an actual instance. You can implement your own factory if you need to change the default behavior. <br/><br/>
+
+Now here is the actual class that implements our new Attribute. Notice that the class has to extend
+{@link org.apache.lucene.util.AttributeSource.AttributeImpl}:
+
+<pre>
+public final class PartOfSpeechAttributeImpl extends AttributeImpl
+ implements PartOfSpeechAttribute{
- public static final class PartOfSpeechAttribute extends Attribute {
-
- private PartOfSpeech pos = PartOfSpeech.Unknown;
-
- public void setPartOfSpeech(PartOfSpeech pos) {
- this.pos = pos;
- }
-
- public PartOfSpeech getPartOfSpeech() {
- return pos;
- }
+ private PartOfSpeech pos = PartOfSpeech.Unknown;
+
+ public void setPartOfSpeech(PartOfSpeech pos) {
+ this.pos = pos;
+ }
+
+ public PartOfSpeech getPartOfSpeech() {
+ return pos;
+ }
- public void clear() {
- pos = PartOfSpeech.Unknown;
- }
+ public void clear() {
+ pos = PartOfSpeech.Unknown;
+ }
- public void copyTo(Attribute target) {
- ((PartOfSpeechAttribute) target).pos = pos;
- }
+ public void copyTo(AttributeImpl target) {
+ ((PartOfSpeechAttributeImpl) target).pos = pos;
+ }
- public boolean equals(Object other) {
- if (other == this) {
- return true;
- }
-
- if (other instanceof PartOfSpeechAttribute) {
- return pos == ((PartOfSpeechAttribute) other).pos;
- }
-
- return false;
+ public boolean equals(Object other) {
+ if (other == this) {
+ return true;
}
-
- public int hashCode() {
- return pos.ordinal();
+
+ if (other instanceof PartOfSpeechAttributeImpl) {
+ return pos == ((PartOfSpeechAttributeImpl) other).pos;
}
+
+ return false;
+ }
- public String toString() {
- return "PartOfSpeech=" + pos;
- }
+ public int hashCode() {
+ return pos.ordinal();
}
+}
</pre>
-This is a simple Attribute that has only a single variable that stores the part-of-speech of a token. It extends the
-new <code>Attribute</code> class and therefore implements its abstract methods <code>clear(), copyTo(), equals(), hashCode(), toString()</code>.
+This is a simple Attribute implementation has only a single variable that stores the part-of-speech of a token. It extends the
+new <code>AttributeImpl</code> class and therefore implements its abstract methods <code>clear(), copyTo(), equals(), hashCode()</code>.
Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter
that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
<pre>
@@ -523,7 +539,9 @@
}
</pre>
Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and
-stores references in instance variables. Now we need to add the filter to the chain:
+stores references in instance variables. Notice how you only need to pass in the interface of the new
+Attribute and instantiating the correct class is automatically been taken care of.
+Now we need to add the filter to the chain:
<pre>
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream stream = new WhitespaceTokenizer(reader);
@@ -582,7 +600,8 @@
as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise).
As a small hint, this is how the new Attribute class could begin:
<pre>
- public class FirstTokenOfSentenceAttribute extends Attribute {
+ public class FirstTokenOfSentenceAttributeImpl extends Attribute
+ implements FirstTokenOfSentenceAttribute {
private boolean firstToken;
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java Fri Jul 24 21:45:48 2009
@@ -73,39 +73,4 @@
return true;
}
-
- /** Returns the next token in the stream, or null at EOS.
- * <p>Removes <tt>'s</tt> from the end of words.
- * <p>Removes dots from acronyms.
- * @deprecated
- */
- public final Token next(final Token reusableToken) throws java.io.IOException {
- assert reusableToken != null;
- Token nextToken = input.next(reusableToken);
-
- if (nextToken == null)
- return null;
-
- char[] buffer = nextToken.termBuffer();
- final int bufferLength = nextToken.termLength();
- final String type = nextToken.type();
-
- if (type == APOSTROPHE_TYPE && // remove 's
- bufferLength >= 2 &&
- buffer[bufferLength-2] == '\'' &&
- (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
- // Strip last 2 characters off
- nextToken.setTermLength(bufferLength - 2);
- } else if (type == ACRONYM_TYPE) { // remove dots
- int upto = 0;
- for(int i=0;i<bufferLength;i++) {
- char c = buffer[i];
- if (c != '.')
- buffer[upto++] = c;
- }
- nextToken.setTermLength(upto);
- }
-
- return nextToken;
- }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Fri Jul 24 21:45:48 2009
@@ -147,7 +147,7 @@
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
- public boolean incrementToken() throws IOException {
+ public final boolean incrementToken() throws IOException {
int posIncr = 1;
while(true) {
@@ -183,66 +183,33 @@
posIncr++;
}
}
-
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next(final Token reusableToken) throws IOException {
+ return super.next(reusableToken);
+ }
+
+ /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+ * not be overridden. Delegates to the backwards compatibility layer. */
+ public final Token next() throws IOException {
+ return super.next();
+ }
+
/*
* (non-Javadoc)
*
- * @see org.apache.lucene.analysis.TokenStream#next()
+ * @see org.apache.lucene.analysis.TokenStream#reset()
*/
- /** @deprecated */
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- int posIncr = 1;
-
- while(true) {
- int tokenType = scanner.getNextToken();
-
- if (tokenType == StandardTokenizerImpl.YYEOF) {
- return null;
- }
-
- if (scanner.yylength() <= maxTokenLength) {
- reusableToken.clear();
- reusableToken.setPositionIncrement(posIncr);
- scanner.getText(reusableToken);
- final int start = scanner.yychar();
- reusableToken.setStartOffset(input.correctOffset(start));
- reusableToken.setEndOffset(input.correctOffset(start+reusableToken.termLength()));
- // This 'if' should be removed in the next release. For now, it converts
- // invalid acronyms to HOST. When removed, only the 'else' part should
- // remain.
- if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
- if (replaceInvalidAcronym) {
- reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
- reusableToken.setTermLength(reusableToken.termLength() - 1); // remove extra '.'
- } else {
- reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
- }
- } else {
- reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
- }
- return reusableToken;
- } else
- // When we skip a too-long term, we still increment the
- // position increment
- posIncr++;
- }
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.analysis.TokenStream#reset()
- */
- public void reset() throws IOException {
- super.reset();
- scanner.yyreset(input);
- }
+ public void reset() throws IOException {
+ super.reset();
+ scanner.yyreset(input);
+ }
- public void reset(Reader reader) throws IOException {
- setInput(reader);
- reset();
- }
+ public void reset(Reader reader) throws IOException {
+ setInput(reader);
+ reset();
+ }
/**
* Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java?rev=797665&r1=797664&r2=797665&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java Fri Jul 24 21:45:48 2009
@@ -17,8 +17,6 @@
* limitations under the License.
*/
-import java.io.Serializable;
-
import org.apache.lucene.util.Attribute;
/**
@@ -31,9 +29,7 @@
* We will make our best efforts to keep the APIs backwards-compatible.</font>
*/
-public class FlagsAttribute extends Attribute implements Cloneable, Serializable {
- private int flags = 0;
-
+public interface FlagsAttribute extends Attribute {
/**
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
* <p/>
@@ -44,43 +40,10 @@
*
* @return The bits
*/
- public int getFlags() {
- return flags;
- }
+ public int getFlags();
/**
* @see #getFlags()
*/
- public void setFlags(int flags) {
- this.flags = flags;
- }
-
- public void clear() {
- flags = 0;
- }
-
- public String toString() {
- return "flags=" + flags;
- }
-
- public boolean equals(Object other) {
- if (this == other) {
- return true;
- }
-
- if (other instanceof FlagsAttribute) {
- return ((FlagsAttribute) other).flags == flags;
- }
-
- return false;
- }
-
- public int hashCode() {
- return flags;
- }
-
- public void copyTo(Attribute target) {
- FlagsAttribute t = (FlagsAttribute) target;
- t.setFlags(flags);
- }
+ public void setFlags(int flags);
}
Added: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java?rev=797665&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java Fri Jul 24 21:45:48 2009
@@ -0,0 +1,82 @@
+package org.apache.lucene.analysis.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+
+import org.apache.lucene.util.AttributeImpl;
+
+/**
+ * This attribute can be used to pass different flags down the tokenizer chain,
+ * e. g. from one TokenFilter to another one.
+ *
+ * <p><font color="#FF0000">
+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
+ * The APIs introduced in these classes with Lucene 2.9 might change in the future.
+ * We will make our best efforts to keep the APIs backwards-compatible.</font>
+
+ */
+public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable, Serializable {
+ private int flags = 0;
+
+ /**
+ * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
+ * <p/>
+ *
+ * Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
+ * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
+ *
+ *
+ * @return The bits
+ */
+ public int getFlags() {
+ return flags;
+ }
+
+ /**
+ * @see #getFlags()
+ */
+ public void setFlags(int flags) {
+ this.flags = flags;
+ }
+
+ public void clear() {
+ flags = 0;
+ }
+
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+
+ if (other instanceof FlagsAttributeImpl) {
+ return ((FlagsAttributeImpl) other).flags == flags;
+ }
+
+ return false;
+ }
+
+ public int hashCode() {
+ return flags;
+ }
+
+ public void copyTo(AttributeImpl target) {
+ FlagsAttribute t = (FlagsAttribute) target;
+ t.setFlags(flags);
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java
------------------------------------------------------------------------------
svn:eol-style = native