You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by us...@apache.org on 2009/08/15 00:01:43 UTC
svn commit: r804392 - in /lucene/java/trunk: ./
contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneou...
Author: uschindler
Date: Fri Aug 14 22:01:42 2009
New Revision: 804392
URL: http://svn.apache.org/viewvc?rev=804392&view=rev
Log:
LUCENE-1801: All Tokenizers/TokenStreams that are source of tokens call AttributeSource.clearAttributes() first. Made Token.clear() consistent to AttributeImpl (clear everything)
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java
lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Aug 14 22:01:42 2009
@@ -171,6 +171,13 @@
reusableTokenStream. This is now fixed, such that if
reusableTokenStream is invoked on such a subclass, that method
will forcefully fallback to tokenStream. (Mike McCandless)
+
+12. LUCENE-1801: Token.clear() and Token.clearNoTermBuffer() now also clear
+ startOffset, endOffset and type. This should normally affect no
+ Tokenizer chains, as Tokenizers normally always set these three values.
+ This change was made to be conform to the new AttributeImpl.clear() and
+ AttributeSource.clearAttributes() to work identical for Token as one for all
+ AttributeImpl and the 6 separate AttributeImpls. (Uwe Schindler, Michael Busch)
API Changes
@@ -468,6 +475,10 @@
22. LUCENE-1805: CloseableThreadLocal did not allow a null Object in get(),
although it does allow it in set(Object). Fix get() to not assert the object
is not null. (Shai Erera via Mike McCandless)
+
+23. LUCENE-1801: Changed all Tokenizers or TokenStreams in core/contrib)
+ that are the source of Tokens to always call
+ AttributeSource.clearAttributes() first. (Uwe Schindler)
New features
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java Fri Aug 14 22:01:42 2009
@@ -123,6 +123,7 @@
*
*/
public boolean incrementToken() throws IOException {
+ clearAttributes();
/** how many character(s) has been stored in buffer */
while(true) { // loop until we find a non-empty token
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java Fri Aug 14 22:01:42 2009
@@ -96,6 +96,7 @@
}
public boolean incrementToken() throws IOException {
+ clearAttributes();
length = 0;
start = offset;
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java Fri Aug 14 22:01:42 2009
@@ -64,6 +64,7 @@
Token clone = (Token) singleToken.clone();
+ clearAttributes();
termAtt.setTermBuffer(clone.termBuffer(), 0, clone.termLength());
offsetAtt.setOffset(clone.startOffset(), clone.endOffset());
flagsAtt.setFlags(clone.getFlags());
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Fri Aug 14 22:01:42 2009
@@ -123,6 +123,7 @@
/** Returns the next token in the stream, or null at EOS. */
public final boolean incrementToken() throws IOException {
+ clearAttributes();
// if we are just starting, read the whole input
if (!started) {
started = true;
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Fri Aug 14 22:01:42 2009
@@ -72,6 +72,7 @@
/** Returns the next token in the stream, or null at EOS. */
public final boolean incrementToken() throws IOException {
+ clearAttributes();
if (!started) {
started = true;
gramSize = minGram;
Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Fri Aug 14 22:01:42 2009
@@ -54,6 +54,7 @@
}
public boolean incrementToken() throws IOException {
+ clearAttributes();
buffer.setLength(0);
int ci;
char ch, pch;
Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java Fri Aug 14 22:01:42 2009
@@ -343,7 +343,7 @@
public final boolean incrementToken() {
if (matcher == null) return false;
-
+ clearAttributes();
while (true) { // loop takes care of leading and trailing boundary cases
int start = pos;
int end;
@@ -401,6 +401,7 @@
}
public boolean incrementToken() {
+ clearAttributes();
// cache loop instance vars (performance)
String s = str;
int len = s.length();
Modified: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (original)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java Fri Aug 14 22:01:42 2009
@@ -184,6 +184,7 @@
restoreState(state);
return true;
}
+ clearAttributes();
int tokenType = scanner.getNextToken();
if (tokenType == WikipediaTokenizerImpl.YYEOF) {
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java Fri Aug 14 22:01:42 2009
@@ -53,9 +53,9 @@
}
public final boolean incrementToken() throws IOException {
+ clearAttributes();
int length = 0;
int start = bufferIndex;
- termAtt.clear();
char[] buffer = termAtt.termBuffer();
while (true) {
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java Fri Aug 14 22:01:42 2009
@@ -49,6 +49,7 @@
public final boolean incrementToken() throws IOException {
if (!done) {
+ clearAttributes();
done = true;
int upto = 0;
char[] buffer = termAtt.termBuffer();
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java Fri Aug 14 22:01:42 2009
@@ -184,6 +184,7 @@
if (shift >= valSize)
return false;
+ clearAttributes();
final char[] buffer;
switch (valSize) {
case 64:
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java Fri Aug 14 22:01:42 2009
@@ -117,7 +117,7 @@
</ul>
A few things to note:
<ul>
- <li>clear() initializes most of the fields to default values, but not startOffset, endOffset and type.</li>
+ <li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
<li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
<li>The startOffset and endOffset represent the start and offset in the source text. So be careful in adjusting them.</li>
<li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
@@ -622,9 +622,9 @@
return sb.toString();
}
- /** Resets the term text, payload, flags, and positionIncrement to default.
- * Other fields such as startOffset, endOffset and the token type are
- * not reset since they are normally overwritten by the tokenizer. */
+ /** Resets the term text, payload, flags, and positionIncrement,
+ * startOffset, endOffset and token type to default.
+ */
public void clear() {
payload = null;
// Leave termBuffer to allow re-use
@@ -632,8 +632,8 @@
termText = null;
positionIncrement = 1;
flags = 0;
- // startOffset = endOffset = 0;
- // type = DEFAULT_TYPE;
+ startOffset = endOffset = 0;
+ type = DEFAULT_TYPE;
}
public Object clone() {
@@ -715,6 +715,8 @@
payload = null;
positionIncrement = 1;
flags = 0;
+ startOffset = endOffset = 0;
+ type = DEFAULT_TYPE;
}
/** Shorthand for calling {@link #clear},
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java Fri Aug 14 22:01:42 2009
@@ -122,6 +122,7 @@
}
// PayloadAttribute
+
public Payload getPayload() {
return delegate.getPayload();
}
@@ -130,14 +131,12 @@
delegate.setPayload(payload);
}
- // TokenAttribute
-
+ // AttributeImpl
+
public void clear() {
delegate.clear();
}
- // AttributeImpl
-
public String toString() {
return delegate.toString();
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java Fri Aug 14 22:01:42 2009
@@ -26,12 +26,16 @@
<p>
This is an abstract class.
<p>
- NOTE: To use the old API subclasses must override {@link #next(Token)}.
- It's also OK to instead override {@link #next()} but that
- method is slower compared to {@link #next(Token)}.
+ NOTE: subclasses must override
+ {@link #incrementToken()} if the new TokenStream API is used
+ and {@link #next(Token)} or {@link #next()} if the old
+ TokenStream API is used.
<p>
- NOTE: subclasses overriding {@link #next(Token)} must
- call {@link Token#clear()}.
+ NOTE: Subclasses overriding {@link #incrementToken()} must
+ call {@link AttributeSource#clearAttributes()} before
+ setting attributes.
+ Subclasses overriding {@link #next(Token)} must call
+ {@link Token#clear()} before setting Token attributes.
*/
public abstract class Tokenizer extends TokenStream {
@@ -85,6 +89,9 @@
this.input = CharReader.get(input);
}
+ /** Expert: Reset the tokenizer to a new CharStream. Typically, an
+ * analyzer (in its reusableTokenStream method) will use
+ * this to re-use a previously created tokenizer. */
public void reset(CharStream input) throws IOException {
this.input = input;
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Fri Aug 14 22:01:42 2009
@@ -148,6 +148,7 @@
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public final boolean incrementToken() throws IOException {
+ clearAttributes();
int posIncr = 1;
while(true) {
Modified: lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java Fri Aug 14 22:01:42 2009
@@ -21,5 +21,4 @@
* Base interface for attributes.
*/
public interface Attribute {
- public void clear();
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java Fri Aug 14 22:01:42 2009
@@ -30,8 +30,9 @@
*/
public abstract class AttributeImpl implements Cloneable, Serializable {
/**
- * Clears the values in this Attribute and resets it to its
- * default value.
+ * Clears the values in this AttributeImpl and resets it to its
+ * default value. If this implementation implements more than one Attribute interface
+ * it clears all.
*/
public abstract void clear();