You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by us...@apache.org on 2009/08/15 00:01:43 UTC

svn commit: r804392 - in /lucene/java/trunk: ./ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneou...

Author: uschindler
Date: Fri Aug 14 22:01:42 2009
New Revision: 804392

URL: http://svn.apache.org/viewvc?rev=804392&view=rev
Log:
LUCENE-1801: All Tokenizers/TokenStreams that are source of tokens call AttributeSource.clearAttributes() first. Made Token.clear() consistent to AttributeImpl (clear everything)

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
    lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
    lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
    lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java
    lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Aug 14 22:01:42 2009
@@ -171,6 +171,13 @@
     reusableTokenStream.  This is now fixed, such that if
     reusableTokenStream is invoked on such a subclass, that method
     will forcefully fallback to tokenStream.  (Mike McCandless)
+    
+12. LUCENE-1801: Token.clear() and Token.clearNoTermBuffer() now also clear
+    startOffset, endOffset and type. This should normally affect no
+    Tokenizer chains, as Tokenizers normally always set these three values.
+    This change was made to be conform to the new AttributeImpl.clear() and
+    AttributeSource.clearAttributes() to work identical for Token as one for all
+    AttributeImpl and the 6 separate AttributeImpls. (Uwe Schindler, Michael Busch)
 
 API Changes
 
@@ -468,6 +475,10 @@
 22. LUCENE-1805: CloseableThreadLocal did not allow a null Object in get(), 
     although it does allow it in set(Object). Fix get() to not assert the object
     is not null. (Shai Erera via Mike McCandless)
+    
+23. LUCENE-1801: Changed all Tokenizers or TokenStreams in core/contrib)
+    that are the source of Tokens to always call
+    AttributeSource.clearAttributes() first. (Uwe Schindler)
 
 New features
 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java Fri Aug 14 22:01:42 2009
@@ -123,6 +123,7 @@
      *
      */
     public boolean incrementToken() throws IOException {
+        clearAttributes();
         /** how many character(s) has been stored in buffer */
 
         while(true) { // loop until we find a non-empty token

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java Fri Aug 14 22:01:42 2009
@@ -96,6 +96,7 @@
     }
 
     public boolean incrementToken() throws IOException {
+        clearAttributes();
 
         length = 0;
         start = offset;

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java Fri Aug 14 22:01:42 2009
@@ -64,6 +64,7 @@
     
     Token clone = (Token) singleToken.clone();
     
+    clearAttributes();
     termAtt.setTermBuffer(clone.termBuffer(), 0, clone.termLength());
     offsetAtt.setOffset(clone.startOffset(), clone.endOffset());
     flagsAtt.setFlags(clone.getFlags());

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Fri Aug 14 22:01:42 2009
@@ -123,6 +123,7 @@
 
   /** Returns the next token in the stream, or null at EOS. */
   public final boolean incrementToken() throws IOException {
+    clearAttributes();
     // if we are just starting, read the whole input
     if (!started) {
       started = true;

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Fri Aug 14 22:01:42 2009
@@ -72,6 +72,7 @@
 
   /** Returns the next token in the stream, or null at EOS. */
   public final boolean incrementToken() throws IOException {
+    clearAttributes();
     if (!started) {
       started = true;
       gramSize = minGram;

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Fri Aug 14 22:01:42 2009
@@ -54,6 +54,7 @@
   }
 
   public boolean incrementToken() throws IOException {
+    clearAttributes();
     buffer.setLength(0);
     int ci;
     char ch, pch;

Modified: lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java (original)
+++ lucene/java/trunk/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java Fri Aug 14 22:01:42 2009
@@ -343,7 +343,7 @@
 
     public final boolean incrementToken() {
       if (matcher == null) return false;
-      
+      clearAttributes();
       while (true) { // loop takes care of leading and trailing boundary cases
         int start = pos;
         int end;
@@ -401,6 +401,7 @@
     }
 
     public boolean incrementToken() {
+      clearAttributes();
       // cache loop instance vars (performance)
       String s = str;
       int len = s.length();

Modified: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (original)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java Fri Aug 14 22:01:42 2009
@@ -184,6 +184,7 @@
       restoreState(state);
       return true;
     }
+    clearAttributes();
     int tokenType = scanner.getNextToken();
 
     if (tokenType == WikipediaTokenizerImpl.YYEOF) {

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/CharTokenizer.java Fri Aug 14 22:01:42 2009
@@ -53,9 +53,9 @@
   }
 
   public final boolean incrementToken() throws IOException {
+    clearAttributes();
     int length = 0;
     int start = bufferIndex;
-    termAtt.clear();
     char[] buffer = termAtt.termBuffer();
     while (true) {
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/KeywordTokenizer.java Fri Aug 14 22:01:42 2009
@@ -49,6 +49,7 @@
   
   public final boolean incrementToken() throws IOException {
     if (!done) {
+      clearAttributes();
       done = true;
       int upto = 0;
       char[] buffer = termAtt.termBuffer();

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/NumericTokenStream.java Fri Aug 14 22:01:42 2009
@@ -184,6 +184,7 @@
     if (shift >= valSize)
       return false;
 
+    clearAttributes();
     final char[] buffer;
     switch (valSize) {
       case 64:

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Token.java Fri Aug 14 22:01:42 2009
@@ -117,7 +117,7 @@
   </ul>
   A few things to note:
   <ul>
-  <li>clear() initializes most of the fields to default values, but not startOffset, endOffset and type.</li>
+  <li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
   <li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
   <li>The startOffset and endOffset represent the start and offset in the source text. So be careful in adjusting them.</li>
   <li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
@@ -622,9 +622,9 @@
     return sb.toString();
   }
 
-  /** Resets the term text, payload, flags, and positionIncrement to default.
-   * Other fields such as startOffset, endOffset and the token type are
-   * not reset since they are normally overwritten by the tokenizer. */
+  /** Resets the term text, payload, flags, and positionIncrement,
+   * startOffset, endOffset and token type to default.
+   */
   public void clear() {
     payload = null;
     // Leave termBuffer to allow re-use
@@ -632,8 +632,8 @@
     termText = null;
     positionIncrement = 1;
     flags = 0;
-    // startOffset = endOffset = 0;
-    // type = DEFAULT_TYPE;
+    startOffset = endOffset = 0;
+    type = DEFAULT_TYPE;
   }
 
   public Object clone() {
@@ -715,6 +715,8 @@
     payload = null;
     positionIncrement = 1;
     flags = 0;
+    startOffset = endOffset = 0;
+    type = DEFAULT_TYPE;
   }
 
   /** Shorthand for calling {@link #clear},

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/TokenWrapper.java Fri Aug 14 22:01:42 2009
@@ -122,6 +122,7 @@
   }
   
   // PayloadAttribute
+  
   public Payload getPayload() {
     return delegate.getPayload();
   }
@@ -130,14 +131,12 @@
     delegate.setPayload(payload);
   }
   
-  // TokenAttribute
-  
+  // AttributeImpl
+
   public void clear() {
     delegate.clear();
   }
 
-  // AttributeImpl
-
   public String toString() {
     return delegate.toString();
   }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/Tokenizer.java Fri Aug 14 22:01:42 2009
@@ -26,12 +26,16 @@
   <p>
   This is an abstract class.
   <p>
-  NOTE: To use the old API subclasses must override {@link #next(Token)}.
-  It's also OK to instead override {@link #next()} but that
-  method is slower compared to {@link #next(Token)}.
+  NOTE: subclasses must override 
+  {@link #incrementToken()} if the new TokenStream API is used
+  and {@link #next(Token)} or {@link #next()} if the old
+  TokenStream API is used.
   <p>
-  NOTE: subclasses overriding {@link #next(Token)} must  
-  call {@link Token#clear()}.
+  NOTE: Subclasses overriding {@link #incrementToken()} must
+  call {@link AttributeSource#clearAttributes()} before
+  setting attributes.
+  Subclasses overriding {@link #next(Token)} must call
+  {@link Token#clear()} before setting Token attributes. 
  */
 
 public abstract class Tokenizer extends TokenStream {
@@ -85,6 +89,9 @@
     this.input = CharReader.get(input);
   }
 
+  /** Expert: Reset the tokenizer to a new CharStream.  Typically, an
+   *  analyzer (in its reusableTokenStream method) will use
+   *  this to re-use a previously created tokenizer. */
   public void reset(CharStream input) throws IOException {
     this.input = input;
   }

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java Fri Aug 14 22:01:42 2009
@@ -148,6 +148,7 @@
    * @see org.apache.lucene.analysis.TokenStream#next()
    */
   public final boolean incrementToken() throws IOException {
+    clearAttributes();
     int posIncr = 1;
 
     while(true) {

Modified: lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/util/Attribute.java Fri Aug 14 22:01:42 2009
@@ -21,5 +21,4 @@
  * Base interface for attributes.
  */
 public interface Attribute {
-  public void clear();
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java?rev=804392&r1=804391&r2=804392&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/util/AttributeImpl.java Fri Aug 14 22:01:42 2009
@@ -30,8 +30,9 @@
  */
 public abstract class AttributeImpl implements Cloneable, Serializable {  
   /**
-   * Clears the values in this Attribute and resets it to its 
-   * default value.
+   * Clears the values in this AttributeImpl and resets it to its 
+   * default value. If this implementation implements more than one Attribute interface
+   * it clears all.
    */
   public abstract void clear();