You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ar...@apache.org on 2008/06/25 04:53:12 UTC
svn commit: r671406 [3/3] - in
/incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis: ./ Standard/
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Token.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Token.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Token.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Token.cs Tue Jun 24 19:53:11 2008
@@ -17,150 +17,435 @@
using System;
+using Payload = Lucene.Net.Index.Payload;
+
namespace Lucene.Net.Analysis
{
- /// <summary>A Token is an occurence of a term from the text of a field. It consists of
- /// a term's text, the start and end offset of the term in the text of the field,
- /// and a type string.
- /// The start and end offsets permit applications to re-associate a token with
- /// its source text, e.g., to display highlighted query terms in a document
- /// browser, or to show matching text fragments in a KWIC (KeyWord In Context)
- /// display, etc.
- /// The type is an interned string, assigned by a lexical analyzer
- /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
- /// belongs to. For example an end of sentence marker token might be implemented
- /// with type "eos". The default token type is "word".
- /// </summary>
-
+ /// <summary>A Token is an occurence of a term from the text of a field. It consists of
+ /// a term's text, the start and end offset of the term in the text of the field,
+ /// and a type string.
+ /// <p>
+ /// The start and end offsets permit applications to re-associate a token with
+ /// its source text, e.g., to display highlighted query terms in a document
+ /// browser, or to show matching text fragments in a KWIC (KeyWord In Context)
+ /// display, etc.
+ /// <p>
+ /// The type is an interned string, assigned by a lexical analyzer
+ /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
+ /// belongs to. For example an end of sentence marker token might be implemented
+ /// with type "eos". The default token type is "word".
+ /// <p>
+ /// A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
+ /// length byte array. Use {@link TermPositions#GetPayloadLength()} and
+ /// {@link TermPositions#GetPayload(byte[], int)} to retrieve the payloads from the index.
+ /// </summary>
+ /// <summary><br><br>
+ /// <p><font color="#FF0000">
+ /// WARNING: The status of the <b>Payloads</b> feature is experimental.
+ /// The APIs introduced here might change in the future and will not be
+ /// supported anymore in such a case.</font>
+ /// <br><br>
+ /// <p><b>NOTE:</b> As of 2.3, Token stores the term text
+ /// internally as a malleable char[] termBuffer instead of
+ /// String termText. The indexing code and core tokenizers
+ /// have been changed re-use a single Token instance, changing
+ /// its buffer and other fields in-place as the Token is
+ /// processed. This provides substantially better indexing
+ /// performance as it saves the GC cost of new'ing a Token and
+ /// String for every term. The APIs that accept String
+ /// termText are still available but a warning about the
+ /// associated performance cost has been added (below). The
+ /// {@link #TermText()} method has been deprecated.</p>
+ /// </summary>
+ /// <summary><p>Tokenizers and filters should try to re-use a Token
+ /// instance when possible for best performance, by
+ /// implementing the {@link TokenStream#Next(Token)} API.
+ /// Failing that, to create a new Token you should first use
+ /// one of the constructors that starts with null text. Then
+ /// you should call either {@link #TermBuffer()} or {@link
+ /// #ResizeTermBuffer(int)} to retrieve the Token's
+ /// termBuffer. Fill in the characters of your term into this
+ /// buffer, and finally call {@link #SetTermLength(int)} to
+ /// set the length of the term text. See <a target="_top"
+ /// href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
+ /// for details.</p>
+ /// </summary>
+ /// <seealso cref="Lucene.Net.Index.Payload">
+ /// </seealso>
public class Token : System.ICloneable
{
- internal System.String termText; // the text of the term
- internal int startOffset; // start in source text
- internal int endOffset; // end in source text
- internal System.String type = "word"; // lexical type
-
- private int positionIncrement = 1;
-
- /// <summary>Constructs a Token with the given term text, and start & end offsets.
- /// The type defaults to "word."
- /// </summary>
- public Token(System.String text, int start, int end)
- {
- termText = text;
- startOffset = start;
- endOffset = end;
- }
-
- /// <summary>Constructs a Token with the given text, start and end offsets, & type. </summary>
- public Token(System.String text, int start, int end, System.String typ)
- {
- termText = text;
- startOffset = start;
- endOffset = end;
- type = typ;
- }
-
- /// <summary>Set the position increment. This determines the position of this token
- /// relative to the previous Token in a {@link TokenStream}, used in phrase
- /// searching.
- ///
- /// <p>The default value is one.
- ///
- /// <p>Some common uses for this are:<ul>
- ///
- /// <li>Set it to zero to put multiple terms in the same position. This is
- /// useful if, e.g., a word has multiple stems. Searches for phrases
- /// including either stem will match. In this case, all but the first stem's
- /// increment should be set to zero: the increment of the first instance
- /// should be one. Repeating a token with an increment of zero can also be
- /// used to boost the scores of matches on that token.
- ///
- /// <li>Set it to values greater than one to inhibit exact phrase matches.
- /// If, for example, one does not want phrases to match across removed stop
- /// words, then one could build a stop word filter that removes stop words and
- /// also sets the increment to the number of stop words removed before each
- /// non-stop word. Then exact phrase queries will only match when the terms
- /// occur with no intervening stop words.
- ///
- /// </ul>
- /// </summary>
- /// <seealso cref="Lucene.Net.index.TermPositions">
- /// </seealso>
- public void SetPositionIncrement(int positionIncrement)
- {
- if (positionIncrement < 0)
- throw new System.ArgumentException("Increment must be zero or greater: " + positionIncrement);
- this.positionIncrement = positionIncrement;
- }
-
- /// <summary>Returns the position increment of this Token.</summary>
- /// <seealso cref="setPositionIncrement">
- /// </seealso>
- public int GetPositionIncrement()
- {
- return positionIncrement;
- }
-
- /// <summary>Sets the Token's term text. </summary>
- public virtual void SetTermText(System.String text)
- {
- termText = text;
- }
-
- /// <summary>Returns the Token's term text. </summary>
- public System.String TermText()
- {
- return termText;
- }
-
- /// <summary>Returns this Token's starting offset, the position of the first character
- /// corresponding to this token in the source text.
- /// Note that the difference between endOffset() and startOffset() may not be
- /// equal to termText.length(), as the term text may have been altered by a
- /// stemmer or some other filter.
- /// </summary>
- public int StartOffset()
- {
- return startOffset;
- }
-
- /// <summary>Returns this Token's ending offset, one greater than the position of the
- /// last character corresponding to this token in the source text.
- /// </summary>
- public int EndOffset()
- {
- return endOffset;
- }
-
- /// <summary>Returns this Token's lexical type. Defaults to "word". </summary>
- public System.String Type()
- {
- return type;
- }
-
- public override System.String ToString()
- {
- System.Text.StringBuilder sb = new System.Text.StringBuilder();
- sb.Append("(" + termText + "," + startOffset + "," + endOffset);
- if (!type.Equals("word"))
- sb.Append(",type=" + type);
- if (positionIncrement != 1)
- sb.Append(",posIncr=" + positionIncrement);
- sb.Append(")");
- return sb.ToString();
- }
-
- public virtual System.Object Clone()
- {
- try
- {
- return base.MemberwiseClone();
- }
- catch (System.Exception e)
- {
- throw new System.SystemException("", e); // shouldn't happen since we implement Cloneable
- }
- }
- }
+
+ public const System.String DEFAULT_TYPE = "word";
+ private static int MIN_BUFFER_SIZE = 10;
+
+ /// <deprecated>: we will remove this when we remove the
+ /// deprecated APIs
+ /// </deprecated>
+ private System.String termText;
+
+ internal char[] termBuffer; // characters for the term text
+ internal int termLength; // length of term text in buffer
+
+ internal int startOffset; // start in source text
+ internal int endOffset; // end in source text
+ internal System.String type = DEFAULT_TYPE; // lexical type
+
+ internal Payload payload;
+
+ internal int positionIncrement = 1;
+
+ /// <summary>Constructs a Token will null text. </summary>
+ public Token()
+ {
+ }
+
+ /// <summary>Constructs a Token with null text and start & end
+ /// offsets.
+ /// </summary>
+ /// <param name="start">start offset
+ /// </param>
+ /// <param name="end">end offset
+ /// </param>
+ public Token(int start, int end)
+ {
+ startOffset = start;
+ endOffset = end;
+ }
+
+ /// <summary>Constructs a Token with null text and start & end
+ /// offsets plus the Token type.
+ /// </summary>
+ /// <param name="start">start offset
+ /// </param>
+ /// <param name="end">end offset
+ /// </param>
+ public Token(int start, int end, System.String typ)
+ {
+ startOffset = start;
+ endOffset = end;
+ type = typ;
+ }
+
+ /// <summary>Constructs a Token with the given term text, and start
+ /// & end offsets. The type defaults to "word."
+ /// <b>NOTE:</b> for better indexing speed you should
+ /// instead use the char[] termBuffer methods to set the
+ /// term text.
+ /// </summary>
+ /// <param name="text">term text
+ /// </param>
+ /// <param name="start">start offset
+ /// </param>
+ /// <param name="end">end offset
+ /// </param>
+ public Token(System.String text, int start, int end)
+ {
+ termText = text;
+ startOffset = start;
+ endOffset = end;
+ }
+
+ /// <summary>Constructs a Token with the given text, start and end
+ /// offsets, & type. <b>NOTE:</b> for better indexing
+ /// speed you should instead use the char[] termBuffer
+ /// methods to set the term text.
+ /// </summary>
+ /// <param name="text">term text
+ /// </param>
+ /// <param name="start">start offset
+ /// </param>
+ /// <param name="end">end offset
+ /// </param>
+ /// <param name="typ">token type
+ /// </param>
+ public Token(System.String text, int start, int end, System.String typ)
+ {
+ termText = text;
+ startOffset = start;
+ endOffset = end;
+ type = typ;
+ }
+
+ /// <summary>Set the position increment. This determines the position of this token
+ /// relative to the previous Token in a {@link TokenStream}, used in phrase
+ /// searching.
+ ///
+ /// <p>The default value is one.
+ ///
+ /// <p>Some common uses for this are:<ul>
+ ///
+ /// <li>Set it to zero to put multiple terms in the same position. This is
+ /// useful if, e.g., a word has multiple stems. Searches for phrases
+ /// including either stem will match. In this case, all but the first stem's
+ /// increment should be set to zero: the increment of the first instance
+ /// should be one. Repeating a token with an increment of zero can also be
+ /// used to boost the scores of matches on that token.
+ ///
+ /// <li>Set it to values greater than one to inhibit exact phrase matches.
+ /// If, for example, one does not want phrases to match across removed stop
+ /// words, then one could build a stop word filter that removes stop words and
+ /// also sets the increment to the number of stop words removed before each
+ /// non-stop word. Then exact phrase queries will only match when the terms
+ /// occur with no intervening stop words.
+ ///
+ /// </ul>
+ /// </summary>
+ /// <seealso cref="Lucene.Net.Index.TermPositions">
+ /// </seealso>
+ public virtual void SetPositionIncrement(int positionIncrement)
+ {
+ if (positionIncrement < 0)
+ throw new System.ArgumentException("Increment must be zero or greater: " + positionIncrement);
+ this.positionIncrement = positionIncrement;
+ }
+
+ /// <summary>Returns the position increment of this Token.</summary>
+ /// <seealso cref="setPositionIncrement">
+ /// </seealso>
+ public virtual int GetPositionIncrement()
+ {
+ return positionIncrement;
+ }
+
+ /// <summary>Sets the Token's term text. <b>NOTE:</b> for better
+ /// indexing speed you should instead use the char[]
+ /// termBuffer methods to set the term text.
+ /// </summary>
+ public virtual void SetTermText(System.String text)
+ {
+ termText = text;
+ termBuffer = null;
+ }
+
+ /// <summary>Returns the Token's term text.
+ ///
+ /// </summary>
+ /// <deprecated> Use {@link #TermBuffer()} and {@link
+ /// #TermLength()} instead.
+ /// </deprecated>
+ public System.String TermText()
+ {
+ if (termText == null && termBuffer != null)
+ termText = new System.String(termBuffer, 0, termLength);
+ return termText;
+ }
+
+ /// <summary>Copies the contents of buffer, starting at offset for
+ /// length characters, into the termBuffer
+ /// array. <b>NOTE:</b> for better indexing speed you
+ /// should instead retrieve the termBuffer, using {@link
+ /// #TermBuffer()} or {@link #ResizeTermBuffer(int)}, and
+ /// fill it in directly to set the term text. This saves
+ /// an extra copy.
+ /// </summary>
+ public void SetTermBuffer(char[] buffer, int offset, int length)
+ {
+ ResizeTermBuffer(length);
+ Array.Copy(buffer, offset, termBuffer, 0, length);
+ termLength = length;
+ }
+
+ /// <summary>Returns the internal termBuffer character array which
+ /// you can then directly alter. If the array is too
+ /// small for your token, use {@link
+ /// #ResizeTermBuffer(int)} to increase it. After
+ /// altering the buffer be sure to call {@link
+ /// #setTermLength} to record the number of valid
+ /// characters that were placed into the termBuffer.
+ /// </summary>
+ public char[] TermBuffer()
+ {
+ InitTermBuffer();
+ return termBuffer;
+ }
+
+ /// <summary>Grows the termBuffer to at least size newSize.</summary>
+ /// <param name="newSize">minimum size of the new termBuffer
+ /// </param>
+ /// <returns> newly created termBuffer with length >= newSize
+ /// </returns>
+ public virtual char[] ResizeTermBuffer(int newSize)
+ {
+ InitTermBuffer();
+ if (newSize > termBuffer.Length)
+ {
+ int size = termBuffer.Length;
+ while (size < newSize)
+ size *= 2;
+ char[] newBuffer = new char[size];
+ Array.Copy(termBuffer, 0, newBuffer, 0, termBuffer.Length);
+ termBuffer = newBuffer;
+ }
+ return termBuffer;
+ }
+
+ // TODO: once we remove the deprecated termText() method
+ // and switch entirely to char[] termBuffer we don't need
+ // to use this method anymore
+ private void InitTermBuffer()
+ {
+ if (termBuffer == null)
+ {
+ if (termText == null)
+ {
+ termBuffer = new char[MIN_BUFFER_SIZE];
+ termLength = 0;
+ }
+ else
+ {
+ int length = termText.Length;
+ if (length < MIN_BUFFER_SIZE)
+ length = MIN_BUFFER_SIZE;
+ termBuffer = new char[length];
+ termLength = termText.Length;
+
+ int offset = 0;
+ while (offset < termText.Length)
+ {
+ termBuffer[offset] = (char) termText[offset];
+ offset++;
+ }
+
+ termText = null;
+ }
+ }
+ else if (termText != null)
+ termText = null;
+ }
+
+ /// <summary>Return number of valid characters (length of the term)
+ /// in the termBuffer array.
+ /// </summary>
+ public int TermLength()
+ {
+ InitTermBuffer();
+ return termLength;
+ }
+
+ /// <summary>Set number of valid characters (length of the term) in
+ /// the termBuffer array.
+ /// </summary>
+ public void SetTermLength(int length)
+ {
+ InitTermBuffer();
+ termLength = length;
+ }
+
+ /// <summary>Returns this Token's starting offset, the position of the first character
+ /// corresponding to this token in the source text.
+ /// Note that the difference between endOffset() and startOffset() may not be
+ /// equal to termText.length(), as the term text may have been altered by a
+ /// stemmer or some other filter.
+ /// </summary>
+ public int StartOffset()
+ {
+ return startOffset;
+ }
+
+ /// <summary>Set the starting offset.</summary>
+ /// <seealso cref="StartOffset()">
+ /// </seealso>
+ public virtual void SetStartOffset(int offset)
+ {
+ this.startOffset = offset;
+ }
+
+ /// <summary>Returns this Token's ending offset, one greater than the position of the
+ /// last character corresponding to this token in the source text.
+ /// </summary>
+ public int EndOffset()
+ {
+ return endOffset;
+ }
+
+ /// <summary>Set the ending offset.</summary>
+ /// <seealso cref="EndOffset()">
+ /// </seealso>
+ public virtual void SetEndOffset(int offset)
+ {
+ this.endOffset = offset;
+ }
+
+ /// <summary>Returns this Token's lexical type. Defaults to "word". </summary>
+ public System.String Type()
+ {
+ return type;
+ }
+
+ /// <summary>Set the lexical type.</summary>
+ /// <seealso cref="Type()">
+ /// </seealso>
+ public void SetType(System.String type)
+ {
+ this.type = type;
+ }
+
+ /// <summary> Returns this Token's payload.</summary>
+ public virtual Payload GetPayload()
+ {
+ return this.payload;
+ }
+
+ /// <summary> Sets this Token's payload.</summary>
+ public virtual void SetPayload(Payload payload)
+ {
+ this.payload = payload;
+ }
+
+ public override System.String ToString()
+ {
+ System.Text.StringBuilder sb = new System.Text.StringBuilder();
+ sb.Append('(');
+ InitTermBuffer();
+ if (termBuffer == null)
+ sb.Append("null");
+ else
+ sb.Append(termBuffer, 0, termLength);
+ sb.Append(',').Append(startOffset).Append(',').Append(endOffset);
+ if (!type.Equals("word"))
+ sb.Append(",type=").Append(type);
+ if (positionIncrement != 1)
+ sb.Append(",posIncr=").Append(positionIncrement);
+ sb.Append(')');
+ return sb.ToString();
+ }
+
+ /// <summary>Resets the term text, payload, and positionIncrement to default.
+ /// Other fields such as startOffset, endOffset and the token type are
+ /// not reset since they are normally overwritten by the tokenizer.
+ /// </summary>
+ public virtual void Clear()
+ {
+ payload = null;
+ // Leave termBuffer to allow re-use
+ termLength = 0;
+ termText = null;
+ positionIncrement = 1;
+ // startOffset = endOffset = 0;
+ // type = DEFAULT_TYPE;
+ }
+
+ public virtual System.Object Clone()
+ {
+ try
+ {
+ Token t = (Token) base.MemberwiseClone();
+ if (termBuffer != null)
+ {
+ t.termBuffer = null;
+ t.SetTermBuffer(termBuffer, 0, termLength);
+ }
+ if (payload != null)
+ {
+ t.SetPayload((Payload) payload.Clone());
+ }
+ return t;
+ }
+ catch (System.Exception e)
+ {
+ throw new System.SystemException("", e); // shouldn't happen
+ }
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/TokenFilter.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenFilter.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenFilter.cs Tue Jun 24 19:53:11 2008
@@ -23,8 +23,9 @@
/// <summary>A TokenFilter is a TokenStream whose input is another token stream.
/// <p>
/// This is an abstract class.
- /// </summary>
-
+ /// NOTE: subclasses must override at least one of {@link
+ /// #Next()} or {@link #Next(Token)}.
+ /// </summary>
public abstract class TokenFilter : TokenStream
{
/// <summary>The source of tokens for this filter. </summary>
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenStream.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/TokenStream.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenStream.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/TokenStream.cs Tue Jun 24 19:53:11 2008
@@ -17,29 +17,91 @@
using System;
+using Payload = Lucene.Net.Index.Payload;
+
namespace Lucene.Net.Analysis
{
- /// <summary>A TokenStream enumerates the sequence of tokens, either from
- /// fields of a document or from query text.
- /// <p>
- /// This is an abstract class. Concrete subclasses are:
- /// <ul>
- /// <li>{@link Tokenizer}, a TokenStream
- /// whose input is a Reader; and
- /// <li>{@link TokenFilter}, a TokenStream
- /// whose input is another TokenStream.
- /// </ul>
- /// </summary>
+ /// <summary>A TokenStream enumerates the sequence of tokens, either from
+ /// fields of a document or from query text.
+ /// <p>
+ /// This is an abstract class. Concrete subclasses are:
+ /// <ul>
+ /// <li>{@link Tokenizer}, a TokenStream
+ /// whose input is a Reader; and
+ /// <li>{@link TokenFilter}, a TokenStream
+ /// whose input is another TokenStream.
+ /// </ul>
+ /// NOTE: subclasses must override at least one of {@link
+ /// #Next()} or {@link #Next(Token)}.
+ /// </summary>
- public abstract class TokenStream
- {
- /// <summary>Returns the next token in the stream, or null at EOS. </summary>
- public abstract Token Next();
+ public abstract class TokenStream
+ {
+
+ /// <summary>Returns the next token in the stream, or null at EOS.
+ /// The returned Token is a "full private copy" (not
+ /// re-used across calls to next()) but will be slower
+ /// than calling {@link #Next(Token)} instead..
+ /// </summary>
+ public virtual Token Next()
+ {
+ Token result = Next(new Token());
+
+ if (result != null)
+ {
+ Payload p = result.GetPayload();
+ if (p != null)
+ {
+ result.SetPayload((Payload) p.Clone());
+ }
+ }
+
+ return result;
+ }
+
+ /// <summary>Returns the next token in the stream, or null at EOS.
+ /// When possible, the input Token should be used as the
+ /// returned Token (this gives fastest tokenization
+ /// performance), but this is not required and a new Token
+ /// may be returned. Callers may re-use a single Token
+ /// instance for successive calls to this method.
+ /// <p>
+ /// This implicitly defines a "contract" between
+ /// consumers (callers of this method) and
+ /// producers (implementations of this method
+ /// that are the source for tokens):
+ /// <ul>
+ /// <li>A consumer must fully consume the previously
+ /// returned Token before calling this method again.</li>
+ /// <li>A producer must call {@link Token#Clear()}
+ /// before setting the fields in it & returning it</li>
+ /// </ul>
+ /// Note that a {@link TokenFilter} is considered a consumer.
+ /// </summary>
+ /// <param name="result">a Token that may or may not be used to return
+ /// </param>
+ /// <returns> next token in the stream or null if end-of-stream was hit
+ /// </returns>
+ public virtual Token Next(Token result)
+ {
+ return Next();
+ }
+
+ /// <summary>Resets this stream to the beginning. This is an
+ /// optional operation, so subclasses may or may not
+ /// implement this method. Reset() is not needed for
+ /// the standard indexing process. However, if the Tokens
+ /// of a TokenStream are intended to be consumed more than
+ /// once, it is necessary to implement reset().
+ /// </summary>
+ public virtual void Reset()
+ {
+ }
- /// <summary>Releases resources associated with this stream. </summary>
- public virtual void Close()
- {
- }
- }
+ /// <summary>Releases resources associated with this stream. </summary>
+ public virtual void Close()
+ {
+ }
+ }
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Tokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/Tokenizer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Tokenizer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/Tokenizer.cs Tue Jun 24 19:53:11 2008
@@ -20,10 +20,16 @@
namespace Lucene.Net.Analysis
{
- /// <summary>A Tokenizer is a TokenStream whose input is a Reader.
- /// <p>
- /// This is an abstract class.
- /// </summary>
+ /// <summary>A Tokenizer is a TokenStream whose input is a Reader.
+ /// <p>
+ /// This is an abstract class.
+ /// <p>
+ /// NOTE: subclasses must override at least one of {@link
+ /// #Next()} or {@link #Next(Token)}.
+ /// <p>
+ /// NOTE: subclasses overriding {@link #Next(Token)} must
+ /// call {@link Token#Clear()}.
+ /// </summary>
public abstract class Tokenizer : TokenStream
{
@@ -49,5 +55,14 @@
input.Close();
}
}
+
+ /// <summary>Expert: Reset the tokenizer to a new reader. Typically, an
+ /// analyzer (in its reusableTokenStream method) will use
+ /// this to re-use a previously created tokenizer.
+ /// </summary>
+ public virtual void Reset(System.IO.TextReader input)
+ {
+ this.input = input;
+ }
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WhitespaceAnalyzer.cs Tue Jun 24 19:53:11 2008
@@ -28,5 +28,18 @@
{
return new WhitespaceTokenizer(reader);
}
+
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ Tokenizer tokenizer = (Tokenizer) GetPreviousTokenStream();
+ if (tokenizer == null)
+ {
+ tokenizer = new WhitespaceTokenizer(reader);
+ SetPreviousTokenStream(tokenizer);
+ }
+ else
+ tokenizer.Reset(reader);
+ return tokenizer;
+ }
}
}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WordlistLoader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Analysis/WordlistLoader.cs?rev=671406&r1=671405&r2=671406&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WordlistLoader.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Analysis/WordlistLoader.cs Tue Jun 24 19:53:11 2008
@@ -20,13 +20,12 @@
namespace Lucene.Net.Analysis
{
- /// <summary> Loader for text files that represent a list of stopwords.
- ///
- /// </summary>
- /// <author> Gerhard Schwarz
- /// </author>
- /// <version> $Id: WordlistLoader.java 192989 2005-06-22 19:59:03Z dnaber $
- /// </version>
+ /// <summary> Loader for text files that represent a list of stopwords.
+ ///
+ ///
+ /// </summary>
+ /// <version> $Id: WordlistLoader.java 564236 2007-08-09 15:21:19Z gsingers $
+ /// </version>
public class WordlistLoader
{