You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2013/04/03 19:40:08 UTC
[25/51] [partial] Mass convert mixed tabs to spaces
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/core/Analysis/StopAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/core/Analysis/StopAnalyzer.cs b/src/core/Analysis/StopAnalyzer.cs
index aabe197..96a673d 100644
--- a/src/core/Analysis/StopAnalyzer.cs
+++ b/src/core/Analysis/StopAnalyzer.cs
@@ -20,57 +20,57 @@ using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis
{
-
- /// <summary> Filters <see cref="LetterTokenizer" /> with <see cref="LowerCaseFilter" /> and
- /// <see cref="StopFilter" />.
- ///
- /// <a name="version"/>
- /// <p/>
- /// You must specify the required <see cref="Version" /> compatibility when creating
- /// StopAnalyzer:
- /// <list type="bullet">
- /// <item>As of 2.9, position increments are preserved</item>
- /// </list>
- /// </summary>
-
- public sealed class StopAnalyzer:Analyzer
- {
- private readonly ISet<string> stopWords;
- private readonly bool enablePositionIncrements;
+
+ /// <summary> Filters <see cref="LetterTokenizer" /> with <see cref="LowerCaseFilter" /> and
+ /// <see cref="StopFilter" />.
+ ///
+ /// <a name="version"/>
+ /// <p/>
+ /// You must specify the required <see cref="Version" /> compatibility when creating
+ /// StopAnalyzer:
+ /// <list type="bullet">
+ /// <item>As of 2.9, position increments are preserved</item>
+ /// </list>
+ /// </summary>
+
+ public sealed class StopAnalyzer:Analyzer
+ {
+ private readonly ISet<string> stopWords;
+ private readonly bool enablePositionIncrements;
- /// <summary>An unmodifiable set containing some common English words that are not usually useful
- /// for searching.
- /// </summary>
+ /// <summary>An unmodifiable set containing some common English words that are not usually useful
+ /// for searching.
+ /// </summary>
public static ISet<string> ENGLISH_STOP_WORDS_SET;
-
- /// <summary> Builds an analyzer which removes words in ENGLISH_STOP_WORDS.</summary>
- public StopAnalyzer(Version matchVersion)
- {
- stopWords = ENGLISH_STOP_WORDS_SET;
- enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
- }
+
+ /// <summary> Builds an analyzer which removes words in ENGLISH_STOP_WORDS.</summary>
+ public StopAnalyzer(Version matchVersion)
+ {
+ stopWords = ENGLISH_STOP_WORDS_SET;
+ enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
+ }
- /// <summary>Builds an analyzer with the stop words from the given set.</summary>
- public StopAnalyzer(Version matchVersion, ISet<string> stopWords)
- {
- this.stopWords = stopWords;
- enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
- }
-
- /// <summary> Builds an analyzer with the stop words from the given file.
- ///
- /// </summary>
- /// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)">
- /// </seealso>
- /// <param name="matchVersion">See <a href="#version">above</a>
- /// </param>
- /// <param name="stopwordsFile">File to load stop words from
- /// </param>
- public StopAnalyzer(Version matchVersion, System.IO.FileInfo stopwordsFile)
- {
- stopWords = WordlistLoader.GetWordSet(stopwordsFile);
- enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
- }
+ /// <summary>Builds an analyzer with the stop words from the given set.</summary>
+ public StopAnalyzer(Version matchVersion, ISet<string> stopWords)
+ {
+ this.stopWords = stopWords;
+ enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
+ }
+
+ /// <summary> Builds an analyzer with the stop words from the given file.
+ ///
+ /// </summary>
+ /// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)">
+ /// </seealso>
+ /// <param name="matchVersion">See <a href="#version">above</a>
+ /// </param>
+ /// <param name="stopwordsFile">File to load stop words from
+ /// </param>
+ public StopAnalyzer(Version matchVersion, System.IO.FileInfo stopwordsFile)
+ {
+ stopWords = WordlistLoader.GetWordSet(stopwordsFile);
+ enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
+ }
/// <summary>Builds an analyzer with the stop words from the given reader. </summary>
/// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)">
@@ -86,56 +86,56 @@ namespace Lucene.Net.Analysis
}
/// <summary>Filters LowerCaseTokenizer with StopFilter. </summary>
- public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
- {
- return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords);
- }
-
- /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary>
- private class SavedStreams
- {
- public SavedStreams(StopAnalyzer enclosingInstance)
- {
- InitBlock(enclosingInstance);
- }
- private void InitBlock(StopAnalyzer enclosingInstance)
- {
- this.enclosingInstance = enclosingInstance;
- }
- private StopAnalyzer enclosingInstance;
- public StopAnalyzer Enclosing_Instance
- {
- get
- {
- return enclosingInstance;
- }
-
- }
- internal Tokenizer source;
- internal TokenStream result;
- }
-
- public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
- {
- var streams = (SavedStreams) PreviousTokenStream;
- if (streams == null)
- {
- streams = new SavedStreams(this) {source = new LowerCaseTokenizer(reader)};
- streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords);
- PreviousTokenStream = streams;
- }
- else
- streams.source.Reset(reader);
- return streams.result;
- }
- static StopAnalyzer()
- {
- {
- var stopWords = new System.String[]{"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"};
- var stopSet = new CharArraySet(stopWords.Length, false);
- stopSet.AddAll(stopWords);
- ENGLISH_STOP_WORDS_SET = CharArraySet.UnmodifiableSet(stopSet);
- }
- }
- }
+ public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords);
+ }
+
+ /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary>
+ private class SavedStreams
+ {
+ public SavedStreams(StopAnalyzer enclosingInstance)
+ {
+ InitBlock(enclosingInstance);
+ }
+ private void InitBlock(StopAnalyzer enclosingInstance)
+ {
+ this.enclosingInstance = enclosingInstance;
+ }
+ private StopAnalyzer enclosingInstance;
+ public StopAnalyzer Enclosing_Instance
+ {
+ get
+ {
+ return enclosingInstance;
+ }
+
+ }
+ internal Tokenizer source;
+ internal TokenStream result;
+ }
+
+ public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
+ {
+ var streams = (SavedStreams) PreviousTokenStream;
+ if (streams == null)
+ {
+ streams = new SavedStreams(this) {source = new LowerCaseTokenizer(reader)};
+ streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords);
+ PreviousTokenStream = streams;
+ }
+ else
+ streams.source.Reset(reader);
+ return streams.result;
+ }
+ static StopAnalyzer()
+ {
+ {
+ var stopWords = new System.String[]{"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"};
+ var stopSet = new CharArraySet(stopWords.Length, false);
+ stopSet.AddAll(stopWords);
+ ENGLISH_STOP_WORDS_SET = CharArraySet.UnmodifiableSet(stopSet);
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/core/Analysis/StopFilter.cs
----------------------------------------------------------------------
diff --git a/src/core/Analysis/StopFilter.cs b/src/core/Analysis/StopFilter.cs
index 81b7dd0..722faaf 100644
--- a/src/core/Analysis/StopFilter.cs
+++ b/src/core/Analysis/StopFilter.cs
@@ -24,155 +24,155 @@ using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis
{
-
- /// <summary> Removes stop words from a token stream.</summary>
-
- public sealed class StopFilter:TokenFilter
- {
- private readonly CharArraySet stopWords;
- private bool enablePositionIncrements = false;
-
- private readonly ITermAttribute termAtt;
- private readonly IPositionIncrementAttribute posIncrAtt;
-
- /// <summary> Construct a token stream filtering the given input.
- /// If <c>stopWords</c> is an instance of <see cref="CharArraySet" /> (true if
- /// <c>makeStopSet()</c> was used to construct the set) it will be directly used
- /// and <c>ignoreCase</c> will be ignored since <c>CharArraySet</c>
- /// directly controls case sensitivity.
- /// <p/>
- /// If <c>stopWords</c> is not an instance of <see cref="CharArraySet" />,
- /// a new CharArraySet will be constructed and <c>ignoreCase</c> will be
- /// used to specify the case sensitivity of that set.
- /// </summary>
- /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
- /// <param name="input">Input TokenStream</param>
- /// <param name="stopWords">A Set of strings or strings or char[] or any other ToString()-able set representing the stopwords</param>
+
+ /// <summary> Removes stop words from a token stream.</summary>
+
+ public sealed class StopFilter:TokenFilter
+ {
+ private readonly CharArraySet stopWords;
+ private bool enablePositionIncrements = false;
+
+ private readonly ITermAttribute termAtt;
+ private readonly IPositionIncrementAttribute posIncrAtt;
+
+ /// <summary> Construct a token stream filtering the given input.
+ /// If <c>stopWords</c> is an instance of <see cref="CharArraySet" /> (true if
+ /// <c>makeStopSet()</c> was used to construct the set) it will be directly used
+ /// and <c>ignoreCase</c> will be ignored since <c>CharArraySet</c>
+ /// directly controls case sensitivity.
+ /// <p/>
+ /// If <c>stopWords</c> is not an instance of <see cref="CharArraySet" />,
+ /// a new CharArraySet will be constructed and <c>ignoreCase</c> will be
+ /// used to specify the case sensitivity of that set.
+ /// </summary>
+ /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
+ /// <param name="input">Input TokenStream</param>
+ /// <param name="stopWords">A Set of strings or strings or char[] or any other ToString()-able set representing the stopwords</param>
/// <param name="ignoreCase">if true, all words are lower cased first</param>
public StopFilter(bool enablePositionIncrements, TokenStream input, ISet<string> stopWords, bool ignoreCase)
: base(input)
- {
- if (stopWords is CharArraySet)
- {
- this.stopWords = (CharArraySet) stopWords;
- }
- else
- {
- this.stopWords = new CharArraySet(stopWords.Count, ignoreCase);
- this.stopWords.AddAll(stopWords);
- }
- this.enablePositionIncrements = enablePositionIncrements;
- termAtt = AddAttribute<ITermAttribute>();
+ {
+ if (stopWords is CharArraySet)
+ {
+ this.stopWords = (CharArraySet) stopWords;
+ }
+ else
+ {
+ this.stopWords = new CharArraySet(stopWords.Count, ignoreCase);
+ this.stopWords.AddAll(stopWords);
+ }
+ this.enablePositionIncrements = enablePositionIncrements;
+ termAtt = AddAttribute<ITermAttribute>();
posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
- }
+ }
- /// <summary> Constructs a filter which removes words from the input
- /// TokenStream that are named in the Set.
- /// </summary>
- /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
- /// <param name="in">Input stream</param>
- /// <param name="stopWords">A Set of strings or char[] or any other ToString()-able set representing the stopwords</param>
- /// <seealso cref="MakeStopSet(String[])"/>
- public StopFilter(bool enablePositionIncrements, TokenStream @in, ISet<string> stopWords)
- : this(enablePositionIncrements, @in, stopWords, false)
- { }
-
- /// <summary> Builds a Set from an array of stop words,
- /// appropriate for passing into the StopFilter constructor.
- /// This permits this stopWords construction to be cached once when
- /// an Analyzer is constructed.
- ///
- /// </summary>
- /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso>
- public static ISet<string> MakeStopSet(params string[] stopWords)
- {
- return MakeStopSet(stopWords, false);
- }
-
- /// <summary> Builds a Set from an array of stop words,
- /// appropriate for passing into the StopFilter constructor.
- /// This permits this stopWords construction to be cached once when
- /// an Analyzer is constructed.
- /// </summary>
- /// <param name="stopWords">A list of strings or char[] or any other ToString()-able list representing the stop words</param>
- /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso>
- public static ISet<string> MakeStopSet(IList<object> stopWords)
- {
- return MakeStopSet(stopWords, false);
- }
-
- /// <summary></summary>
- /// <param name="stopWords">An array of stopwords</param>
- /// <param name="ignoreCase">If true, all words are lower cased first.</param>
- /// <returns> a Set containing the words</returns>
- public static ISet<string> MakeStopSet(string[] stopWords, bool ignoreCase)
- {
- var stopSet = new CharArraySet(stopWords.Length, ignoreCase);
- stopSet.AddAll(stopWords);
- return stopSet;
- }
-
- /// <summary> </summary>
+ /// <summary> Constructs a filter which removes words from the input
+ /// TokenStream that are named in the Set.
+ /// </summary>
+ /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param>
+ /// <param name="in">Input stream</param>
+ /// <param name="stopWords">A Set of strings or char[] or any other ToString()-able set representing the stopwords</param>
+ /// <seealso cref="MakeStopSet(String[])"/>
+ public StopFilter(bool enablePositionIncrements, TokenStream @in, ISet<string> stopWords)
+ : this(enablePositionIncrements, @in, stopWords, false)
+ { }
+
+ /// <summary> Builds a Set from an array of stop words,
+ /// appropriate for passing into the StopFilter constructor.
+ /// This permits this stopWords construction to be cached once when
+ /// an Analyzer is constructed.
+ ///
+ /// </summary>
+ /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso>
+ public static ISet<string> MakeStopSet(params string[] stopWords)
+ {
+ return MakeStopSet(stopWords, false);
+ }
+
+ /// <summary> Builds a Set from an array of stop words,
+ /// appropriate for passing into the StopFilter constructor.
+ /// This permits this stopWords construction to be cached once when
+ /// an Analyzer is constructed.
+ /// </summary>
+ /// <param name="stopWords">A list of strings or char[] or any other ToString()-able list representing the stop words</param>
+ /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso>
+ public static ISet<string> MakeStopSet(IList<object> stopWords)
+ {
+ return MakeStopSet(stopWords, false);
+ }
+
+ /// <summary></summary>
+ /// <param name="stopWords">An array of stopwords</param>
+ /// <param name="ignoreCase">If true, all words are lower cased first.</param>
+ /// <returns> a Set containing the words</returns>
+ public static ISet<string> MakeStopSet(string[] stopWords, bool ignoreCase)
+ {
+ var stopSet = new CharArraySet(stopWords.Length, ignoreCase);
+ stopSet.AddAll(stopWords);
+ return stopSet;
+ }
+
+ /// <summary> </summary>
/// <param name="stopWords">A List of Strings or char[] or any other toString()-able list representing the stopwords </param>
- /// <param name="ignoreCase">if true, all words are lower cased first</param>
- /// <returns>A Set (<see cref="CharArraySet"/>)containing the words</returns>
- public static ISet<string> MakeStopSet(IList<object> stopWords, bool ignoreCase)
- {
- var stopSet = new CharArraySet(stopWords.Count, ignoreCase);
+ /// <param name="ignoreCase">if true, all words are lower cased first</param>
+ /// <returns>A Set (<see cref="CharArraySet"/>)containing the words</returns>
+ public static ISet<string> MakeStopSet(IList<object> stopWords, bool ignoreCase)
+ {
+ var stopSet = new CharArraySet(stopWords.Count, ignoreCase);
foreach(var word in stopWords)
stopSet.Add(word.ToString());
- return stopSet;
- }
-
- /// <summary> Returns the next input Token whose term() is not a stop word.</summary>
- public override bool IncrementToken()
- {
- // return the first non-stop word found
- int skippedPositions = 0;
- while (input.IncrementToken())
- {
- if (!stopWords.Contains(termAtt.TermBuffer(), 0, termAtt.TermLength()))
- {
- if (enablePositionIncrements)
- {
- posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
- }
- return true;
- }
- skippedPositions += posIncrAtt.PositionIncrement;
- }
- // reached EOS -- return false
- return false;
- }
-
- /// <summary> Returns version-dependent default for enablePositionIncrements. Analyzers
- /// that embed StopFilter use this method when creating the StopFilter. Prior
- /// to 2.9, this returns false. On 2.9 or later, it returns true.
- /// </summary>
- public static bool GetEnablePositionIncrementsVersionDefault(Version matchVersion)
- {
+ return stopSet;
+ }
+
+ /// <summary> Returns the next input Token whose term() is not a stop word.</summary>
+ public override bool IncrementToken()
+ {
+ // return the first non-stop word found
+ int skippedPositions = 0;
+ while (input.IncrementToken())
+ {
+ if (!stopWords.Contains(termAtt.TermBuffer(), 0, termAtt.TermLength()))
+ {
+ if (enablePositionIncrements)
+ {
+ posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
+ }
+ return true;
+ }
+ skippedPositions += posIncrAtt.PositionIncrement;
+ }
+ // reached EOS -- return false
+ return false;
+ }
+
+ /// <summary> Returns version-dependent default for enablePositionIncrements. Analyzers
+ /// that embed StopFilter use this method when creating the StopFilter. Prior
+ /// to 2.9, this returns false. On 2.9 or later, it returns true.
+ /// </summary>
+ public static bool GetEnablePositionIncrementsVersionDefault(Version matchVersion)
+ {
return matchVersion.OnOrAfter(Version.LUCENE_29);
- }
+ }
- /// <summary> If <c>true</c>, this StopFilter will preserve
- /// positions of the incoming tokens (ie, accumulate and
- /// set position increments of the removed stop tokens).
- /// Generally, <c>true</c> is best as it does not
- /// lose information (positions of the original tokens)
- /// during indexing.
- ///
- /// <p/> When set, when a token is stopped
- /// (omitted), the position increment of the following
- /// token is incremented.
- ///
- /// <p/> <b>NOTE</b>: be sure to also
- /// set <see cref="QueryParser.EnablePositionIncrements" /> if
- /// you use QueryParser to create queries.
- /// </summary>
- public bool EnablePositionIncrements
- {
- get { return enablePositionIncrements; }
- set { enablePositionIncrements = value; }
- }
- }
+ /// <summary> If <c>true</c>, this StopFilter will preserve
+ /// positions of the incoming tokens (ie, accumulate and
+ /// set position increments of the removed stop tokens).
+ /// Generally, <c>true</c> is best as it does not
+ /// lose information (positions of the original tokens)
+ /// during indexing.
+ ///
+ /// <p/> When set, when a token is stopped
+ /// (omitted), the position increment of the following
+ /// token is incremented.
+ ///
+ /// <p/> <b>NOTE</b>: be sure to also
+ /// set <see cref="QueryParser.EnablePositionIncrements" /> if
+ /// you use QueryParser to create queries.
+ /// </summary>
+ public bool EnablePositionIncrements
+ {
+ get { return enablePositionIncrements; }
+ set { enablePositionIncrements = value; }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/62f018ab/src/core/Analysis/TeeSinkTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/core/Analysis/TeeSinkTokenFilter.cs b/src/core/Analysis/TeeSinkTokenFilter.cs
index bec605e..6eb217f 100644
--- a/src/core/Analysis/TeeSinkTokenFilter.cs
+++ b/src/core/Analysis/TeeSinkTokenFilter.cs
@@ -22,245 +22,245 @@ using AttributeSource = Lucene.Net.Util.AttributeSource;
namespace Lucene.Net.Analysis
{
-
- /// <summary> This TokenFilter provides the ability to set aside attribute states
- /// that have already been analyzed. This is useful in situations where multiple fields share
- /// many common analysis steps and then go their separate ways.
- /// <p/>
- /// It is also useful for doing things like entity extraction or proper noun analysis as
- /// part of the analysis workflow and saving off those tokens for use in another field.
- ///
- /// <code>
- /// TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
- /// TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
- /// TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
- /// TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
- /// source2.addSinkTokenStream(sink1);
- /// source2.addSinkTokenStream(sink2);
- /// TokenStream final1 = new LowerCaseFilter(source1);
- /// TokenStream final2 = source2;
- /// TokenStream final3 = new EntityDetect(sink1);
- /// TokenStream final4 = new URLDetect(sink2);
- /// d.add(new Field("f1", final1));
- /// d.add(new Field("f2", final2));
- /// d.add(new Field("f3", final3));
- /// d.add(new Field("f4", final4));
- /// </code>
- /// In this example, <c>sink1</c> and <c>sink2</c> will both get tokens from both
- /// <c>reader1</c> and <c>reader2</c> after whitespace tokenizer
- /// and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
- /// It is important, that tees are consumed before sinks (in the above example, the field names must be
- /// less the sink's field names). If you are not sure, which stream is consumed first, you can simply
- /// add another sink and then pass all tokens to the sinks at once using <see cref="ConsumeAllTokens" />.
- /// This TokenFilter is exhausted after this. In the above example, change
- /// the example above to:
- /// <code>
- /// ...
- /// TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
- /// TokenStream final2 = source2.newSinkTokenStream();
- /// sink1.consumeAllTokens();
- /// sink2.consumeAllTokens();
- /// ...
- /// </code>
- /// In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
- /// <p/>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
- /// </summary>
- public sealed class TeeSinkTokenFilter:TokenFilter
- {
- public class AnonymousClassSinkFilter:SinkFilter
- {
- public override bool Accept(AttributeSource source)
- {
- return true;
- }
- }
- private readonly LinkedList<WeakReference> sinks = new LinkedList<WeakReference>();
-
- /// <summary> Instantiates a new TeeSinkTokenFilter.</summary>
- public TeeSinkTokenFilter(TokenStream input):base(input)
- {
- }
-
- /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream.</summary>
- public SinkTokenStream NewSinkTokenStream()
- {
- return NewSinkTokenStream(ACCEPT_ALL_FILTER);
- }
-
- /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream
- /// that pass the supplied filter.
- /// </summary>
- /// <seealso cref="SinkFilter">
- /// </seealso>
- public SinkTokenStream NewSinkTokenStream(SinkFilter filter)
- {
- var sink = new SinkTokenStream(this.CloneAttributes(), filter);
- sinks.AddLast(new WeakReference(sink));
- return sink;
- }
-
- /// <summary> Adds a <see cref="SinkTokenStream" /> created by another <c>TeeSinkTokenFilter</c>
- /// to this one. The supplied stream will also receive all consumed tokens.
- /// This method can be used to pass tokens from two different tees to one sink.
- /// </summary>
- public void AddSinkTokenStream(SinkTokenStream sink)
- {
- // check that sink has correct factory
- if (!this.Factory.Equals(sink.Factory))
- {
- throw new System.ArgumentException("The supplied sink is not compatible to this tee");
- }
- // add eventually missing attribute impls to the existing sink
+
+ /// <summary> This TokenFilter provides the ability to set aside attribute states
+ /// that have already been analyzed. This is useful in situations where multiple fields share
+ /// many common analysis steps and then go their separate ways.
+ /// <p/>
+ /// It is also useful for doing things like entity extraction or proper noun analysis as
+ /// part of the analysis workflow and saving off those tokens for use in another field.
+ ///
+ /// <code>
+ /// TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
+ /// TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
+ /// TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
+ /// TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
+ /// source2.addSinkTokenStream(sink1);
+ /// source2.addSinkTokenStream(sink2);
+ /// TokenStream final1 = new LowerCaseFilter(source1);
+ /// TokenStream final2 = source2;
+ /// TokenStream final3 = new EntityDetect(sink1);
+ /// TokenStream final4 = new URLDetect(sink2);
+ /// d.add(new Field("f1", final1));
+ /// d.add(new Field("f2", final2));
+ /// d.add(new Field("f3", final3));
+ /// d.add(new Field("f4", final4));
+ /// </code>
+ /// In this example, <c>sink1</c> and <c>sink2</c> will both get tokens from both
+ /// <c>reader1</c> and <c>reader2</c> after whitespace tokenizer
+ /// and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
+ /// It is important, that tees are consumed before sinks (in the above example, the field names must be
+ /// less the sink's field names). If you are not sure, which stream is consumed first, you can simply
+ /// add another sink and then pass all tokens to the sinks at once using <see cref="ConsumeAllTokens" />.
+ /// This TokenFilter is exhausted after this. In the above example, change
+ /// the example above to:
+ /// <code>
+ /// ...
+ /// TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
+ /// TokenStream final2 = source2.newSinkTokenStream();
+ /// sink1.consumeAllTokens();
+ /// sink2.consumeAllTokens();
+ /// ...
+ /// </code>
+ /// In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
+ /// <p/>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
+ /// </summary>
+ public sealed class TeeSinkTokenFilter:TokenFilter
+ {
+ public class AnonymousClassSinkFilter:SinkFilter
+ {
+ public override bool Accept(AttributeSource source)
+ {
+ return true;
+ }
+ }
+ private readonly LinkedList<WeakReference> sinks = new LinkedList<WeakReference>();
+
+ /// <summary> Instantiates a new TeeSinkTokenFilter.</summary>
+ public TeeSinkTokenFilter(TokenStream input):base(input)
+ {
+ }
+
+ /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream.</summary>
+ public SinkTokenStream NewSinkTokenStream()
+ {
+ return NewSinkTokenStream(ACCEPT_ALL_FILTER);
+ }
+
+ /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream
+ /// that pass the supplied filter.
+ /// </summary>
+ /// <seealso cref="SinkFilter">
+ /// </seealso>
+ public SinkTokenStream NewSinkTokenStream(SinkFilter filter)
+ {
+ var sink = new SinkTokenStream(this.CloneAttributes(), filter);
+ sinks.AddLast(new WeakReference(sink));
+ return sink;
+ }
+
+ /// <summary> Adds a <see cref="SinkTokenStream" /> created by another <c>TeeSinkTokenFilter</c>
+ /// to this one. The supplied stream will also receive all consumed tokens.
+ /// This method can be used to pass tokens from two different tees to one sink.
+ /// </summary>
+ public void AddSinkTokenStream(SinkTokenStream sink)
+ {
+ // check that sink has correct factory
+ if (!this.Factory.Equals(sink.Factory))
+ {
+ throw new System.ArgumentException("The supplied sink is not compatible to this tee");
+ }
+ // add eventually missing attribute impls to the existing sink
foreach (var impl in this.CloneAttributes().GetAttributeImplsIterator())
{
sink.AddAttributeImpl(impl);
}
- sinks.AddLast(new WeakReference(sink));
- }
-
- /// <summary> <c>TeeSinkTokenFilter</c> passes all tokens to the added sinks
- /// when itself is consumed. To be sure, that all tokens from the input
- /// stream are passed to the sinks, you can call this methods.
- /// This instance is exhausted after this, but all sinks are instant available.
- /// </summary>
- public void ConsumeAllTokens()
- {
+ sinks.AddLast(new WeakReference(sink));
+ }
+
+ /// <summary> <c>TeeSinkTokenFilter</c> passes all tokens to the added sinks
+ /// when itself is consumed. To be sure, that all tokens from the input
+ /// stream are passed to the sinks, you can call this methods.
+ /// This instance is exhausted after this, but all sinks are instant available.
+ /// </summary>
+ public void ConsumeAllTokens()
+ {
while (IncrementToken())
{
}
- }
-
- public override bool IncrementToken()
- {
- if (input.IncrementToken())
- {
- // capture state lazily - maybe no SinkFilter accepts this state
- State state = null;
- foreach(WeakReference wr in sinks)
- {
- var sink = (SinkTokenStream)wr.Target;
- if (sink != null)
- {
- if (sink.Accept(this))
- {
- if (state == null)
- {
- state = this.CaptureState();
- }
- sink.AddState(state);
- }
- }
- }
- return true;
- }
-
- return false;
- }
-
- public override void End()
- {
- base.End();
- State finalState = CaptureState();
- foreach(WeakReference wr in sinks)
- {
+ }
+
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ // capture state lazily - maybe no SinkFilter accepts this state
+ State state = null;
+ foreach(WeakReference wr in sinks)
+ {
+ var sink = (SinkTokenStream)wr.Target;
+ if (sink != null)
+ {
+ if (sink.Accept(this))
+ {
+ if (state == null)
+ {
+ state = this.CaptureState();
+ }
+ sink.AddState(state);
+ }
+ }
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ public override void End()
+ {
+ base.End();
+ State finalState = CaptureState();
+ foreach(WeakReference wr in sinks)
+ {
var sink = (SinkTokenStream)wr.Target;
- if (sink != null)
- {
- sink.SetFinalState(finalState);
- }
- }
- }
-
- /// <summary> A filter that decides which <see cref="AttributeSource" /> states to store in the sink.</summary>
- public abstract class SinkFilter
- {
- /// <summary> Returns true, iff the current state of the passed-in <see cref="AttributeSource" /> shall be stored
- /// in the sink.
- /// </summary>
- public abstract bool Accept(AttributeSource source);
-
- /// <summary> Called by <see cref="SinkTokenStream.Reset()" />. This method does nothing by default
- /// and can optionally be overridden.
- /// </summary>
- public virtual void Reset()
- {
- // nothing to do; can be overridden
- }
- }
-
- public sealed class SinkTokenStream : TokenStream
- {
+ if (sink != null)
+ {
+ sink.SetFinalState(finalState);
+ }
+ }
+ }
+
+ /// <summary> A filter that decides which <see cref="AttributeSource" /> states to store in the sink.</summary>
+ public abstract class SinkFilter
+ {
+ /// <summary> Returns true, iff the current state of the passed-in <see cref="AttributeSource" /> shall be stored
+ /// in the sink.
+ /// </summary>
+ public abstract bool Accept(AttributeSource source);
+
+ /// <summary> Called by <see cref="SinkTokenStream.Reset()" />. This method does nothing by default
+ /// and can optionally be overridden.
+ /// </summary>
+ public virtual void Reset()
+ {
+ // nothing to do; can be overridden
+ }
+ }
+
+ public sealed class SinkTokenStream : TokenStream
+ {
private readonly LinkedList<State> cachedStates = new LinkedList<State>();
- private State finalState;
- private IEnumerator<AttributeSource.State> it = null;
- private readonly SinkFilter filter;
+ private State finalState;
+ private IEnumerator<AttributeSource.State> it = null;
+ private readonly SinkFilter filter;
- internal SinkTokenStream(AttributeSource source, SinkFilter filter)
+ internal SinkTokenStream(AttributeSource source, SinkFilter filter)
: base(source)
- {
- this.filter = filter;
- }
-
- internal /*private*/ bool Accept(AttributeSource source)
- {
- return filter.Accept(source);
- }
-
- internal /*private*/ void AddState(AttributeSource.State state)
- {
- if (it != null)
- {
- throw new System.SystemException("The tee must be consumed before sinks are consumed.");
- }
- cachedStates.AddLast(state);
- }
-
- internal /*private*/ void SetFinalState(AttributeSource.State finalState)
- {
- this.finalState = finalState;
- }
-
- public override bool IncrementToken()
- {
- // lazy init the iterator
- if (it == null)
- {
- it = cachedStates.GetEnumerator();
- }
-
- if (!it.MoveNext())
- {
- return false;
- }
-
- State state = it.Current;
- RestoreState(state);
- return true;
- }
-
- public override void End()
- {
- if (finalState != null)
- {
- RestoreState(finalState);
- }
- }
-
- public override void Reset()
- {
- it = cachedStates.GetEnumerator();
- }
+ {
+ this.filter = filter;
+ }
+
+ internal /*private*/ bool Accept(AttributeSource source)
+ {
+ return filter.Accept(source);
+ }
+
+ internal /*private*/ void AddState(AttributeSource.State state)
+ {
+ if (it != null)
+ {
+ throw new System.SystemException("The tee must be consumed before sinks are consumed.");
+ }
+ cachedStates.AddLast(state);
+ }
+
+ internal /*private*/ void SetFinalState(AttributeSource.State finalState)
+ {
+ this.finalState = finalState;
+ }
+
+ public override bool IncrementToken()
+ {
+ // lazy init the iterator
+ if (it == null)
+ {
+ it = cachedStates.GetEnumerator();
+ }
+
+ if (!it.MoveNext())
+ {
+ return false;
+ }
+
+ State state = it.Current;
+ RestoreState(state);
+ return true;
+ }
+
+ public override void End()
+ {
+ if (finalState != null)
+ {
+ RestoreState(finalState);
+ }
+ }
+
+ public override void Reset()
+ {
+ it = cachedStates.GetEnumerator();
+ }
- protected override void Dispose(bool disposing)
- {
- // Do nothing.
- }
- }
-
- private static readonly SinkFilter ACCEPT_ALL_FILTER;
- static TeeSinkTokenFilter()
- {
- ACCEPT_ALL_FILTER = new AnonymousClassSinkFilter();
- }
- }
+ protected override void Dispose(bool disposing)
+ {
+ // Do nothing.
+ }
+ }
+
+ private static readonly SinkFilter ACCEPT_ALL_FILTER;
+ static TeeSinkTokenFilter()
+ {
+ ACCEPT_ALL_FILTER = new AnonymousClassSinkFilter();
+ }
+ }
}
\ No newline at end of file