You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/11/08 00:12:13 UTC
[09/34] lucenenet git commit: Raw porting of
Lucene.Net.Analysis.Common
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleAnalyzerWrapper.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleAnalyzerWrapper.cs b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleAnalyzerWrapper.cs
new file mode 100644
index 0000000..06c5e10
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleAnalyzerWrapper.cs
@@ -0,0 +1,182 @@
+namespace org.apache.lucene.analysis.shingle
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using StandardAnalyzer = org.apache.lucene.analysis.standard.StandardAnalyzer;
+ using Version = org.apache.lucene.util.Version;
+
+ /// <summary>
+ /// A ShingleAnalyzerWrapper wraps a <seealso cref="ShingleFilter"/> around another <seealso cref="Analyzer"/>.
+ /// <para>
+ /// A shingle is another name for a token based n-gram.
+ /// </para>
+ /// </summary>
+ public sealed class ShingleAnalyzerWrapper : AnalyzerWrapper
+ {
+
+ private readonly Analyzer @delegate;
+ private readonly int maxShingleSize;
+ private readonly int minShingleSize;
+ private readonly string tokenSeparator;
+ private readonly bool outputUnigrams;
+ private readonly bool outputUnigramsIfNoShingles;
+ private readonly string fillerToken;
+
+ public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) : this(defaultAnalyzer, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE)
+ {
+ }
+
+ public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) : this(defaultAnalyzer, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize)
+ {
+ }
+
+ public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) : this(defaultAnalyzer, minShingleSize, maxShingleSize, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, ShingleFilter.DEFAULT_FILLER_TOKEN)
+ {
+ }
+
+ /// <summary>
+ /// Creates a new ShingleAnalyzerWrapper
+ /// </summary>
+ /// <param name="delegate"> Analyzer whose TokenStream is to be filtered </param>
+ /// <param name="minShingleSize"> Min shingle (token ngram) size </param>
+ /// <param name="maxShingleSize"> Max shingle size </param>
+ /// <param name="tokenSeparator"> Used to separate input stream tokens in output shingles </param>
+ /// <param name="outputUnigrams"> Whether or not the filter shall pass the original
+ /// tokens to the output stream </param>
+ /// <param name="outputUnigramsIfNoShingles"> Overrides the behavior of outputUnigrams==false for those
+ /// times when no shingles are available (because there are fewer than
+ /// minShingleSize tokens in the input stream)?
+ /// Note that if outputUnigrams==true, then unigrams are always output,
+ /// regardless of whether any shingles are available. </param>
+ /// <param name="fillerToken"> filler token to use when positionIncrement is more than 1 </param>
+ public ShingleAnalyzerWrapper(Analyzer @delegate, int minShingleSize, int maxShingleSize, string tokenSeparator, bool outputUnigrams, bool outputUnigramsIfNoShingles, string fillerToken) : base(@delegate.ReuseStrategy)
+ {
+ this.@delegate = @delegate;
+
+ if (maxShingleSize < 2)
+ {
+ throw new System.ArgumentException("Max shingle size must be >= 2");
+ }
+ this.maxShingleSize = maxShingleSize;
+
+ if (minShingleSize < 2)
+ {
+ throw new System.ArgumentException("Min shingle size must be >= 2");
+ }
+ if (minShingleSize > maxShingleSize)
+ {
+ throw new System.ArgumentException("Min shingle size must be <= max shingle size");
+ }
+ this.minShingleSize = minShingleSize;
+
+ this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
+ this.outputUnigrams = outputUnigrams;
+ this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+ this.fillerToken = fillerToken;
+ }
+
+ /// <summary>
+ /// Wraps <seealso cref="StandardAnalyzer"/>.
+ /// </summary>
+ public ShingleAnalyzerWrapper(Version matchVersion) : this(matchVersion, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE)
+ {
+ }
+
+ /// <summary>
+ /// Wraps <seealso cref="StandardAnalyzer"/>.
+ /// </summary>
+ public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) : this(new StandardAnalyzer(matchVersion), minShingleSize, maxShingleSize)
+ {
+ }
+
+ /// <summary>
+ /// The max shingle (token ngram) size
+ /// </summary>
+ /// <returns> The max shingle (token ngram) size </returns>
+ public int MaxShingleSize
+ {
+ get
+ {
+ return maxShingleSize;
+ }
+ }
+
+ /// <summary>
+ /// The min shingle (token ngram) size
+ /// </summary>
+ /// <returns> The min shingle (token ngram) size </returns>
+ public int MinShingleSize
+ {
+ get
+ {
+ return minShingleSize;
+ }
+ }
+
+ public string TokenSeparator
+ {
+ get
+ {
+ return tokenSeparator;
+ }
+ }
+
+ public bool OutputUnigrams
+ {
+ get
+ {
+ return outputUnigrams;
+ }
+ }
+
+ public bool OutputUnigramsIfNoShingles
+ {
+ get
+ {
+ return outputUnigramsIfNoShingles;
+ }
+ }
+
+ public string FillerToken
+ {
+ get
+ {
+ return fillerToken;
+ }
+ }
+
+ public override Analyzer getWrappedAnalyzer(string fieldName)
+ {
+ return @delegate;
+ }
+
+ protected internal override TokenStreamComponents wrapComponents(string fieldName, TokenStreamComponents components)
+ {
+ ShingleFilter filter = new ShingleFilter(components.TokenStream, minShingleSize, maxShingleSize);
+ filter.MinShingleSize = minShingleSize;
+ filter.MaxShingleSize = maxShingleSize;
+ filter.TokenSeparator = tokenSeparator;
+ filter.OutputUnigrams = outputUnigrams;
+ filter.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+ filter.FillerToken = fillerToken;
+ return new TokenStreamComponents(components.Tokenizer, filter);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs
new file mode 100644
index 0000000..9bdc341
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs
@@ -0,0 +1,724 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace org.apache.lucene.analysis.shingle
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+ using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+ using PositionLengthAttribute = org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+ using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+ using AttributeSource = org.apache.lucene.util.AttributeSource;
+
+
+ /// <summary>
+ /// <para>A ShingleFilter constructs shingles (token n-grams) from a token stream.
+ /// In other words, it creates combinations of tokens as a single token.
+ ///
+ /// </para>
+ /// <para>For example, the sentence "please divide this sentence into shingles"
+ /// might be tokenized into shingles "please divide", "divide this",
+ /// "this sentence", "sentence into", and "into shingles".
+ ///
+ /// </para>
+ /// <para>This filter handles position increments > 1 by inserting filler tokens
+ /// (tokens with termtext "_"). It does not handle a position increment of 0.
+ /// </para>
+ /// </summary>
+ public sealed class ShingleFilter : TokenFilter
+ {
+
+ /// <summary>
+ /// filler token for when positionIncrement is more than 1
+ /// </summary>
+ public const string DEFAULT_FILLER_TOKEN = "_";
+
+ /// <summary>
+ /// default maximum shingle size is 2.
+ /// </summary>
+ public const int DEFAULT_MAX_SHINGLE_SIZE = 2;
+
+ /// <summary>
+ /// default minimum shingle size is 2.
+ /// </summary>
+ public const int DEFAULT_MIN_SHINGLE_SIZE = 2;
+
+ /// <summary>
+ /// default token type attribute value is "shingle"
+ /// </summary>
+ public const string DEFAULT_TOKEN_TYPE = "shingle";
+
+ /// <summary>
+ /// The default string to use when joining adjacent tokens to form a shingle
+ /// </summary>
+ public const string DEFAULT_TOKEN_SEPARATOR = " ";
+
+ /// <summary>
+ /// The sequence of input stream tokens (or filler tokens, if necessary)
+ /// that will be composed to form output shingles.
+ /// </summary>
+ private LinkedList<InputWindowToken> inputWindow = new LinkedList<InputWindowToken>();
+
+ /// <summary>
+ /// The number of input tokens in the next output token. This is the "n" in
+ /// "token n-grams".
+ /// </summary>
+ private CircularSequence gramSize;
+
+ /// <summary>
+ /// Shingle and unigram text is composed here.
+ /// </summary>
+ private StringBuilder gramBuilder = new StringBuilder();
+
+ /// <summary>
+ /// The token type attribute value to use - default is "shingle"
+ /// </summary>
+ private string tokenType = DEFAULT_TOKEN_TYPE;
+
+ /// <summary>
+ /// The string to use when joining adjacent tokens to form a shingle
+ /// </summary>
+ private string tokenSeparator = DEFAULT_TOKEN_SEPARATOR;
+
+ /// <summary>
+ /// The string to insert for each position at which there is no token
+ /// (i.e., when position increment is greater than one).
+ /// </summary>
+ private char[] fillerToken = DEFAULT_FILLER_TOKEN.ToCharArray();
+
+ /// <summary>
+ /// By default, we output unigrams (individual tokens) as well as shingles
+ /// (token n-grams).
+ /// </summary>
+ private bool outputUnigrams = true;
+
+ /// <summary>
+ /// By default, we don't override behavior of outputUnigrams.
+ /// </summary>
+ private bool outputUnigramsIfNoShingles = false;
+
+ /// <summary>
+ /// maximum shingle size (number of tokens)
+ /// </summary>
+ private int maxShingleSize;
+
+ /// <summary>
+ /// minimum shingle size (number of tokens)
+ /// </summary>
+ private int minShingleSize;
+
+ /// <summary>
+ /// The remaining number of filler tokens to be inserted into the input stream
+ /// from which shingles are composed, to handle position increments greater
+ /// than one.
+ /// </summary>
+ private int numFillerTokensToInsert;
+
+ /// <summary>
+ /// When the next input stream token has a position increment greater than
+ /// one, it is stored in this field until sufficient filler tokens have been
+ /// inserted to account for the position increment.
+ /// </summary>
+ private AttributeSource nextInputStreamToken;
+
+ /// <summary>
+ /// Whether or not there is a next input stream token.
+ /// </summary>
+ private bool isNextInputStreamToken = false;
+
+ /// <summary>
+ /// Whether at least one unigram or shingle has been output at the current
+ /// position.
+ /// </summary>
+ private bool isOutputHere = false;
+
+ /// <summary>
+ /// true if no shingles have been output yet (for outputUnigramsIfNoShingles).
+ /// </summary>
+ internal bool noShingleOutput = true;
+
+ /// <summary>
+ /// Holds the State after input.end() was called, so we can
+ /// restore it in our end() impl.
+ /// </summary>
+ private State endState;
+
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+ private readonly PositionIncrementAttribute posIncrAtt = addAttribute(typeof(PositionIncrementAttribute));
+ private readonly PositionLengthAttribute posLenAtt = addAttribute(typeof(PositionLengthAttribute));
+ private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
+
+
+ /// <summary>
+ /// Constructs a ShingleFilter with the specified shingle size from the
+ /// <seealso cref="TokenStream"/> <code>input</code>
+ /// </summary>
+ /// <param name="input"> input stream </param>
+ /// <param name="minShingleSize"> minimum shingle size produced by the filter. </param>
+ /// <param name="maxShingleSize"> maximum shingle size produced by the filter. </param>
+ public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) : base(input)
+ {
+ MaxShingleSize = maxShingleSize;
+ MinShingleSize = minShingleSize;
+ }
+
+ /// <summary>
+ /// Constructs a ShingleFilter with the specified shingle size from the
+ /// <seealso cref="TokenStream"/> <code>input</code>
+ /// </summary>
+ /// <param name="input"> input stream </param>
+ /// <param name="maxShingleSize"> maximum shingle size produced by the filter. </param>
+ public ShingleFilter(TokenStream input, int maxShingleSize) : this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize)
+ {
+ }
+
+ /// <summary>
+ /// Construct a ShingleFilter with default shingle size: 2.
+ /// </summary>
+ /// <param name="input"> input stream </param>
+ public ShingleFilter(TokenStream input) : this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE)
+ {
+ }
+
+ /// <summary>
+ /// Construct a ShingleFilter with the specified token type for shingle tokens
+ /// and the default shingle size: 2
+ /// </summary>
+ /// <param name="input"> input stream </param>
+ /// <param name="tokenType"> token type for shingle tokens </param>
+ public ShingleFilter(TokenStream input, string tokenType) : this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE)
+ {
+ TokenType = tokenType;
+ }
+
+ /// <summary>
+ /// Set the type of the shingle tokens produced by this filter.
+ /// (default: "shingle")
+ /// </summary>
+ /// <param name="tokenType"> token tokenType </param>
+ public string TokenType
+ {
+ set
+ {
+ this.tokenType = value;
+ }
+ }
+
+ /// <summary>
+ /// Shall the output stream contain the input tokens (unigrams) as well as
+ /// shingles? (default: true.)
+ /// </summary>
+ /// <param name="outputUnigrams"> Whether or not the output stream shall contain
+ /// the input tokens (unigrams) </param>
+ public bool OutputUnigrams
+ {
+ set
+ {
+ this.outputUnigrams = value;
+ gramSize = new CircularSequence(this);
+ }
+ }
+
+ /// <summary>
+ /// <para>Shall we override the behavior of outputUnigrams==false for those
+ /// times when no shingles are available (because there are fewer than
+ /// minShingleSize tokens in the input stream)? (default: false.)
+ /// </para>
+ /// <para>Note that if outputUnigrams==true, then unigrams are always output,
+ /// regardless of whether any shingles are available.
+ ///
+ /// </para>
+ /// </summary>
+ /// <param name="outputUnigramsIfNoShingles"> Whether or not to output a single
+ /// unigram when no shingles are available. </param>
+ public bool OutputUnigramsIfNoShingles
+ {
+ set
+ {
+ this.outputUnigramsIfNoShingles = value;
+ }
+ }
+
+ /// <summary>
+ /// Set the max shingle size (default: 2)
+ /// </summary>
+ /// <param name="maxShingleSize"> max size of output shingles </param>
+ public int MaxShingleSize
+ {
+ set
+ {
+ if (value < 2)
+ {
+ throw new System.ArgumentException("Max shingle size must be >= 2");
+ }
+ this.maxShingleSize = value;
+ }
+ }
+
+ /// <summary>
+ /// <para>Set the min shingle size (default: 2).
+ /// </para>
+ /// <para>This method requires that the passed in minShingleSize is not greater
+ /// than maxShingleSize, so make sure that maxShingleSize is set before
+ /// calling this method.
+ /// </para>
+ /// <para>The unigram output option is independent of the min shingle size.
+ ///
+ /// </para>
+ /// </summary>
+ /// <param name="minShingleSize"> min size of output shingles </param>
+ public int MinShingleSize
+ {
+ set
+ {
+ if (value < 2)
+ {
+ throw new System.ArgumentException("Min shingle size must be >= 2");
+ }
+ if (value > maxShingleSize)
+ {
+ throw new System.ArgumentException("Min shingle size must be <= max shingle size");
+ }
+ this.minShingleSize = value;
+ gramSize = new CircularSequence(this);
+ }
+ }
+
+ /// <summary>
+ /// Sets the string to use when joining adjacent tokens to form a shingle </summary>
+ /// <param name="tokenSeparator"> used to separate input stream tokens in output shingles </param>
+ public string TokenSeparator
+ {
+ set
+ {
+ this.tokenSeparator = null == value ? "" : value;
+ }
+ }
+
+ /// <summary>
+ /// Sets the string to insert for each position at which there is no token
+ /// (i.e., when position increment is greater than one).
+ /// </summary>
+ /// <param name="fillerToken"> string to insert at each position where there is no token </param>
+ public string FillerToken
+ {
+ set
+ {
+ this.fillerToken = null == value ? new char[0] : value.ToCharArray();
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ bool tokenAvailable = false;
+ int builtGramSize = 0;
+ if (gramSize.atMinValue() || inputWindow.Count < gramSize.Value)
+ {
+ shiftInputWindow();
+ gramBuilder.Length = 0;
+ }
+ else
+ {
+ builtGramSize = gramSize.PreviousValue;
+ }
+ if (inputWindow.Count >= gramSize.Value)
+ {
+ bool isAllFiller = true;
+ InputWindowToken nextToken = null;
+ IEnumerator<InputWindowToken> iter = inputWindow.GetEnumerator();
+//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
+ for (int gramNum = 1 ; iter.hasNext() && builtGramSize < gramSize.Value ; ++gramNum)
+ {
+ nextToken = iter.Current;
+ if (builtGramSize < gramNum)
+ {
+ if (builtGramSize > 0)
+ {
+ gramBuilder.Append(tokenSeparator);
+ }
+ gramBuilder.Append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length());
+ ++builtGramSize;
+ }
+ if (isAllFiller && nextToken.isFiller)
+ {
+ if (gramNum == gramSize.Value)
+ {
+ gramSize.advance();
+ }
+ }
+ else
+ {
+ isAllFiller = false;
+ }
+ }
+ if (!isAllFiller && builtGramSize == gramSize.Value)
+ {
+ inputWindow.First.Value.attSource.copyTo(this);
+ posIncrAtt.PositionIncrement = isOutputHere ? 0 : 1;
+ termAtt.setEmpty().append(gramBuilder);
+ if (gramSize.Value > 1)
+ {
+ typeAtt.Type = tokenType;
+ noShingleOutput = false;
+ }
+ offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
+ posLenAtt.PositionLength = builtGramSize;
+ isOutputHere = true;
+ gramSize.advance();
+ tokenAvailable = true;
+ }
+ }
+ return tokenAvailable;
+ }
+
+ private bool exhausted;
+
+ /// <summary>
+ /// <para>Get the next token from the input stream.
+ /// </para>
+ /// <para>If the next token has <code>positionIncrement > 1</code>,
+ /// <code>positionIncrement - 1</code> <seealso cref="#fillerToken"/>s are
+ /// inserted first.
+ /// </para>
+ /// </summary>
+ /// <param name="target"> Where to put the new token; if null, a new instance is created. </param>
+ /// <returns> On success, the populated token; null otherwise </returns>
+ /// <exception cref="IOException"> if the input stream has a problem </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private InputWindowToken getNextToken(InputWindowToken target) throws java.io.IOException
+ private InputWindowToken getNextToken(InputWindowToken target)
+ {
+ InputWindowToken newTarget = target;
+ if (numFillerTokensToInsert > 0)
+ {
+ if (null == target)
+ {
+ newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes());
+ }
+ else
+ {
+ nextInputStreamToken.copyTo(target.attSource);
+ }
+ // A filler token occupies no space
+ newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset());
+ newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length);
+ newTarget.isFiller = true;
+ --numFillerTokensToInsert;
+ }
+ else if (isNextInputStreamToken)
+ {
+ if (null == target)
+ {
+ newTarget = new InputWindowToken(this, nextInputStreamToken.cloneAttributes());
+ }
+ else
+ {
+ nextInputStreamToken.copyTo(target.attSource);
+ }
+ isNextInputStreamToken = false;
+ newTarget.isFiller = false;
+ }
+ else if (!exhausted)
+ {
+ if (input.incrementToken())
+ {
+ if (null == target)
+ {
+ newTarget = new InputWindowToken(this, cloneAttributes());
+ }
+ else
+ {
+ this.copyTo(target.attSource);
+ }
+ if (posIncrAtt.PositionIncrement > 1)
+ {
+ // Each output shingle must contain at least one input token,
+ // so no more than (maxShingleSize - 1) filler tokens will be inserted.
+ numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement - 1, maxShingleSize - 1);
+ // Save the current token as the next input stream token
+ if (null == nextInputStreamToken)
+ {
+ nextInputStreamToken = cloneAttributes();
+ }
+ else
+ {
+ this.copyTo(nextInputStreamToken);
+ }
+ isNextInputStreamToken = true;
+ // A filler token occupies no space
+ newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
+ newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.Length);
+ newTarget.isFiller = true;
+ --numFillerTokensToInsert;
+ }
+ else
+ {
+ newTarget.isFiller = false;
+ }
+ }
+ else
+ {
+ exhausted = true;
+ input.end();
+ endState = captureState();
+ numFillerTokensToInsert = Math.Min(posIncrAtt.PositionIncrement, maxShingleSize - 1);
+ if (numFillerTokensToInsert > 0)
+ {
+ nextInputStreamToken = new AttributeSource(AttributeFactory);
+ nextInputStreamToken.addAttribute(typeof(CharTermAttribute));
+ OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(typeof(OffsetAttribute));
+ newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
+ // Recurse/loop just once:
+ return getNextToken(target);
+ }
+ else
+ {
+ newTarget = null;
+ }
+ }
+ }
+ else
+ {
+ newTarget = null;
+ }
+ return newTarget;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void end() throws java.io.IOException
+ public override void end()
+ {
+ if (!exhausted)
+ {
+ base.end();
+ }
+ else
+ {
+ restoreState(endState);
+ }
+ }
+
+ /// <summary>
+ /// <para>Fills <seealso cref="#inputWindow"/> with input stream tokens, if available,
+ /// shifting to the right if the window was previously full.
+ /// </para>
+ /// <para>Resets <seealso cref="#gramSize"/> to its minimum value.
+ ///
+ /// </para>
+ /// </summary>
+ /// <exception cref="IOException"> if there's a problem getting the next token </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private void shiftInputWindow() throws java.io.IOException
+ private void shiftInputWindow()
+ {
+ InputWindowToken firstToken = null;
+ if (inputWindow.Count > 0)
+ {
+ firstToken = inputWindow.RemoveFirst();
+ }
+ while (inputWindow.Count < maxShingleSize)
+ {
+ if (null != firstToken) // recycle the firstToken, if available
+ {
+ if (null != getNextToken(firstToken))
+ {
+ inputWindow.AddLast(firstToken); // the firstToken becomes the last
+ firstToken = null;
+ }
+ else
+ {
+ break; // end of input stream
+ }
+ }
+ else
+ {
+ InputWindowToken nextToken = getNextToken(null);
+ if (null != nextToken)
+ {
+ inputWindow.AddLast(nextToken);
+ }
+ else
+ {
+ break; // end of input stream
+ }
+ }
+ }
+ if (outputUnigramsIfNoShingles && noShingleOutput && gramSize.minValue > 1 && inputWindow.Count < minShingleSize)
+ {
+ gramSize.minValue = 1;
+ }
+ gramSize.reset();
+ isOutputHere = false;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ base.reset();
+ gramSize.reset();
+ inputWindow.Clear();
+ nextInputStreamToken = null;
+ isNextInputStreamToken = false;
+ numFillerTokensToInsert = 0;
+ isOutputHere = false;
+ noShingleOutput = true;
+ exhausted = false;
+ endState = null;
+ if (outputUnigramsIfNoShingles && !outputUnigrams)
+ {
+ // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
+ gramSize.minValue = minShingleSize;
+ }
+ }
+
+
+ /// <summary>
+ /// <para>An instance of this class is used to maintain the number of input
+ /// stream tokens that will be used to compose the next unigram or shingle:
+ /// <seealso cref="#gramSize"/>.
+ /// </para>
+ /// <para><code>gramSize</code> will take on values from the circular sequence
+ /// <b>{ [ 1, ] <seealso cref="#minShingleSize"/> [ , ... , <seealso cref="#maxShingleSize"/> ] }</b>.
+ /// </para>
+ /// <para>1 is included in the circular sequence only if
+ /// <seealso cref="#outputUnigrams"/> = true.
+ /// </para>
+ /// </summary>
+ private class CircularSequence
+ {
+ private readonly ShingleFilter outerInstance;
+
+ internal int value;
+ internal int previousValue;
+ internal int minValue;
+
+ public CircularSequence(ShingleFilter outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ minValue = outerInstance.outputUnigrams ? 1 : outerInstance.minShingleSize;
+ reset();
+ }
+
+ /// <returns> the current value. </returns>
+ /// <seealso cref= #advance() </seealso>
+ public virtual int Value
+ {
+ get
+ {
+ return value;
+ }
+ }
+
+ /// <summary>
+ /// <para>Increments this circular number's value to the next member in the
+ /// circular sequence
+ /// <code>gramSize</code> will take on values from the circular sequence
+ /// <b>{ [ 1, ] <seealso cref="#minShingleSize"/> [ , ... , <seealso cref="#maxShingleSize"/> ] }</b>.
+ /// </para>
+ /// <para>1 is included in the circular sequence only if
+ /// <seealso cref="#outputUnigrams"/> = true.
+ /// </para>
+ /// </summary>
+ public virtual void advance()
+ {
+ previousValue = value;
+ if (value == 1)
+ {
+ value = outerInstance.minShingleSize;
+ }
+ else if (value == outerInstance.maxShingleSize)
+ {
+ reset();
+ }
+ else
+ {
+ ++value;
+ }
+ }
+
+ /// <summary>
+ /// <para>Sets this circular number's value to the first member of the
+ /// circular sequence
+ /// </para>
+ /// <para><code>gramSize</code> will take on values from the circular sequence
+ /// <b>{ [ 1, ] <seealso cref="#minShingleSize"/> [ , ... , <seealso cref="#maxShingleSize"/> ] }</b>.
+ /// </para>
+ /// <para>1 is included in the circular sequence only if
+ /// <seealso cref="#outputUnigrams"/> = true.
+ /// </para>
+ /// </summary>
+ public virtual void reset()
+ {
+ previousValue = value = minValue;
+ }
+
+ /// <summary>
+ /// <para>Returns true if the current value is the first member of the circular
+ /// sequence.
+ /// </para>
+ /// <para>If <seealso cref="#outputUnigrams"/> = true, the first member of the circular
+ /// sequence will be 1; otherwise, it will be <seealso cref="#minShingleSize"/>.
+ ///
+ /// </para>
+ /// </summary>
+ /// <returns> true if the current value is the first member of the circular
+ /// sequence; false otherwise </returns>
+ public virtual bool atMinValue()
+ {
+ return value == minValue;
+ }
+
+ /// <returns> the value this instance had before the last advance() call </returns>
+ public virtual int PreviousValue
+ {
+ get
+ {
+ return previousValue;
+ }
+ }
+ }
+
+ private class InputWindowToken
+ {
+ private readonly ShingleFilter outerInstance;
+
+ internal readonly AttributeSource attSource;
+ internal readonly CharTermAttribute termAtt;
+ internal readonly OffsetAttribute offsetAtt;
+ internal bool isFiller = false;
+
+ public InputWindowToken(ShingleFilter outerInstance, AttributeSource attSource)
+ {
+ this.outerInstance = outerInstance;
+ this.attSource = attSource;
+ this.termAtt = attSource.getAttribute(typeof(CharTermAttribute));
+ this.offsetAtt = attSource.getAttribute(typeof(OffsetAttribute));
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilterFactory.cs
new file mode 100644
index 0000000..429e9ce
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilterFactory.cs
@@ -0,0 +1,86 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.shingle
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="ShingleFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_shingle" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
+ /// outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" " fillerToken="_"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class ShingleFilterFactory : TokenFilterFactory
+ {
+ private readonly int minShingleSize;
+ private readonly int maxShingleSize;
+ private readonly bool outputUnigrams;
+ private readonly bool outputUnigramsIfNoShingles;
+ private readonly string tokenSeparator;
+ private readonly string fillerToken;
+
+ /// <summary>
+ /// Creates a new ShingleFilterFactory </summary>
+ public ShingleFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ maxShingleSize = getInt(args, "maxShingleSize", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
+ if (maxShingleSize < 2)
+ {
+ throw new System.ArgumentException("Invalid maxShingleSize (" + maxShingleSize + ") - must be at least 2");
+ }
+ minShingleSize = getInt(args, "minShingleSize", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
+ if (minShingleSize < 2)
+ {
+ throw new System.ArgumentException("Invalid minShingleSize (" + minShingleSize + ") - must be at least 2");
+ }
+ if (minShingleSize > maxShingleSize)
+ {
+ throw new System.ArgumentException("Invalid minShingleSize (" + minShingleSize + ") - must be no greater than maxShingleSize (" + maxShingleSize + ")");
+ }
+ outputUnigrams = getBoolean(args, "outputUnigrams", true);
+ outputUnigramsIfNoShingles = getBoolean(args, "outputUnigramsIfNoShingles", false);
+ tokenSeparator = get(args, "tokenSeparator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
+ fillerToken = get(args, "fillerToken", ShingleFilter.DEFAULT_FILLER_TOKEN);
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override ShingleFilter create(TokenStream input)
+ {
+ ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
+ r.OutputUnigrams = outputUnigrams;
+ r.OutputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+ r.TokenSeparator = tokenSeparator;
+ r.FillerToken = fillerToken;
+ return r;
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Sinks/DateRecognizerSinkFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Sinks/DateRecognizerSinkFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Sinks/DateRecognizerSinkFilter.cs
new file mode 100644
index 0000000..a04fd51
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Sinks/DateRecognizerSinkFilter.cs
@@ -0,0 +1,79 @@
+using System;
+
+namespace org.apache.lucene.analysis.sinks
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using AttributeSource = org.apache.lucene.util.AttributeSource;
+
+ /// <summary>
+ /// Attempts to parse the <seealso cref="CharTermAttribute#buffer()"/> as a Date using a <seealso cref="java.text.DateFormat"/>.
+ /// If the value is a Date, it will add it to the sink.
+ /// <p/>
+ ///
+ ///
+ /// </summary>
+ public class DateRecognizerSinkFilter : TeeSinkTokenFilter.SinkFilter
+ {
+ public const string DATE_TYPE = "date";
+
+ protected internal DateFormat dateFormat;
+ protected internal CharTermAttribute termAtt;
+
+ /// <summary>
+ /// Uses {@link java.text.DateFormat#getDateInstance(int, Locale)
+ /// DateFormat#getDateInstance(DateFormat.DEFAULT, Locale.ROOT)} as
+ /// the <seealso cref="java.text.DateFormat"/> object.
+ /// </summary>
+ public DateRecognizerSinkFilter() : this(DateFormat.getDateInstance(DateFormat.DEFAULT, Locale.ROOT))
+ {
+ }
+
+ public DateRecognizerSinkFilter(DateFormat dateFormat)
+ {
+ this.dateFormat = dateFormat;
+ }
+
+ public override bool accept(AttributeSource source)
+ {
+ if (termAtt == null)
+ {
+ termAtt = source.addAttribute(typeof(CharTermAttribute));
+ }
+ try
+ {
+ DateTime date = dateFormat.parse(termAtt.ToString()); //We don't care about the date, just that we can parse it as a date
+ if (date != null)
+ {
+ return true;
+ }
+ }
+ catch (ParseException)
+ {
+
+ }
+
+ return false;
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TeeSinkTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TeeSinkTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TeeSinkTokenFilter.cs
new file mode 100644
index 0000000..f6857d9
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TeeSinkTokenFilter.cs
@@ -0,0 +1,300 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.sinks
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using AttributeImpl = org.apache.lucene.util.AttributeImpl;
+ using AttributeSource = org.apache.lucene.util.AttributeSource;
+
+ /// <summary>
+ /// This TokenFilter provides the ability to set aside attribute states
+ /// that have already been analyzed. This is useful in situations where multiple fields share
+ /// many common analysis steps and then go their separate ways.
+ /// <p/>
+ /// It is also useful for doing things like entity extraction or proper noun analysis as
+ /// part of the analysis workflow and saving off those tokens for use in another field.
+ ///
+ /// <pre class="prettyprint">
+ /// TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(version, reader1));
+ /// TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
+ /// TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
+ ///
+ /// TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(version, reader2));
+ /// source2.addSinkTokenStream(sink1);
+ /// source2.addSinkTokenStream(sink2);
+ ///
+ /// TokenStream final1 = new LowerCaseFilter(version, source1);
+ /// TokenStream final2 = source2;
+ /// TokenStream final3 = new EntityDetect(sink1);
+ /// TokenStream final4 = new URLDetect(sink2);
+ ///
+ /// d.add(new TextField("f1", final1, Field.Store.NO));
+ /// d.add(new TextField("f2", final2, Field.Store.NO));
+ /// d.add(new TextField("f3", final3, Field.Store.NO));
+ /// d.add(new TextField("f4", final4, Field.Store.NO));
+ /// </pre>
+ /// In this example, <code>sink1</code> and <code>sink2</code> will both get tokens from both
+ /// <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
+ /// and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
+ /// It is important, that tees are consumed before sinks (in the above example, the field names must be
+ /// less the sink's field names). If you are not sure, which stream is consumed first, you can simply
+ /// add another sink and then pass all tokens to the sinks at once using <seealso cref="#consumeAllTokens"/>.
+ /// This TokenFilter is exhausted after this. In the above example, change
+ /// the example above to:
+ /// <pre class="prettyprint">
+ /// ...
+ /// TokenStream final1 = new LowerCaseFilter(version, source1.newSinkTokenStream());
+ /// TokenStream final2 = source2.newSinkTokenStream();
+ /// sink1.consumeAllTokens();
+ /// sink2.consumeAllTokens();
+ /// ...
+ /// </pre>
+ /// In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
+ /// <para>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
+ /// </para>
+ /// </summary>
+ public sealed class TeeSinkTokenFilter : TokenFilter
+ {
+ private readonly IList<WeakReference<SinkTokenStream>> sinks = new LinkedList<WeakReference<SinkTokenStream>>();
+
+ /// <summary>
+ /// Instantiates a new TeeSinkTokenFilter.
+ /// </summary>
+ public TeeSinkTokenFilter(TokenStream input) : base(input)
+ {
+ }
+
+ /// <summary>
+ /// Returns a new <seealso cref="SinkTokenStream"/> that receives all tokens consumed by this stream.
+ /// </summary>
+ public SinkTokenStream newSinkTokenStream()
+ {
+ return newSinkTokenStream(ACCEPT_ALL_FILTER);
+ }
+
+ /// <summary>
+ /// Returns a new <seealso cref="SinkTokenStream"/> that receives all tokens consumed by this stream
+ /// that pass the supplied filter. </summary>
+ /// <seealso cref= SinkFilter </seealso>
+ public SinkTokenStream newSinkTokenStream(SinkFilter filter)
+ {
+ SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter);
+ this.sinks.Add(new WeakReference<>(sink));
+ return sink;
+ }
+
+ /// <summary>
+ /// Adds a <seealso cref="SinkTokenStream"/> created by another <code>TeeSinkTokenFilter</code>
+ /// to this one. The supplied stream will also receive all consumed tokens.
+ /// This method can be used to pass tokens from two different tees to one sink.
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+//ORIGINAL LINE: public void addSinkTokenStream(final SinkTokenStream sink)
+ public void addSinkTokenStream(SinkTokenStream sink)
+ {
+ // check that sink has correct factory
+ if (!this.AttributeFactory.Equals(sink.AttributeFactory))
+ {
+ throw new System.ArgumentException("The supplied sink is not compatible to this tee");
+ }
+ // add eventually missing attribute impls to the existing sink
+ for (IEnumerator<AttributeImpl> it = this.cloneAttributes().AttributeImplsIterator; it.MoveNext();)
+ {
+ sink.addAttributeImpl(it.Current);
+ }
+ this.sinks.Add(new WeakReference<>(sink));
+ }
+
+ /// <summary>
+ /// <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks
+ /// when itself is consumed. To be sure, that all tokens from the input
+ /// stream are passed to the sinks, you can call this methods.
+ /// This instance is exhausted after this, but all sinks are instant available.
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void consumeAllTokens() throws java.io.IOException
+ public void consumeAllTokens()
+ {
+ while (incrementToken())
+ {
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ // capture state lazily - maybe no SinkFilter accepts this state
+ AttributeSource.State state = null;
+ foreach (WeakReference<SinkTokenStream> @ref in sinks)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final SinkTokenStream sink = ref.get();
+ SinkTokenStream sink = @ref.get();
+ if (sink != null)
+ {
+ if (sink.accept(this))
+ {
+ if (state == null)
+ {
+ state = this.captureState();
+ }
+ sink.addState(state);
+ }
+ }
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final void end() throws java.io.IOException
+ public override void end()
+ {
+ base.end();
+ AttributeSource.State finalState = captureState();
+ foreach (WeakReference<SinkTokenStream> @ref in sinks)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final SinkTokenStream sink = ref.get();
+ SinkTokenStream sink = @ref.get();
+ if (sink != null)
+ {
+ sink.FinalState = finalState;
+ }
+ }
+ }
+
+ /// <summary>
+ /// A filter that decides which <seealso cref="AttributeSource"/> states to store in the sink.
+ /// </summary>
+ public abstract class SinkFilter
+ {
+ /// <summary>
+ /// Returns true, iff the current state of the passed-in <seealso cref="AttributeSource"/> shall be stored
+ /// in the sink.
+ /// </summary>
+ public abstract bool accept(AttributeSource source);
+
+ /// <summary>
+ /// Called by <seealso cref="SinkTokenStream#reset()"/>. This method does nothing by default
+ /// and can optionally be overridden.
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void reset() throws java.io.IOException
+ public virtual void reset()
+ {
+ // nothing to do; can be overridden
+ }
+ }
+
+ /// <summary>
+ /// TokenStream output from a tee with optional filtering.
+ /// </summary>
+ public sealed class SinkTokenStream : TokenStream
+ {
+ internal readonly IList<AttributeSource.State> cachedStates = new LinkedList<AttributeSource.State>();
+ internal AttributeSource.State finalState;
+ internal IEnumerator<AttributeSource.State> it = null;
+ internal SinkFilter filter;
+
+ internal SinkTokenStream(AttributeSource source, SinkFilter filter) : base(source)
+ {
+ this.filter = filter;
+ }
+
+ internal bool accept(AttributeSource source)
+ {
+ return filter.accept(source);
+ }
+
+ internal void addState(AttributeSource.State state)
+ {
+ if (it != null)
+ {
+ throw new System.InvalidOperationException("The tee must be consumed before sinks are consumed.");
+ }
+ cachedStates.Add(state);
+ }
+
+ internal AttributeSource.State FinalState
+ {
+ set
+ {
+ this.finalState = value;
+ }
+ }
+
+ public override bool incrementToken()
+ {
+ // lazy init the iterator
+ if (it == null)
+ {
+ it = cachedStates.GetEnumerator();
+ }
+
+//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
+ if (!it.hasNext())
+ {
+ return false;
+ }
+
+//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
+ AttributeSource.State state = it.next();
+ restoreState(state);
+ return true;
+ }
+
+ public override void end()
+ {
+ if (finalState != null)
+ {
+ restoreState(finalState);
+ }
+ }
+
+ public override void reset()
+ {
+ it = cachedStates.GetEnumerator();
+ }
+ }
+
+ private static readonly SinkFilter ACCEPT_ALL_FILTER = new SinkFilterAnonymousInnerClassHelper();
+
+ private class SinkFilterAnonymousInnerClassHelper : SinkFilter
+ {
+ public SinkFilterAnonymousInnerClassHelper()
+ {
+ }
+
+ public override bool accept(AttributeSource source)
+ {
+ return true;
+ }
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TokenRangeSinkFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TokenRangeSinkFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TokenRangeSinkFilter.cs
new file mode 100644
index 0000000..568fea6
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TokenRangeSinkFilter.cs
@@ -0,0 +1,73 @@
+namespace org.apache.lucene.analysis.sinks
+{
+
+ /// <summary>
+ /// Licensed to the Apache Software Foundation (ASF) under one or more
+ /// contributor license agreements. See the NOTICE file distributed with
+ /// this work for additional information regarding copyright ownership.
+ /// The ASF licenses this file to You under the Apache License, Version 2.0
+ /// (the "License"); you may not use this file except in compliance with
+ /// the License. You may obtain a copy of the License at
+ ///
+ /// http://www.apache.org/licenses/LICENSE-2.0
+ ///
+ /// Unless required by applicable law or agreed to in writing, software
+ /// distributed under the License is distributed on an "AS IS" BASIS,
+ /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ /// See the License for the specific language governing permissions and
+ /// limitations under the License.
+ /// </summary>
+
+ using AttributeSource = org.apache.lucene.util.AttributeSource;
+
+ /// <summary>
+ /// Counts the tokens as they go by and saves to the internal list those between the range of lower and upper, exclusive of upper
+ ///
+ ///
+ /// </summary>
+ public class TokenRangeSinkFilter : TeeSinkTokenFilter.SinkFilter
+ {
+ private int lower;
+ private int upper;
+ private int count;
+
+ public TokenRangeSinkFilter(int lower, int upper)
+ {
+ if (lower < 1)
+ {
+ throw new System.ArgumentException("lower must be greater than zero");
+ }
+ if (lower > upper)
+ {
+ throw new System.ArgumentException("lower must not be greater than upper");
+ }
+ this.lower = lower;
+ this.upper = upper;
+ }
+
+
+ public override bool accept(AttributeSource source)
+ {
+ try
+ {
+ if (count >= lower && count < upper)
+ {
+ return true;
+ }
+ return false;
+ }
+ finally
+ {
+ count++;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ count = 0;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TokenTypeSinkFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TokenTypeSinkFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TokenTypeSinkFilter.cs
new file mode 100644
index 0000000..f844a1c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Sinks/TokenTypeSinkFilter.cs
@@ -0,0 +1,50 @@
+namespace org.apache.lucene.analysis.sinks
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+ using AttributeSource = org.apache.lucene.util.AttributeSource;
+
+ /// <summary>
+ /// Adds a token to the sink if it has a specific type.
+ /// </summary>
+ public class TokenTypeSinkFilter : TeeSinkTokenFilter.SinkFilter
+ {
+ private string typeToMatch;
+ private TypeAttribute typeAtt;
+
+ public TokenTypeSinkFilter(string typeToMatch)
+ {
+ this.typeToMatch = typeToMatch;
+ }
+
+ public override bool accept(AttributeSource source)
+ {
+ if (typeAtt == null)
+ {
+ typeAtt = source.addAttribute(typeof(TypeAttribute));
+ }
+
+ //check to see if this is a Category
+ return (typeToMatch.Equals(typeAtt.type()));
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballAnalyzer.cs
new file mode 100644
index 0000000..1ce0ffd
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballAnalyzer.cs
@@ -0,0 +1,102 @@
+using System;
+
+namespace org.apache.lucene.analysis.snowball
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using org.apache.lucene.analysis;
+ using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+ using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+ using EnglishPossessiveFilter = org.apache.lucene.analysis.en.EnglishPossessiveFilter;
+ using org.apache.lucene.analysis.standard;
+ using TurkishLowerCaseFilter = org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using Version = org.apache.lucene.util.Version;
+
+ /// <summary>
+ /// Filters <seealso cref="StandardTokenizer"/> with <seealso cref="StandardFilter"/>, {@link
+ /// LowerCaseFilter}, <seealso cref="StopFilter"/> and <seealso cref="SnowballFilter"/>.
+ ///
+ /// Available stemmers are listed in org.tartarus.snowball.ext. The name of a
+ /// stemmer is the part of the class name before "Stemmer", e.g., the stemmer in
+ /// <seealso cref="org.tartarus.snowball.ext.EnglishStemmer"/> is named "English".
+ ///
+ /// <para><b>NOTE</b>: This class uses the same <seealso cref="Version"/>
+ /// dependent settings as <seealso cref="StandardAnalyzer"/>, with the following addition:
+ /// <ul>
+ /// <li> As of 3.1, uses <seealso cref="TurkishLowerCaseFilter"/> for Turkish language.
+ /// </ul>
+ /// </para> </summary>
+ /// @deprecated (3.1) Use the language-specific analyzer in modules/analysis instead.
+ /// This analyzer will be removed in Lucene 5.0
+ [Obsolete("(3.1) Use the language-specific analyzer in modules/analysis instead.")]
+ public sealed class SnowballAnalyzer : Analyzer
+ {
+ private string name;
+ private CharArraySet stopSet;
+ private readonly Version matchVersion;
+
+ /// <summary>
+ /// Builds the named analyzer with no stop words. </summary>
+ public SnowballAnalyzer(Version matchVersion, string name)
+ {
+ this.name = name;
+ this.matchVersion = matchVersion;
+ }
+
+ /// <summary>
+ /// Builds the named analyzer with the given stop words. </summary>
+ public SnowballAnalyzer(Version matchVersion, string name, CharArraySet stopWords) : this(matchVersion, name)
+ {
+ stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopWords));
+ }
+
+ /// <summary>
+ /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link
+ /// StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>,
+ /// and a <seealso cref="SnowballFilter"/>
+ /// </summary>
+ public override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ {
+ Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, tokenizer);
+ // remove the possessive 's for english stemmers
+ if (matchVersion.onOrAfter(Version.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins")))
+ {
+ result = new EnglishPossessiveFilter(result);
+ }
+ // Use a special lowercase filter for turkish, the stemmer expects it.
+ if (matchVersion.onOrAfter(Version.LUCENE_31) && name.Equals("Turkish"))
+ {
+ result = new TurkishLowerCaseFilter(result);
+ }
+ else
+ {
+ result = new LowerCaseFilter(matchVersion, result);
+ }
+ if (stopSet != null)
+ {
+ result = new StopFilter(matchVersion, result, stopSet);
+ }
+ result = new SnowballFilter(result, name);
+ return new TokenStreamComponents(tokenizer, result);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballFilter.cs
new file mode 100644
index 0000000..58a8361
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballFilter.cs
@@ -0,0 +1,129 @@
+using System;
+
+namespace org.apache.lucene.analysis.snowball
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+ using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using TurkishLowerCaseFilter = org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
+ using SnowballProgram = org.tartarus.snowball.SnowballProgram;
+
+ /// <summary>
+ /// A filter that stems words using a Snowball-generated stemmer.
+ ///
+ /// Available stemmers are listed in <seealso cref="org.tartarus.snowball.ext"/>.
+ /// <para><b>NOTE</b>: SnowballFilter expects lowercased text.
+ /// <ul>
+ /// <li>For the Turkish language, see <seealso cref="TurkishLowerCaseFilter"/>.
+ /// <li>For other languages, see <seealso cref="LowerCaseFilter"/>.
+ /// </ul>
+ /// </para>
+ ///
+ /// <para>
+ /// Note: This filter is aware of the <seealso cref="KeywordAttribute"/>. To prevent
+ /// certain terms from being passed to the stemmer
+ /// <seealso cref="KeywordAttribute#isKeyword()"/> should be set to <code>true</code>
+ /// in a previous <seealso cref="TokenStream"/>.
+ ///
+ /// Note: For including the original term as well as the stemmed version, see
+ /// <seealso cref="org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory"/>
+ /// </para>
+ ///
+ ///
+ /// </summary>
+ public sealed class SnowballFilter : TokenFilter
+ {
+
+ private readonly SnowballProgram stemmer;
+
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly KeywordAttribute keywordAttr = addAttribute(typeof(KeywordAttribute));
+
+ public SnowballFilter(TokenStream input, SnowballProgram stemmer) : base(input)
+ {
+ this.stemmer = stemmer;
+ }
+
+ /// <summary>
+ /// Construct the named stemming filter.
+ ///
+ /// Available stemmers are listed in <seealso cref="org.tartarus.snowball.ext"/>.
+ /// The name of a stemmer is the part of the class name before "Stemmer",
+ /// e.g., the stemmer in <seealso cref="org.tartarus.snowball.ext.EnglishStemmer"/> is named "English".
+ /// </summary>
+ /// <param name="in"> the input tokens to stem </param>
+ /// <param name="name"> the name of a stemmer </param>
+ public SnowballFilter(TokenStream @in, string name) : base(@in)
+ {
+ //Class.forName is frowned upon in place of the ResourceLoader but in this case,
+ // the factory will use the other constructor so that the program is already loaded.
+ try
+ {
+ Type stemClass = Type.GetType("org.tartarus.snowball.ext." + name + "Stemmer").asSubclass(typeof(SnowballProgram));
+ stemmer = stemClass.newInstance();
+ }
+ catch (Exception e)
+ {
+ throw new System.ArgumentException("Invalid stemmer class specified: " + name, e);
+ }
+ }
+
+ /// <summary>
+ /// Returns the next input Token, after being stemmed </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ if (!keywordAttr.Keyword)
+ {
+ char[] termBuffer = termAtt.buffer();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int length = termAtt.length();
+ int length = termAtt.length();
+ stemmer.setCurrent(termBuffer, length);
+ stemmer.stem();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char finalTerm[] = stemmer.getCurrentBuffer();
+ char[] finalTerm = stemmer.CurrentBuffer;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newLength = stemmer.getCurrentBufferLength();
+ int newLength = stemmer.CurrentBufferLength;
+ if (finalTerm != termBuffer)
+ {
+ termAtt.copyBuffer(finalTerm, 0, newLength);
+ }
+ else
+ {
+ termAtt.Length = newLength;
+ }
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballPorterFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballPorterFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballPorterFilterFactory.cs
new file mode 100644
index 0000000..310391e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Snowball/SnowballPorterFilterFactory.cs
@@ -0,0 +1,101 @@
+using System;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.snowball
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+ using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+ using SnowballProgram = org.tartarus.snowball.SnowballProgram;
+
+ /// <summary>
+ /// Factory for <seealso cref="SnowballFilter"/>, with configurable language
+ /// <para>
+ /// Note: Use of the "Lovins" stemmer is not recommended, as it is implemented with reflection.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_snowballstem" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.LowerCaseFilterFactory"/>
+ /// <filter class="solr.SnowballPorterFilterFactory" protected="protectedkeyword.txt" language="English"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </para>
+ /// </summary>
+ public class SnowballPorterFilterFactory : TokenFilterFactory, ResourceLoaderAware
+ {
+ public const string PROTECTED_TOKENS = "protected";
+
+ private readonly string language;
+ private readonly string wordFiles;
+ private Type stemClass;
+ private CharArraySet protectedWords = null;
+
+ /// <summary>
+ /// Creates a new SnowballPorterFilterFactory </summary>
+ public SnowballPorterFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ language = get(args, "language", "English");
+ wordFiles = get(args, PROTECTED_TOKENS);
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
+ public virtual void inform(ResourceLoader loader)
+ {
+ string className = "org.tartarus.snowball.ext." + language + "Stemmer";
+ stemClass = loader.newInstance(className, typeof(SnowballProgram)).GetType();
+
+ if (wordFiles != null)
+ {
+ protectedWords = getWordSet(loader, wordFiles, false);
+ }
+ }
+
+ public override TokenFilter create(TokenStream input)
+ {
+ SnowballProgram program;
+ try
+ {
+ program = stemClass.newInstance();
+ }
+ catch (Exception e)
+ {
+ throw new Exception("Error instantiating stemmer for language " + language + "from class " + stemClass, e);
+ }
+
+ if (protectedWords != null)
+ {
+ input = new SetKeywordMarkerFilter(input, protectedWords);
+ }
+ return new SnowballFilter(input, program);
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs
new file mode 100644
index 0000000..f2387f1
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs
@@ -0,0 +1,161 @@
+using Lucene.Net.Analysis.Core;
+
+namespace org.apache.lucene.analysis.standard
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using org.apache.lucene.analysis;
+ using LowerCaseFilter = LowerCaseFilter;
+ using StopAnalyzer = StopAnalyzer;
+ using StopFilter = StopFilter;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+ using WordlistLoader = org.apache.lucene.analysis.util.WordlistLoader;
+ using Version = org.apache.lucene.util.Version;
+
+
+ /// <summary>
+ /// Filters <seealso cref="ClassicTokenizer"/> with <seealso cref="ClassicFilter"/>, {@link
+ /// LowerCaseFilter} and <seealso cref="StopFilter"/>, using a list of
+ /// English stop words.
+ ///
+ /// <a name="version"/>
+ /// <para>You must specify the required <seealso cref="Version"/>
+ /// compatibility when creating ClassicAnalyzer:
+ /// <ul>
+ /// <li> As of 3.1, StopFilter correctly handles Unicode 4.0
+ /// supplementary characters in stopwords
+ /// <li> As of 2.9, StopFilter preserves position
+ /// increments
+ /// <li> As of 2.4, Tokens incorrectly identified as acronyms
+ /// are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
+ /// </ul>
+ ///
+ /// ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
+ /// As of 3.1, <seealso cref="StandardAnalyzer"/> implements Unicode text segmentation,
+ /// as specified by UAX#29.
+ /// </para>
+ /// </summary>
+ public sealed class ClassicAnalyzer : StopwordAnalyzerBase
+ {
+
+ /// <summary>
+ /// Default maximum allowed token length </summary>
+ public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ /// <summary>
+ /// An unmodifiable set containing some common English words that are usually not
+ /// useful for searching.
+ /// </summary>
+ public static readonly CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words. </summary>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ /// <param name="stopWords"> stop words </param>
+ public ClassicAnalyzer(Version matchVersion, CharArraySet stopWords) : base(matchVersion, stopWords)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words ({@link
+ /// #STOP_WORDS_SET}). </summary>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ public ClassicAnalyzer(Version matchVersion) : this(matchVersion, STOP_WORDS_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the stop words from the given reader. </summary>
+ /// <seealso cref= WordlistLoader#getWordSet(Reader, Version) </seealso>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ /// <param name="stopwords"> Reader to read stop words from </param>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public ClassicAnalyzer(org.apache.lucene.util.Version matchVersion, java.io.Reader stopwords) throws java.io.IOException
+ public ClassicAnalyzer(Version matchVersion, Reader stopwords) : this(matchVersion, loadStopwordSet(stopwords, matchVersion))
+ {
+ }
+
+ /// <summary>
+ /// Set maximum allowed token length. If a token is seen
+ /// that exceeds this length then it is discarded. This
+ /// setting only takes effect the next time tokenStream or
+ /// tokenStream is called.
+ /// </summary>
+ public int MaxTokenLength
+ {
+ set
+ {
+ maxTokenLength = value;
+ }
+ get
+ {
+ return maxTokenLength;
+ }
+ }
+
+
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+//ORIGINAL LINE: @Override protected TokenStreamComponents createComponents(final String fieldName, final java.io.Reader reader)
+ protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
+ ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
+ src.MaxTokenLength = maxTokenLength;
+ TokenStream tok = new ClassicFilter(src);
+ tok = new LowerCaseFilter(matchVersion, tok);
+ tok = new StopFilter(matchVersion, tok, stopwords);
+ return new TokenStreamComponentsAnonymousInnerClassHelper(this, src, tok, reader);
+ }
+
+ private class TokenStreamComponentsAnonymousInnerClassHelper : TokenStreamComponents
+ {
+ private readonly ClassicAnalyzer outerInstance;
+
+ private Reader reader;
+ private org.apache.lucene.analysis.standard.ClassicTokenizer src;
+
+ public TokenStreamComponentsAnonymousInnerClassHelper(ClassicAnalyzer outerInstance, org.apache.lucene.analysis.standard.ClassicTokenizer src, TokenStream tok, Reader reader) : base(src, tok)
+ {
+ this.outerInstance = outerInstance;
+ this.reader = reader;
+ this.src = src;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override protected void setReader(final java.io.Reader reader) throws java.io.IOException
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+ protected internal override Reader Reader
+ {
+ set
+ {
+ src.MaxTokenLength = outerInstance.maxTokenLength;
+ base.Reader = value;
+ }
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs
new file mode 100644
index 0000000..9ee4b32
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs
@@ -0,0 +1,92 @@
+namespace org.apache.lucene.analysis.standard
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+ /// <summary>
+ /// Normalizes tokens extracted with <seealso cref="ClassicTokenizer"/>. </summary>
+
+ public class ClassicFilter : TokenFilter
+ {
+
+ /// <summary>
+ /// Construct filtering <i>in</i>. </summary>
+ public ClassicFilter(TokenStream @in) : base(@in)
+ {
+ }
+
+ private static readonly string APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
+ private static readonly string ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
+
+ // this filters uses attribute type
+ private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+ /// <summary>
+ /// Returns the next token in the stream, or null at EOS.
+ /// <para>Removes <tt>'s</tt> from the end of words.
+ /// </para>
+ /// <para>Removes dots from acronyms.
+ /// </para>
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (!input.incrementToken())
+ {
+ return false;
+ }
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char[] buffer = termAtt.buffer();
+ char[] buffer = termAtt.buffer();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int bufferLength = termAtt.length();
+ int bufferLength = termAtt.length();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final String type = typeAtt.type();
+ string type = typeAtt.type();
+
+ if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) // remove 's
+ {
+ // Strip last 2 characters off
+ termAtt.Length = bufferLength - 2;
+ } // remove dots
+ else if (type == ACRONYM_TYPE)
+ {
+ int upto = 0;
+ for (int i = 0;i < bufferLength;i++)
+ {
+ char c = buffer[i];
+ if (c != '.')
+ {
+ buffer[upto++] = c;
+ }
+ }
+ termAtt.Length = upto;
+ }
+
+ return true;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs
new file mode 100644
index 0000000..2107ccc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.standard
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="ClassicFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.ClassicTokenizerFactory"/>
+ /// <filter class="solr.ClassicFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class ClassicFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new ClassicFilterFactory </summary>
+ public ClassicFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenFilter create(TokenStream input)
+ {
+ return new ClassicFilter(input);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs
new file mode 100644
index 0000000..a41f48d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.standard
+{
+
+
+ using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+ using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+ using Version = org.apache.lucene.util.Version;
+
+ /// <summary>
+ /// A grammar-based tokenizer constructed with JFlex
+ ///
+ /// <para> This should be a good tokenizer for most European-language documents:
+ ///
+ /// <ul>
+ /// <li>Splits words at punctuation characters, removing punctuation. However, a
+ /// dot that's not followed by whitespace is considered part of a token.
+ /// <li>Splits words at hyphens, unless there's a number in the token, in which case
+ /// the whole token is interpreted as a product number and is not split.
+ /// <li>Recognizes email addresses and internet hostnames as one token.
+ /// </ul>
+ ///
+ /// </para>
+ /// <para>Many applications have specific tokenizer needs. If this tokenizer does
+ /// not suit your application, please consider copying this source code
+ /// directory to your project and maintaining your own grammar-based tokenizer.
+ ///
+ /// ClassicTokenizer was named StandardTokenizer in Lucene versions prior to 3.1.
+ /// As of 3.1, <seealso cref="StandardTokenizer"/> implements Unicode text segmentation,
+ /// as specified by UAX#29.
+ /// </para>
+ /// </summary>
+
+ public sealed class ClassicTokenizer : Tokenizer
+ {
+ /// <summary>
+ /// A private instance of the JFlex-constructed scanner </summary>
+ private StandardTokenizerInterface scanner;
+
+ public const int ALPHANUM = 0;
+ public const int APOSTROPHE = 1;
+ public const int ACRONYM = 2;
+ public const int COMPANY = 3;
+ public const int EMAIL = 4;
+ public const int HOST = 5;
+ public const int NUM = 6;
+ public const int CJ = 7;
+
+ public const int ACRONYM_DEP = 8;
+
+ /// <summary>
+ /// String token types that correspond to token type int constants </summary>
+ public static readonly string[] TOKEN_TYPES = new string [] {"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};
+
+ private int skippedPositions;
+
+ private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+ /// <summary>
+ /// Set the max allowed token length. Any token longer
+ /// than this is skipped.
+ /// </summary>
+ public int MaxTokenLength
+ {
+ set
+ {
+ if (value < 1)
+ {
+ throw new System.ArgumentException("maxTokenLength must be greater than zero");
+ }
+ this.maxTokenLength = value;
+ }
+ get
+ {
+ return maxTokenLength;
+ }
+ }
+
+
+ /// <summary>
+ /// Creates a new instance of the <seealso cref="ClassicTokenizer"/>. Attaches
+ /// the <code>input</code> to the newly created JFlex scanner.
+ /// </summary>
+ /// <param name="input"> The input reader
+ ///
+ /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param>
+ public ClassicTokenizer(Version matchVersion, Reader input) : base(input)
+ {
+ init(matchVersion);
+ }
+
+ /// <summary>
+ /// Creates a new ClassicTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>
+ /// </summary>
+ public ClassicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) : base(factory, input)
+ {
+ init(matchVersion);
+ }
+
+ private void init(Version matchVersion)
+ {
+ this.scanner = new ClassicTokenizerImpl(input);
+ }
+
+ // this tokenizer generates three attributes:
+ // term offset, positionIncrement and type
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+ private readonly PositionIncrementAttribute posIncrAtt = addAttribute(typeof(PositionIncrementAttribute));
+ private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ clearAttributes();
+ skippedPositions = 0;
+
+ while (true)
+ {
+ int tokenType = scanner.NextToken;
+
+ if (tokenType == StandardTokenizerInterface_Fields.YYEOF)
+ {
+ return false;
+ }
+
+ if (scanner.yylength() <= maxTokenLength)
+ {
+ posIncrAtt.PositionIncrement = skippedPositions + 1;
+ scanner.getText(termAtt);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int start = scanner.yychar();
+ int start = scanner.yychar();
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
+
+ if (tokenType == ClassicTokenizer.ACRONYM_DEP)
+ {
+ typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST];
+ termAtt.Length = termAtt.length() - 1; // remove extra '.'
+ }
+ else
+ {
+ typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[tokenType];
+ }
+ return true;
+ }
+ else
+ // When we skip a too-long term, we still increment the
+ // position increment
+ {
+ skippedPositions++;
+ }
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final void end() throws java.io.IOException
+ public override void end()
+ {
+ base.end();
+ // set final offset
+ int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ // adjust any skipped tokens
+ posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void close() throws java.io.IOException
+ public override void close()
+ {
+ base.close();
+ scanner.yyreset(input);
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ base.reset();
+ scanner.yyreset(input);
+ skippedPositions = 0;
+ }
+ }
+
+}
\ No newline at end of file