You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2015/04/15 01:32:29 UTC
[3/3] lucenenet git commit: More porting work
More porting work
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/b4eaf2fc
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/b4eaf2fc
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/b4eaf2fc
Branch: refs/heads/master
Commit: b4eaf2fc441dfd5d32732eda844ef1e8e62588a1
Parents: 8d7a54f
Author: Itamar Syn-Hershko <it...@code972.com>
Authored: Wed Apr 15 02:32:11 2015 +0300
Committer: Itamar Syn-Hershko <it...@code972.com>
Committed: Wed Apr 15 02:32:11 2015 +0300
----------------------------------------------------------------------
.../Compound/CompoundWordTokenFilterBase.cs | 365 ++++++-----
.../Analysis/Core/UpperCaseFilter.cs | 114 ++--
.../Ngram/Lucene43EdgeNGramTokenizer.cs | 609 +++++++++----------
.../Analysis/Standard/ClassicAnalyzer.cs | 299 +++++----
.../Analysis/Standard/ClassicFilter.cs | 153 +++--
.../Analysis/Standard/ClassicFilterFactory.cs | 92 ++-
.../Analysis/Standard/ClassicTokenizer.cs | 369 ++++++-----
.../Analysis/Standard/ClassicTokenizerImpl.cs | 14 +-
.../Analysis/Standard/StandardAnalyzer.cs | 273 +++++----
.../Analysis/Standard/StandardFilter.cs | 167 +++--
.../Analysis/Standard/StandardFilterFactory.cs | 1 +
.../Analysis/Standard/StandardTokenizer.cs | 13 +-
.../Standard/StandardTokenizerFactory.cs | 18 +-
.../Analysis/Standard/StandardTokenizerImpl.cs | 2 -
.../Analysis/Standard/UAX29URLEmailAnalyzer.cs | 43 +-
.../Analysis/Util/CharArraySet.cs | 1 -
.../Analysis/Wikipedia/WikipediaTokenizer.cs | 23 +-
.../Wikipedia/WikipediaTokenizerFactory.cs | 1 +
src/Lucene.Net.Core/Util/StringHelper.cs | 10 +-
19 files changed, 1239 insertions(+), 1328 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
index ba8fd6c..c6bc4cd 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/CompoundWordTokenFilterBase.cs
@@ -2,193 +2,192 @@
using System.Diagnostics;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
using Lucene.Net.Util;
namespace Lucene.Net.Analysis.Compound
{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
/// <summary>
- /// Base class for decomposition token filters.
- /// <para>
- ///
- /// <a name="version"></a>
- /// You must specify the required <seealso cref="LuceneVersion"/> compatibility when creating
- /// CompoundWordTokenFilterBase:
- /// <ul>
- /// <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
- /// supplementary characters in strings and char arrays provided as compound word
- /// dictionaries.
- /// <li>As of 4.4, <seealso cref="CompoundWordTokenFilterBase"/> doesn't update offsets.
- /// </ul>
- /// </para>
- /// </summary>
- public abstract class CompoundWordTokenFilterBase : TokenFilter
- {
- /// <summary>
- /// The default for minimal word length that gets decomposed
- /// </summary>
- public const int DEFAULT_MIN_WORD_SIZE = 5;
-
- /// <summary>
- /// The default for minimal length of subwords that get propagated to the output of this filter
- /// </summary>
- public const int DEFAULT_MIN_SUBWORD_SIZE = 2;
-
- /// <summary>
- /// The default for maximal length of subwords that get propagated to the output of this filter
- /// </summary>
- public const int DEFAULT_MAX_SUBWORD_SIZE = 15;
-
- protected internal readonly LuceneVersion matchVersion;
- protected internal readonly CharArraySet dictionary;
- protected internal readonly LinkedList<CompoundToken> tokens;
- protected internal readonly int minWordSize;
- protected internal readonly int minSubwordSize;
- protected internal readonly int maxSubwordSize;
- protected internal readonly bool onlyLongestMatch;
-
- protected internal readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
- protected internal readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
- private readonly PositionIncrementAttribute posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
-
- private AttributeSource.State current;
-
- protected internal CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, bool onlyLongestMatch) : this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch)
- {
- }
-
- protected internal CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary) : this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false)
- {
- }
-
- protected internal CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) : base(input)
- {
- this.matchVersion = matchVersion;
- this.tokens = new LinkedList<CompoundToken>();
- if (minWordSize < 0)
- {
- throw new System.ArgumentException("minWordSize cannot be negative");
- }
- this.minWordSize = minWordSize;
- if (minSubwordSize < 0)
- {
- throw new System.ArgumentException("minSubwordSize cannot be negative");
- }
- this.minSubwordSize = minSubwordSize;
- if (maxSubwordSize < 0)
- {
- throw new System.ArgumentException("maxSubwordSize cannot be negative");
- }
- this.maxSubwordSize = maxSubwordSize;
- this.onlyLongestMatch = onlyLongestMatch;
- this.dictionary = dictionary;
- }
-
- public override bool IncrementToken()
- {
- if (tokens.Count > 0)
- {
- Debug.Assert(current != null);
- CompoundToken token = tokens.First.Value; tokens.RemoveFirst();
- RestoreState(current); // keep all other attributes untouched
- termAtt.SetEmpty().Append(token.txt);
- offsetAtt.SetOffset(token.startOffset, token.endOffset);
- posIncAtt.PositionIncrement = 0;
- return true;
- }
-
- current = null; // not really needed, but for safety
- if (input.incrementToken())
- {
- // Only words longer than minWordSize get processed
- if (termAtt.length() >= this.minWordSize)
- {
- decompose();
- // only capture the state if we really need it for producing new tokens
- if (tokens.Count > 0)
- {
- current = captureState();
- }
- }
- // return original token:
- return true;
- }
- else
- {
- return false;
- }
- }
-
- /// <summary>
- /// Decomposes the current <seealso cref="#termAtt"/> and places <seealso cref="CompoundToken"/> instances in the <seealso cref="#tokens"/> list.
- /// The original token may not be placed in the list, as it is automatically passed through this filter.
- /// </summary>
- protected internal abstract void decompose();
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
- public override void reset()
- {
- base.reset();
- tokens.Clear();
- current = null;
- }
-
- /// <summary>
- /// Helper class to hold decompounded token information
- /// </summary>
- protected internal class CompoundToken
- {
- private readonly CompoundWordTokenFilterBase outerInstance;
-
- public readonly string txt;
- public readonly int startOffset, endOffset;
-
- /// <summary>
- /// Construct the compound token based on a slice of the current <seealso cref="CompoundWordTokenFilterBase#termAtt"/>. </summary>
- public CompoundToken(CompoundWordTokenFilterBase outerInstance, int offset, int length)
- {
- this.outerInstance = outerInstance;
- this.txt = outerInstance.termAtt.subSequence(offset, offset + length);
-
- // offsets of the original word
- int startOff = outerInstance.offsetAtt.startOffset();
- int endOff = outerInstance.offsetAtt.endOffset();
-
- if (outerInstance.matchVersion.onOrAfter(LuceneVersion.LUCENE_44) || endOff - startOff != outerInstance.termAtt.length())
- {
- // if length by start + end offsets doesn't match the term text then assume
- // this is a synonym and don't adjust the offsets.
- this.startOffset = startOff;
- this.endOffset = endOff;
- }
- else
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int newStart = startOff + offset;
- int newStart = startOff + offset;
- this.startOffset = newStart;
- this.endOffset = newStart + length;
- }
- }
-
- }
- }
-
+ /// Base class for decomposition token filters.
+ /// <para>
+ ///
+ /// <a name="version"></a>
+ /// You must specify the required <seealso cref="LuceneVersion"/> compatibility when creating
+ /// CompoundWordTokenFilterBase:
+ /// <ul>
+ /// <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ /// supplementary characters in strings and char arrays provided as compound word
+ /// dictionaries.
+ /// <li>As of 4.4, <seealso cref="CompoundWordTokenFilterBase"/> doesn't update offsets.
+ /// </ul>
+ /// </para>
+ /// </summary>
+ public abstract class CompoundWordTokenFilterBase : TokenFilter
+ {
+ /// <summary>
+ /// The default for minimal word length that gets decomposed
+ /// </summary>
+ public const int DEFAULT_MIN_WORD_SIZE = 5;
+
+ /// <summary>
+ /// The default for minimal length of subwords that get propagated to the output of this filter
+ /// </summary>
+ public const int DEFAULT_MIN_SUBWORD_SIZE = 2;
+
+ /// <summary>
+ /// The default for maximal length of subwords that get propagated to the output of this filter
+ /// </summary>
+ public const int DEFAULT_MAX_SUBWORD_SIZE = 15;
+
+ protected internal readonly LuceneVersion matchVersion;
+ protected internal readonly CharArraySet dictionary;
+ protected internal readonly LinkedList<CompoundToken> tokens;
+ protected internal readonly int minWordSize;
+ protected internal readonly int minSubwordSize;
+ protected internal readonly int maxSubwordSize;
+ protected internal readonly bool onlyLongestMatch;
+
+ protected internal readonly CharTermAttribute termAtt;
+ protected internal readonly IOffsetAttribute offsetAtt;
+ private readonly IPositionIncrementAttribute posIncAtt;
+
+ private AttributeSource.State current;
+
+ protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, bool onlyLongestMatch)
+ : this(matchVersion, input, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch)
+ {
+ }
+
+ protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary)
+ : this(matchVersion, input, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false)
+ {
+ }
+
+ protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
+ : base(input)
+ {
+ termAtt = AddAttribute<ICharTermAttribute>() as CharTermAttribute;
+ offsetAtt = AddAttribute<IOffsetAttribute>();
+ posIncAtt = AddAttribute<IPositionIncrementAttribute>();
+
+ this.matchVersion = matchVersion;
+ this.tokens = new LinkedList<CompoundToken>();
+ if (minWordSize < 0)
+ {
+ throw new System.ArgumentException("minWordSize cannot be negative");
+ }
+ this.minWordSize = minWordSize;
+ if (minSubwordSize < 0)
+ {
+ throw new System.ArgumentException("minSubwordSize cannot be negative");
+ }
+ this.minSubwordSize = minSubwordSize;
+ if (maxSubwordSize < 0)
+ {
+ throw new System.ArgumentException("maxSubwordSize cannot be negative");
+ }
+ this.maxSubwordSize = maxSubwordSize;
+ this.onlyLongestMatch = onlyLongestMatch;
+ this.dictionary = dictionary;
+ }
+
+ public override bool IncrementToken()
+ {
+ if (tokens.Count > 0)
+ {
+ Debug.Assert(current != null);
+ CompoundToken token = tokens.First.Value; tokens.RemoveFirst();
+ RestoreState(current); // keep all other attributes untouched
+ termAtt.SetEmpty().Append(token.txt);
+ offsetAtt.SetOffset(token.startOffset, token.endOffset);
+ posIncAtt.PositionIncrement = 0;
+ return true;
+ }
+
+ current = null; // not really needed, but for safety
+ if (input.IncrementToken())
+ {
+ // Only words longer than minWordSize get processed
+ if (termAtt.Length >= this.minWordSize)
+ {
+ Decompose();
+ // only capture the state if we really need it for producing new tokens
+ if (tokens.Count > 0)
+ {
+ current = CaptureState();
+ }
+ }
+ // return original token:
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /// <summary>
+ /// Decomposes the current <seealso cref="#termAtt"/> and places <seealso cref="CompoundToken"/> instances in the <seealso cref="#tokens"/> list.
+ /// The original token may not be placed in the list, as it is automatically passed through this filter.
+ /// </summary>
+ protected abstract void Decompose();
+
+ public override void Reset()
+ {
+ base.Reset();
+ tokens.Clear();
+ current = null;
+ }
+
+ /// <summary>
+ /// Helper class to hold decompounded token information
+ /// </summary>
+ protected internal class CompoundToken
+ {
+ public readonly ICharSequence txt;
+ public readonly int startOffset, endOffset;
+
+ /// <summary>
+ /// Construct the compound token based on a slice of the current <seealso cref="CompoundWordTokenFilterBase#termAtt"/>. </summary>
+ public CompoundToken(CompoundWordTokenFilterBase outerInstance, int offset, int length)
+ {
+ this.txt = outerInstance.termAtt.SubSequence(offset, offset + length);
+
+ // offsets of the original word
+ int startOff = outerInstance.offsetAtt.StartOffset();
+ int endOff = outerInstance.offsetAtt.EndOffset();
+
+ if (outerInstance.matchVersion.OnOrAfter(LuceneVersion.LUCENE_44) || endOff - startOff != outerInstance.termAtt.Length)
+ {
+ // if length by start + end offsets doesn't match the term text then assume
+ // this is a synonym and don't adjust the offsets.
+ this.startOffset = startOff;
+ this.endOffset = endOff;
+ }
+ else
+ {
+ int newStart = startOff + offset;
+ this.startOffset = newStart;
+ this.endOffset = newStart + length;
+ }
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs
index 6b722ad..c8b5f5f 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs
@@ -4,65 +4,65 @@ using Lucene.Net.Util;
namespace Lucene.Net.Analysis.Core
{
-
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
/// <summary>
- /// Normalizes token text to UPPER CASE.
- /// <a name="version"/>
- /// <para>You must specify the required <seealso cref="LuceneVersion"/>
- /// compatibility when creating UpperCaseFilter
- ///
- /// </para>
- /// <para><b>NOTE:</b> In Unicode, this transformation may lose information when the
- /// upper case character represents more than one lower case character. Use this filter
- /// when you require uppercase tokens. Use the <seealso cref="LowerCaseFilter"/> for
- /// general search matching
- /// </para>
- /// </summary>
- public sealed class UpperCaseFilter : TokenFilter
- {
- private readonly CharacterUtils charUtils;
- private readonly ICharTermAttribute termAtt;;
+ /// Normalizes token text to UPPER CASE.
+ /// <a name="version"/>
+ /// <para>You must specify the required <seealso cref="LuceneVersion"/>
+ /// compatibility when creating UpperCaseFilter
+ ///
+ /// </para>
+ /// <para><b>NOTE:</b> In Unicode, this transformation may lose information when the
+ /// upper case character represents more than one lower case character. Use this filter
+ /// when you require uppercase tokens. Use the <seealso cref="LowerCaseFilter"/> for
+ /// general search matching
+ /// </para>
+ /// </summary>
+ public sealed class UpperCaseFilter : TokenFilter
+ {
+ private readonly CharacterUtils charUtils;
+ private readonly ICharTermAttribute termAtt;
- /// <summary>
- /// Create a new UpperCaseFilter, that normalizes token text to upper case.
- /// </summary>
- /// <param name="matchVersion"> See <a href="#version">above</a> </param>
- /// <param name="in"> TokenStream to filter </param>
- public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in) : base(@in)
- {
- termAtt = AddAttribute<ICharTermAttribute>();
- termAtt = AddAttribute<ICharTermAttribute>();
- charUtils = CharacterUtils.GetInstance(matchVersion);
- }
+ /// <summary>
+ /// Create a new UpperCaseFilter, that normalizes token text to upper case.
+ /// </summary>
+ /// <param name="matchVersion"> See <a href="#version">above</a> </param>
+ /// <param name="in"> TokenStream to filter </param>
+ public UpperCaseFilter(LuceneVersion matchVersion, TokenStream @in)
+ : base(@in)
+ {
+ termAtt = AddAttribute<ICharTermAttribute>();
+ termAtt = AddAttribute<ICharTermAttribute>();
+ charUtils = CharacterUtils.GetInstance(matchVersion);
+ }
- public override bool IncrementToken()
- {
- if (input.IncrementToken())
- {
- charUtils.ToUpper(termAtt.Buffer(), 0, termAtt.Length);
- return true;
- }
- else
- {
- return false;
- }
- }
- }
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ charUtils.ToUpper(termAtt.Buffer(), 0, termAtt.Length);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
index 3827b36..c277918 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
@@ -1,323 +1,308 @@
using System;
using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
using Reader = System.IO.TextReader;
using Version = Lucene.Net.Util.LuceneVersion;
namespace Lucene.Net.Analysis.Ngram
{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
/// <summary>
- /// Old version of <seealso cref="EdgeNGramTokenizer"/> which doesn't handle correctly
- /// supplementary characters.
- /// </summary>
- [Obsolete]
- public sealed class Lucene43EdgeNGramTokenizer : Tokenizer
- {
- public const Side DEFAULT_SIDE = Side.FRONT;
- public const int DEFAULT_MAX_GRAM_SIZE = 1;
- public const int DEFAULT_MIN_GRAM_SIZE = 1;
-
- private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
- private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
- private readonly PositionIncrementAttribute posIncrAtt = addAttribute(typeof(PositionIncrementAttribute));
-
- /// <summary>
- /// Specifies which side of the input the n-gram should be generated from </summary>
- public enum Side
- {
-
- /// <summary>
- /// Get the n-gram from the front of the input </summary>
-//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
- FRONT
- {
- public String getLabel() { return "front"
- }
- },
-
- /// <summary>
- /// Get the n-gram from the end of the input </summary>
-//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
- BACK
- {
- public String getLabel()
- {
- return "back";
- }
- }
-
- public =
-
- // Get the appropriate Side from a string
- public static Side getSide(String sideName)
- {
-//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
- if (FRONT.getLabel().equals(sideName))
- {
- return FRONT;
- }
-//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
- if (BACK.getLabel().equals(sideName))
- {
- return BACK;
- }
- return null;
- }
- }
-
- private int minGram;
- private int maxGram;
- private int gramSize;
- private Side side;
- private bool started;
- private int inLen; // length of the input AFTER trim()
- private int charsRead; // length of the input
- private string inStr;
-
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(Version version, Reader input, Side side, int minGram, int maxGram) : base(input)
- {
- init(version, side, minGram, maxGram);
- }
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) : base(factory, input)
- {
- init(version, side, minGram, maxGram);
- }
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(Version version, Reader input, string sideLabel, int minGram, int maxGram) : this(version, input, Side.getSide(sideLabel), minGram, maxGram)
- {
- }
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- [Obsolete]
- public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, string sideLabel, int minGram, int maxGram) : this(version, factory, input, Side.getSide(sideLabel), minGram, maxGram)
- {
- }
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public Lucene43EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) : this(version, input, Side.FRONT, minGram, maxGram)
- {
- }
-
- /// <summary>
- /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
- /// </summary>
- /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
- /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
- /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
- /// <param name="minGram"> the smallest n-gram to generate </param>
- /// <param name="maxGram"> the largest n-gram to generate </param>
- public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) : this(version, factory, input, Side.FRONT, minGram, maxGram)
- {
- }
-
- private void init(Version version, Side side, int minGram, int maxGram)
- {
- if (version == null)
- {
- throw new System.ArgumentException("version must not be null");
- }
-
- if (side == null)
- {
- throw new System.ArgumentException("sideLabel must be either front or back");
- }
-
- if (minGram < 1)
- {
- throw new System.ArgumentException("minGram must be greater than zero");
- }
-
- if (minGram > maxGram)
- {
- throw new System.ArgumentException("minGram must not be greater than maxGram");
- }
-
- if (version.onOrAfter(Version.LUCENE_44))
- {
- if (side == Side.BACK)
- {
- throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
- }
- }
- else
- {
- maxGram = Math.Min(maxGram, 1024);
- }
-
- this.minGram = minGram;
- this.maxGram = maxGram;
- this.side = side;
- }
-
- /// <summary>
- /// Returns the next token in the stream, or null at EOS. </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
- public override bool incrementToken()
- {
- clearAttributes();
- // if we are just starting, read the whole input
- if (!started)
- {
- started = true;
- gramSize = minGram;
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int limit = side == Side.FRONT ? maxGram : 1024;
- int limit = side == Side.FRONT ? maxGram : 1024;
- char[] chars = new char[Math.Min(1024, limit)];
- charsRead = 0;
- // TODO: refactor to a shared readFully somewhere:
- bool exhausted = false;
- while (charsRead < limit)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int inc = input.read(chars, charsRead, chars.length-charsRead);
- int inc = input.read(chars, charsRead, chars.Length - charsRead);
- if (inc == -1)
- {
- exhausted = true;
- break;
- }
- charsRead += inc;
- if (charsRead == chars.Length && charsRead < limit)
- {
- chars = ArrayUtil.grow(chars);
- }
- }
-
- inStr = new string(chars, 0, charsRead);
- inStr = inStr.Trim();
-
- if (!exhausted)
- {
- // Read extra throwaway chars so that on end() we
- // report the correct offset:
- char[] throwaway = new char[1024];
- while (true)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int inc = input.read(throwaway, 0, throwaway.length);
- int inc = input.read(throwaway, 0, throwaway.Length);
- if (inc == -1)
- {
- break;
- }
- charsRead += inc;
- }
- }
-
- inLen = inStr.length();
- if (inLen == 0)
- {
- return false;
- }
- posIncrAtt.PositionIncrement = 1;
- }
- else
- {
- posIncrAtt.PositionIncrement = 0;
- }
-
- // if the remaining input is too short, we can't generate any n-grams
- if (gramSize > inLen)
- {
- return false;
- }
-
- // if we have hit the end of our n-gram size range, quit
- if (gramSize > maxGram || gramSize > inLen)
- {
- return false;
- }
-
- // grab gramSize chars from front or back
- int start = side == Side.FRONT ? 0 : inLen - gramSize;
- int end = start + gramSize;
- termAtt.setEmpty().append(inStr, start, end);
- offsetAtt.setOffset(correctOffset(start), correctOffset(end));
- gramSize++;
- return true;
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void end() throws java.io.IOException
- public override void end()
- {
- base.end();
- // set final offset
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int finalOffset = correctOffset(charsRead);
- int finalOffset = correctOffset(charsRead);
- this.offsetAtt.setOffset(finalOffset, finalOffset);
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
- public override void reset()
- {
- base.reset();
- started = false;
- }
-}
+ /// Old version of <seealso cref="EdgeNGramTokenizer"/> which doesn't handle correctly
+ /// supplementary characters.
+ /// </summary>
+ [Obsolete]
+ public sealed class Lucene43EdgeNGramTokenizer : Tokenizer
+ {
+ public const Side DEFAULT_SIDE = Side.FRONT;
+ public const int DEFAULT_MAX_GRAM_SIZE = 1;
+ public const int DEFAULT_MIN_GRAM_SIZE = 1;
+
+ private readonly CharTermAttribute termAtt;
+ private readonly OffsetAttribute offsetAtt;
+ private readonly PositionIncrementAttribute posIncrAtt;
+
+ /// <summary>
+ /// Specifies which side of the input the n-gram should be generated from </summary>
+ public enum Side
+ {
+
+ /// <summary>
+ /// Get the n-gram from the front of the input </summary>
+ FRONT,
+
+ /// <summary>
+ /// Get the n-gram from the end of the input </summary>
+ BACK,
+ }
+
+ private static string GetSideLabel(Side side)
+ {
+ if (side == Side.FRONT) return "front";
+ if (side == Side.BACK) return "back";
+ return null;
+ }
+
+
+ // Get the appropriate Side from a string
+ internal static Side? GetSide(String sideName)
+ {
+ if (GetSideLabel(Side.FRONT).Equals(sideName))
+ {
+ return Side.FRONT;
+ }
+ if (GetSideLabel(Side.BACK).Equals(sideName))
+ {
+ return Side.BACK;
+ }
+ return null;
+ }
+
+ private int minGram;
+ private int maxGram;
+ private int gramSize;
+ private Side side;
+ private bool started;
+ private int inLen; // length of the input AFTER trim()
+ private int charsRead; // length of the input
+ private string inStr;
+
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(Version version, Reader input, Side side, int minGram, int maxGram)
+ : base(input)
+ {
+ init(version, side, minGram, maxGram);
+ }
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, Side side, int minGram, int maxGram)
+ : base(factory, input)
+ {
+ init(version, side, minGram, maxGram);
+ }
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(Version version, Reader input, string sideLabel, int minGram, int maxGram)
+ : this(version, input, GetSide(sideLabel), minGram, maxGram)
+ {
+ }
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ [Obsolete]
+ public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, string sideLabel, int minGram, int maxGram)
+ : this(version, factory, input, GetSide(sideLabel), minGram, maxGram)
+ {
+ }
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public Lucene43EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram)
+ : this(version, input, Side.FRONT, minGram, maxGram)
+ {
+ }
+
+ /// <summary>
+ /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ /// </summary>
+ /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+ /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+ /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+ /// <param name="minGram"> the smallest n-gram to generate </param>
+ /// <param name="maxGram"> the largest n-gram to generate </param>
+ public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram)
+ : this(version, factory, input, Side.FRONT, minGram, maxGram)
+ {
+ }
+
+ private void init(Version version, Side side, int minGram, int maxGram)
+ {
+ if (version == null)
+ {
+ throw new System.ArgumentException("version must not be null");
+ }
+
+ if (side == null)
+ {
+ throw new System.ArgumentException("sideLabel must be either front or back");
+ }
+
+ if (minGram < 1)
+ {
+ throw new System.ArgumentException("minGram must be greater than zero");
+ }
+
+ if (minGram > maxGram)
+ {
+ throw new System.ArgumentException("minGram must not be greater than maxGram");
+ }
+
+ if (version.OnOrAfter(Version.LUCENE_44))
+ {
+ if (side == Side.BACK)
+ {
+ throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
+ }
+ }
+ else
+ {
+ maxGram = Math.Min(maxGram, 1024);
+ }
+
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ this.side = side;
+ }
+
+ /// <summary>
+ /// Returns the next token in the stream, or null at EOS. </summary>
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ // if we are just starting, read the whole input
+ if (!started)
+ {
+ started = true;
+ gramSize = minGram;
+ int limit = side == Side.FRONT ? maxGram : 1024;
+ char[] chars = new char[Math.Min(1024, limit)];
+ charsRead = 0;
+ // TODO: refactor to a shared readFully somewhere:
+ bool exhausted = false;
+ while (charsRead < limit)
+ {
+ int inc = input.Read(chars, charsRead, chars.Length - charsRead);
+ if (inc <= 0)
+ {
+ exhausted = true;
+ break;
+ }
+ charsRead += inc;
+ if (charsRead == chars.Length && charsRead < limit)
+ {
+ chars = ArrayUtil.Grow(chars);
+ }
+ }
+
+ inStr = new string(chars, 0, charsRead);
+ inStr = inStr.Trim();
+
+ if (!exhausted)
+ {
+ // Read extra throwaway chars so that on end() we
+ // report the correct offset:
+ var throwaway = new char[1024];
+ while (true)
+ {
+ int inc = input.Read(throwaway, 0, throwaway.Length);
+ if (inc <= 0)
+ {
+ break;
+ }
+ charsRead += inc;
+ }
+ }
+
+ inLen = inStr.Length;
+ if (inLen == 0)
+ {
+ return false;
+ }
+ posIncrAtt.PositionIncrement = 1;
+ }
+ else
+ {
+ posIncrAtt.PositionIncrement = 0;
+ }
+
+ // if the remaining input is too short, we can't generate any n-grams
+ if (gramSize > inLen)
+ {
+ return false;
+ }
+
+ // if we have hit the end of our n-gram size range, quit
+ if (gramSize > maxGram || gramSize > inLen)
+ {
+ return false;
+ }
+
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : inLen - gramSize;
+ int end = start + gramSize;
+ termAtt.SetEmpty().Append(inStr, start, end);
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
+ gramSize++;
+ return true;
+ }
+
+ public override void End()
+ {
+ base.End();
+ // set final offset
+ int finalOffset = CorrectOffset(charsRead);
+ this.offsetAtt.SetOffset(finalOffset, finalOffset);
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ started = false;
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs
index de32d23..0dd0529 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicAnalyzer.cs
@@ -1,164 +1,149 @@
using Lucene.Net.Analysis.Core;
-using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.Util;
-using StopwordAnalyzerBase = Lucene.Net.Analysis.Util.StopwordAnalyzerBase;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.standard;
+using Reader = System.IO.TextReader;
-namespace org.apache.lucene.analysis.standard
+namespace Lucene.Net.Analysis.Standard
{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- using org.apache.lucene.analysis;
- using LowerCaseFilter = LowerCaseFilter;
- using StopAnalyzer = StopAnalyzer;
- using StopFilter = StopFilter;
- using CharArraySet = CharArraySet;
- using StopwordAnalyzerBase = StopwordAnalyzerBase;
- using WordlistLoader = WordlistLoader;
- using Version = org.apache.lucene.util.Version;
-
-
- /// <summary>
- /// Filters <seealso cref="ClassicTokenizer"/> with <seealso cref="ClassicFilter"/>, {@link
- /// LowerCaseFilter} and <seealso cref="StopFilter"/>, using a list of
- /// English stop words.
- ///
- /// <a name="version"/>
- /// <para>You must specify the required <seealso cref="Version"/>
- /// compatibility when creating ClassicAnalyzer:
- /// <ul>
- /// <li> As of 3.1, StopFilter correctly handles Unicode 4.0
- /// supplementary characters in stopwords
- /// <li> As of 2.9, StopFilter preserves position
- /// increments
- /// <li> As of 2.4, Tokens incorrectly identified as acronyms
- /// are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
- /// </ul>
- ///
- /// ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
- /// As of 3.1, <seealso cref="StandardAnalyzer"/> implements Unicode text segmentation,
- /// as specified by UAX#29.
- /// </para>
- /// </summary>
- public sealed class ClassicAnalyzer : StopwordAnalyzerBase
- {
-
- /// <summary>
- /// Default maximum allowed token length </summary>
- public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
-
- private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
-
- /// <summary>
- /// An unmodifiable set containing some common English words that are usually not
- /// useful for searching.
- /// </summary>
- public static readonly CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
-
- /// <summary>
- /// Builds an analyzer with the given stop words. </summary>
- /// <param name="matchVersion"> Lucene version to match See {@link
- /// <a href="#version">above</a>} </param>
- /// <param name="stopWords"> stop words </param>
- public ClassicAnalyzer(Version matchVersion, CharArraySet stopWords) : base(matchVersion, stopWords)
- {
- }
-
- /// <summary>
- /// Builds an analyzer with the default stop words ({@link
- /// #STOP_WORDS_SET}). </summary>
- /// <param name="matchVersion"> Lucene version to match See {@link
- /// <a href="#version">above</a>} </param>
- public ClassicAnalyzer(Version matchVersion) : this(matchVersion, STOP_WORDS_SET)
- {
- }
-
- /// <summary>
- /// Builds an analyzer with the stop words from the given reader. </summary>
- /// <seealso cref= WordlistLoader#getWordSet(Reader, Version) </seealso>
- /// <param name="matchVersion"> Lucene version to match See {@link
- /// <a href="#version">above</a>} </param>
- /// <param name="stopwords"> Reader to read stop words from </param>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public ClassicAnalyzer(org.apache.lucene.util.Version matchVersion, java.io.Reader stopwords) throws java.io.IOException
- public ClassicAnalyzer(Version matchVersion, Reader stopwords) : this(matchVersion, loadStopwordSet(stopwords, matchVersion))
- {
- }
-
- /// <summary>
- /// Set maximum allowed token length. If a token is seen
- /// that exceeds this length then it is discarded. This
- /// setting only takes effect the next time tokenStream or
- /// tokenStream is called.
- /// </summary>
- public int MaxTokenLength
- {
- set
- {
- maxTokenLength = value;
- }
- get
- {
- return maxTokenLength;
- }
- }
-
-
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
-//ORIGINAL LINE: @Override protected TokenStreamComponents createComponents(final String fieldName, final java.io.Reader reader)
- protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
- ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
- src.MaxTokenLength = maxTokenLength;
- TokenStream tok = new ClassicFilter(src);
- tok = new LowerCaseFilter(matchVersion, tok);
- tok = new StopFilter(matchVersion, tok, stopwords);
- return new TokenStreamComponentsAnonymousInnerClassHelper(this, src, tok, reader);
- }
-
- private class TokenStreamComponentsAnonymousInnerClassHelper : TokenStreamComponents
- {
- private readonly ClassicAnalyzer outerInstance;
-
- private Reader reader;
- private org.apache.lucene.analysis.standard.ClassicTokenizer src;
-
- public TokenStreamComponentsAnonymousInnerClassHelper(ClassicAnalyzer outerInstance, org.apache.lucene.analysis.standard.ClassicTokenizer src, TokenStream tok, Reader reader) : base(src, tok)
- {
- this.outerInstance = outerInstance;
- this.reader = reader;
- this.src = src;
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override protected void setReader(final java.io.Reader reader) throws java.io.IOException
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
- protected internal override Reader Reader
- {
- set
- {
- src.MaxTokenLength = outerInstance.maxTokenLength;
- base.Reader = value;
- }
- }
- }
- }
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Filters <seealso cref="ClassicTokenizer"/> with <seealso cref="ClassicFilter"/>, {@link
+ /// LowerCaseFilter} and <seealso cref="StopFilter"/>, using a list of
+ /// English stop words.
+ ///
+ /// <a name="version"/>
+ /// <para>You must specify the required <seealso cref="LuceneVersion"/>
+ /// compatibility when creating ClassicAnalyzer:
+ /// <ul>
+ /// <li> As of 3.1, StopFilter correctly handles Unicode 4.0
+ /// supplementary characters in stopwords
+ /// <li> As of 2.9, StopFilter preserves position
+ /// increments
+ /// <li> As of 2.4, Tokens incorrectly identified as acronyms
+ /// are corrected (see <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
+ /// </ul>
+ ///
+ /// ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1.
+ /// As of 3.1, <seealso cref="StandardAnalyzer"/> implements Unicode text segmentation,
+ /// as specified by UAX#29.
+ /// </para>
+ /// </summary>
+ public sealed class ClassicAnalyzer : StopwordAnalyzerBase
+ {
+
+ /// <summary>
+ /// Default maximum allowed token length </summary>
+ public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+ private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+ /// <summary>
+ /// An unmodifiable set containing some common English words that are usually not
+ /// useful for searching.
+ /// </summary>
+ public static readonly CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words. </summary>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ /// <param name="stopWords"> stop words </param>
+ public ClassicAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords)
+ : base(matchVersion, stopWords)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words ({@link
+ /// #STOP_WORDS_SET}). </summary>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ public ClassicAnalyzer(LuceneVersion matchVersion)
+ : this(matchVersion, STOP_WORDS_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the stop words from the given reader. </summary>
+ /// <seealso cref= WordlistLoader#getWordSet(Reader, Version) </seealso>
+ /// <param name="matchVersion"> Lucene version to match See {@link
+ /// <a href="#version">above</a>} </param>
+ /// <param name="stopwords"> Reader to read stop words from </param>
+ public ClassicAnalyzer(LuceneVersion matchVersion, Reader stopwords)
+ : this(matchVersion, loadStopwordSet(stopwords, matchVersion))
+ {
+ }
+
+ /// <summary>
+ /// Set maximum allowed token length. If a token is seen
+ /// that exceeds this length then it is discarded. This
+ /// setting only takes effect the next time tokenStream or
+ /// tokenStream is called.
+ /// </summary>
+ public int MaxTokenLength
+ {
+ set
+ {
+ maxTokenLength = value;
+ }
+ get
+ {
+ return maxTokenLength;
+ }
+ }
+
+
+ public override TokenStreamComponents CreateComponents(string fieldName, Reader reader)
+ {
+ var src = new ClassicTokenizer(matchVersion, reader);
+ src.MaxTokenLength = maxTokenLength;
+ TokenStream tok = new ClassicFilter(src);
+ tok = new LowerCaseFilter(matchVersion, tok);
+ tok = new StopFilter(matchVersion, tok, stopwords);
+ return new TokenStreamComponentsAnonymousInnerClassHelper(this, src, tok, reader);
+ }
+
+ private class TokenStreamComponentsAnonymousInnerClassHelper : TokenStreamComponents
+ {
+ private readonly ClassicAnalyzer outerInstance;
+
+ private Reader reader;
+ private ClassicTokenizer src;
+
+ public TokenStreamComponentsAnonymousInnerClassHelper(ClassicAnalyzer outerInstance, ClassicTokenizer src, TokenStream tok, Reader reader)
+ : base(src, tok)
+ {
+ this.outerInstance = outerInstance;
+ this.reader = reader;
+ this.src = src;
+ }
+
+ protected override Reader Reader
+ {
+ set
+ {
+ src.MaxTokenLength = outerInstance.maxTokenLength;
+ base.Reader = value;
+ }
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs
index 9ee4b32..60bd1dd 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilter.cs
@@ -1,92 +1,85 @@
-namespace org.apache.lucene.analysis.standard
-{
-
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+using Lucene.Net.Analysis.Tokenattributes;
- using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+namespace Lucene.Net.Analysis.Standard
+{
- /// <summary>
- /// Normalizes tokens extracted with <seealso cref="ClassicTokenizer"/>. </summary>
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Normalizes tokens extracted with <seealso cref="ClassicTokenizer"/>. </summary>
- public class ClassicFilter : TokenFilter
- {
+ public class ClassicFilter : TokenFilter
+ {
- /// <summary>
- /// Construct filtering <i>in</i>. </summary>
- public ClassicFilter(TokenStream @in) : base(@in)
- {
- }
+ /// <summary>
+ /// Construct filtering <i>in</i>. </summary>
+ public ClassicFilter(TokenStream @in)
+ : base(@in)
+ {
+ typeAtt = AddAttribute<ITypeAttribute>();
+ termAtt = AddAttribute<ICharTermAttribute>();
+ }
- private static readonly string APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
- private static readonly string ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
+ private static readonly string APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
+ private static readonly string ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
- // this filters uses attribute type
- private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
- private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ // this filters uses attribute type
+ private readonly ITypeAttribute typeAtt;
+ private readonly ICharTermAttribute termAtt;
- /// <summary>
- /// Returns the next token in the stream, or null at EOS.
- /// <para>Removes <tt>'s</tt> from the end of words.
- /// </para>
- /// <para>Removes dots from acronyms.
- /// </para>
- /// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
- public override bool incrementToken()
- {
- if (!input.incrementToken())
- {
- return false;
- }
+ /// <summary>
+ /// Returns the next token in the stream, or null at EOS.
+ /// <para>Removes <tt>'s</tt> from the end of words.
+ /// </para>
+ /// <para>Removes dots from acronyms.
+ /// </para>
+ /// </summary>
+ public override bool IncrementToken()
+ {
+ if (!input.IncrementToken())
+ {
+ return false;
+ }
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final char[] buffer = termAtt.buffer();
- char[] buffer = termAtt.buffer();
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int bufferLength = termAtt.length();
- int bufferLength = termAtt.length();
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final String type = typeAtt.type();
- string type = typeAtt.type();
+ char[] buffer = termAtt.Buffer();
+ int bufferLength = termAtt.Length;
+ string type = typeAtt.Type;
- if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) // remove 's
- {
- // Strip last 2 characters off
- termAtt.Length = bufferLength - 2;
- } // remove dots
- else if (type == ACRONYM_TYPE)
- {
- int upto = 0;
- for (int i = 0;i < bufferLength;i++)
- {
- char c = buffer[i];
- if (c != '.')
- {
- buffer[upto++] = c;
- }
- }
- termAtt.Length = upto;
- }
+ if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) // remove 's
+ {
+ // Strip last 2 characters off
+ termAtt.Length = bufferLength - 2;
+ } // remove dots
+ else if (type == ACRONYM_TYPE)
+ {
+ int upto = 0;
+ for (int i = 0; i < bufferLength; i++)
+ {
+ char c = buffer[i];
+ if (c != '.')
+ {
+ buffer[upto++] = c;
+ }
+ }
+ termAtt.Length = upto;
+ }
- return true;
- }
- }
+ return true;
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs
index 2107ccc..45d7cd0 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicFilterFactory.cs
@@ -1,55 +1,53 @@
using System.Collections.Generic;
-using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+using Lucene.Net.Analysis.Util;
-namespace org.apache.lucene.analysis.standard
+namespace Lucene.Net.Analysis.Standard
{
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Factory for <seealso cref="ClassicFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.ClassicTokenizerFactory"/>
+ /// <filter class="solr.ClassicFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class ClassicFilterFactory : TokenFilterFactory
+ {
- using TokenFilterFactory = TokenFilterFactory;
+ /// <summary>
+ /// Creates a new ClassicFilterFactory </summary>
+ public ClassicFilterFactory(IDictionary<string, string> args)
+ : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
- /// <summary>
- /// Factory for <seealso cref="ClassicFilter"/>.
- /// <pre class="prettyprint">
- /// <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
- /// <analyzer>
- /// <tokenizer class="solr.ClassicTokenizerFactory"/>
- /// <filter class="solr.ClassicFilterFactory"/>
- /// </analyzer>
- /// </fieldType></pre>
- /// </summary>
- public class ClassicFilterFactory : TokenFilterFactory
- {
-
- /// <summary>
- /// Creates a new ClassicFilterFactory </summary>
- public ClassicFilterFactory(IDictionary<string, string> args) : base(args)
- {
- if (args.Count > 0)
- {
- throw new System.ArgumentException("Unknown parameters: " + args);
- }
- }
-
- public override TokenFilter create(TokenStream input)
- {
- return new ClassicFilter(input);
- }
- }
+ public override TokenStream Create(TokenStream input)
+ {
+ return new ClassicFilter(input);
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs
index f9c680e..3ef7a9e 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizer.cs
@@ -15,198 +15,185 @@
* limitations under the License.
*/
-using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.standard;
+using Reader = System.IO.TextReader;
-namespace org.apache.lucene.analysis.standard
+namespace Lucene.Net.Analysis.Standard
{
-
-
- using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
- using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
- using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
- using Version = org.apache.lucene.util.Version;
-
- /// <summary>
- /// A grammar-based tokenizer constructed with JFlex
- ///
- /// <para> This should be a good tokenizer for most European-language documents:
- ///
- /// <ul>
- /// <li>Splits words at punctuation characters, removing punctuation. However, a
- /// dot that's not followed by whitespace is considered part of a token.
- /// <li>Splits words at hyphens, unless there's a number in the token, in which case
- /// the whole token is interpreted as a product number and is not split.
- /// <li>Recognizes email addresses and internet hostnames as one token.
- /// </ul>
- ///
- /// </para>
- /// <para>Many applications have specific tokenizer needs. If this tokenizer does
- /// not suit your application, please consider copying this source code
- /// directory to your project and maintaining your own grammar-based tokenizer.
- ///
- /// ClassicTokenizer was named StandardTokenizer in Lucene versions prior to 3.1.
- /// As of 3.1, <seealso cref="StandardTokenizer"/> implements Unicode text segmentation,
- /// as specified by UAX#29.
- /// </para>
- /// </summary>
-
- public sealed class ClassicTokenizer : Tokenizer
- {
- /// <summary>
- /// A private instance of the JFlex-constructed scanner </summary>
- private StandardTokenizerInterface scanner;
-
- public const int ALPHANUM = 0;
- public const int APOSTROPHE = 1;
- public const int ACRONYM = 2;
- public const int COMPANY = 3;
- public const int EMAIL = 4;
- public const int HOST = 5;
- public const int NUM = 6;
- public const int CJ = 7;
-
- public const int ACRONYM_DEP = 8;
-
- /// <summary>
- /// String token types that correspond to token type int constants </summary>
- public static readonly string[] TOKEN_TYPES = new string [] {"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};
-
- private int skippedPositions;
-
- private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
-
- /// <summary>
- /// Set the max allowed token length. Any token longer
- /// than this is skipped.
- /// </summary>
- public int MaxTokenLength
- {
- set
- {
- if (value < 1)
- {
- throw new System.ArgumentException("maxTokenLength must be greater than zero");
- }
- this.maxTokenLength = value;
- }
- get
- {
- return maxTokenLength;
- }
- }
-
-
- /// <summary>
- /// Creates a new instance of the <seealso cref="ClassicTokenizer"/>. Attaches
- /// the <code>input</code> to the newly created JFlex scanner.
- /// </summary>
- /// <param name="input"> The input reader
- ///
- /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param>
- public ClassicTokenizer(Version matchVersion, Reader input) : base(input)
- {
- init(matchVersion);
- }
-
- /// <summary>
- /// Creates a new ClassicTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>
- /// </summary>
- public ClassicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) : base(factory, input)
- {
- init(matchVersion);
- }
-
- private void init(Version matchVersion)
- {
- this.scanner = new ClassicTokenizerImpl(input);
- }
-
- // this tokenizer generates three attributes:
- // term offset, positionIncrement and type
- private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
- private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
- private readonly PositionIncrementAttribute posIncrAtt = addAttribute(typeof(PositionIncrementAttribute));
- private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.analysis.TokenStream#next()
- */
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
- public override bool incrementToken()
- {
- clearAttributes();
- skippedPositions = 0;
-
- while (true)
- {
- int tokenType = scanner.NextToken;
-
- if (tokenType == StandardTokenizerInterface_Fields.YYEOF)
- {
- return false;
- }
-
- if (scanner.yylength() <= maxTokenLength)
- {
- posIncrAtt.PositionIncrement = skippedPositions + 1;
- scanner.getText(termAtt);
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int start = scanner.yychar();
- int start = scanner.yychar();
- offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
-
- if (tokenType == ClassicTokenizer.ACRONYM_DEP)
- {
- typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST];
- termAtt.Length = termAtt.length() - 1; // remove extra '.'
- }
- else
- {
- typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[tokenType];
- }
- return true;
- }
- else
- // When we skip a too-long term, we still increment the
- // position increment
- {
- skippedPositions++;
- }
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public final void end() throws java.io.IOException
- public override void end()
- {
- base.end();
- // set final offset
- int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
- offsetAtt.setOffset(finalOffset, finalOffset);
- // adjust any skipped tokens
- posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void close() throws java.io.IOException
- public override void close()
- {
- base.close();
- scanner.yyreset(input);
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
- public override void reset()
- {
- base.reset();
- scanner.yyreset(input);
- skippedPositions = 0;
- }
- }
+ /// <summary>
+ /// A grammar-based tokenizer constructed with JFlex
+ ///
+ /// <para> This should be a good tokenizer for most European-language documents:
+ ///
+ /// <ul>
+ /// <li>Splits words at punctuation characters, removing punctuation. However, a
+ /// dot that's not followed by whitespace is considered part of a token.
+ /// <li>Splits words at hyphens, unless there's a number in the token, in which case
+ /// the whole token is interpreted as a product number and is not split.
+ /// <li>Recognizes email addresses and internet hostnames as one token.
+ /// </ul>
+ ///
+ /// </para>
+ /// <para>Many applications have specific tokenizer needs. If this tokenizer does
+ /// not suit your application, please consider copying this source code
+ /// directory to your project and maintaining your own grammar-based tokenizer.
+ ///
+ /// ClassicTokenizer was named StandardTokenizer in Lucene versions prior to 3.1.
+ /// As of 3.1, <seealso cref="StandardTokenizer"/> implements Unicode text segmentation,
+ /// as specified by UAX#29.
+ /// </para>
+ /// </summary>
+
+ public sealed class ClassicTokenizer : Tokenizer
+ {
+ /// <summary>
+ /// A private instance of the JFlex-constructed scanner </summary>
+ private StandardTokenizerInterface scanner;
+
+ public const int ALPHANUM = 0;
+ public const int APOSTROPHE = 1;
+ public const int ACRONYM = 2;
+ public const int COMPANY = 3;
+ public const int EMAIL = 4;
+ public const int HOST = 5;
+ public const int NUM = 6;
+ public const int CJ = 7;
+
+ public const int ACRONYM_DEP = 8;
+
+ /// <summary>
+ /// String token types that correspond to token type int constants </summary>
+ public static readonly string[] TOKEN_TYPES = new string[] { "<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>" };
+
+ private int skippedPositions;
+
+ private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+ /// <summary>
+ /// Set the max allowed token length. Any token longer
+ /// than this is skipped.
+ /// </summary>
+ public int MaxTokenLength
+ {
+ set
+ {
+ if (value < 1)
+ {
+ throw new System.ArgumentException("maxTokenLength must be greater than zero");
+ }
+ this.maxTokenLength = value;
+ }
+ get
+ {
+ return maxTokenLength;
+ }
+ }
+
+
+ /// <summary>
+ /// Creates a new instance of the <seealso cref="ClassicTokenizer"/>. Attaches
+ /// the <code>input</code> to the newly created JFlex scanner.
+ /// </summary>
+ /// <param name="input"> The input reader
+ ///
+ /// See http://issues.apache.org/jira/browse/LUCENE-1068 </param>
+ public ClassicTokenizer(LuceneVersion matchVersion, Reader input)
+ : base(input)
+ {
+ Init(matchVersion);
+ }
+
+ /// <summary>
+ /// Creates a new ClassicTokenizer with a given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>
+ /// </summary>
+ public ClassicTokenizer(LuceneVersion matchVersion, AttributeFactory factory, Reader input)
+ : base(factory, input)
+ {
+ Init(matchVersion);
+ }
+
+ private void Init(LuceneVersion matchVersion)
+ {
+ this.scanner = new ClassicTokenizerImpl(input);
+ }
+
+ // this tokenizer generates three attributes:
+ // term offset, positionIncrement and type
+ private readonly CharTermAttribute termAtt;
+ private readonly OffsetAttribute offsetAtt;
+ private readonly PositionIncrementAttribute posIncrAtt;
+ private readonly TypeAttribute typeAtt;
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ public override bool IncrementToken()
+ {
+ ClearAttributes();
+ skippedPositions = 0;
+
+ while (true)
+ {
+ int tokenType = scanner.NextToken;
+
+ if (tokenType == StandardTokenizerInterface_Fields.YYEOF)
+ {
+ return false;
+ }
+
+ if (scanner.yylength() <= maxTokenLength)
+ {
+ posIncrAtt.PositionIncrement = skippedPositions + 1;
+ scanner.getText(termAtt);
+
+ int start = scanner.yychar();
+ offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length));
+
+ if (tokenType == ClassicTokenizer.ACRONYM_DEP)
+ {
+ typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST];
+ termAtt.Length = termAtt.Length - 1; // remove extra '.'
+ }
+ else
+ {
+ typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[tokenType];
+ }
+ return true;
+ }
+ else
+ // When we skip a too-long term, we still increment the
+ // position increment
+ {
+ skippedPositions++;
+ }
+ }
+ }
+
+ public override void End()
+ {
+ base.End();
+ // set final offset
+ int finalOffset = CorrectOffset(scanner.yychar() + scanner.yylength());
+ offsetAtt.SetOffset(finalOffset, finalOffset);
+ // adjust any skipped tokens
+ posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
+ }
+
+ public override void Dispose()
+ {
+ base.Dispose();
+ scanner.yyreset(input);
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ scanner.yyreset(input);
+ skippedPositions = 0;
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b4eaf2fc/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizerImpl.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizerImpl.cs b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizerImpl.cs
index 4d30289..f2ad424 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizerImpl.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Standard/ClassicTokenizerImpl.cs
@@ -1,7 +1,9 @@
/* The following code was generated by JFlex 1.5.1 */
using System;
using System.IO;
+using Lucene.Net.Analysis.Tokenattributes;
using org.apache.lucene.analysis.standard;
+using Reader = System.IO.TextReader;
namespace Lucene.Net.Analysis.Standard
{
@@ -286,9 +288,9 @@ namespace Lucene.Net.Analysis.Standard
/// <summary>
/// Fills CharTermAttribute with the current token text.
/// </summary>
- public void getText(CharTermAttribute t)
+ public void getText(ICharTermAttribute t)
{
- t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+ t.CopyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
}
@@ -359,7 +361,7 @@ namespace Lucene.Net.Analysis.Standard
}
/* finally: fill the buffer with new input */
- int numRead = zzReader.read(zzBuffer, zzEndRead, zzBuffer.Length - zzEndRead);
+ int numRead = zzReader.Read(zzBuffer, zzEndRead, zzBuffer.Length - zzEndRead);
if (numRead > 0)
{
@@ -369,7 +371,7 @@ namespace Lucene.Net.Analysis.Standard
// unlikely but not impossible: read 0 characters, but not at end of stream
if (numRead == 0)
{
- int c = zzReader.read();
+ int c = zzReader.Read();
if (c == -1)
{
return true;
@@ -389,8 +391,6 @@ namespace Lucene.Net.Analysis.Standard
/// <summary>
/// Closes the input stream.
/// </summary>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public final void yyclose() throws java.io.IOException
public void yyclose()
{
zzAtEOF = true; // indicate end of file
@@ -398,7 +398,7 @@ namespace Lucene.Net.Analysis.Standard
if (zzReader != null)
{
- zzReader.close();
+ zzReader.Close();
}
}