You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by mh...@apache.org on 2013/09/24 20:33:13 UTC
[37/50] [abbrv] git commit: Finish up Lucene.Net.Analysis.Core
Finish up Lucene.Net.Analysis.Core
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/98e877d5
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/98e877d5
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/98e877d5
Branch: refs/heads/branch_4x
Commit: 98e877d50c803e381fcc92250068f366c1dc6c4c
Parents: d72f5c1
Author: Paul Irwin <pa...@gmail.com>
Authored: Wed Aug 7 15:02:40 2013 -0400
Committer: Paul Irwin <pa...@gmail.com>
Committed: Wed Aug 7 15:02:40 2013 -0400
----------------------------------------------------------------------
src/contrib/Analyzers/Contrib.Analyzers.csproj | 136 ++++++-------------
src/contrib/Analyzers/Core/LetterTokenizer.cs | 4 +-
src/contrib/Analyzers/Core/LowerCaseFilter.cs | 34 +++++
.../Analyzers/Core/LowerCaseFilterFactory.cs | 31 +++++
.../Analyzers/Core/LowerCaseTokenizer.cs | 27 ++++
.../Analyzers/Core/LowerCaseTokenizerFactory.cs | 32 +++++
src/contrib/Analyzers/Core/SimpleAnalyzer.cs | 23 ++++
src/contrib/Analyzers/Core/StopAnalyzer.cs | 55 ++++++++
src/contrib/Analyzers/Core/StopFilter.cs | 53 ++++++++
src/contrib/Analyzers/Core/StopFilterFactory.cs | 81 +++++++++++
src/contrib/Analyzers/Core/TypeTokenFilter.cs | 34 +++++
.../Analyzers/Core/TypeTokenFilterFactory.cs | 63 +++++++++
.../Analyzers/Core/WhitespaceAnalyzer.cs | 23 ++++
.../Analyzers/Core/WhitespaceTokenizer.cs | 28 ++++
.../Core/WhitespaceTokenizerFactory.cs | 26 ++++
src/contrib/Analyzers/Support/AbstractSet.cs | 2 +-
.../Analyzers/Util/AbstractAnalysisFactory.cs | 11 +-
src/contrib/Analyzers/Util/CharArrayMap.cs | 73 +++++++++-
src/contrib/Analyzers/Util/CharArraySet.cs | 17 ++-
src/contrib/Analyzers/Util/CharTokenizer.cs | 4 +-
src/contrib/Analyzers/Util/CharacterUtils.cs | 2 +-
.../Analyzers/Util/FilteringTokenFilter.cs | 77 +++++++++++
.../Analyzers/Util/IMultiTermAwareComponent.cs | 12 ++
.../Analyzers/Util/IResourceLoaderAware.cs | 12 ++
.../Analyzers/Util/StopwordAnalyzerBase.cs | 10 +-
.../Analyzers/Util/TokenFilterFactory.cs | 44 ++++++
src/contrib/Analyzers/Util/WordlistLoader.cs | 4 +-
27 files changed, 792 insertions(+), 126 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Contrib.Analyzers.csproj
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Contrib.Analyzers.csproj b/src/contrib/Analyzers/Contrib.Analyzers.csproj
index 8613c88..74b0f63 100644
--- a/src/contrib/Analyzers/Contrib.Analyzers.csproj
+++ b/src/contrib/Analyzers/Contrib.Analyzers.csproj
@@ -103,116 +103,39 @@
<Reference Condition="'$(Framework)' == 'NET35'" Include="System.Core" />
</ItemGroup>
<ItemGroup>
- <Compile Include="AR\ArabicAnalyzer.cs" />
- <Compile Include="AR\ArabicLetterTokenizer.cs" />
- <Compile Include="AR\ArabicNormalizationFilter.cs" />
- <Compile Include="AR\ArabicNormalizer.cs" />
- <Compile Include="AR\ArabicStemFilter.cs" />
- <Compile Include="AR\ArabicStemmer.cs" />
- <Compile Include="BR\BrazilianAnalyzer.cs" />
- <Compile Include="BR\BrazilianStemFilter.cs" />
- <Compile Include="BR\BrazilianStemmer.cs" />
- <Compile Include="CJK\CJKAnalyzer.cs" />
- <Compile Include="CJK\CJKTokenizer.cs" />
- <Compile Include="Cn\ChineseAnalyzer.cs" />
- <Compile Include="Cn\ChineseFilter.cs" />
- <Compile Include="Cn\ChineseTokenizer.cs" />
- <Compile Include="Compound\CompoundWordTokenFilterBase.cs" />
- <Compile Include="Compound\DictionaryCompoundWordTokenFilter.cs" />
- <Compile Include="Compound\HyphenationCompoundWordTokenFilter.cs" />
- <Compile Include="Compound\Hyphenation\ByteVector.cs" />
- <Compile Include="Compound\Hyphenation\CharVector.cs" />
- <Compile Include="Compound\Hyphenation\Hyphen.cs" />
- <Compile Include="Compound\Hyphenation\Hyphenation.cs" />
- <Compile Include="Compound\Hyphenation\HyphenationException.cs" />
- <Compile Include="Compound\Hyphenation\HyphenationTree.cs" />
- <Compile Include="Compound\Hyphenation\PatternConsumer.cs" />
- <Compile Include="Compound\Hyphenation\PatternParser.cs" />
- <Compile Include="Compound\Hyphenation\TernaryTree.cs" />
<Compile Include="Core\KeywordAnalyzer.cs" />
<Compile Include="Core\KeywordTokenizer.cs" />
<Compile Include="Core\KeywordTokenizerFactory.cs" />
<Compile Include="Core\LetterTokenizer.cs" />
<Compile Include="Core\LetterTokenizerFactory.cs" />
- <Compile Include="Cz\CzechAnalyzer.cs" />
- <Compile Include="De\GermanAnalyzer.cs" />
- <Compile Include="De\GermanStemFilter.cs" />
- <Compile Include="De\GermanStemmer.cs" />
- <Compile Include="De\GermanDIN2Stemmer.cs" />
- <Compile Include="El\GreekAnalyzer.cs" />
- <Compile Include="El\GreekLowerCaseFilter.cs" />
- <Compile Include="Fa\PersianAnalyzer.cs" />
- <Compile Include="Fa\PersianNormalizationFilter.cs" />
- <Compile Include="Fa\PersianNormalizer.cs" />
- <Compile Include="Fr\ElisionFilter.cs" />
- <Compile Include="Fr\FrenchAnalyzer.cs" />
- <Compile Include="Fr\FrenchStemFilter.cs" />
- <Compile Include="Fr\FrenchStemmer.cs" />
- <Compile Include="Hunspell\HunspellAffix.cs" />
- <Compile Include="Hunspell\HunspellDictionary.cs" />
- <Compile Include="Hunspell\HunspellStem.cs" />
- <Compile Include="Hunspell\HunspellStemFilter.cs" />
- <Compile Include="Hunspell\HunspellStemmer.cs" />
- <Compile Include="Hunspell\HunspellWord.cs" />
- <Compile Include="Miscellaneous\EmptyTokenStream.cs" />
- <Compile Include="Miscellaneous\InjectablePrefixAwareTokenFilter.cs" />
- <Compile Include="Miscellaneous\PatternAnalyzer.cs" />
- <Compile Include="Miscellaneous\PrefixAndSuffixAwareTokenFilter.cs" />
- <Compile Include="Miscellaneous\PrefixAwareTokenStream.cs" />
- <Compile Include="Miscellaneous\SingleTokenTokenStream.cs" />
- <Compile Include="NGram\EdgeNGramTokenFilter.cs" />
- <Compile Include="NGram\EdgeNGramTokenizer.cs" />
- <Compile Include="NGram\NGramTokenFilter.cs" />
- <Compile Include="NGram\NGramTokenizer.cs" />
- <Compile Include="Nl\DutchAnalyzer.cs" />
- <Compile Include="Nl\DutchStemFilter.cs" />
- <Compile Include="Nl\DutchStemmer.cs" />
- <Compile Include="Payloads\AbstractEncoder.cs" />
- <Compile Include="Payloads\DelimitedPayloadTokenFilter.cs" />
- <Compile Include="Payloads\FloatEncoder.cs" />
- <Compile Include="Payloads\IdentityEncoder.cs" />
- <Compile Include="Payloads\IntegerEncoder.cs" />
- <Compile Include="Payloads\NumericPayloadTokenFilter.cs" />
- <Compile Include="Payloads\PayloadEncoder.cs" />
- <Compile Include="Payloads\PayloadHelper.cs" />
- <Compile Include="Payloads\TokenOffsetPayloadTokenFilter.cs" />
- <Compile Include="Payloads\TypeAsPayloadTokenFilter.cs" />
- <Compile Include="Position\PositionFilter.cs" />
- <Compile Include="Query\QueryAutoStopWordAnalyzer.cs" />
- <Compile Include="Reverse\ReverseStringFilter.cs" />
- <Compile Include="Ru\RussianAnalyzer.cs" />
- <Compile Include="Ru\RussianLetterTokenizer.cs" />
- <Compile Include="Ru\RussianLowerCaseFilter.cs" />
- <Compile Include="Ru\RussianStemFilter.cs" />
- <Compile Include="Ru\RussianStemmer.cs" />
+ <Compile Include="Core\LowerCaseFilter.cs" />
+ <Compile Include="Core\LowerCaseFilterFactory.cs" />
+ <Compile Include="Core\LowerCaseTokenizer.cs" />
+ <Compile Include="Core\LowerCaseTokenizerFactory.cs" />
+ <Compile Include="Core\SimpleAnalyzer.cs" />
+ <Compile Include="Core\StopAnalyzer.cs" />
+ <Compile Include="Core\StopFilter.cs" />
+ <Compile Include="Core\StopFilterFactory.cs" />
+ <Compile Include="Core\TypeTokenFilter.cs" />
+ <Compile Include="Core\TypeTokenFilterFactory.cs" />
+ <Compile Include="Core\WhitespaceAnalyzer.cs" />
+ <Compile Include="Core\WhitespaceTokenizer.cs" />
+ <Compile Include="Core\WhitespaceTokenizerFactory.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
- <Compile Include="Shingle\Matrix\Column.cs" />
- <Compile Include="Shingle\Matrix\Matrix.cs" />
- <Compile Include="Shingle\Matrix\MatrixPermutationIterator.cs" />
- <Compile Include="Shingle\Matrix\Row.cs" />
- <Compile Include="Shingle\ShingleAnalyzerWrapper.cs" />
- <Compile Include="Shingle\ShingleFilter.cs" />
- <Compile Include="Shingle\ShingleMatrixFilter.cs" />
- <Compile Include="Shingle\TokenPositioner.cs" />
- <Compile Include="Shingle\Codec\OneDimensionalNonWeightedTokenSettingsCodec.cs" />
- <Compile Include="Shingle\Codec\SimpleThreeDimensionalTokenSettingsCodec.cs" />
- <Compile Include="Shingle\Codec\TokenSettingsCodec.cs" />
- <Compile Include="Shingle\Codec\TwoDimensionalNonWeightedSynonymTokenSettingsCodec.cs" />
- <Compile Include="Sinks\DateRecognizerSinkFilter.cs" />
- <Compile Include="Sinks\TokenRangeSinkFilter.cs" />
- <Compile Include="Sinks\TokenTypeSinkFilter.cs" />
<Compile Include="Support\AbstractSet.cs" />
<Compile Include="Support\StringExtensions.cs" />
- <Compile Include="Th\ThaiAnalyzer.cs" />
- <Compile Include="Th\ThaiWordFilter.cs" />
<Compile Include="Util\AbstractAnalysisFactory.cs" />
<Compile Include="Util\AnalysisSPILoader.cs" />
<Compile Include="Util\CharacterUtils.cs" />
<Compile Include="Util\CharArrayMap.cs" />
<Compile Include="Util\CharArraySet.cs" />
<Compile Include="Util\CharTokenizer.cs" />
+ <Compile Include="Util\FilteringTokenFilter.cs" />
+ <Compile Include="Util\IMultiTermAwareComponent.cs" />
<Compile Include="Util\IResourceLoader.cs" />
+ <Compile Include="Util\IResourceLoaderAware.cs" />
<Compile Include="Util\StopwordAnalyzerBase.cs" />
+ <Compile Include="Util\TokenFilterFactory.cs" />
<Compile Include="Util\TokenizerFactory.cs" />
<Compile Include="Util\WordlistLoader.cs" />
<Compile Include="WordlistLoader.cs" />
@@ -227,9 +150,32 @@
</ProjectReference>
</ItemGroup>
<ItemGroup>
- <None Include="Compound\Hyphenation\hyphenation.dtd" />
<None Include="Lucene.Net.snk" />
</ItemGroup>
+ <ItemGroup>
+ <Folder Include="BR\" />
+ <Folder Include="CJK\" />
+ <Folder Include="Cn\" />
+ <Folder Include="Compound\Hyphenation\" />
+ <Folder Include="Cz\" />
+ <Folder Include="De\" />
+ <Folder Include="El\" />
+ <Folder Include="Fa\" />
+ <Folder Include="Fr\" />
+ <Folder Include="Hunspell\" />
+ <Folder Include="Miscellaneous\" />
+ <Folder Include="NGram\" />
+ <Folder Include="Nl\" />
+ <Folder Include="Payloads\" />
+ <Folder Include="Position\" />
+ <Folder Include="Query\" />
+ <Folder Include="Reverse\" />
+ <Folder Include="Ru\" />
+ <Folder Include="Shingle\Codec\" />
+ <Folder Include="Shingle\Matrix\" />
+ <Folder Include="Sinks\" />
+ <Folder Include="Th\" />
+ </ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/LetterTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/LetterTokenizer.cs b/src/contrib/Analyzers/Core/LetterTokenizer.cs
index 669d8dc..a4e4938 100644
--- a/src/contrib/Analyzers/Core/LetterTokenizer.cs
+++ b/src/contrib/Analyzers/Core/LetterTokenizer.cs
@@ -10,12 +10,12 @@ namespace Lucene.Net.Analysis.Core
{
public class LetterTokenizer : CharTokenizer
{
- public LetterTokenizer(Version matchVersion, TextReader input)
+ public LetterTokenizer(Version? matchVersion, TextReader input)
: base(matchVersion, input)
{
}
- public LetterTokenizer(Version matchVersion, AttributeFactory factory, TextReader input)
+ public LetterTokenizer(Version? matchVersion, AttributeFactory factory, TextReader input)
: base(matchVersion, factory, input)
{
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/LowerCaseFilter.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/LowerCaseFilter.cs b/src/contrib/Analyzers/Core/LowerCaseFilter.cs
new file mode 100644
index 0000000..d0157f5
--- /dev/null
+++ b/src/contrib/Analyzers/Core/LowerCaseFilter.cs
@@ -0,0 +1,34 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public sealed class LowerCaseFilter : TokenFilter
+ {
+ private readonly CharacterUtils charUtils;
+ private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+
+ public LowerCaseFilter(Version? matchVersion, TokenStream input)
+ : base(input)
+ {
+ charUtils = CharacterUtils.GetInstance(matchVersion);
+ termAtt = AddAttribute<ICharTermAttribute>();
+ }
+
+ public override bool IncrementToken()
+ {
+ if (input.IncrementToken())
+ {
+ charUtils.ToLowerCase(termAtt.Buffer, 0, termAtt.Length);
+ return true;
+ }
+ else
+ return false;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/LowerCaseFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/LowerCaseFilterFactory.cs b/src/contrib/Analyzers/Core/LowerCaseFilterFactory.cs
new file mode 100644
index 0000000..6ea42e9
--- /dev/null
+++ b/src/contrib/Analyzers/Core/LowerCaseFilterFactory.cs
@@ -0,0 +1,31 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public class LowerCaseFilterFactory : TokenFilterFactory, IMultiTermAwareComponent
+ {
+ public LowerCaseFilterFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ AssureMatchVersion();
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+ return new LowerCaseFilter(luceneMatchVersion, input);
+ }
+
+ public AbstractAnalysisFactory MultiTermComponent
+ {
+ get { return this; }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/LowerCaseTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/LowerCaseTokenizer.cs b/src/contrib/Analyzers/Core/LowerCaseTokenizer.cs
new file mode 100644
index 0000000..34d4a23
--- /dev/null
+++ b/src/contrib/Analyzers/Core/LowerCaseTokenizer.cs
@@ -0,0 +1,27 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public sealed class LowerCaseTokenizer : LetterTokenizer
+ {
+ public LowerCaseTokenizer(Version? matchVersion, TextReader input)
+ : base(matchVersion, input)
+ {
+ }
+
+ public LowerCaseTokenizer(Version? matchVersion, AttributeFactory factory, TextReader input)
+ : base(matchVersion, factory, input)
+ {
+ }
+
+ protected override int Normalize(int c)
+ {
+ return (int)char.ToLower((char)c);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/LowerCaseTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/LowerCaseTokenizerFactory.cs b/src/contrib/Analyzers/Core/LowerCaseTokenizerFactory.cs
new file mode 100644
index 0000000..316f775
--- /dev/null
+++ b/src/contrib/Analyzers/Core/LowerCaseTokenizerFactory.cs
@@ -0,0 +1,32 @@
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public class LowerCaseTokenizerFactory : TokenizerFactory, IMultiTermAwareComponent
+ {
+ public LowerCaseTokenizerFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ AssureMatchVersion();
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override Tokenizer Create(Net.Util.AttributeSource.AttributeFactory factory, System.IO.TextReader input)
+ {
+ return new LowerCaseTokenizer(luceneMatchVersion, factory, input);
+ }
+
+ public AbstractAnalysisFactory MultiTermComponent
+ {
+ get { return new LowerCaseFilterFactory(new HashMap<String, String>(OriginalArgs)); }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/SimpleAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/SimpleAnalyzer.cs b/src/contrib/Analyzers/Core/SimpleAnalyzer.cs
new file mode 100644
index 0000000..2b2b97d
--- /dev/null
+++ b/src/contrib/Analyzers/Core/SimpleAnalyzer.cs
@@ -0,0 +1,23 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public sealed class SimpleAnalyzer : Analyzer
+ {
+ private readonly Version? matchVersion;
+
+ public SimpleAnalyzer(Version? matchVersion)
+ {
+ this.matchVersion = matchVersion;
+ }
+
+ public override Analyzer.TokenStreamComponents CreateComponents(string fieldName, System.IO.TextReader reader)
+ {
+ return new TokenStreamComponents(new LowerCaseTokenizer(matchVersion, reader));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/StopAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/StopAnalyzer.cs b/src/contrib/Analyzers/Core/StopAnalyzer.cs
new file mode 100644
index 0000000..ed41f02
--- /dev/null
+++ b/src/contrib/Analyzers/Core/StopAnalyzer.cs
@@ -0,0 +1,55 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public sealed class StopAnalyzer : StopwordAnalyzerBase
+ {
+ public static readonly CharArraySet ENGLISH_STOP_WORDS_SET;
+
+ static StopAnalyzer()
+ {
+ string[] stopWords = new string[] {
+ "a", "an", "and", "are", "as", "at", "be", "but", "by",
+ "for", "if", "in", "into", "is", "it",
+ "no", "not", "of", "on", "or", "such",
+ "that", "the", "their", "then", "there", "these",
+ "they", "this", "to", "was", "will", "with"
+ };
+ CharArraySet stopSet = new CharArraySet(Version.LUCENE_CURRENT, stopWords, false);
+ ENGLISH_STOP_WORDS_SET = CharArraySet.UnmodifiableSet(stopSet);
+ }
+
+ public StopAnalyzer(Version? matchVersion)
+ : this(matchVersion, ENGLISH_STOP_WORDS_SET)
+ {
+ }
+
+ public StopAnalyzer(Version? matchVersion, CharArraySet stopWords)
+ : base(matchVersion, stopWords)
+ {
+ }
+
+ public StopAnalyzer(Version? matchVersion, Stream stopwordsFile)
+ : this(matchVersion, LoadStopwordSet(stopwordsFile, matchVersion))
+ {
+ }
+
+ public StopAnalyzer(Version? matchVersion, TextReader stopwords)
+ : this(matchVersion, LoadStopwordSet(stopwords, matchVersion))
+ {
+ }
+
+ public override Analyzer.TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+ {
+ Tokenizer source = new LowerCaseTokenizer(matchVersion, reader);
+ return new TokenStreamComponents(source, new StopFilter(matchVersion,
+ source, stopwords));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/StopFilter.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/StopFilter.cs b/src/contrib/Analyzers/Core/StopFilter.cs
new file mode 100644
index 0000000..c9a193b
--- /dev/null
+++ b/src/contrib/Analyzers/Core/StopFilter.cs
@@ -0,0 +1,53 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public sealed class StopFilter : FilteringTokenFilter
+ {
+ private readonly CharArraySet stopWords;
+ private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+
+ public StopFilter(Version? matchVersion, TokenStream input, CharArraySet stopWords)
+ : base(true, input)
+ {
+ this.stopWords = stopWords;
+ termAtt = AddAttribute<ICharTermAttribute>();
+ }
+
+ public static CharArraySet MakeStopSet(Version? matchVersion, params String[] stopWords)
+ {
+ return MakeStopSet(matchVersion, stopWords, false);
+ }
+
+ public static CharArraySet MakeStopSet(Version? matchVersion, List<object> stopWords)
+ {
+ return MakeStopSet(matchVersion, stopWords, false);
+ }
+
+ public static CharArraySet MakeStopSet(Version? matchVersion, String[] stopWords, bool ignoreCase)
+ {
+ CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.Length, ignoreCase);
+ stopSet.AddAll(stopWords);
+ return stopSet;
+ }
+
+ public static CharArraySet MakeStopSet(Version? matchVersion, List<object> stopWords, bool ignoreCase)
+ {
+ CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.Count, ignoreCase);
+ stopSet.AddAll(stopWords);
+ return stopSet;
+ }
+
+ protected override bool Accept()
+ {
+ return !stopWords.Contains(termAtt.Buffer, 0, termAtt.Length);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/StopFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/StopFilterFactory.cs b/src/contrib/Analyzers/Core/StopFilterFactory.cs
new file mode 100644
index 0000000..907c383
--- /dev/null
+++ b/src/contrib/Analyzers/Core/StopFilterFactory.cs
@@ -0,0 +1,81 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public class StopFilterFactory : TokenFilterFactory, IResourceLoaderAware
+ {
+ private CharArraySet stopWords;
+ private readonly String stopWordFiles;
+ private readonly String format;
+ private readonly bool ignoreCase;
+ private readonly bool enablePositionIncrements;
+
+ public StopFilterFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ AssureMatchVersion();
+ stopWordFiles = Get(args, "words");
+ format = Get(args, "format");
+ ignoreCase = GetBoolean(args, "ignoreCase", false);
+ enablePositionIncrements = GetBoolean(args, "enablePositionIncrements", false);
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public void Inform(IResourceLoader loader)
+ {
+ if (stopWordFiles != null)
+ {
+ if ("snowball".EqualsIgnoreCase(format))
+ {
+ stopWords = GetSnowballWordSet(loader, stopWordFiles, ignoreCase);
+ }
+ else
+ {
+ stopWords = GetWordSet(loader, stopWordFiles, ignoreCase);
+ }
+ }
+ else
+ {
+ stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
+ }
+ }
+
+ public bool IsEnablePositionIncrements
+ {
+ get
+ {
+ return enablePositionIncrements;
+ }
+ }
+
+ public bool IsIgnoreCase
+ {
+ get
+ {
+ return ignoreCase;
+ }
+ }
+
+ public CharArraySet StopWords
+ {
+ get
+ {
+ return stopWords;
+ }
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+ StopFilter stopFilter = new StopFilter(luceneMatchVersion, input, stopWords);
+ stopFilter.EnablePositionIncrements = enablePositionIncrements;
+ return stopFilter;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/TypeTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/TypeTokenFilter.cs b/src/contrib/Analyzers/Core/TypeTokenFilter.cs
new file mode 100644
index 0000000..f6ea7dd
--- /dev/null
+++ b/src/contrib/Analyzers/Core/TypeTokenFilter.cs
@@ -0,0 +1,34 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public sealed class TypeTokenFilter : FilteringTokenFilter
+ {
+ private readonly ISet<String> stopTypes;
+ private readonly ITypeAttribute typeAttribute; // = addAttribute(TypeAttribute.class);
+ private readonly bool useWhiteList;
+
+ public TypeTokenFilter(bool enablePositionIncrements, TokenStream input, ISet<String> stopTypes, bool useWhiteList)
+ : base(enablePositionIncrements, input)
+ {
+ this.stopTypes = stopTypes;
+ this.useWhiteList = useWhiteList;
+ typeAttribute = AddAttribute<ITypeAttribute>();
+ }
+
+ public TypeTokenFilter(bool enablePositionIncrements, TokenStream input, ISet<String> stopTypes)
+ : this(enablePositionIncrements, input, stopTypes, false)
+ {
+ }
+
+ protected override bool Accept()
+ {
+ return useWhiteList == stopTypes.Contains(typeAttribute.Type);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/TypeTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/TypeTokenFilterFactory.cs b/src/contrib/Analyzers/Core/TypeTokenFilterFactory.cs
new file mode 100644
index 0000000..1552375
--- /dev/null
+++ b/src/contrib/Analyzers/Core/TypeTokenFilterFactory.cs
@@ -0,0 +1,63 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public class TypeTokenFilterFactory : TokenFilterFactory, IResourceLoaderAware
+ {
+ private readonly bool useWhitelist;
+ private readonly bool enablePositionIncrements;
+ private readonly String stopTypesFiles;
+ private ISet<String> stopTypes;
+
+ public TypeTokenFilterFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ stopTypesFiles = Require(args, "types");
+ enablePositionIncrements = GetBoolean(args, "enablePositionIncrements", false);
+ useWhitelist = GetBoolean(args, "useWhitelist", false);
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public void Inform(IResourceLoader loader)
+ {
+ IList<String> files = SplitFileNames(stopTypesFiles);
+ if (files.Count > 0)
+ {
+ stopTypes = new HashSet<String>();
+ foreach (String file in files)
+ {
+ IList<String> typesLines = GetLines(loader, file.Trim());
+ stopTypes.UnionWith(typesLines);
+ }
+ }
+ }
+
+ public bool IsEnablePositionIncrements
+ {
+ get
+ {
+ return enablePositionIncrements;
+ }
+ }
+
+ public ISet<String> StopTypes
+ {
+ get
+ {
+ return stopTypes;
+ }
+ }
+
+ public override TokenStream Create(TokenStream input)
+ {
+ return new TypeTokenFilter(enablePositionIncrements, input, stopTypes, useWhitelist);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/WhitespaceAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/WhitespaceAnalyzer.cs b/src/contrib/Analyzers/Core/WhitespaceAnalyzer.cs
new file mode 100644
index 0000000..180329e
--- /dev/null
+++ b/src/contrib/Analyzers/Core/WhitespaceAnalyzer.cs
@@ -0,0 +1,23 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public sealed class WhitespaceAnalyzer : Analyzer
+ {
+ private readonly Version? matchVersion;
+
+ public WhitespaceAnalyzer(Version? matchVersion)
+ {
+ this.matchVersion = matchVersion;
+ }
+
+ public override Analyzer.TokenStreamComponents CreateComponents(string fieldName, System.IO.TextReader reader)
+ {
+ return new TokenStreamComponents(new WhitespaceTokenizer(matchVersion, reader));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/WhitespaceTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/WhitespaceTokenizer.cs b/src/contrib/Analyzers/Core/WhitespaceTokenizer.cs
new file mode 100644
index 0000000..87909a2
--- /dev/null
+++ b/src/contrib/Analyzers/Core/WhitespaceTokenizer.cs
@@ -0,0 +1,28 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public sealed class WhitespaceTokenizer : CharTokenizer
+ {
+ public WhitespaceTokenizer(Version? matchVersion, TextReader input)
+ : base(matchVersion, input)
+ {
+ }
+
+ public WhitespaceTokenizer(Version? matchVersion, AttributeFactory factory, TextReader input)
+ : base(matchVersion, factory, input)
+ {
+ }
+
+ protected override bool IsTokenChar(int c)
+ {
+ return !char.IsWhiteSpace((char)c);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Core/WhitespaceTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Core/WhitespaceTokenizerFactory.cs b/src/contrib/Analyzers/Core/WhitespaceTokenizerFactory.cs
new file mode 100644
index 0000000..378d30f
--- /dev/null
+++ b/src/contrib/Analyzers/Core/WhitespaceTokenizerFactory.cs
@@ -0,0 +1,26 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Core
+{
+ public class WhitespaceTokenizerFactory : TokenizerFactory
+ {
+ public WhitespaceTokenizerFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ AssureMatchVersion();
+ if (args.Count > 0)
+ {
+ throw new ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override Tokenizer Create(Net.Util.AttributeSource.AttributeFactory factory, System.IO.TextReader input)
+ {
+ return new WhitespaceTokenizer(luceneMatchVersion, factory, input);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Support/AbstractSet.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Support/AbstractSet.cs b/src/contrib/Analyzers/Support/AbstractSet.cs
index f732d08..a9249d5 100644
--- a/src/contrib/Analyzers/Support/AbstractSet.cs
+++ b/src/contrib/Analyzers/Support/AbstractSet.cs
@@ -5,7 +5,7 @@ using System.Text;
namespace Lucene.Net.Analysis.Support
{
- public class AbstractSet<T> : ISet<T>
+ public abstract class AbstractSet<T> : ISet<T>
{
public virtual bool Add(T item)
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Util/AbstractAnalysisFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/AbstractAnalysisFactory.cs b/src/contrib/Analyzers/Util/AbstractAnalysisFactory.cs
index ab0b117..b815eb6 100644
--- a/src/contrib/Analyzers/Util/AbstractAnalysisFactory.cs
+++ b/src/contrib/Analyzers/Util/AbstractAnalysisFactory.cs
@@ -1,4 +1,5 @@
-using Lucene.Net.Support;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
@@ -15,7 +16,7 @@ namespace Lucene.Net.Analysis.Util
private readonly IDictionary<string, string> originalArgs;
- protected readonly Lucene.Net.Util.Version luceneMatchVersion;
+ protected readonly Lucene.Net.Util.Version? luceneMatchVersion;
private bool isExplicitLuceneMatchVersion = false;
@@ -23,7 +24,7 @@ namespace Lucene.Net.Analysis.Util
{
originalArgs = new HashMap<String, String>(args);
String version = Get(args, LUCENE_MATCH_VERSION_PARAM);
- luceneMatchVersion = version == null ? (Lucene.Net.Util.Version)null : version.ParseLeniently();
+ luceneMatchVersion = version == null ? (Lucene.Net.Util.Version?)null : version.ParseLeniently();
args.Remove(CLASS_NAME); // consume the class arg
}
@@ -44,7 +45,7 @@ namespace Lucene.Net.Analysis.Util
}
}
- public Lucene.Net.Util.Version LuceneMatchVersion
+ public Lucene.Net.Util.Version? LuceneMatchVersion
{
get
{
@@ -274,7 +275,7 @@ namespace Lucene.Net.Analysis.Util
foreach (String file in files)
{
IList<String> wlist = GetLines(loader, file.Trim());
- words.UnionWith(StopFilter.MakeStopSet(luceneMatchVersion, wlist,
+ words.UnionWith(StopFilter.MakeStopSet(luceneMatchVersion, wlist.Cast<object>().ToList(),
ignoreCase));
}
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Util/CharArrayMap.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/CharArrayMap.cs b/src/contrib/Analyzers/Util/CharArrayMap.cs
index e124451..fb7ee13 100644
--- a/src/contrib/Analyzers/Util/CharArrayMap.cs
+++ b/src/contrib/Analyzers/Util/CharArrayMap.cs
@@ -16,11 +16,11 @@ namespace Lucene.Net.Analysis.Util
private readonly CharacterUtils charUtils;
private bool ignoreCase;
private int count;
- internal readonly Lucene.Net.Util.Version matchVersion; // package private because used in CharArraySet
+ internal readonly Lucene.Net.Util.Version? matchVersion; // package private because used in CharArraySet
internal char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
internal V[] values; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
- public CharArrayMap(Lucene.Net.Util.Version matchVersion, int startSize, bool ignoreCase)
+ public CharArrayMap(Lucene.Net.Util.Version? matchVersion, int startSize, bool ignoreCase)
{
this.ignoreCase = ignoreCase;
int size = INIT_SIZE;
@@ -28,11 +28,11 @@ namespace Lucene.Net.Analysis.Util
size <<= 1;
keys = new char[size][];
values = new V[size];
- this.charUtils = CharacterUtils.GetInstance(matchVersion);
+ this.charUtils = CharacterUtils.GetInstance(matchVersion.GetValueOrDefault());
this.matchVersion = matchVersion;
}
- public CharArrayMap(Lucene.Net.Util.Version matchVersion, IDictionary<object, V> c, bool ignoreCase)
+ public CharArrayMap(Lucene.Net.Util.Version? matchVersion, IDictionary<object, V> c, bool ignoreCase)
: this(matchVersion, c.Count, ignoreCase)
{
foreach (var kvp in c)
@@ -367,7 +367,7 @@ namespace Lucene.Net.Analysis.Util
if (keySet == null)
{
// prevent adding of entries
- keySet = new AnonymousCharArraySet(this);
+ keySet = new AnonymousCharArraySet(new CharArrayMap<object>(matchVersion, this.ToDictionary(i => (object)i.Key, i => (object)i.Value), ignoreCase));
}
return keySet;
@@ -376,7 +376,7 @@ namespace Lucene.Net.Analysis.Util
private sealed class AnonymousCharArraySet : CharArraySet
{
- public AnonymousCharArraySet(CharArrayMap<V> map)
+ public AnonymousCharArraySet(CharArrayMap<object> map)
: base(map)
{
}
@@ -581,6 +581,65 @@ namespace Lucene.Net.Analysis.Util
parent.Clear();
}
}
+
+ public void Add(object key, V value)
+ {
+ Put(key, value);
+ }
+
+ bool IDictionary<object, V>.Remove(object key)
+ {
+ Remove(key);
+ return true;
+ }
+
+ public bool TryGetValue(object key, out V value)
+ {
+ value = Get(key);
+
+ return value != null;
+ }
+
+ public ICollection<V> Values
+ {
+ get { return values; }
+ }
+
+ public void Add(KeyValuePair<object, V> item)
+ {
+ Put(item.Key, item.Value);
+ }
+
+ public bool Contains(KeyValuePair<object, V> item)
+ {
+ return ContainsKey(item.Key);
+ }
+
+ public void CopyTo(KeyValuePair<object, V>[] array, int arrayIndex)
+ {
+ throw new NotImplementedException();
+ }
+
+ public bool IsReadOnly
+ {
+ get { return false; }
+ }
+
+ public bool Remove(KeyValuePair<object, V> item)
+ {
+ Remove(item.Key);
+ return true;
+ }
+
+ public IEnumerator<KeyValuePair<object, V>> GetEnumerator()
+ {
+ return GetEntrySet().GetEnumerator();
+ }
+
+ System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
+ {
+ return GetEnumerator();
+ }
}
// .NET Port: non-generic static clas to hold nested types and static methods
@@ -597,7 +656,7 @@ namespace Lucene.Net.Analysis.Util
return new UnmodifiableCharArrayMap<V>(map);
}
- public static CharArrayMap<V> Copy<V>(Lucene.Net.Util.Version matchVersion, IDictionary<object, V> map)
+ public static CharArrayMap<V> Copy<V>(Lucene.Net.Util.Version? matchVersion, IDictionary<object, V> map)
{
if (map == CharArrayMap<V>.EMPTY_MAP)
return EmptyMap<V>();
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Util/CharArraySet.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/CharArraySet.cs b/src/contrib/Analyzers/Util/CharArraySet.cs
index 522bcaa..23eb0ea 100644
--- a/src/contrib/Analyzers/Util/CharArraySet.cs
+++ b/src/contrib/Analyzers/Util/CharArraySet.cs
@@ -14,12 +14,12 @@ namespace Lucene.Net.Analysis.Util
private readonly CharArrayMap<object> map;
- public CharArraySet(Lucene.Net.Util.Version matchVersion, int startSize, bool ignoreCase)
+ public CharArraySet(Lucene.Net.Util.Version? matchVersion, int startSize, bool ignoreCase)
: this(new CharArrayMap<Object>(matchVersion, startSize, ignoreCase))
{
}
- public CharArraySet(Lucene.Net.Util.Version matchVersion, ICollection<object> c, bool ignoreCase)
+ public CharArraySet(Lucene.Net.Util.Version? matchVersion, ICollection<object> c, bool ignoreCase)
: this(matchVersion, c.Count, ignoreCase)
{
AddAll(c);
@@ -55,17 +55,17 @@ namespace Lucene.Net.Analysis.Util
return map.Put(o, PLACEHOLDER) == null;
}
- public bool Add(ICharSequence text)
+ public virtual bool Add(ICharSequence text)
{
return map.Put(text, PLACEHOLDER) == null;
}
- public bool Add(string text)
+ public virtual bool Add(string text)
{
return map.Put(text, PLACEHOLDER) == null;
}
- public bool Add(char[] text)
+ public virtual bool Add(char[] text)
{
return map.Put(text, PLACEHOLDER) == null;
}
@@ -86,7 +86,7 @@ namespace Lucene.Net.Analysis.Util
return new CharArraySet(CharArrayMap.UnmodifiableMap(set.map));
}
- public static CharArraySet Copy(Lucene.Net.Util.Version matchVersion, ICollection<object> set)
+ public static CharArraySet Copy(Lucene.Net.Util.Version? matchVersion, ICollection<object> set)
{
if (set == EMPTY_SET)
return EMPTY_SET;
@@ -121,5 +121,10 @@ namespace Lucene.Net.Analysis.Util
}
return sb.Append(']').ToString();
}
+
+ public override bool Remove(object item)
+ {
+ throw new NotImplementedException();
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Util/CharTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/CharTokenizer.cs b/src/contrib/Analyzers/Util/CharTokenizer.cs
index b0029fa..0a31781 100644
--- a/src/contrib/Analyzers/Util/CharTokenizer.cs
+++ b/src/contrib/Analyzers/Util/CharTokenizer.cs
@@ -11,7 +11,7 @@ namespace Lucene.Net.Analysis.Util
{
public abstract class CharTokenizer : Tokenizer
{
- public CharTokenizer(Version matchVersion, TextReader input)
+ public CharTokenizer(Version? matchVersion, TextReader input)
: base(input)
{
charUtils = CharacterUtils.GetInstance(matchVersion);
@@ -19,7 +19,7 @@ namespace Lucene.Net.Analysis.Util
offsetAtt = AddAttribute<IOffsetAttribute>();
}
- public CharTokenizer(Version matchVersion, AttributeFactory factory, TextReader input)
+ public CharTokenizer(Version? matchVersion, AttributeFactory factory, TextReader input)
: base(factory, input)
{
charUtils = CharacterUtils.GetInstance(matchVersion);
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Util/CharacterUtils.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/CharacterUtils.cs b/src/contrib/Analyzers/Util/CharacterUtils.cs
index 223d8f0..5fdc78f 100644
--- a/src/contrib/Analyzers/Util/CharacterUtils.cs
+++ b/src/contrib/Analyzers/Util/CharacterUtils.cs
@@ -16,7 +16,7 @@ namespace Lucene.Net.Analysis.Util
// .NET Port: we never changed how we handle strings and chars :-)
private static readonly DotNetCharacterUtils DOTNET = new DotNetCharacterUtils();
- public static CharacterUtils GetInstance(Lucene.Net.Util.Version matchVersion)
+ public static CharacterUtils GetInstance(Lucene.Net.Util.Version? matchVersion)
{
//return matchVersion.OnOrAfter(Lucene.Net.Util.Version.LUCENE_31) ? JAVA_5 : JAVA_4;
return DOTNET;
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Util/FilteringTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/FilteringTokenFilter.cs b/src/contrib/Analyzers/Util/FilteringTokenFilter.cs
new file mode 100644
index 0000000..d06af92
--- /dev/null
+++ b/src/contrib/Analyzers/Util/FilteringTokenFilter.cs
@@ -0,0 +1,77 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Util
+{
+ public abstract class FilteringTokenFilter : TokenFilter
+ {
+ private readonly IPositionIncrementAttribute posIncrAtt; // = addAttribute(PositionIncrementAttribute.class);
+ private bool enablePositionIncrements; // no init needed, as ctor enforces setting value!
+ private bool first = true; // only used when not preserving gaps
+
+ public FilteringTokenFilter(bool enablePositionIncrements, TokenStream input)
+ : base(input)
+ {
+ this.enablePositionIncrements = enablePositionIncrements;
+ posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ }
+
+ protected abstract bool Accept();
+
+ public override bool IncrementToken()
+ {
+ if (enablePositionIncrements)
+ {
+ int skippedPositions = 0;
+ while (input.IncrementToken())
+ {
+ if (Accept())
+ {
+ if (skippedPositions != 0)
+ {
+ posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
+ }
+ return true;
+ }
+ skippedPositions += posIncrAtt.PositionIncrement;
+ }
+ }
+ else
+ {
+ while (input.IncrementToken())
+ {
+ if (Accept())
+ {
+ if (first)
+ {
+ // first token having posinc=0 is illegal.
+ if (posIncrAtt.PositionIncrement == 0)
+ {
+ posIncrAtt.PositionIncrement = 1;
+ }
+ first = false;
+ }
+ return true;
+ }
+ }
+ }
+ // reached EOS -- return false
+ return false;
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ first = true;
+ }
+
+ public bool EnablePositionIncrements
+ {
+ get { return enablePositionIncrements; }
+ set { enablePositionIncrements = value; }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Util/IMultiTermAwareComponent.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/IMultiTermAwareComponent.cs b/src/contrib/Analyzers/Util/IMultiTermAwareComponent.cs
new file mode 100644
index 0000000..3e256c3
--- /dev/null
+++ b/src/contrib/Analyzers/Util/IMultiTermAwareComponent.cs
@@ -0,0 +1,12 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Util
+{
+ public interface IMultiTermAwareComponent
+ {
+ AbstractAnalysisFactory MultiTermComponent { get; }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Util/IResourceLoaderAware.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/IResourceLoaderAware.cs b/src/contrib/Analyzers/Util/IResourceLoaderAware.cs
new file mode 100644
index 0000000..8ff35bf
--- /dev/null
+++ b/src/contrib/Analyzers/Util/IResourceLoaderAware.cs
@@ -0,0 +1,12 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Util
+{
+ public interface IResourceLoaderAware
+ {
+ void Inform(IResourceLoader loader);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs b/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs
index f6e9194..a041e65 100644
--- a/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs
+++ b/src/contrib/Analyzers/Util/StopwordAnalyzerBase.cs
@@ -12,7 +12,7 @@ namespace Lucene.Net.Analysis.Util
{
protected readonly CharArraySet stopwords;
- protected readonly Version matchVersion;
+ protected readonly Version? matchVersion;
public CharArraySet StopwordSet
{
@@ -22,7 +22,7 @@ namespace Lucene.Net.Analysis.Util
}
}
- protected StopwordAnalyzerBase(Version version, CharArraySet stopwords)
+ protected StopwordAnalyzerBase(Version? version, CharArraySet stopwords)
{
matchVersion = version;
// analyzers should use char array set for stopwords!
@@ -30,7 +30,7 @@ namespace Lucene.Net.Analysis.Util
.UnmodifiableSet(CharArraySet.Copy(version, stopwords));
}
- protected StopwordAnalyzerBase(Version version)
+ protected StopwordAnalyzerBase(Version? version)
: this(version, null)
{
}
@@ -49,7 +49,7 @@ namespace Lucene.Net.Analysis.Util
}
}
- protected static CharArraySet LoadStopwordSet(Stream stopwords, Version matchVersion)
+ protected static CharArraySet LoadStopwordSet(Stream stopwords, Version? matchVersion)
{
TextReader reader = null;
try
@@ -63,7 +63,7 @@ namespace Lucene.Net.Analysis.Util
}
}
- protected static CharArraySet LoadStopwordSet(TextReader stopwords, Version matchVersion)
+ protected static CharArraySet LoadStopwordSet(TextReader stopwords, Version? matchVersion)
{
try
{
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Util/TokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/TokenFilterFactory.cs b/src/contrib/Analyzers/Util/TokenFilterFactory.cs
new file mode 100644
index 0000000..fcb674e
--- /dev/null
+++ b/src/contrib/Analyzers/Util/TokenFilterFactory.cs
@@ -0,0 +1,44 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Util
+{
+ public abstract class TokenFilterFactory : AbstractAnalysisFactory
+ {
+ private static readonly AnalysisSPILoader<TokenFilterFactory> loader =
+ new AnalysisSPILoader<TokenFilterFactory>(typeof(TokenFilterFactory),
+ new String[] { "TokenFilterFactory", "FilterFactory" });
+
+ public static TokenFilterFactory ForName(String name, IDictionary<String, String> args)
+ {
+ return loader.NewInstance(name, args);
+ }
+
+ public static Type LookupClass(String name)
+ {
+ return loader.LookupClass(name);
+ }
+
+ public static ICollection<String> AvailableTokenFilters
+ {
+ get
+ {
+ return loader.AvailableServices;
+ }
+ }
+
+ public static void ReloadTokenFilters()
+ {
+ loader.Reload();
+ }
+
+ protected TokenFilterFactory(IDictionary<String, String> args)
+ : base(args)
+ {
+ }
+
+ public abstract TokenStream Create(TokenStream input);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/98e877d5/src/contrib/Analyzers/Util/WordlistLoader.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Util/WordlistLoader.cs b/src/contrib/Analyzers/Util/WordlistLoader.cs
index e78ea9b..c2430d0 100644
--- a/src/contrib/Analyzers/Util/WordlistLoader.cs
+++ b/src/contrib/Analyzers/Util/WordlistLoader.cs
@@ -31,12 +31,12 @@ namespace Lucene.Net.Analysis.Util
return result;
}
- public static CharArraySet GetWordSet(TextReader reader, Lucene.Net.Util.Version matchVersion)
+ public static CharArraySet GetWordSet(TextReader reader, Lucene.Net.Util.Version? matchVersion)
{
return GetWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
}
- public static CharArraySet GetWordSet(TextReader reader, String comment, Lucene.Net.Util.Version matchVersion)
+ public static CharArraySet GetWordSet(TextReader reader, String comment, Lucene.Net.Util.Version? matchVersion)
{
return GetWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
}