You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/11/08 00:12:33 UTC
[29/34] lucenenet git commit: Raw porting of
Lucene.Net.Analysis.Common
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizerFactory.cs
new file mode 100644
index 0000000..51960b8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizerFactory.cs
@@ -0,0 +1,58 @@
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Core;
+using TokenizerFactory = Lucene.Net.Analysis.Util.TokenizerFactory;
+
+namespace org.apache.lucene.analysis.core
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenizerFactory = TokenizerFactory;
+ using AttributeFactory = org.apache.lucene.util.AttributeSource.AttributeFactory;
+
+
+ /// <summary>
+ /// Factory for <seealso cref="WhitespaceTokenizer"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class WhitespaceTokenizerFactory : TokenizerFactory
+ {
+
+ /// <summary>
+ /// Creates a new WhitespaceTokenizerFactory </summary>
+ public WhitespaceTokenizerFactory(IDictionary<string, string> args) : base(args)
+ {
+ assureMatchVersion();
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override WhitespaceTokenizer create(AttributeFactory factory, Reader input)
+ {
+ return new WhitespaceTokenizer(luceneMatchVersion, factory, input);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechAnalyzer.cs
new file mode 100644
index 0000000..230ecfd
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechAnalyzer.cs
@@ -0,0 +1,161 @@
+using System;
+
+namespace org.apache.lucene.analysis.cz
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+ using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+ using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+ using WordlistLoader = org.apache.lucene.analysis.util.WordlistLoader;
+ using IOUtils = org.apache.lucene.util.IOUtils;
+ using Version = org.apache.lucene.util.Version;
+
+
+ /// <summary>
+ /// <seealso cref="Analyzer"/> for Czech language.
+ /// <para>
+ /// Supports an external list of stopwords (words that will not be indexed at
+ /// all). A default set of stopwords is used unless an alternative list is
+ /// specified.
+ /// </para>
+ ///
+ /// <a name="version"/>
+ /// <para>
+ /// You must specify the required <seealso cref="Version"/> compatibility when creating
+ /// CzechAnalyzer:
+ /// <ul>
+ /// <li>As of 3.1, words are stemmed with <seealso cref="CzechStemFilter"/>
+ /// <li>As of 2.9, StopFilter preserves position increments
+ /// <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
+ /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
+ /// </ul>
+ /// </para>
+ /// </summary>
+ public sealed class CzechAnalyzer : StopwordAnalyzerBase
+ {
+ /// <summary>
+ /// File containing default Czech stopwords. </summary>
+ public const string DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ /// <summary>
+ /// Returns a set of default Czech-stopwords
+ /// </summary>
+ /// <returns> a set of default Czech-stopwords </returns>
+ public static CharArraySet DefaultStopSet
+ {
+ get
+ {
+ return DefaultSetHolder.DEFAULT_SET;
+ }
+ }
+
+ private class DefaultSetHolder
+ {
+ internal static readonly CharArraySet DEFAULT_SET;
+
+ static DefaultSetHolder()
+ {
+ try
+ {
+ DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(typeof(CzechAnalyzer), DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), "#", Version.LUCENE_CURRENT);
+ }
+ catch (IOException)
+ {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new Exception("Unable to load default stopword set");
+ }
+ }
+ }
+
+
+ private readonly CharArraySet stemExclusionTable;
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words (<seealso cref="#getDefaultStopSet()"/>).
+ /// </summary>
+ /// <param name="matchVersion"> Lucene version to match See
+ /// <seealso cref="<a href="#version">above</a>"/> </param>
+ public CzechAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ /// <param name="matchVersion"> Lucene version to match See
+ /// <seealso cref="<a href="#version">above</a>"/> </param>
+ /// <param name="stopwords"> a stopword set </param>
+ public CzechAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words and a set of work to be
+ /// excluded from the <seealso cref="CzechStemFilter"/>.
+ /// </summary>
+ /// <param name="matchVersion"> Lucene version to match See
+ /// <seealso cref="<a href="#version">above</a>"/> </param>
+ /// <param name="stopwords"> a stopword set </param>
+ /// <param name="stemExclusionTable"> a stemming exclusion set </param>
+ public CzechAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) : base(matchVersion, stopwords)
+ {
+ this.stemExclusionTable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
+ }
+
+ /// <summary>
+ /// Creates
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
+ /// </summary>
+ /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// built from a <seealso cref="StandardTokenizer"/> filtered with
+ /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
+ /// , and <seealso cref="CzechStemFilter"/> (only if version is >= LUCENE_31). If
+ /// a version is >= LUCENE_31 and a stem exclusion set is provided via
+ /// <seealso cref="#CzechAnalyzer(Version, CharArraySet, CharArraySet)"/> a
+ /// <seealso cref="SetKeywordMarkerFilter"/> is added before
+ /// <seealso cref="CzechStemFilter"/>. </returns>
+ protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+ Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ {
+ if (!this.stemExclusionTable.Empty)
+ {
+ result = new SetKeywordMarkerFilter(result, stemExclusionTable);
+ }
+ result = new CzechStemFilter(result);
+ }
+ return new TokenStreamComponents(source, result);
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemFilter.cs
new file mode 100644
index 0000000..598cb86
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemFilter.cs
@@ -0,0 +1,67 @@
+namespace org.apache.lucene.analysis.cz
+{
+
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // for javadoc
+ using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> that applies <seealso cref="CzechStemmer"/> to stem Czech words.
+ /// <para>
+ /// To prevent terms from being stemmed use an instance of
+ /// <seealso cref="SetKeywordMarkerFilter"/> or a custom <seealso cref="TokenFilter"/> that sets
+ /// the <seealso cref="KeywordAttribute"/> before this <seealso cref="TokenStream"/>.
+ /// </para>
+ /// <para><b>NOTE</b>: Input is expected to be in lowercase,
+ /// but with diacritical marks</para> </summary>
+ /// <seealso cref= SetKeywordMarkerFilter </seealso>
+ public sealed class CzechStemFilter : TokenFilter
+ {
+ private readonly CzechStemmer stemmer = new CzechStemmer();
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly KeywordAttribute keywordAttr = addAttribute(typeof(KeywordAttribute));
+
+ public CzechStemFilter(TokenStream input) : base(input)
+ {
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ if (!keywordAttr.Keyword)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.Length = newlen;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemFilterFactory.cs
new file mode 100644
index 0000000..7152da8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.cz
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="CzechStemFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_czstem" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.LowerCaseFilterFactory"/>
+ /// <filter class="solr.CzechStemFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class CzechStemFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new CzechStemFilterFactory </summary>
+ public CzechStemFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream create(TokenStream input)
+ {
+ return new CzechStemFilter(input);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemmer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemmer.cs
new file mode 100644
index 0000000..49ebc43
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cz/CzechStemmer.cs
@@ -0,0 +1,157 @@
+namespace org.apache.lucene.analysis.cz
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using org.apache.lucene.analysis.util;
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+// import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+ /// <summary>
+ /// Light Stemmer for Czech.
+ /// <para>
+ /// Implements the algorithm described in:
+ /// <i>
+ /// Indexing and stemming approaches for the Czech language
+ /// </i>
+ /// http://portal.acm.org/citation.cfm?id=1598600
+ /// </para>
+ /// </summary>
+ public class CzechStemmer
+ {
+
+ /// <summary>
+ /// Stem an input buffer of Czech text.
+ /// </summary>
+ /// <param name="s"> input buffer </param>
+ /// <param name="len"> length of input buffer </param>
+ /// <returns> length of input buffer after normalization
+ ///
+ /// <para><b>NOTE</b>: Input is expected to be in lowercase,
+ /// but with diacritical marks</para> </returns>
+ public virtual int stem(char[] s, int len)
+ {
+ len = removeCase(s, len);
+ len = removePossessives(s, len);
+ if (len > 0)
+ {
+ len = normalize(s, len);
+ }
+ return len;
+ }
+
+ private int removeCase(char[] s, int len)
+ {
+ if (len > 7 && StemmerUtil.EndsWith(s, len, "atech"))
+ {
+ return len - 5;
+ }
+
+ if (len > 6 && (StemmerUtil.EndsWith(s, len,"ětem") || StemmerUtil.EndsWith(s, len,"etem") || StemmerUtil.EndsWith(s, len,"atům")))
+ {
+ return len - 4;
+ }
+
+ if (len > 5 && (StemmerUtil.EndsWith(s, len, "ech") || StemmerUtil.EndsWith(s, len, "ich") || StemmerUtil.EndsWith(s, len, "ích") || StemmerUtil.EndsWith(s, len, "ého") || StemmerUtil.EndsWith(s, len, "ěmi") || StemmerUtil.EndsWith(s, len, "emi") || StemmerUtil.EndsWith(s, len, "ému") || StemmerUtil.EndsWith(s, len, "ěte") || StemmerUtil.EndsWith(s, len, "ete") || StemmerUtil.EndsWith(s, len, "ěti") || StemmerUtil.EndsWith(s, len, "eti") || StemmerUtil.EndsWith(s, len, "ího") || StemmerUtil.EndsWith(s, len, "iho") || StemmerUtil.EndsWith(s, len, "ími") || StemmerUtil.EndsWith(s, len, "ímu") || StemmerUtil.EndsWith(s, len, "imu") || StemmerUtil.EndsWith(s, len, "ách") || StemmerUtil.EndsWith(s, len, "ata") || StemmerUtil.EndsWith(s, len, "aty") || StemmerUtil.EndsWith(s, len, "ých") || StemmerUtil.EndsWith(s, len, "ama") || StemmerUtil.EndsWith(s, len, "ami") || StemmerUtil.EndsWith(s, len, "ové") || StemmerUtil.EndsWith(s, len, "ovi") || StemmerUtil.EndsWith(s, len, "�
�mi")))
+ {
+ return len - 3;
+ }
+
+ if (len > 4 && (StemmerUtil.EndsWith(s, len, "em") || StemmerUtil.EndsWith(s, len, "es") || StemmerUtil.EndsWith(s, len, "ém") || StemmerUtil.EndsWith(s, len, "ím") || StemmerUtil.EndsWith(s, len, "ům") || StemmerUtil.EndsWith(s, len, "at") || StemmerUtil.EndsWith(s, len, "ám") || StemmerUtil.EndsWith(s, len, "os") || StemmerUtil.EndsWith(s, len, "us") || StemmerUtil.EndsWith(s, len, "ým") || StemmerUtil.EndsWith(s, len, "mi") || StemmerUtil.EndsWith(s, len, "ou")))
+ {
+ return len - 2;
+ }
+
+ if (len > 3)
+ {
+ switch (s[len - 1])
+ {
+ case 'a':
+ case 'e':
+ case 'i':
+ case 'o':
+ case 'u':
+ case 'ů':
+ case 'y':
+ case 'á':
+ case 'é':
+ case 'í':
+ case 'ý':
+ case 'ě':
+ return len - 1;
+ }
+ }
+
+ return len;
+ }
+
+ private int removePossessives(char[] s, int len)
+ {
+ if (len > 5 && (StemmerUtil.EndsWith(s, len, "ov") || StemmerUtil.EndsWith(s, len, "in") || StemmerUtil.EndsWith(s, len, "ův")))
+ {
+ return len - 2;
+ }
+
+ return len;
+ }
+
+ private int normalize(char[] s, int len)
+ {
+ if (StemmerUtil.EndsWith(s, len, "čt")) // čt -> ck
+ {
+ s[len - 2] = 'c';
+ s[len - 1] = 'k';
+ return len;
+ }
+
+ if (StemmerUtil.EndsWith(s, len, "št")) // št -> sk
+ {
+ s[len - 2] = 's';
+ s[len - 1] = 'k';
+ return len;
+ }
+
+ switch (s[len - 1])
+ {
+ case 'c': // [cč] -> k
+ case 'č':
+ s[len - 1] = 'k';
+ return len;
+ case 'z': // [zž] -> h
+ case 'ž':
+ s[len - 1] = 'h';
+ return len;
+ }
+
+ if (len > 1 && s[len - 2] == 'e')
+ {
+ s[len - 2] = s[len - 1]; // e* > *
+ return len - 1;
+ }
+
+ if (len > 2 && s[len - 2] == 'ů')
+ {
+ s[len - 2] = 'o'; // *ů* -> *o*
+ return len;
+ }
+
+ return len;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Da/DanishAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Da/DanishAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Da/DanishAnalyzer.cs
new file mode 100644
index 0000000..b02657b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Da/DanishAnalyzer.cs
@@ -0,0 +1,139 @@
+using System;
+
+namespace org.apache.lucene.analysis.da
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+ using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using SnowballFilter = org.apache.lucene.analysis.snowball.SnowballFilter;
+ using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+ using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+ using WordlistLoader = org.apache.lucene.analysis.util.WordlistLoader;
+ using IOUtils = org.apache.lucene.util.IOUtils;
+ using Version = org.apache.lucene.util.Version;
+ using DanishStemmer = org.tartarus.snowball.ext.DanishStemmer;
+
+ /// <summary>
+ /// <seealso cref="Analyzer"/> for Danish.
+ /// </summary>
+ public sealed class DanishAnalyzer : StopwordAnalyzerBase
+ {
+ private readonly CharArraySet stemExclusionSet;
+
+ /// <summary>
+ /// File containing default Danish stopwords. </summary>
+ public const string DEFAULT_STOPWORD_FILE = "danish_stop.txt";
+
+ /// <summary>
+ /// Returns an unmodifiable instance of the default stop words set. </summary>
+ /// <returns> default stop words set. </returns>
+ public static CharArraySet DefaultStopSet
+ {
+ get
+ {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+ }
+
+ /// <summary>
+ /// Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ /// accesses the static final set the first time.;
+ /// </summary>
+ private class DefaultSetHolder
+ {
+ internal static readonly CharArraySet DEFAULT_STOP_SET;
+
+ static DefaultSetHolder()
+ {
+ try
+ {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(typeof(SnowballFilter), DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
+ }
+ catch (IOException)
+ {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new Exception("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words: <seealso cref="#DEFAULT_STOPWORD_FILE"/>.
+ /// </summary>
+ public DanishAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ /// <param name="matchVersion"> lucene compatibility version </param>
+ /// <param name="stopwords"> a stopword set </param>
+ public DanishAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before
+ /// stemming.
+ /// </summary>
+ /// <param name="matchVersion"> lucene compatibility version </param>
+ /// <param name="stopwords"> a stopword set </param>
+ /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
+ public DanishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords)
+ {
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
+ }
+
+ /// <summary>
+ /// Creates a
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
+ /// </summary>
+ /// <returns> A
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// built from an <seealso cref="StandardTokenizer"/> filtered with
+ /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
+ /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
+ /// provided and <seealso cref="SnowballFilter"/>. </returns>
+ protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+ Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if (!stemExclusionSet.Empty)
+ {
+ result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+ }
+ result = new SnowballFilter(result, new DanishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanAnalyzer.cs
new file mode 100644
index 0000000..13da913
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanAnalyzer.cs
@@ -0,0 +1,185 @@
+using System;
+
+namespace org.apache.lucene.analysis.de
+{
+ // This file is encoded in UTF-8
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+ using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using SnowballFilter = org.apache.lucene.analysis.snowball.SnowballFilter;
+ using StandardAnalyzer = org.apache.lucene.analysis.standard.StandardAnalyzer;
+ using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+ using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+ using WordlistLoader = org.apache.lucene.analysis.util.WordlistLoader;
+ using IOUtils = org.apache.lucene.util.IOUtils;
+ using Version = org.apache.lucene.util.Version;
+ using German2Stemmer = org.tartarus.snowball.ext.German2Stemmer;
+
+ /// <summary>
+ /// <seealso cref="Analyzer"/> for German language.
+ /// <para>
+ /// Supports an external list of stopwords (words that
+ /// will not be indexed at all) and an external list of exclusions (word that will
+ /// not be stemmed, but indexed).
+ /// A default set of stopwords is used unless an alternative list is specified, but the
+ /// exclusion list is empty by default.
+ /// </para>
+ ///
+ /// <a name="version"/>
+ /// <para>You must specify the required <seealso cref="Version"/>
+ /// compatibility when creating GermanAnalyzer:
+ /// <ul>
+ /// <li> As of 3.6, GermanLightStemFilter is used for less aggressive stemming.
+ /// <li> As of 3.1, Snowball stemming is done with SnowballFilter, and
+ /// Snowball stopwords are used by default.
+ /// <li> As of 2.9, StopFilter preserves position
+ /// increments
+ /// </ul>
+ ///
+ /// </para>
+ /// <para><b>NOTE</b>: This class uses the same <seealso cref="Version"/>
+ /// dependent settings as <seealso cref="StandardAnalyzer"/>.</para>
+ /// </summary>
+ public sealed class GermanAnalyzer : StopwordAnalyzerBase
+ {
+
+ /// @deprecated in 3.1, remove in Lucene 5.0 (index bw compat)
+ [Obsolete("in 3.1, remove in Lucene 5.0 (index bw compat)")]
+ private static readonly string[] GERMAN_STOP_WORDS = new string[] {"einer", "eine", "eines", "einem", "einen", "der", "die", "das", "dass", "daß", "du", "er", "sie", "es", "was", "wer", "wie", "wir", "und", "oder", "ohne", "mit", "am", "im", "in", "aus", "auf", "ist", "sein", "war", "wird", "ihr", "ihre", "ihres", "als", "für", "von", "mit", "dich", "dir", "mich", "mir", "mein", "sein", "kein", "durch", "wegen", "wird"};
+
+ /// <summary>
+ /// File containing default German stopwords. </summary>
+ public const string DEFAULT_STOPWORD_FILE = "german_stop.txt";
+
+ /// <summary>
+ /// Returns a set of default German-stopwords </summary>
+ /// <returns> a set of default German-stopwords </returns>
+ public static CharArraySet DefaultStopSet
+ {
+ get
+ {
+ return DefaultSetHolder.DEFAULT_SET;
+ }
+ }
+
+ private class DefaultSetHolder
+ {
+ /// @deprecated in 3.1, remove in Lucene 5.0 (index bw compat)
+ [Obsolete("in 3.1, remove in Lucene 5.0 (index bw compat)")]
+ internal static readonly CharArraySet DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
+ internal static readonly CharArraySet DEFAULT_SET;
+ static DefaultSetHolder()
+ {
+ try
+ {
+ DEFAULT_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(typeof(SnowballFilter), DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
+ }
+ catch (IOException)
+ {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new Exception("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /// <summary>
+ /// Contains the stopwords used with the <seealso cref="StopFilter"/>.
+ /// </summary>
+
+ /// <summary>
+ /// Contains words that should be indexed but not stemmed.
+ /// </summary>
+ private readonly CharArraySet exclusionSet;
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words:
+ /// <seealso cref="#getDefaultStopSet()"/>.
+ /// </summary>
+ public GermanAnalyzer(Version matchVersion) : this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_SET : DefaultSetHolder.DEFAULT_SET_30)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words
+ /// </summary>
+ /// <param name="matchVersion">
+ /// lucene compatibility version </param>
+ /// <param name="stopwords">
+ /// a stopword set </param>
+ public GermanAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words
+ /// </summary>
+ /// <param name="matchVersion">
+ /// lucene compatibility version </param>
+ /// <param name="stopwords">
+ /// a stopword set </param>
+ /// <param name="stemExclusionSet">
+ /// a stemming exclusion set </param>
+ public GermanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords)
+ {
+ exclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
+ }
+
+ /// <summary>
+ /// Creates
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
+ /// </summary>
+ /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// built from a <seealso cref="StandardTokenizer"/> filtered with
+ /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
+ /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
+ /// provided, <seealso cref="GermanNormalizationFilter"/> and <seealso cref="GermanLightStemFilter"/> </returns>
+ protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+ Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ result = new SetKeywordMarkerFilter(result, exclusionSet);
+ if (matchVersion.onOrAfter(Version.LUCENE_36))
+ {
+ result = new GermanNormalizationFilter(result);
+ result = new GermanLightStemFilter(result);
+ }
+ else if (matchVersion.onOrAfter(Version.LUCENE_31))
+ {
+ result = new SnowballFilter(result, new German2Stemmer());
+ }
+ else
+ {
+ result = new GermanStemFilter(result);
+ }
+ return new TokenStreamComponents(source, result);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemFilter.cs
new file mode 100644
index 0000000..57997f8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemFilter.cs
@@ -0,0 +1,66 @@
+namespace org.apache.lucene.analysis.de
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> that applies <seealso cref="GermanLightStemmer"/> to stem German
+ /// words.
+ /// <para>
+ /// To prevent terms from being stemmed use an instance of
+ /// <seealso cref="SetKeywordMarkerFilter"/> or a custom <seealso cref="TokenFilter"/> that sets
+ /// the <seealso cref="KeywordAttribute"/> before this <seealso cref="TokenStream"/>.
+ /// </para>
+ /// </summary>
+ public sealed class GermanLightStemFilter : TokenFilter
+ {
+ private readonly GermanLightStemmer stemmer = new GermanLightStemmer();
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly KeywordAttribute keywordAttr = addAttribute(typeof(KeywordAttribute));
+
+ public GermanLightStemFilter(TokenStream input) : base(input)
+ {
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ if (!keywordAttr.Keyword)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.Length = newlen;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemFilterFactory.cs
new file mode 100644
index 0000000..0f8746e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.de
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="GermanLightStemFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_delgtstem" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.LowerCaseFilterFactory"/>
+ /// <filter class="solr.GermanLightStemFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class GermanLightStemFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new GermanLightStemFilterFactory </summary>
+ public GermanLightStemFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream create(TokenStream input)
+ {
+ return new GermanLightStemFilter(input);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemmer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemmer.cs
new file mode 100644
index 0000000..87307ec
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanLightStemmer.cs
@@ -0,0 +1,177 @@
+namespace org.apache.lucene.analysis.de
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /*
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ *
+ * Full copyright for that code follows:
+ */
+
+ /*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution. Neither the name of the author nor the names
+ * of its contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ /// <summary>
+ /// Light Stemmer for German.
+ /// <para>
+ /// This stemmer implements the "UniNE" algorithm in:
+ /// <i>Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages</i>
+ /// Jacques Savoy
+ /// </para>
+ /// </summary>
+ public class GermanLightStemmer
+ {
+
+ public virtual int stem(char[] s, int len)
+ {
+ for (int i = 0; i < len; i++)
+ {
+ switch (s[i])
+ {
+ case 'ä':
+ case 'à':
+ case 'á':
+ case 'â':
+ s[i] = 'a';
+ break;
+ case 'ö':
+ case 'ò':
+ case 'ó':
+ case 'ô':
+ s[i] = 'o';
+ break;
+ case 'ï':
+ case 'ì':
+ case 'í':
+ case 'î':
+ s[i] = 'i';
+ break;
+ case 'ü':
+ case 'ù':
+ case 'ú':
+ case 'û':
+ s[i] = 'u';
+ break;
+ }
+ }
+
+ len = step1(s, len);
+ return step2(s, len);
+ }
+
+ private bool stEnding(char ch)
+ {
+ switch (ch)
+ {
+ case 'b':
+ case 'd':
+ case 'f':
+ case 'g':
+ case 'h':
+ case 'k':
+ case 'l':
+ case 'm':
+ case 'n':
+ case 't':
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ private int step1(char[] s, int len)
+ {
+ if (len > 5 && s[len - 3] == 'e' && s[len - 2] == 'r' && s[len - 1] == 'n')
+ {
+ return len - 3;
+ }
+
+ if (len > 4 && s[len - 2] == 'e')
+ {
+ switch (s[len - 1])
+ {
+ case 'm':
+ case 'n':
+ case 'r':
+ case 's':
+ return len - 2;
+ }
+ }
+
+ if (len > 3 && s[len - 1] == 'e')
+ {
+ return len - 1;
+ }
+
+ if (len > 3 && s[len - 1] == 's' && stEnding(s[len - 2]))
+ {
+ return len - 1;
+ }
+
+ return len;
+ }
+
+ private int step2(char[] s, int len)
+ {
+ if (len > 5 && s[len - 3] == 'e' && s[len - 2] == 's' && s[len - 1] == 't')
+ {
+ return len - 3;
+ }
+
+ if (len > 4 && s[len - 2] == 'e' && (s[len - 1] == 'r' || s[len - 1] == 'n'))
+ {
+ return len - 2;
+ }
+
+ if (len > 4 && s[len - 2] == 's' && s[len - 1] == 't' && stEnding(s[len - 3]))
+ {
+ return len - 2;
+ }
+
+ return len;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemFilter.cs
new file mode 100644
index 0000000..ca93a4e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemFilter.cs
@@ -0,0 +1,66 @@
+namespace org.apache.lucene.analysis.de
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> that applies <seealso cref="GermanMinimalStemmer"/> to stem German
+ /// words.
+ /// <para>
+ /// To prevent terms from being stemmed use an instance of
+ /// <seealso cref="SetKeywordMarkerFilter"/> or a custom <seealso cref="TokenFilter"/> that sets
+ /// the <seealso cref="KeywordAttribute"/> before this <seealso cref="TokenStream"/>.
+ /// </para>
+ /// </summary>
+ public sealed class GermanMinimalStemFilter : TokenFilter
+ {
+ private readonly GermanMinimalStemmer stemmer = new GermanMinimalStemmer();
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly KeywordAttribute keywordAttr = addAttribute(typeof(KeywordAttribute));
+
+ public GermanMinimalStemFilter(TokenStream input) : base(input)
+ {
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ if (!keywordAttr.Keyword)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.Length = newlen;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemFilterFactory.cs
new file mode 100644
index 0000000..bb72f4b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.de
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="GermanMinimalStemFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_deminstem" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.LowerCaseFilterFactory"/>
+ /// <filter class="solr.GermanMinimalStemFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class GermanMinimalStemFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new GermanMinimalStemFilterFactory </summary>
+ public GermanMinimalStemFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream create(TokenStream input)
+ {
+ return new GermanMinimalStemFilter(input);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemmer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemmer.cs
new file mode 100644
index 0000000..a1e109d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanMinimalStemmer.cs
@@ -0,0 +1,151 @@
+namespace org.apache.lucene.analysis.de
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /*
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ *
+ * Full copyright for that code follows:
+ */
+
+ /*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution. Neither the name of the author nor the names
+ * of its contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ /// <summary>
+ /// Minimal Stemmer for German.
+ /// <para>
+ /// This stemmer implements the following algorithm:
+ /// <i>Morphologie et recherche d'information</i>
+ /// Jacques Savoy.
+ /// </para>
+ /// </summary>
+ public class GermanMinimalStemmer
+ {
+
+ public virtual int stem(char[] s, int len)
+ {
+ if (len < 5)
+ {
+ return len;
+ }
+
+ for (int i = 0; i < len; i++)
+ {
+ switch (s[i])
+ {
+ case 'ä':
+ s[i] = 'a';
+ break;
+ case 'ö':
+ s[i] = 'o';
+ break;
+ case 'ü':
+ s[i] = 'u';
+ break;
+ }
+ }
+
+ if (len > 6 && s[len - 3] == 'n' && s[len - 2] == 'e' && s[len - 1] == 'n')
+ {
+ return len - 3;
+ }
+
+ if (len > 5)
+ {
+ switch (s[len - 1])
+ {
+ case 'n':
+ if (s[len - 2] == 'e')
+ {
+ return len - 2;
+ }
+ else
+ {
+ break;
+ }
+ case 'e':
+ if (s[len - 2] == 's')
+ {
+ return len - 2;
+ }
+ else
+ {
+ break;
+ }
+ case 's':
+ if (s[len - 2] == 'e')
+ {
+ return len - 2;
+ }
+ else
+ {
+ break;
+ }
+ case 'r':
+ if (s[len - 2] == 'e')
+ {
+ return len - 2;
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+
+ switch (s[len - 1])
+ {
+ case 'n':
+ case 'e':
+ case 's':
+ case 'r':
+ return len - 1;
+ }
+
+ return len;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanNormalizationFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanNormalizationFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanNormalizationFilter.cs
new file mode 100644
index 0000000..19fcbf7
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanNormalizationFilter.cs
@@ -0,0 +1,130 @@
+using System;
+
+namespace org.apache.lucene.analysis.de
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using StemmerUtil = org.apache.lucene.analysis.util.StemmerUtil;
+
+ /// <summary>
+ /// Normalizes German characters according to the heuristics
+ /// of the <a href="http://snowball.tartarus.org/algorithms/german2/stemmer.html">
+ /// German2 snowball algorithm</a>.
+ /// It allows for the fact that ä, ö and ü are sometimes written as ae, oe and ue.
+ /// <para>
+ /// <ul>
+ /// <li> 'ß' is replaced by 'ss'
+ /// <li> 'ä', 'ö', 'ü' are replaced by 'a', 'o', 'u', respectively.
+ /// <li> 'ae' and 'oe' are replaced by 'a', and 'o', respectively.
+ /// <li> 'ue' is replaced by 'u', when not following a vowel or q.
+ /// </ul>
+ /// </para>
+ /// <para>
+ /// This is useful if you want this normalization without using
+ /// the German2 stemmer, or perhaps no stemming at all.
+ /// </para>
+ /// </summary>
+ public sealed class GermanNormalizationFilter : TokenFilter
+ {
+ // FSM with 3 states:
+ private const int N = 0; // ordinary state
+ private const int V = 1; // stops 'u' from entering umlaut state
+ private const int U = 2; // umlaut state, allows e-deletion
+
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+ public GermanNormalizationFilter(TokenStream input) : base(input)
+ {
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ int state = N;
+ char[] buffer = termAtt.buffer();
+ int length = termAtt.length();
+ for (int i = 0; i < length; i++)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char c = buffer[i];
+ char c = buffer[i];
+ switch (c)
+ {
+ case 'a':
+ case 'o':
+ state = U;
+ break;
+ case 'u':
+ state = (state == N) ? U : V;
+ break;
+ case 'e':
+ if (state == U)
+ {
+ length = StemmerUtil.delete(buffer, i--, length);
+ }
+ state = V;
+ break;
+ case 'i':
+ case 'q':
+ case 'y':
+ state = V;
+ break;
+ case 'ä':
+ buffer[i] = 'a';
+ state = V;
+ break;
+ case 'ö':
+ buffer[i] = 'o';
+ state = V;
+ break;
+ case 'ü':
+ buffer[i] = 'u';
+ state = V;
+ break;
+ case 'ß':
+ buffer[i++] = 's';
+ buffer = termAtt.resizeBuffer(1 + length);
+ if (i < length)
+ {
+ Array.Copy(buffer, i, buffer, i + 1, (length - i));
+ }
+ buffer[i] = 's';
+ length++;
+ state = N;
+ break;
+ default:
+ state = N;
+ break;
+ }
+ }
+ termAtt.Length = length;
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanNormalizationFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanNormalizationFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanNormalizationFilterFactory.cs
new file mode 100644
index 0000000..0229746
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanNormalizationFilterFactory.cs
@@ -0,0 +1,65 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.de
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using AbstractAnalysisFactory = org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+ using MultiTermAwareComponent = org.apache.lucene.analysis.util.MultiTermAwareComponent;
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="GermanNormalizationFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_denorm" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.LowerCaseFilterFactory"/>
+ /// <filter class="solr.GermanNormalizationFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class GermanNormalizationFilterFactory : TokenFilterFactory, MultiTermAwareComponent
+ {
+
+ /// <summary>
+ /// Creates a new GermanNormalizationFilterFactory </summary>
+ public GermanNormalizationFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream create(TokenStream input)
+ {
+ return new GermanNormalizationFilter(input);
+ }
+
+ public virtual AbstractAnalysisFactory MultiTermComponent
+ {
+ get
+ {
+ return this;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemFilter.cs
new file mode 100644
index 0000000..203c990
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemFilter.cs
@@ -0,0 +1,96 @@
+namespace org.apache.lucene.analysis.de
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> that stems German words.
+ /// <para>
+ /// It supports a table of words that should
+ /// not be stemmed at all. The stemmer used can be changed at runtime after the
+ /// filter object is created (as long as it is a <seealso cref="GermanStemmer"/>).
+ /// </para>
+ /// <para>
+ /// To prevent terms from being stemmed use an instance of
+ /// <seealso cref="SetKeywordMarkerFilter"/> or a custom <seealso cref="TokenFilter"/> that sets
+ /// the <seealso cref="KeywordAttribute"/> before this <seealso cref="TokenStream"/>.
+ /// </para> </summary>
+ /// <seealso cref= SetKeywordMarkerFilter </seealso>
+ public sealed class GermanStemFilter : TokenFilter
+ {
+ /// <summary>
+ /// The actual token in the input stream.
+ /// </summary>
+ private GermanStemmer stemmer = new GermanStemmer();
+
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly KeywordAttribute keywordAttr = addAttribute(typeof(KeywordAttribute));
+
+ /// <summary>
+ /// Creates a <seealso cref="GermanStemFilter"/> instance </summary>
+ /// <param name="in"> the source <seealso cref="TokenStream"/> </param>
+ public GermanStemFilter(TokenStream @in) : base(@in)
+ {
+ }
+
+ /// <returns> Returns true for next token in the stream, or false at EOS </returns>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ string term = termAtt.ToString();
+
+ if (!keywordAttr.Keyword)
+ {
+ string s = stemmer.stem(term);
+ // If not stemmed, don't waste the time adjusting the token.
+ if ((s != null) && !s.Equals(term))
+ {
+ termAtt.setEmpty().append(s);
+ }
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /// <summary>
+ /// Set a alternative/custom <seealso cref="GermanStemmer"/> for this filter.
+ /// </summary>
+ public GermanStemmer Stemmer
+ {
+ set
+ {
+ if (value != null)
+ {
+ this.stemmer = value;
+ }
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemFilterFactory.cs
new file mode 100644
index 0000000..4f5c136
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemFilterFactory.cs
@@ -0,0 +1,56 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.de
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="GermanStemFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_destem" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.LowerCaseFilterFactory"/>
+ /// <filter class="solr.GermanStemFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class GermanStemFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new GermanStemFilterFactory </summary>
+ public GermanStemFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override GermanStemFilter create(TokenStream @in)
+ {
+ return new GermanStemFilter(@in);
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemmer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemmer.cs
new file mode 100644
index 0000000..9b63922
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/De/GermanStemmer.cs
@@ -0,0 +1,308 @@
+using System.Text;
+
+namespace org.apache.lucene.analysis.de
+{
+
+ // This file is encoded in UTF-8
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A stemmer for German words.
+ /// <para>
+ /// The algorithm is based on the report
+ /// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
+ /// Caumanns (joerg.caumanns at isst.fhg.de).
+ /// </para>
+ /// </summary>
+ public class GermanStemmer
+ {
+ /// <summary>
+ /// Buffer for the terms while stemming them.
+ /// </summary>
+ private StringBuilder sb = new StringBuilder();
+
+ /// <summary>
+ /// Amount of characters that are removed with <tt>substitute()</tt> while stemming.
+ /// </summary>
+ private int substCount = 0;
+
+ private static readonly Locale locale = new Locale("de", "DE");
+
+ /// <summary>
+ /// Stemms the given term to an unique <tt>discriminator</tt>.
+ /// </summary>
+ /// <param name="term"> The term that should be stemmed. </param>
+ /// <returns> Discriminator for <tt>term</tt> </returns>
+ protected internal virtual string stem(string term)
+ {
+ // Use lowercase for medium stemming.
+ term = term.ToLower(locale);
+ if (!isStemmable(term))
+ {
+ return term;
+ }
+ // Reset the StringBuilder.
+ sb.Remove(0, sb.Length);
+ sb.Insert(0, term);
+ // Stemming starts here...
+ substitute(sb);
+ strip(sb);
+ optimize(sb);
+ resubstitute(sb);
+ removeParticleDenotion(sb);
+ return sb.ToString();
+ }
+
+ /// <summary>
+ /// Checks if a term could be stemmed.
+ /// </summary>
+ /// <returns> true if, and only if, the given term consists in letters. </returns>
+ private bool isStemmable(string term)
+ {
+ for (int c = 0; c < term.Length; c++)
+ {
+ if (!char.IsLetter(term[c]))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /// <summary>
+ /// suffix stripping (stemming) on the current term. The stripping is reduced
+ /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
+ /// from which all regular suffixes are build of. The simplification causes
+ /// some overstemming, and way more irregular stems, but still provides unique.
+ /// discriminators in the most of those cases.
+ /// The algorithm is context free, except of the length restrictions.
+ /// </summary>
+ private void strip(StringBuilder buffer)
+ {
+ bool doMore = true;
+ while (doMore && buffer.Length > 3)
+ {
+ if ((buffer.Length + substCount > 5) && StringHelperClass.SubstringSpecial(buffer, buffer.Length - 2, buffer.Length).Equals("nd"))
+ {
+ buffer.Remove(buffer.Length - 2, buffer.Length - buffer.Length - 2);
+ }
+ else if ((buffer.Length + substCount > 4) && StringHelperClass.SubstringSpecial(buffer, buffer.Length - 2, buffer.Length).Equals("em"))
+ {
+ buffer.Remove(buffer.Length - 2, buffer.Length - buffer.Length - 2);
+ }
+ else if ((buffer.Length + substCount > 4) && StringHelperClass.SubstringSpecial(buffer, buffer.Length - 2, buffer.Length).Equals("er"))
+ {
+ buffer.Remove(buffer.Length - 2, buffer.Length - buffer.Length - 2);
+ }
+ else if (buffer[buffer.Length - 1] == 'e')
+ {
+ buffer.Remove(buffer.Length - 1, 1);
+ }
+ else if (buffer[buffer.Length - 1] == 's')
+ {
+ buffer.Remove(buffer.Length - 1, 1);
+ }
+ else if (buffer[buffer.Length - 1] == 'n')
+ {
+ buffer.Remove(buffer.Length - 1, 1);
+ }
+ // "t" occurs only as suffix of verbs.
+ else if (buffer[buffer.Length - 1] == 't')
+ {
+ buffer.Remove(buffer.Length - 1, 1);
+ }
+ else
+ {
+ doMore = false;
+ }
+ }
+ }
+
+ /// <summary>
+ /// Does some optimizations on the term. This optimisations are
+ /// contextual.
+ /// </summary>
+ private void optimize(StringBuilder buffer)
+ {
+ // Additional step for female plurals of professions and inhabitants.
+ if (buffer.Length > 5 && StringHelperClass.SubstringSpecial(buffer, buffer.Length - 5, buffer.Length).Equals("erin*"))
+ {
+ buffer.Remove(buffer.Length - 1, 1);
+ strip(buffer);
+ }
+ // Additional step for irregular plural nouns like "Matrizen -> Matrix".
+ // NOTE: this length constraint is probably not a great value, its just to prevent AIOOBE on empty terms
+ if (buffer.Length > 0 && buffer[buffer.Length - 1] == ('z'))
+ {
+ buffer[buffer.Length - 1] = 'x';
+ }
+ }
+
+ /// <summary>
+ /// Removes a particle denotion ("ge") from a term.
+ /// </summary>
+ private void removeParticleDenotion(StringBuilder buffer)
+ {
+ if (buffer.Length > 4)
+ {
+ for (int c = 0; c < buffer.Length - 3; c++)
+ {
+ if (buffer.Substring(c, 4).Equals("gege"))
+ {
+ buffer.Remove(c, c + 2 - c);
+ return;
+ }
+ }
+ }
+ }
+
+ /// <summary>
+ /// Do some substitutions for the term to reduce overstemming:
+ ///
+ /// - Substitute Umlauts with their corresponding vowel: äöü -> aou,
+ /// "ß" is substituted by "ss"
+ /// - Substitute a second char of a pair of equal characters with
+ /// an asterisk: ?? -> ?*
+ /// - Substitute some common character combinations with a token:
+ /// sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
+ /// </summary>
+ private void substitute(StringBuilder buffer)
+ {
+ substCount = 0;
+ for (int c = 0; c < buffer.Length; c++)
+ {
+ // Replace the second char of a pair of the equal characters with an asterisk
+ if (c > 0 && buffer[c] == buffer[c - 1])
+ {
+ buffer[c] = '*';
+ }
+ // Substitute Umlauts.
+ else if (buffer[c] == 'ä')
+ {
+ buffer[c] = 'a';
+ }
+ else if (buffer[c] == 'ö')
+ {
+ buffer[c] = 'o';
+ }
+ else if (buffer[c] == 'ü')
+ {
+ buffer[c] = 'u';
+ }
+ // Fix bug so that 'ß' at the end of a word is replaced.
+ else if (buffer[c] == 'ß')
+ {
+ buffer[c] = 's';
+ buffer.Insert(c + 1, 's');
+ substCount++;
+ }
+ // Take care that at least one character is left left side from the current one
+ if (c < buffer.Length - 1)
+ {
+ // Masking several common character combinations with an token
+ if ((c < buffer.Length - 2) && buffer[c] == 's' && buffer[c + 1] == 'c' && buffer[c + 2] == 'h')
+ {
+ buffer[c] = '$';
+ buffer.Remove(c + 1, c + 3 - c + 1);
+ substCount = + 2;
+ }
+ else if (buffer[c] == 'c' && buffer[c + 1] == 'h')
+ {
+ buffer[c] = '§';
+ buffer.Remove(c + 1, 1);
+ substCount++;
+ }
+ else if (buffer[c] == 'e' && buffer[c + 1] == 'i')
+ {
+ buffer[c] = '%';
+ buffer.Remove(c + 1, 1);
+ substCount++;
+ }
+ else if (buffer[c] == 'i' && buffer[c + 1] == 'e')
+ {
+ buffer[c] = '&';
+ buffer.Remove(c + 1, 1);
+ substCount++;
+ }
+ else if (buffer[c] == 'i' && buffer[c + 1] == 'g')
+ {
+ buffer[c] = '#';
+ buffer.Remove(c + 1, 1);
+ substCount++;
+ }
+ else if (buffer[c] == 's' && buffer[c + 1] == 't')
+ {
+ buffer[c] = '!';
+ buffer.Remove(c + 1, 1);
+ substCount++;
+ }
+ }
+ }
+ }
+
+ /// <summary>
+ /// Undoes the changes made by substitute(). That are character pairs and
+ /// character combinations. Umlauts will remain as their corresponding vowel,
+ /// as "ß" remains as "ss".
+ /// </summary>
+ private void resubstitute(StringBuilder buffer)
+ {
+ for (int c = 0; c < buffer.Length; c++)
+ {
+ if (buffer[c] == '*')
+ {
+ char x = buffer[c - 1];
+ buffer[c] = x;
+ }
+ else if (buffer[c] == '$')
+ {
+ buffer[c] = 's';
+ buffer.Insert(c + 1, new char[]{'c', 'h'}, 0, 2);
+ }
+ else if (buffer[c] == '§')
+ {
+ buffer[c] = 'c';
+ buffer.Insert(c + 1, 'h');
+ }
+ else if (buffer[c] == '%')
+ {
+ buffer[c] = 'e';
+ buffer.Insert(c + 1, 'i');
+ }
+ else if (buffer[c] == '&')
+ {
+ buffer[c] = 'i';
+ buffer.Insert(c + 1, 'e');
+ }
+ else if (buffer[c] == '#')
+ {
+ buffer[c] = 'i';
+ buffer.Insert(c + 1, 'g');
+ }
+ else if (buffer[c] == '!')
+ {
+ buffer[c] = 's';
+ buffer.Insert(c + 1, 't');
+ }
+ }
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/El/GreekAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/El/GreekAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/El/GreekAnalyzer.cs
new file mode 100644
index 0000000..9ad4f94
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/El/GreekAnalyzer.cs
@@ -0,0 +1,139 @@
+using System;
+
+namespace org.apache.lucene.analysis.el
+{
+
+ /// <summary>
+ /// Copyright 2005 The Apache Software Foundation
+ ///
+ /// Licensed under the Apache License, Version 2.0 (the "License");
+ /// you may not use this file except in compliance with the License.
+ /// You may obtain a copy of the License at
+ ///
+ /// http://www.apache.org/licenses/LICENSE-2.0
+ ///
+ /// Unless required by applicable law or agreed to in writing, software
+ /// distributed under the License is distributed on an "AS IS" BASIS,
+ /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ /// See the License for the specific language governing permissions and
+ /// limitations under the License.
+ /// </summary>
+
+
+ using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+ using StandardAnalyzer = org.apache.lucene.analysis.standard.StandardAnalyzer;
+ using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+ using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+ using Version = org.apache.lucene.util.Version;
+
+ /// <summary>
+ /// <seealso cref="Analyzer"/> for the Greek language.
+ /// <para>
+ /// Supports an external list of stopwords (words
+ /// that will not be indexed at all).
+ /// A default set of stopwords is used unless an alternative list is specified.
+ /// </para>
+ ///
+ /// <a name="version"/>
+ /// <para>You must specify the required <seealso cref="Version"/>
+ /// compatibility when creating GreekAnalyzer:
+ /// <ul>
+ /// <li> As of 3.1, StandardFilter and GreekStemmer are used by default.
+ /// <li> As of 2.9, StopFilter preserves position
+ /// increments
+ /// </ul>
+ ///
+ /// </para>
+ /// <para><b>NOTE</b>: This class uses the same <seealso cref="Version"/>
+ /// dependent settings as <seealso cref="StandardAnalyzer"/>.</para>
+ /// </summary>
+ public sealed class GreekAnalyzer : StopwordAnalyzerBase
+ {
+ /// <summary>
+ /// File containing default Greek stopwords. </summary>
+ public const string DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ /// <summary>
+ /// Returns a set of default Greek-stopwords </summary>
+ /// <returns> a set of default Greek-stopwords </returns>
+ public static CharArraySet DefaultStopSet
+ {
+ get
+ {
+ return DefaultSetHolder.DEFAULT_SET;
+ }
+ }
+
+ private class DefaultSetHolder
+ {
+ internal static readonly CharArraySet DEFAULT_SET;
+
+ static DefaultSetHolder()
+ {
+ try
+ {
+ DEFAULT_SET = loadStopwordSet(false, typeof(GreekAnalyzer), DEFAULT_STOPWORD_FILE, "#");
+ }
+ catch (IOException)
+ {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new Exception("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words. </summary>
+ /// <param name="matchVersion"> Lucene compatibility version,
+ /// See <a href="#version">above</a> </param>
+ public GreekAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// <para>
+ /// <b>NOTE:</b> The stopwords set should be pre-processed with the logic of
+ /// <seealso cref="GreekLowerCaseFilter"/> for best results.
+ ///
+ /// </para>
+ /// </summary>
+ /// <param name="matchVersion"> Lucene compatibility version,
+ /// See <a href="#version">above</a> </param>
+ /// <param name="stopwords"> a stopword set </param>
+ public GreekAnalyzer(Version matchVersion, CharArraySet stopwords) : base(matchVersion, stopwords)
+ {
+ }
+
+ /// <summary>
+ /// Creates
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
+ /// </summary>
+ /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// built from a <seealso cref="StandardTokenizer"/> filtered with
+ /// <seealso cref="GreekLowerCaseFilter"/>, <seealso cref="StandardFilter"/>,
+ /// <seealso cref="StopFilter"/>, and <seealso cref="GreekStemFilter"/> </returns>
+ protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+ Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ {
+ result = new StandardFilter(matchVersion, result);
+ }
+ result = new StopFilter(matchVersion, result, stopwords);
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ {
+ result = new GreekStemFilter(result);
+ }
+ return new TokenStreamComponents(source, result);
+ }
+ }
+
+}
\ No newline at end of file