You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/11/08 00:12:22 UTC
[18/34] lucenenet git commit: Raw porting of
Lucene.Net.Analysis.Common
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemFilterFactory.cs
new file mode 100644
index 0000000..6392329
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.lv
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="LatvianStemFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.LowerCaseFilterFactory"/>
+ /// <filter class="solr.LatvianStemFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class LatvianStemFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new LatvianStemFilterFactory </summary>
+ public LatvianStemFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream create(TokenStream input)
+ {
+ return new LatvianStemFilter(input);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemmer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemmer.cs
new file mode 100644
index 0000000..d3e5ea8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemmer.cs
@@ -0,0 +1,198 @@
+namespace org.apache.lucene.analysis.lv
+{
+
+ using org.apache.lucene.analysis.util;
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+// import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Light stemmer for Latvian.
+ /// <para>
+ /// This is a light version of the algorithm in Karlis Kreslin's PhD thesis
+ /// <i>A stemming algorithm for Latvian</i> with the following modifications:
+ /// <ul>
+ /// <li>Only explicitly stems noun and adjective morphology
+ /// <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
+ /// <li>Removes only the primary inflectional suffixes: case and number for nouns ;
+ /// case, number, gender, and definitiveness for adjectives.
+ /// <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
+ /// </ul>
+ /// </para>
+ /// </summary>
+ public class LatvianStemmer
+ {
+ /// <summary>
+ /// Stem a latvian word. returns the new adjusted length.
+ /// </summary>
+ public virtual int stem(char[] s, int len)
+ {
+ int numVowels_Renamed = numVowels(s, len);
+
+ for (int i = 0; i < affixes.Length; i++)
+ {
+ Affix affix = affixes[i];
+ if (numVowels_Renamed > affix.vc && len >= affix.affix.Length + 3 && StemmerUtil.EndsWith(s, len, affix.affix))
+ {
+ len -= affix.affix.Length;
+ return affix.palatalizes ? unpalatalize(s, len) : len;
+ }
+ }
+
+ return len;
+ }
+
+ internal static readonly Affix[] affixes = {};
+
+ internal class Affix
+ {
+ internal char[] affix; // suffix
+ internal int vc; // vowel count of the suffix
+ internal bool palatalizes; // true if we should fire palatalization rules.
+
+ internal Affix(string affix, int vc, bool palatalizes)
+ {
+ this.affix = affix.ToCharArray();
+ this.vc = vc;
+ this.palatalizes = palatalizes;
+ }
+ }
+
+ /// <summary>
+ /// Most cases are handled except for the ambiguous ones:
+ /// <ul>
+ /// <li> s -> š
+ /// <li> t -> š
+ /// <li> d -> ž
+ /// <li> z -> ž
+ /// </ul>
+ /// </summary>
+ private int unpalatalize(char[] s, int len)
+ {
+ // we check the character removed: if its -u then
+ // its 2,5, or 6 gen pl., and these two can only apply then.
+ if (s[len] == 'u')
+ {
+ // kš -> kst
+ if (StemmerUtil.EndsWith(s, len, "kš"))
+ {
+ len++;
+ s[len - 2] = 's';
+ s[len - 1] = 't';
+ return len;
+ }
+ // ņņ -> nn
+ if (StemmerUtil.EndsWith(s, len, "ņņ"))
+ {
+ s[len - 2] = 'n';
+ s[len - 1] = 'n';
+ return len;
+ }
+ }
+
+ // otherwise all other rules
+ if (StemmerUtil.EndsWith(s, len, "pj") || StemmerUtil.EndsWith(s, len, "bj") || StemmerUtil.EndsWith(s, len, "mj") || StemmerUtil.EndsWith(s, len, "vj"))
+ {
+ // labial consonant
+ return len - 1;
+ }
+ else if (StemmerUtil.EndsWith(s, len, "šņ"))
+ {
+ s[len - 2] = 's';
+ s[len - 1] = 'n';
+ return len;
+ }
+ else if (StemmerUtil.EndsWith(s, len, "žņ"))
+ {
+ s[len - 2] = 'z';
+ s[len - 1] = 'n';
+ return len;
+ }
+ else if (StemmerUtil.EndsWith(s, len, "šļ"))
+ {
+ s[len - 2] = 's';
+ s[len - 1] = 'l';
+ return len;
+ }
+ else if (StemmerUtil.EndsWith(s, len, "žļ"))
+ {
+ s[len - 2] = 'z';
+ s[len - 1] = 'l';
+ return len;
+ }
+ else if (StemmerUtil.EndsWith(s, len, "ļņ"))
+ {
+ s[len - 2] = 'l';
+ s[len - 1] = 'n';
+ return len;
+ }
+ else if (StemmerUtil.EndsWith(s, len, "ļļ"))
+ {
+ s[len - 2] = 'l';
+ s[len - 1] = 'l';
+ return len;
+ }
+ else if (s[len - 1] == 'č')
+ {
+ s[len - 1] = 'c';
+ return len;
+ }
+ else if (s[len - 1] == 'ļ')
+ {
+ s[len - 1] = 'l';
+ return len;
+ }
+ else if (s[len - 1] == 'ņ')
+ {
+ s[len - 1] = 'n';
+ return len;
+ }
+
+ return len;
+ }
+
+ /// <summary>
+ /// Count the vowels in the string, we always require at least
+ /// one in the remaining stem to accept it.
+ /// </summary>
+ private int numVowels(char[] s, int len)
+ {
+ int n = 0;
+ for (int i = 0; i < len; i++)
+ {
+ switch (s[i])
+ {
+ case 'a':
+ case 'e':
+ case 'i':
+ case 'o':
+ case 'u':
+ case 'ā':
+ case 'ī':
+ case 'ē':
+ case 'ū':
+ n++;
+ break;
+ }
+ }
+ return n;
+ }
+ }
+
+}
\ No newline at end of file