You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/11/08 00:12:22 UTC
[18/34] lucenenet git commit: Raw porting of Lucene.Net.Analysis.Common

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemFilterFactory.cs
new file mode 100644
index 0000000..6392329
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.lv
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="LatvianStemFilter"/>. 
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+	///     &lt;filter class="solr.LatvianStemFilterFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class LatvianStemFilterFactory : TokenFilterFactory
+	{
+
+	  /// <summary>
+	  /// Creates a new LatvianStemFilterFactory </summary>
+	  public LatvianStemFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override TokenStream create(TokenStream input)
+	  {
+		return new LatvianStemFilter(input);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemmer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemmer.cs
new file mode 100644
index 0000000..d3e5ea8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Lv/LatvianStemmer.cs
@@ -0,0 +1,198 @@
+namespace org.apache.lucene.analysis.lv
+{
+
+	using org.apache.lucene.analysis.util;
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+//	import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	/// <summary>
+	/// Light stemmer for Latvian.
+	/// <para>
+	/// This is a light version of the algorithm in Karlis Kreslin's PhD thesis
+	/// <i>A stemming algorithm for Latvian</i> with the following modifications:
+	/// <ul>
+	///   <li>Only explicitly stems noun and adjective morphology
+	///   <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
+	///   <li>Removes only the primary inflectional suffixes: case and number for nouns ; 
+	///       case, number, gender, and definitiveness for adjectives.
+	///   <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public class LatvianStemmer
+	{
+	  /// <summary>
+	  /// Stem a latvian word. returns the new adjusted length.
+	  /// </summary>
+	  public virtual int stem(char[] s, int len)
+	  {
+		int numVowels_Renamed = numVowels(s, len);
+
+		for (int i = 0; i < affixes.Length; i++)
+		{
+		  Affix affix = affixes[i];
+		  if (numVowels_Renamed > affix.vc && len >= affix.affix.Length + 3 && StemmerUtil.EndsWith(s, len, affix.affix))
+		  {
+			len -= affix.affix.Length;
+			return affix.palatalizes ? unpalatalize(s, len) : len;
+		  }
+		}
+
+		return len;
+	  }
+
+	  internal static readonly Affix[] affixes = {};
+
+	  internal class Affix
+	  {
+		internal char[] affix; // suffix
+		internal int vc; // vowel count of the suffix
+		internal bool palatalizes; // true if we should fire palatalization rules.
+
+		internal Affix(string affix, int vc, bool palatalizes)
+		{
+		  this.affix = affix.ToCharArray();
+		  this.vc = vc;
+		  this.palatalizes = palatalizes;
+		}
+	  }
+
+	  /// <summary>
+	  /// Most cases are handled except for the ambiguous ones:
+	  /// <ul>
+	  ///  <li> s -> š
+	  ///  <li> t -> š
+	  ///  <li> d -> ž
+	  ///  <li> z -> ž
+	  /// </ul>
+	  /// </summary>
+	  private int unpalatalize(char[] s, int len)
+	  {
+		// we check the character removed: if its -u then 
+		// its 2,5, or 6 gen pl., and these two can only apply then.
+		if (s[len] == 'u')
+		{
+		  // kš -> kst
+		  if (StemmerUtil.EndsWith(s, len, "kš"))
+		  {
+			len++;
+			s[len - 2] = 's';
+			s[len - 1] = 't';
+			return len;
+		  }
+		  // ņņ -> nn
+		  if (StemmerUtil.EndsWith(s, len, "ņņ"))
+		  {
+			s[len - 2] = 'n';
+			s[len - 1] = 'n';
+			return len;
+		  }
+		}
+
+		// otherwise all other rules
+		if (StemmerUtil.EndsWith(s, len, "pj") || StemmerUtil.EndsWith(s, len, "bj") || StemmerUtil.EndsWith(s, len, "mj") || StemmerUtil.EndsWith(s, len, "vj"))
+		{
+		  // labial consonant
+		  return len - 1;
+		}
+		else if (StemmerUtil.EndsWith(s, len, "šņ"))
+		{
+		  s[len - 2] = 's';
+		  s[len - 1] = 'n';
+		  return len;
+		}
+		else if (StemmerUtil.EndsWith(s, len, "žņ"))
+		{
+		  s[len - 2] = 'z';
+		  s[len - 1] = 'n';
+		  return len;
+		}
+		else if (StemmerUtil.EndsWith(s, len, "šļ"))
+		{
+		  s[len - 2] = 's';
+		  s[len - 1] = 'l';
+		  return len;
+		}
+		else if (StemmerUtil.EndsWith(s, len, "žļ"))
+		{
+		  s[len - 2] = 'z';
+		  s[len - 1] = 'l';
+		  return len;
+		}
+		else if (StemmerUtil.EndsWith(s, len, "ļņ"))
+		{
+		  s[len - 2] = 'l';
+		  s[len - 1] = 'n';
+		  return len;
+		}
+		else if (StemmerUtil.EndsWith(s, len, "ļļ"))
+		{
+		  s[len - 2] = 'l';
+		  s[len - 1] = 'l';
+		  return len;
+		}
+		else if (s[len - 1] == 'č')
+		{
+		  s[len - 1] = 'c';
+		  return len;
+		}
+		else if (s[len - 1] == 'ļ')
+		{
+		  s[len - 1] = 'l';
+		  return len;
+		}
+		else if (s[len - 1] == 'ņ')
+		{
+		  s[len - 1] = 'n';
+		  return len;
+		}
+
+		return len;
+	  }
+
+	  /// <summary>
+	  /// Count the vowels in the string, we always require at least
+	  /// one in the remaining stem to accept it.
+	  /// </summary>
+	  private int numVowels(char[] s, int len)
+	  {
+		int n = 0;
+		for (int i = 0; i < len; i++)
+		{
+		  switch (s[i])
+		  {
+			case 'a':
+		case 'e':
+	case 'i':
+			case 'o':
+		case 'u':
+	case 'ā':
+			case 'ī':
+		case 'ē':
+	case 'ū':
+			  n++;
+		  break;
+		  }
+		}
+		return n;
+	  }
+	}
+
+}
\ No newline at end of file