You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2012/03/23 03:11:44 UTC

svn commit: r1304164 - in /incubator/lucene.net/trunk: src/contrib/Analyzers/ src/contrib/Analyzers/De/ test/contrib/Analyzers/ test/contrib/Analyzers/De/

Author: ccurrens
Date: Fri Mar 23 02:11:43 2012
New Revision: 1304164

URL: http://svn.apache.org/viewvc?rev=1304164&view=rev
Log:
[LUCENENET-466] - added a DIN-5007-2 stemmer to GermanAnalyzer, as well as new constructors to specify its use if desired.  TestGermanStemFilter's TestStemming is now renamed to TestDin1Stemming, and TestDin2Stemming has been added for GermanStemmerDIN2

Added:
    incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs
    incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt
Modified:
    incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj
    incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs
    incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs
    incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs
    incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
    incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj Fri Mar 23 02:11:43 2012
@@ -100,6 +100,7 @@
     <Compile Include="De\GermanAnalyzer.cs" />
     <Compile Include="De\GermanStemFilter.cs" />
     <Compile Include="De\GermanStemmer.cs" />
+    <Compile Include="De\GermanStemmerDIN2.cs" />
     <Compile Include="El\GreekAnalyzer.cs" />
     <Compile Include="El\GreekLowerCaseFilter.cs" />
     <Compile Include="Fa\PersianAnalyzer.cs" />

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs Fri Mar 23 02:11:43 2012
@@ -88,6 +88,7 @@ namespace Lucene.Net.Analysis.De
         private ISet<string> exclusionSet;
 
         private Version matchVersion;
+        private readonly bool _useDin2Stemmer;
 
         /// <summary>
         /// Builds an analyzer with the default stop words:
@@ -95,7 +96,7 @@ namespace Lucene.Net.Analysis.De
         /// </summary>
         [Obsolete("Use GermanAnalyzer(Version) instead")]
         public GermanAnalyzer()
-            : this(Version.LUCENE_23)
+            : this(Version.LUCENE_CURRENT)
         {
         }
 
@@ -108,7 +109,15 @@ namespace Lucene.Net.Analysis.De
         { }
 
         /// <summary>
-        /// Builds an analyzer with the given stop words. 
+        /// Builds an analyzer with the default stop words:
+        /// <see cref="GetDefaultStopSet"/>
+        ///  </summary>
+        public GermanAnalyzer(Version matchVersion, bool useDin2Stemmer)
+            : this(matchVersion, DefaultSetHolder.DEFAULT_SET, useDin2Stemmer)
+        { }
+
+        /// <summary>
+        /// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer
         /// </summary>
         /// <param name="matchVersion">Lucene compatibility version</param>
         /// <param name="stopwords">a stopword set</param>
@@ -120,15 +129,41 @@ namespace Lucene.Net.Analysis.De
         /// <summary>
         /// Builds an analyzer with the given stop words
         /// </summary>
+        /// <param name="matchVersion">Lucene compatibility version</param>
+        /// <param name="stopwords">a stopword set</param>
+        /// <param name="useDin2Stemmer">Specifies if the DIN-2007-2 style stemmer should be used.  Commonly referred to as
+        /// phone book sorting, since it was defined to be used with names, rather than words</param>
+        public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, bool useDin2Stemmer)
+            : this(matchVersion, stopwords, CharArraySet.EMPTY_SET, useDin2Stemmer)
+        {
+        }
+
+        /// <summary>
+        /// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer
+        /// </summary>
         /// <param name="matchVersion">lucene compatibility version</param>
         /// <param name="stopwords">a stopword set</param>
         /// <param name="stemExclusionSet">a stemming exclusion set</param>
         public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionSet)
+            : this(matchVersion, stopwords, stemExclusionSet, false)
+        { }
+
+
+        /// <summary>
+        /// Builds an analyzer with the given stop words
+        /// </summary>
+        /// <param name="matchVersion">lucene compatibility version</param>
+        /// <param name="stopwords">a stopword set</param>
+        /// <param name="stemExclusionSet">a stemming exclusion set</param>
+        /// <param name="useDin2Stemmer">Specifies if the DIN-2007-2 style stemmer should be used.  Commonly referred to as
+        /// phone book sorting, since it was defined to be used with names, rather than words</param>
+        public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionSet, bool useDin2Stemmer)
         {
             stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
             exclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet));
-            SetOverridesTokenStreamMethod<GermanAnalyzer>();
             this.matchVersion = matchVersion;
+            _useDin2Stemmer = useDin2Stemmer;
+            SetOverridesTokenStreamMethod<GermanAnalyzer>();
         }
 
         /// <summary>
@@ -202,7 +237,7 @@ namespace Lucene.Net.Analysis.De
             result = new StandardFilter(result);
             result = new LowerCaseFilter(result);
             result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet);
-            result = new GermanStemFilter(result, exclusionSet);
+            result = new GermanStemFilter(result, exclusionSet, _useDin2Stemmer);
             return result;
         }
     }

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs Fri Mar 23 02:11:43 2012
@@ -43,11 +43,12 @@ namespace Lucene.Net.Analysis.De
         private TermAttribute termAtt;
 
         public GermanStemFilter(TokenStream _in)
-            : base(_in)
-        {
-            stemmer = new GermanStemmer();
-            termAtt = AddAttribute<TermAttribute>();
-        }
+            : this(_in, false)
+        { }
+
+        public GermanStemFilter(TokenStream _in, bool useDin2Stemmer)
+            : this(_in, null, useDin2Stemmer)
+        { }
 
         /// <summary>
         /// Builds a GermanStemFilter that uses an exclusiontable. 
@@ -55,9 +56,22 @@ namespace Lucene.Net.Analysis.De
         /// <param name="_in"></param>
         /// <param name="exclusiontable"></param>
         public GermanStemFilter(TokenStream _in, ISet<string> exclusiontable)
-            : this(_in)
+            : this(_in, exclusiontable, false)
+        { }
+
+        /// <summary>
+        /// Builds a GermanStemFilter that uses an exclusiontable. 
+        /// </summary>
+        /// <param name="_in"></param>
+        /// <param name="exclusiontable"></param>
+        /// <param name="useDin2Stemmer">Specifies where to use the DIN-5007-2 (names) 
+        /// stemmer instead of the default DIN-5007-1 (words) stemmer</param>
+        public GermanStemFilter(TokenStream _in, ISet<string> exclusiontable, bool useDin2Stemmer)
+            : base(_in)
         {
             exclusionSet = exclusiontable;
+            stemmer = useDin2Stemmer ? new GermanStemmerDIN2() : new GermanStemmer();
+            termAtt = AddAttribute<TermAttribute>();
         }
 
         /// <returns>

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs Fri Mar 23 02:11:43 2012
@@ -41,7 +41,7 @@ namespace Lucene.Net.Analysis.De
 		/// <summary>
 		/// Amount of characters that are removed with <tt>Substitute()</tt> while stemming.
 		/// </summary>
-		private int substCount = 0;
+		protected int substCount = 0;
 
 		/// <summary>
 		/// Stemms the given term to an unique <tt>discriminator</tt>.
@@ -187,32 +187,16 @@ namespace Lucene.Net.Analysis.De
 			for ( int c = 0; c < buffer.Length; c++ ) 
 			{
 				// Replace the second char of a pair of the equal characters with an asterisk
-				if ( c > 0 && buffer[c] == buffer[c - 1]) 
-				{
-					buffer[c] = '*';
-				}
-					// Substitute Umlauts.
-				else if ( buffer[c] == 'ä' ) 
-				{
-					buffer[c] = 'a';
-				}
-				else if ( buffer[c] == 'ö' ) 
-				{
-					buffer[c] = 'o';
-				}
-				else if ( buffer[c] == 'ü' ) 
-				{
-					buffer[c] = 'u';
-				}
-				// Fix bug so that 'ß' at the end of a word is replaced.
-				else if ( buffer[c] == 'ß' ) 
-				{
-				
-					buffer[c] = 's';
-					buffer.Insert(c + 1, 's');
-					substCount++;
-				}
-				// Take care that at least one character is left left side from the current one
+                if (c > 0 && buffer[c] == buffer[c - 1])
+                {
+                    buffer[c] = '*';
+                }
+                // Substitute Umlauts.
+                else
+                {
+                    SubstituteUmlauts(buffer, c);
+                }
+			    // Take care that at least one character is left left side from the current one
 				if ( c < buffer.Length - 1 ) 
 				{
 					// Masking several common character combinations with an token
@@ -257,7 +241,30 @@ namespace Lucene.Net.Analysis.De
 			}
 		}
 
-		/// <summary>
+	    protected virtual void SubstituteUmlauts(StringBuilder buffer, int c)
+	    {
+	        if (buffer[c] == 'ä')
+	        {
+	            buffer[c] = 'a';
+	        }
+	        else if (buffer[c] == 'ö')
+	        {
+	            buffer[c] = 'o';
+	        }
+	        else if (buffer[c] == 'ü')
+	        {
+	            buffer[c] = 'u';
+	        }
+	            // Fix bug so that 'ß' at the end of a word is replaced.
+	        else if (buffer[c] == 'ß')
+	        {
+	            buffer[c] = 's';
+	            buffer.Insert(c + 1, 's');
+	            substCount++;
+	        }
+	    }
+
+	    /// <summary>
 		/// Undoes the changes made by Substitute(). That are character pairs and
 		/// character combinations. Umlauts will remain as their corresponding vowel,
 		/// as "?" remains as "ss".

Added: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs?rev=1304164&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs Fri Mar 23 02:11:43 2012
@@ -0,0 +1,41 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.De
+{
+    /// <summary>
+    /// A stemmer for the german language that uses the
+    /// DIN-5007-2 "Phone Book" rules for handling
+    /// umlaut characters.
+    /// </summary>
+    public sealed class GermanStemmerDIN2 : GermanStemmer
+    {
+        protected override void SubstituteUmlauts(StringBuilder buffer, int c)
+        {
+            if (buffer[c] == 'ä')
+            {
+                buffer[c] = 'a';
+                buffer.Insert(c + 1, 'e');
+            }
+            else if (buffer[c] == 'ö')
+            {
+                buffer[c] = 'o';
+                buffer.Insert(c + 1, 'e');
+            }
+            else if (buffer[c] == 'ü')
+            {
+                buffer[c] = 'u';
+                buffer.Insert(c + 1, 'e');
+            }
+            // Fix bug so that 'ß' at the end of a word is replaced.
+            else if (buffer[c] == 'ß')
+            {
+                buffer[c] = 's';
+                buffer.Insert(c + 1, 's');
+                substCount++;
+            }
+        }
+    }
+}

Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj Fri Mar 23 02:11:43 2012
@@ -170,6 +170,9 @@
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
     </Content>
     <Compile Include="Fa\TestPersianNormalizationFilter.cs" />
+    <Content Include="De\data_din2.txt">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
     <Content Include="Nl\customStemDict.txt">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
     </Content>

Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs Fri Mar 23 02:11:43 2012
@@ -39,12 +39,12 @@ namespace Lucene.Net.Analyzers.De
     public class TestGermanStemFilter : BaseTokenStreamTestCase
     {
         [Test]
-        public void TestStemming()
+        public void TestDin1Stemming()
         {
             // read test cases from external file:
-            string testFile = @"De\data.txt";
-            using (FileStream fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read))
-            using (StreamReader breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
+            const string testFile = @"De\data.txt";
+            using (var fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read))
+            using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
             {
                 while (true)
                 {
@@ -56,7 +56,28 @@ namespace Lucene.Net.Analyzers.De
                         continue; // ignore comments and empty lines
                     String[] parts = line.Split(';');
                     //System.out.println(parts[0] + " -- " + parts[1]);
-                    Check(parts[0], parts[1]);
+                    Check(parts[0], parts[1], false);
+                }
+            }
+        }
+
+        [Test]
+        public void TestDin2Stemming()
+        {
+            // read test cases from external file:
+            const string testFile = @"De\data_din2.txt";
+            using (var fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read))
+            using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
+            {
+                string line;
+                while ((line = breader.ReadLine()) != null)
+                {
+                    line = line.Trim();
+                    if (line.StartsWith("#") || string.IsNullOrEmpty(line))
+                        continue; // ignore comments and empty lines
+
+                    var parts = line.Split(';');
+                    Check(parts[0], parts[1], true);
                 }
             }
         }
@@ -73,7 +94,7 @@ namespace Lucene.Net.Analyzers.De
         /**
          * subclass that acts just like whitespace analyzer for testing
          */
-        private class GermanSubclassAnalyzer : GermanAnalyzer
+        private sealed class GermanSubclassAnalyzer : GermanAnalyzer
         {
             public GermanSubclassAnalyzer(Version matchVersion)
                 : base(matchVersion)
@@ -99,15 +120,15 @@ namespace Lucene.Net.Analyzers.De
         [Test]
         public void TestExclusionTableReuse()
         {
-            GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
+            var a = new GermanAnalyzer(Version.LUCENE_CURRENT);
             CheckReuse(a, "tischen", "tisch");
-            a.SetStemExclusionTable(new String[] { "tischen" });
+            a.SetStemExclusionTable(new[] { "tischen" });
             CheckReuse(a, "tischen", "tischen");
         }
 
-        private void Check(String input, String expected)
+        private void Check(String input, String expected, bool useDin2)
         {
-            CheckOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected);
+            CheckOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT, useDin2), input, expected);
         }
 
         private void CheckReuse(Analyzer a, String input, String expected)

Added: incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt?rev=1304164&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt Fri Mar 23 02:11:43 2012
@@ -0,0 +1,50 @@
+# German special characters are replaced:
+häufig;haeufig
+üor;ueor
+björk;bjoerk
+
+# here the stemmer works okay, it maps related words to the same stem:
+abschließen;abschliess
+abschließender;abschliess
+abschließendes;abschliess
+abschließenden;abschliess
+
+Tisch;tisch
+Tische;tisch
+Tischen;tisch
+
+Haus;hau
+Hauses;hau
+Häuser;haeu
+Häusern;haeu
+# here's a case where overstemming occurs, i.e. a word is 
+# mapped to the same stem as unrelated words:
+hauen;hau
+
+# here's a case where understemming occurs, i.e. two related words
+# are not mapped to the same stem. This is the case with basically
+# all irregular forms:
+Drama;drama
+Dramen;dram
+
+# replace "ß" with 'ss':
+Ausmaß;ausmass
+
+# fake words to test if suffixes are cut off:
+xxxxxe;xxxxx
+xxxxxs;xxxxx
+xxxxxn;xxxxx
+xxxxxt;xxxxx
+xxxxxem;xxxxx
+xxxxxer;xxxxx
+xxxxxnd;xxxxx
+# the suffixes are also removed when combined:
+xxxxxetende;xxxxx
+
+# words that are shorter than four charcters are not changed:
+xxe;xxe
+# -em and -er are not removed from words shorter than five characters:
+xxem;xxem
+xxer;xxer
+# -nd is not removed from words shorter than six characters:
+xxxnd;xxxnd