You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2012/03/27 18:40:55 UTC
svn commit: r1305896 - in /incubator/lucene.net/trunk: src/contrib/Analyzers/De/ test/contrib/Analyzers/De/

Author: ccurrens
Date: Tue Mar 27 16:40:54 2012
New Revision: 1305896

URL: http://svn.apache.org/viewvc?rev=1305896&view=rev
Log:
[LUCENENET-466] - Fixed German DIN2 stemmer corrupting word roots.

Modified:
    incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs
    incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs
    incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs
    incubator/lucene.net/trunk/test/contrib/Analyzers/De/data.txt
    incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs?rev=1305896&r1=1305895&r2=1305896&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs Tue Mar 27 16:40:54 2012
@@ -12,30 +12,23 @@ namespace Lucene.Net.Analysis.De
     /// </summary>
     public sealed class GermanDIN2Stemmer : GermanStemmer
     {
-        protected override void SubstituteUmlauts(StringBuilder buffer, int c)
+        protected override void Substitute(StringBuilder buffer)
         {
-            if (buffer[c] == 'Ã¤')
+            for (int c = 0; c < buffer.Length; c++)
             {
-                buffer[c] = 'a';
-                buffer.Insert(c + 1, 'e');
-            }
-            else if (buffer[c] == 'Ã¶')
-            {
-                buffer[c] = 'o';
-                buffer.Insert(c + 1, 'e');
-            }
-            else if (buffer[c] == 'Ã¼')
-            {
-                buffer[c] = 'u';
-                buffer.Insert(c + 1, 'e');
-            }
-            // Fix bug so that 'Ã' at the end of a word is replaced.
-            else if (buffer[c] == 'Ã')
-            {
-                buffer[c] = 's';
-                buffer.Insert(c + 1, 's');
-                substCount++;
+                if (buffer[c] == 'e')
+                {
+                    switch (buffer[c - 1])
+                    {
+                        case 'a':
+                        case 'o':
+                        case 'u':
+                            buffer.Remove(c, 1);
+                            break;
+                    }
+                }
             }
+            base.Substitute(buffer);
         }
     }
 }

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs?rev=1305896&r1=1305895&r2=1305896&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs Tue Mar 27 16:40:54 2012
@@ -181,7 +181,7 @@ namespace Lucene.Net.Analysis.De
 		/// - Substitute some common character combinations with a token:
         ///   sch/ch/ei/ie/ig/st -&gt; $/Ð&#167;/%/&amp;/#/!
 		/// </summary>
-		private void Substitute( StringBuilder buffer )
+		protected virtual void Substitute( StringBuilder buffer )
 		{
 			substCount = 0;
 			for ( int c = 0; c < buffer.Length; c++ ) 
@@ -192,10 +192,26 @@ namespace Lucene.Net.Analysis.De
                     buffer[c] = '*';
                 }
                 // Substitute Umlauts.
-                else
+                else if (buffer[c] == 'Ã¤')
                 {
-                    SubstituteUmlauts(buffer, c);
+                    buffer[c] = 'a';
                 }
+                else if (buffer[c] == 'Ã¶')
+                {
+                    buffer[c] = 'o';
+                }
+                else if (buffer[c] == 'Ã¼')
+                {
+                    buffer[c] = 'u';
+                }
+                // Fix bug so that 'Ã' at the end of a word is replaced.
+                else if (buffer[c] == 'Ã')
+                {
+                    buffer[c] = 's';
+                    buffer.Insert(c + 1, 's');
+                    substCount++;
+                }
+
 			    // Take care that at least one character is left left side from the current one
 				if ( c < buffer.Length - 1 ) 
 				{
@@ -241,29 +257,6 @@ namespace Lucene.Net.Analysis.De
 			}
 		}
 
-	    protected virtual void SubstituteUmlauts(StringBuilder buffer, int c)
-	    {
-	        if (buffer[c] == 'Ã¤')
-	        {
-	            buffer[c] = 'a';
-	        }
-	        else if (buffer[c] == 'Ã¶')
-	        {
-	            buffer[c] = 'o';
-	        }
-	        else if (buffer[c] == 'Ã¼')
-	        {
-	            buffer[c] = 'u';
-	        }
-	            // Fix bug so that 'Ã' at the end of a word is replaced.
-	        else if (buffer[c] == 'Ã')
-	        {
-	            buffer[c] = 's';
-	            buffer.Insert(c + 1, 's');
-	            substCount++;
-	        }
-	    }
-
 	    /// <summary>
 		/// Undoes the changes made by Substitute(). That are character pairs and
 		/// character combinations. Umlauts will remain as their corresponding vowel,

Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs?rev=1305896&r1=1305895&r2=1305896&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs Tue Mar 27 16:40:54 2012
@@ -38,12 +38,14 @@ namespace Lucene.Net.Analyzers.De
     [TestFixture]
     public class TestGermanStemFilter : BaseTokenStreamTestCase
     {
+        const string TestFile = @"De\data.txt";
+        const string TestFileDin2 = @"De\data_din2.txt";
+
         [Test]
         public void TestDin1Stemming()
         {
             // read test cases from external file:
-            const string testFile = @"De\data.txt";
-            using (var fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read))
+            using (var fis = new FileStream(TestFile, FileMode.Open, FileAccess.Read, FileShare.Read))
             using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
             {
                 while (true)
@@ -64,20 +66,22 @@ namespace Lucene.Net.Analyzers.De
         [Test]
         public void TestDin2Stemming()
         {
-            // read test cases from external file:
-            const string testFile = @"De\data_din2.txt";
-            using (var fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read))
-            using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
+            // read test cases from external file(s):
+            foreach (var file in new[] { TestFile, TestFileDin2 })
             {
-                string line;
-                while ((line = breader.ReadLine()) != null)
+                using (var fis = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.Read))
+                using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
                 {
-                    line = line.Trim();
-                    if (line.StartsWith("#") || string.IsNullOrEmpty(line))
-                        continue; // ignore comments and empty lines
-
-                    var parts = line.Split(';');
-                    Check(parts[0], parts[1], true);
+                    string line;
+                    while ((line = breader.ReadLine()) != null)
+                    {
+                        line = line.Trim();
+                        if (line.StartsWith("#") || string.IsNullOrEmpty(line))
+                            continue; // ignore comments and empty lines
+
+                        var parts = line.Split(';');
+                        Check(parts[0], parts[1], true);
+                    }
                 }
             }
         }

Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/De/data.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/data.txt?rev=1305896&r1=1305895&r2=1305896&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/De/data.txt (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/data.txt Tue Mar 27 16:40:54 2012
@@ -12,6 +12,7 @@ abschlieÃenden;abschliess
 Tisch;tisch
 Tische;tisch
 Tischen;tisch
+geheimtÃ¼r;geheimtur
 
 Haus;hau
 Hauses;hau

Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt?rev=1305896&r1=1305895&r2=1305896&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt Tue Mar 27 16:40:54 2012
@@ -1,50 +1,8 @@
-ï»¿# German special characters are replaced:
-hÃ¤ufig;haeufig
-Ã¼or;ueor
-bjÃ¶rk;bjoerk
-
-# here the stemmer works okay, it maps related words to the same stem:
-abschlieÃen;abschliess
-abschlieÃender;abschliess
-abschlieÃendes;abschliess
-abschlieÃenden;abschliess
-
-Tisch;tisch
-Tische;tisch
-Tischen;tisch
-
+ï»¿# Test cases for words with ae, ue, or oe in them
 Haus;hau
 Hauses;hau
-HÃ¤user;haeu
-HÃ¤usern;haeu
-# here's a case where overstemming occurs, i.e. a word is 
-# mapped to the same stem as unrelated words:
-hauen;hau
-
-# here's a case where understemming occurs, i.e. two related words
-# are not mapped to the same stem. This is the case with basically
-# all irregular forms:
-Drama;drama
-Dramen;dram
-
-# replace "Ã" with 'ss':
-AusmaÃ;ausmass
-
-# fake words to test if suffixes are cut off:
-xxxxxe;xxxxx
-xxxxxs;xxxxx
-xxxxxn;xxxxx
-xxxxxt;xxxxx
-xxxxxem;xxxxx
-xxxxxer;xxxxx
-xxxxxnd;xxxxx
-# the suffixes are also removed when combined:
-xxxxxetende;xxxxx
-
-# words that are shorter than four charcters are not changed:
-xxe;xxe
-# -em and -er are not removed from words shorter than five characters:
-xxem;xxem
-xxer;xxer
-# -nd is not removed from words shorter than six characters:
-xxxnd;xxxnd
+Haeuser;hau
+Haeusern;hau
+steuer;steur
+rueckwaerts;ruckwar
+geheimtuer;geheimtur
\ No newline at end of file