You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2012/03/27 18:40:55 UTC
svn commit: r1305896 - in /incubator/lucene.net/trunk:
src/contrib/Analyzers/De/ test/contrib/Analyzers/De/
Author: ccurrens
Date: Tue Mar 27 16:40:54 2012
New Revision: 1305896
URL: http://svn.apache.org/viewvc?rev=1305896&view=rev
Log:
[LUCENENET-466] - Fixed German DIN2 stemmer corrupting word roots.
Modified:
incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs
incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs
incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs
incubator/lucene.net/trunk/test/contrib/Analyzers/De/data.txt
incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs?rev=1305896&r1=1305895&r2=1305896&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs Tue Mar 27 16:40:54 2012
@@ -12,30 +12,23 @@ namespace Lucene.Net.Analysis.De
/// </summary>
public sealed class GermanDIN2Stemmer : GermanStemmer
{
- protected override void SubstituteUmlauts(StringBuilder buffer, int c)
+ protected override void Substitute(StringBuilder buffer)
{
- if (buffer[c] == 'ä')
+ for (int c = 0; c < buffer.Length; c++)
{
- buffer[c] = 'a';
- buffer.Insert(c + 1, 'e');
- }
- else if (buffer[c] == 'ö')
- {
- buffer[c] = 'o';
- buffer.Insert(c + 1, 'e');
- }
- else if (buffer[c] == 'ü')
- {
- buffer[c] = 'u';
- buffer.Insert(c + 1, 'e');
- }
- // Fix bug so that 'Ã' at the end of a word is replaced.
- else if (buffer[c] == 'Ã')
- {
- buffer[c] = 's';
- buffer.Insert(c + 1, 's');
- substCount++;
+ if (buffer[c] == 'e')
+ {
+ switch (buffer[c - 1])
+ {
+ case 'a':
+ case 'o':
+ case 'u':
+ buffer.Remove(c, 1);
+ break;
+ }
+ }
}
+ base.Substitute(buffer);
}
}
}
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs?rev=1305896&r1=1305895&r2=1305896&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs Tue Mar 27 16:40:54 2012
@@ -181,7 +181,7 @@ namespace Lucene.Net.Analysis.De
/// - Substitute some common character combinations with a token:
/// sch/ch/ei/ie/ig/st -> $/Ч/%/&/#/!
/// </summary>
- private void Substitute( StringBuilder buffer )
+ protected virtual void Substitute( StringBuilder buffer )
{
substCount = 0;
for ( int c = 0; c < buffer.Length; c++ )
@@ -192,10 +192,26 @@ namespace Lucene.Net.Analysis.De
buffer[c] = '*';
}
// Substitute Umlauts.
- else
+ else if (buffer[c] == 'ä')
{
- SubstituteUmlauts(buffer, c);
+ buffer[c] = 'a';
}
+ else if (buffer[c] == 'ö')
+ {
+ buffer[c] = 'o';
+ }
+ else if (buffer[c] == 'ü')
+ {
+ buffer[c] = 'u';
+ }
+ // Fix bug so that 'Ã' at the end of a word is replaced.
+ else if (buffer[c] == 'Ã')
+ {
+ buffer[c] = 's';
+ buffer.Insert(c + 1, 's');
+ substCount++;
+ }
+
// Take care that at least one character is left left side from the current one
if ( c < buffer.Length - 1 )
{
@@ -241,29 +257,6 @@ namespace Lucene.Net.Analysis.De
}
}
- protected virtual void SubstituteUmlauts(StringBuilder buffer, int c)
- {
- if (buffer[c] == 'ä')
- {
- buffer[c] = 'a';
- }
- else if (buffer[c] == 'ö')
- {
- buffer[c] = 'o';
- }
- else if (buffer[c] == 'ü')
- {
- buffer[c] = 'u';
- }
- // Fix bug so that 'Ã' at the end of a word is replaced.
- else if (buffer[c] == 'Ã')
- {
- buffer[c] = 's';
- buffer.Insert(c + 1, 's');
- substCount++;
- }
- }
-
/// <summary>
/// Undoes the changes made by Substitute(). That are character pairs and
/// character combinations. Umlauts will remain as their corresponding vowel,
Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs?rev=1305896&r1=1305895&r2=1305896&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs Tue Mar 27 16:40:54 2012
@@ -38,12 +38,14 @@ namespace Lucene.Net.Analyzers.De
[TestFixture]
public class TestGermanStemFilter : BaseTokenStreamTestCase
{
+ const string TestFile = @"De\data.txt";
+ const string TestFileDin2 = @"De\data_din2.txt";
+
[Test]
public void TestDin1Stemming()
{
// read test cases from external file:
- const string testFile = @"De\data.txt";
- using (var fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read))
+ using (var fis = new FileStream(TestFile, FileMode.Open, FileAccess.Read, FileShare.Read))
using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
{
while (true)
@@ -64,20 +66,22 @@ namespace Lucene.Net.Analyzers.De
[Test]
public void TestDin2Stemming()
{
- // read test cases from external file:
- const string testFile = @"De\data_din2.txt";
- using (var fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read))
- using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
+ // read test cases from external file(s):
+ foreach (var file in new[] { TestFile, TestFileDin2 })
{
- string line;
- while ((line = breader.ReadLine()) != null)
+ using (var fis = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.Read))
+ using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
{
- line = line.Trim();
- if (line.StartsWith("#") || string.IsNullOrEmpty(line))
- continue; // ignore comments and empty lines
-
- var parts = line.Split(';');
- Check(parts[0], parts[1], true);
+ string line;
+ while ((line = breader.ReadLine()) != null)
+ {
+ line = line.Trim();
+ if (line.StartsWith("#") || string.IsNullOrEmpty(line))
+ continue; // ignore comments and empty lines
+
+ var parts = line.Split(';');
+ Check(parts[0], parts[1], true);
+ }
}
}
}
Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/De/data.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/data.txt?rev=1305896&r1=1305895&r2=1305896&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/De/data.txt (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/data.txt Tue Mar 27 16:40:54 2012
@@ -12,6 +12,7 @@ abschlieÃenden;abschliess
Tisch;tisch
Tische;tisch
Tischen;tisch
+geheimtür;geheimtur
Haus;hau
Hauses;hau
Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt?rev=1305896&r1=1305895&r2=1305896&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt Tue Mar 27 16:40:54 2012
@@ -1,50 +1,8 @@
-# German special characters are replaced:
-häufig;haeufig
-üor;ueor
-björk;bjoerk
-
-# here the stemmer works okay, it maps related words to the same stem:
-abschlieÃen;abschliess
-abschlieÃender;abschliess
-abschlieÃendes;abschliess
-abschlieÃenden;abschliess
-
-Tisch;tisch
-Tische;tisch
-Tischen;tisch
-
+# Test cases for words with ae, ue, or oe in them
Haus;hau
Hauses;hau
-Häuser;haeu
-Häusern;haeu
-# here's a case where overstemming occurs, i.e. a word is
-# mapped to the same stem as unrelated words:
-hauen;hau
-
-# here's a case where understemming occurs, i.e. two related words
-# are not mapped to the same stem. This is the case with basically
-# all irregular forms:
-Drama;drama
-Dramen;dram
-
-# replace "Ã" with 'ss':
-AusmaÃ;ausmass
-
-# fake words to test if suffixes are cut off:
-xxxxxe;xxxxx
-xxxxxs;xxxxx
-xxxxxn;xxxxx
-xxxxxt;xxxxx
-xxxxxem;xxxxx
-xxxxxer;xxxxx
-xxxxxnd;xxxxx
-# the suffixes are also removed when combined:
-xxxxxetende;xxxxx
-
-# words that are shorter than four charcters are not changed:
-xxe;xxe
-# -em and -er are not removed from words shorter than five characters:
-xxem;xxem
-xxer;xxer
-# -nd is not removed from words shorter than six characters:
-xxxnd;xxxnd
+Haeuser;hau
+Haeusern;hau
+steuer;steur
+rueckwaerts;ruckwar
+geheimtuer;geheimtur
\ No newline at end of file