You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2012/03/23 03:11:44 UTC
svn commit: r1304164 - in /incubator/lucene.net/trunk:
src/contrib/Analyzers/ src/contrib/Analyzers/De/ test/contrib/Analyzers/
test/contrib/Analyzers/De/
Author: ccurrens
Date: Fri Mar 23 02:11:43 2012
New Revision: 1304164
URL: http://svn.apache.org/viewvc?rev=1304164&view=rev
Log:
[LUCENENET-466] - added a DIN-5007-2 stemmer to GermanAnalyzer, as well as new constructors to specify its use if desired. TestGermanStemFilter's TestStemming is now renamed to TestDin1Stemming, and TestDin2Stemming has been added for GermanStemmerDIN2
Added:
incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs
incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt
Modified:
incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj
incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs
incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs
incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs
incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj Fri Mar 23 02:11:43 2012
@@ -100,6 +100,7 @@
<Compile Include="De\GermanAnalyzer.cs" />
<Compile Include="De\GermanStemFilter.cs" />
<Compile Include="De\GermanStemmer.cs" />
+ <Compile Include="De\GermanStemmerDIN2.cs" />
<Compile Include="El\GreekAnalyzer.cs" />
<Compile Include="El\GreekLowerCaseFilter.cs" />
<Compile Include="Fa\PersianAnalyzer.cs" />
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanAnalyzer.cs Fri Mar 23 02:11:43 2012
@@ -88,6 +88,7 @@ namespace Lucene.Net.Analysis.De
private ISet<string> exclusionSet;
private Version matchVersion;
+ private readonly bool _useDin2Stemmer;
/// <summary>
/// Builds an analyzer with the default stop words:
@@ -95,7 +96,7 @@ namespace Lucene.Net.Analysis.De
/// </summary>
[Obsolete("Use GermanAnalyzer(Version) instead")]
public GermanAnalyzer()
- : this(Version.LUCENE_23)
+ : this(Version.LUCENE_CURRENT)
{
}
@@ -108,7 +109,15 @@ namespace Lucene.Net.Analysis.De
{ }
/// <summary>
- /// Builds an analyzer with the given stop words.
+ /// Builds an analyzer with the default stop words:
+ /// <see cref="GetDefaultStopSet"/>
+ /// </summary>
+ public GermanAnalyzer(Version matchVersion, bool useDin2Stemmer)
+ : this(matchVersion, DefaultSetHolder.DEFAULT_SET, useDin2Stemmer)
+ { }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer
/// </summary>
/// <param name="matchVersion">Lucene compatibility version</param>
/// <param name="stopwords">a stopword set</param>
@@ -120,15 +129,41 @@ namespace Lucene.Net.Analysis.De
/// <summary>
/// Builds an analyzer with the given stop words
/// </summary>
+ /// <param name="matchVersion">Lucene compatibility version</param>
+ /// <param name="stopwords">a stopword set</param>
+ /// <param name="useDin2Stemmer">Specifies if the DIN-2007-2 style stemmer should be used. Commonly referred to as
+ /// phone book sorting, since it was defined to be used with names, rather than words</param>
+ public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, bool useDin2Stemmer)
+ : this(matchVersion, stopwords, CharArraySet.EMPTY_SET, useDin2Stemmer)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer
+ /// </summary>
/// <param name="matchVersion">lucene compatibility version</param>
/// <param name="stopwords">a stopword set</param>
/// <param name="stemExclusionSet">a stemming exclusion set</param>
public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionSet)
+ : this(matchVersion, stopwords, stemExclusionSet, false)
+ { }
+
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words
+ /// </summary>
+ /// <param name="matchVersion">lucene compatibility version</param>
+ /// <param name="stopwords">a stopword set</param>
+ /// <param name="stemExclusionSet">a stemming exclusion set</param>
+ /// <param name="useDin2Stemmer">Specifies if the DIN-2007-2 style stemmer should be used. Commonly referred to as
+ /// phone book sorting, since it was defined to be used with names, rather than words</param>
+ public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionSet, bool useDin2Stemmer)
{
stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
exclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet));
- SetOverridesTokenStreamMethod<GermanAnalyzer>();
this.matchVersion = matchVersion;
+ _useDin2Stemmer = useDin2Stemmer;
+ SetOverridesTokenStreamMethod<GermanAnalyzer>();
}
/// <summary>
@@ -202,7 +237,7 @@ namespace Lucene.Net.Analysis.De
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet);
- result = new GermanStemFilter(result, exclusionSet);
+ result = new GermanStemFilter(result, exclusionSet, _useDin2Stemmer);
return result;
}
}
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemFilter.cs Fri Mar 23 02:11:43 2012
@@ -43,11 +43,12 @@ namespace Lucene.Net.Analysis.De
private TermAttribute termAtt;
public GermanStemFilter(TokenStream _in)
- : base(_in)
- {
- stemmer = new GermanStemmer();
- termAtt = AddAttribute<TermAttribute>();
- }
+ : this(_in, false)
+ { }
+
+ public GermanStemFilter(TokenStream _in, bool useDin2Stemmer)
+ : this(_in, null, useDin2Stemmer)
+ { }
/// <summary>
/// Builds a GermanStemFilter that uses an exclusiontable.
@@ -55,9 +56,22 @@ namespace Lucene.Net.Analysis.De
/// <param name="_in"></param>
/// <param name="exclusiontable"></param>
public GermanStemFilter(TokenStream _in, ISet<string> exclusiontable)
- : this(_in)
+ : this(_in, exclusiontable, false)
+ { }
+
+ /// <summary>
+ /// Builds a GermanStemFilter that uses an exclusiontable.
+ /// </summary>
+ /// <param name="_in"></param>
+ /// <param name="exclusiontable"></param>
+ /// <param name="useDin2Stemmer">Specifies where to use the DIN-5007-2 (names)
+ /// stemmer instead of the default DIN-5007-1 (words) stemmer</param>
+ public GermanStemFilter(TokenStream _in, ISet<string> exclusiontable, bool useDin2Stemmer)
+ : base(_in)
{
exclusionSet = exclusiontable;
+ stemmer = useDin2Stemmer ? new GermanStemmerDIN2() : new GermanStemmer();
+ termAtt = AddAttribute<TermAttribute>();
}
/// <returns>
Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmer.cs Fri Mar 23 02:11:43 2012
@@ -41,7 +41,7 @@ namespace Lucene.Net.Analysis.De
/// <summary>
/// Amount of characters that are removed with <tt>Substitute()</tt> while stemming.
/// </summary>
- private int substCount = 0;
+ protected int substCount = 0;
/// <summary>
/// Stemms the given term to an unique <tt>discriminator</tt>.
@@ -187,32 +187,16 @@ namespace Lucene.Net.Analysis.De
for ( int c = 0; c < buffer.Length; c++ )
{
// Replace the second char of a pair of the equal characters with an asterisk
- if ( c > 0 && buffer[c] == buffer[c - 1])
- {
- buffer[c] = '*';
- }
- // Substitute Umlauts.
- else if ( buffer[c] == 'ä' )
- {
- buffer[c] = 'a';
- }
- else if ( buffer[c] == 'ö' )
- {
- buffer[c] = 'o';
- }
- else if ( buffer[c] == 'ü' )
- {
- buffer[c] = 'u';
- }
- // Fix bug so that 'Ã' at the end of a word is replaced.
- else if ( buffer[c] == 'Ã' )
- {
-
- buffer[c] = 's';
- buffer.Insert(c + 1, 's');
- substCount++;
- }
- // Take care that at least one character is left left side from the current one
+ if (c > 0 && buffer[c] == buffer[c - 1])
+ {
+ buffer[c] = '*';
+ }
+ // Substitute Umlauts.
+ else
+ {
+ SubstituteUmlauts(buffer, c);
+ }
+ // Take care that at least one character is left left side from the current one
if ( c < buffer.Length - 1 )
{
// Masking several common character combinations with an token
@@ -257,7 +241,30 @@ namespace Lucene.Net.Analysis.De
}
}
- /// <summary>
+ protected virtual void SubstituteUmlauts(StringBuilder buffer, int c)
+ {
+ if (buffer[c] == 'ä')
+ {
+ buffer[c] = 'a';
+ }
+ else if (buffer[c] == 'ö')
+ {
+ buffer[c] = 'o';
+ }
+ else if (buffer[c] == 'ü')
+ {
+ buffer[c] = 'u';
+ }
+ // Fix bug so that 'Ã' at the end of a word is replaced.
+ else if (buffer[c] == 'Ã')
+ {
+ buffer[c] = 's';
+ buffer.Insert(c + 1, 's');
+ substCount++;
+ }
+ }
+
+ /// <summary>
/// Undoes the changes made by Substitute(). That are character pairs and
/// character combinations. Umlauts will remain as their corresponding vowel,
/// as "?" remains as "ss".
Added: incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs?rev=1304164&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/De/GermanStemmerDIN2.cs Fri Mar 23 02:11:43 2012
@@ -0,0 +1,41 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.De
+{
+ /// <summary>
+ /// A stemmer for the german language that uses the
+ /// DIN-5007-2 "Phone Book" rules for handling
+ /// umlaut characters.
+ /// </summary>
+ public sealed class GermanStemmerDIN2 : GermanStemmer
+ {
+ protected override void SubstituteUmlauts(StringBuilder buffer, int c)
+ {
+ if (buffer[c] == 'ä')
+ {
+ buffer[c] = 'a';
+ buffer.Insert(c + 1, 'e');
+ }
+ else if (buffer[c] == 'ö')
+ {
+ buffer[c] = 'o';
+ buffer.Insert(c + 1, 'e');
+ }
+ else if (buffer[c] == 'ü')
+ {
+ buffer[c] = 'u';
+ buffer.Insert(c + 1, 'e');
+ }
+ // Fix bug so that 'Ã' at the end of a word is replaced.
+ else if (buffer[c] == 'Ã')
+ {
+ buffer[c] = 's';
+ buffer.Insert(c + 1, 's');
+ substCount++;
+ }
+ }
+ }
+}
Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj Fri Mar 23 02:11:43 2012
@@ -170,6 +170,9 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Compile Include="Fa\TestPersianNormalizationFilter.cs" />
+ <Content Include="De\data_din2.txt">
+ <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+ </Content>
<Content Include="Nl\customStemDict.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs?rev=1304164&r1=1304163&r2=1304164&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/TestGermanStemFilter.cs Fri Mar 23 02:11:43 2012
@@ -39,12 +39,12 @@ namespace Lucene.Net.Analyzers.De
public class TestGermanStemFilter : BaseTokenStreamTestCase
{
[Test]
- public void TestStemming()
+ public void TestDin1Stemming()
{
// read test cases from external file:
- string testFile = @"De\data.txt";
- using (FileStream fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read))
- using (StreamReader breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
+ const string testFile = @"De\data.txt";
+ using (var fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read))
+ using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
{
while (true)
{
@@ -56,7 +56,28 @@ namespace Lucene.Net.Analyzers.De
continue; // ignore comments and empty lines
String[] parts = line.Split(';');
//System.out.println(parts[0] + " -- " + parts[1]);
- Check(parts[0], parts[1]);
+ Check(parts[0], parts[1], false);
+ }
+ }
+ }
+
+ [Test]
+ public void TestDin2Stemming()
+ {
+ // read test cases from external file:
+ const string testFile = @"De\data_din2.txt";
+ using (var fis = new FileStream(testFile, FileMode.Open, FileAccess.Read, FileShare.Read))
+ using (var breader = new StreamReader(fis, Encoding.GetEncoding("iso-8859-1")))
+ {
+ string line;
+ while ((line = breader.ReadLine()) != null)
+ {
+ line = line.Trim();
+ if (line.StartsWith("#") || string.IsNullOrEmpty(line))
+ continue; // ignore comments and empty lines
+
+ var parts = line.Split(';');
+ Check(parts[0], parts[1], true);
}
}
}
@@ -73,7 +94,7 @@ namespace Lucene.Net.Analyzers.De
/**
* subclass that acts just like whitespace analyzer for testing
*/
- private class GermanSubclassAnalyzer : GermanAnalyzer
+ private sealed class GermanSubclassAnalyzer : GermanAnalyzer
{
public GermanSubclassAnalyzer(Version matchVersion)
: base(matchVersion)
@@ -99,15 +120,15 @@ namespace Lucene.Net.Analyzers.De
[Test]
public void TestExclusionTableReuse()
{
- GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
+ var a = new GermanAnalyzer(Version.LUCENE_CURRENT);
CheckReuse(a, "tischen", "tisch");
- a.SetStemExclusionTable(new String[] { "tischen" });
+ a.SetStemExclusionTable(new[] { "tischen" });
CheckReuse(a, "tischen", "tischen");
}
- private void Check(String input, String expected)
+ private void Check(String input, String expected, bool useDin2)
{
- CheckOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected);
+ CheckOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT, useDin2), input, expected);
}
private void CheckReuse(Analyzer a, String input, String expected)
Added: incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt?rev=1304164&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/De/data_din2.txt Fri Mar 23 02:11:43 2012
@@ -0,0 +1,50 @@
+# German special characters are replaced:
+häufig;haeufig
+üor;ueor
+björk;bjoerk
+
+# here the stemmer works okay, it maps related words to the same stem:
+abschlieÃen;abschliess
+abschlieÃender;abschliess
+abschlieÃendes;abschliess
+abschlieÃenden;abschliess
+
+Tisch;tisch
+Tische;tisch
+Tischen;tisch
+
+Haus;hau
+Hauses;hau
+Häuser;haeu
+Häusern;haeu
+# here's a case where overstemming occurs, i.e. a word is
+# mapped to the same stem as unrelated words:
+hauen;hau
+
+# here's a case where understemming occurs, i.e. two related words
+# are not mapped to the same stem. This is the case with basically
+# all irregular forms:
+Drama;drama
+Dramen;dram
+
+# replace "Ã" with 'ss':
+AusmaÃ;ausmass
+
+# fake words to test if suffixes are cut off:
+xxxxxe;xxxxx
+xxxxxs;xxxxx
+xxxxxn;xxxxx
+xxxxxt;xxxxx
+xxxxxem;xxxxx
+xxxxxer;xxxxx
+xxxxxnd;xxxxx
+# the suffixes are also removed when combined:
+xxxxxetende;xxxxx
+
+# words that are shorter than four charcters are not changed:
+xxe;xxe
+# -em and -er are not removed from words shorter than five characters:
+xxem;xxem
+xxer;xxer
+# -nd is not removed from words shorter than six characters:
+xxxnd;xxxnd